Skip to content
Snippets Groups Projects
Commit d890b1a0 authored by mcoth's avatar mcoth
Browse files

Upload New File

parent 6f8ffe4d
No related branches found
No related tags found
No related merge requests found
Pipeline #38565 passed with warnings
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 16:04:29 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This function performs clustering analysis using K-Means on standardized and PCA-transformed data.
It incorporates methods to determine the optimal number of clusters and evaluate cluster quality.
Includes:
- Elbow method (WCSS), Calinski-Harabasz, Silhouette methods for determining k
- Clustering with k-means++ initiation heuristic
- Adds clusters to DataFrames with variables
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import metrics
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
def perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std):
# Elbow method to decide how many clusters
# Empty dictionary
wcss = {}
# Look through range of k
for i in range(1,10):
# Run kmeans
kmeans = KMeans(n_clusters=i, random_state=0).fit(std_data)
# Sum of squared distances of samples to their closest cluster center.
wcss[i] = (kmeans.inertia_)
# Elbow plot for determining k
sns.reset_defaults()
plt.figure(figsize=(8, 8))
g = sns.lineplot(data=wcss, x = wcss.keys(), y = wcss.values())
plt.xlabel('Number of Clusters')
plt.axvline(x=3, c='grey', lw=1, linestyle='dashed')
plt.ylabel('Within Cluster Sum of Squares')
plt.title('Sum of Squared Errors Across K Clusters')
plt.show()
# Calinski Harabasz method
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(principleComponents.iloc[:,[0,4]])
labels = kmeans_model.labels_
metrics.calinski_harabasz_score(principleComponents.iloc[:,[0,4]], labels)
# Calinski-Harabasz for a range of k
for k in range(2, 5):
kmeans_model = KMeans(n_clusters=k, random_state=1).fit(principleComponents.iloc[:,[0,4]])
labels = kmeans_model.labels_
print(k, metrics.calinski_harabasz_score(principleComponents.iloc[:,[1,4]], labels))
# Yellowbrick visualizer for k, Calinski-Harabasz
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,10), metric='calinski_harabasz',timings=True, locate_elbow=False)
visualizer.fit(principleComponents.iloc[:,[0,4]])
visualizer.show()
#Silhouette visualizer
fig, ax = plt.subplots(2, 2, figsize=(15,6))
for i in [2, 3, 4, 5]:
'''
Create KMeans instance for different number of clusters
'''
km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
q, mod = divmod(i, 2)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
visualizer.fit(principleComponents.iloc[:,[1,4]])
# Remove labels, reduce label size
ax[q-1][mod].set_ylabel('')
ax[q-1][mod].yaxis.set_tick_params(labelsize=10)
if i == 5:
visualizer.set_title('')
visualizer.show()
# KMeans clustering
PC234 = principleComponents.iloc[:,1:4] # Change 1 to 0 for severity clusters
kmeans = KMeans(init= 'k-means++', n_clusters = 3, n_init='auto', max_iter=100, random_state=7)
kmeans.fit(PC234)
LABELS = kmeans.labels_
# Add clusters to data
data_complete['clusters']=pd.DataFrame(LABELS)
# Check amount and percentages of each clusters size
clusters_count = data_complete.clusters.value_counts()
print('Total amount in each cluster:', clusters_count)
clusters_percent = data_complete.clusters.value_counts(normalize=True).mul(100)
print('Total percentage in each cluster', clusters_percent)
# Add cluster labels for plotting
clusterNames = ['Tension', 'Intrusion/Avoidance', 'Anhedonia']
# Add clusters to questionnaire subscales df
questionnaireClusters['clusters'] = pd.DataFrame(LABELS)
questionnaireClusters_std['clusters'] = pd.DataFrame(LABELS)
return PC234, LABELS, clusterNames
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment