#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Sep 1 16:04:29 2024 @author: Maya Coulson Theodorsen (mcoth@dtu.dk) This function performs clustering analysis using K-Means on standardized and PCA-transformed data. It incorporates methods to determine the optimal number of clusters and evaluate cluster quality. Includes: - Elbow method (WCSS), Calinski-Harabasz, Silhouette methods for determining k - Clustering with k-means++ initiation heuristic - Adds clusters to DataFrames with variables """ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn import metrics from yellowbrick.cluster import KElbowVisualizer from yellowbrick.cluster import SilhouetteVisualizer def perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std): # Elbow method to decide how many clusters # Empty dictionary wcss = {} # Look through range of k for i in range(1,10): # Run kmeans kmeans = KMeans(n_clusters=i, random_state=0).fit(std_data) # Sum of squared distances of samples to their closest cluster center. wcss[i] = (kmeans.inertia_) # Elbow plot for determining k sns.reset_defaults() plt.figure(figsize=(8, 8)) g = sns.lineplot(data=wcss, x = wcss.keys(), y = wcss.values()) plt.xlabel('Number of Clusters') plt.axvline(x=3, c='grey', lw=1, linestyle='dashed') plt.ylabel('Within Cluster Sum of Squares') plt.title('Sum of Squared Errors Across K Clusters') plt.show() # Calinski Harabasz method kmeans_model = KMeans(n_clusters=5, random_state=1).fit(principleComponents.iloc[:,[0,4]]) labels = kmeans_model.labels_ metrics.calinski_harabasz_score(principleComponents.iloc[:,[0,4]], labels) # Calinski-Harabasz for a range of k for k in range(2, 5): kmeans_model = KMeans(n_clusters=k, random_state=1).fit(principleComponents.iloc[:,[0,4]]) labels = kmeans_model.labels_ print(k, metrics.calinski_harabasz_score(principleComponents.iloc[:,[1,4]], labels)) # Yellowbrick visualizer for k, Calinski-Harabasz model = KMeans() # k is range of number of clusters. visualizer = KElbowVisualizer(model, k=(2,10), metric='calinski_harabasz',timings=True, locate_elbow=False) visualizer.fit(principleComponents.iloc[:,[0,4]]) visualizer.show() #Silhouette visualizer fig, ax = plt.subplots(2, 2, figsize=(15,6)) for i in [2, 3, 4, 5]: ''' Create KMeans instance for different number of clusters ''' km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42) q, mod = divmod(i, 2) visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod]) visualizer.fit(principleComponents.iloc[:,[1,4]]) # Remove labels, reduce label size ax[q-1][mod].set_ylabel('') ax[q-1][mod].yaxis.set_tick_params(labelsize=10) if i == 5: visualizer.set_title('') visualizer.show() # KMeans clustering PC234 = principleComponents.iloc[:,1:4] # Change 1 to 0 for severity clusters kmeans = KMeans(init= 'k-means++', n_clusters = 3, n_init='auto', max_iter=100, random_state=7) kmeans.fit(PC234) LABELS = kmeans.labels_ # Add clusters to data data_complete['clusters']=pd.DataFrame(LABELS) # Check amount and percentages of each clusters size clusters_count = data_complete.clusters.value_counts() print('Total amount in each cluster:', clusters_count) clusters_percent = data_complete.clusters.value_counts(normalize=True).mul(100) print('Total percentage in each cluster', clusters_percent) # Add cluster labels for plotting clusterNames = ['Tension', 'Intrusion/Avoidance', 'Anhedonia'] # Add clusters to questionnaire subscales df questionnaireClusters['clusters'] = pd.DataFrame(LABELS) questionnaireClusters_std['clusters'] = pd.DataFrame(LABELS) return PC234, LABELS, clusterNames