From d890b1a04c80ae451339ee61d9a8bdbb8227ec12 Mon Sep 17 00:00:00 2001 From: mcoth <mcoth@dtu.dk> Date: Sun, 17 Nov 2024 16:48:16 +0100 Subject: [PATCH] Upload New File --- Perform_clustering.py | 116 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 Perform_clustering.py diff --git a/Perform_clustering.py b/Perform_clustering.py new file mode 100644 index 0000000..f38cf66 --- /dev/null +++ b/Perform_clustering.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 1 16:04:29 2024 + +@author: Maya Coulson Theodorsen (mcoth@dtu.dk) + +This function performs clustering analysis using K-Means on standardized and PCA-transformed data. +It incorporates methods to determine the optimal number of clusters and evaluate cluster quality. + +Includes: +- Elbow method (WCSS), Calinski-Harabasz, Silhouette methods for determining k +- Clustering with k-means++ initiation heuristic +- Adds clusters to DataFrames with variables + +""" + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.cluster import KMeans +from sklearn import metrics +from yellowbrick.cluster import KElbowVisualizer +from yellowbrick.cluster import SilhouetteVisualizer + + +def perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std): + + # Elbow method to decide how many clusters + # Empty dictionary + wcss = {} + # Look through range of k + for i in range(1,10): + # Run kmeans + kmeans = KMeans(n_clusters=i, random_state=0).fit(std_data) + # Sum of squared distances of samples to their closest cluster center. + wcss[i] = (kmeans.inertia_) + + + # Elbow plot for determining k + sns.reset_defaults() + plt.figure(figsize=(8, 8)) + g = sns.lineplot(data=wcss, x = wcss.keys(), y = wcss.values()) + plt.xlabel('Number of Clusters') + plt.axvline(x=3, c='grey', lw=1, linestyle='dashed') + plt.ylabel('Within Cluster Sum of Squares') + plt.title('Sum of Squared Errors Across K Clusters') + plt.show() + + + # Calinski Harabasz method + kmeans_model = KMeans(n_clusters=5, random_state=1).fit(principleComponents.iloc[:,[0,4]]) + labels = kmeans_model.labels_ + metrics.calinski_harabasz_score(principleComponents.iloc[:,[0,4]], labels) + + # Calinski-Harabasz for a range of k + for k in range(2, 5): + kmeans_model = KMeans(n_clusters=k, random_state=1).fit(principleComponents.iloc[:,[0,4]]) + labels = kmeans_model.labels_ + print(k, metrics.calinski_harabasz_score(principleComponents.iloc[:,[1,4]], labels)) + + + # Yellowbrick visualizer for k, Calinski-Harabasz + + model = KMeans() + # k is range of number of clusters. + visualizer = KElbowVisualizer(model, k=(2,10), metric='calinski_harabasz',timings=True, locate_elbow=False) + visualizer.fit(principleComponents.iloc[:,[0,4]]) + visualizer.show() + + + #Silhouette visualizer + fig, ax = plt.subplots(2, 2, figsize=(15,6)) + for i in [2, 3, 4, 5]: + ''' + Create KMeans instance for different number of clusters + ''' + km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42) + q, mod = divmod(i, 2) + + visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod]) + visualizer.fit(principleComponents.iloc[:,[1,4]]) + + # Remove labels, reduce label size + ax[q-1][mod].set_ylabel('') + ax[q-1][mod].yaxis.set_tick_params(labelsize=10) + + if i == 5: + visualizer.set_title('') + + visualizer.show() + + # KMeans clustering + PC234 = principleComponents.iloc[:,1:4] # Change 1 to 0 for severity clusters + kmeans = KMeans(init= 'k-means++', n_clusters = 3, n_init='auto', max_iter=100, random_state=7) + kmeans.fit(PC234) + LABELS = kmeans.labels_ + + # Add clusters to data + data_complete['clusters']=pd.DataFrame(LABELS) + # Check amount and percentages of each clusters size + clusters_count = data_complete.clusters.value_counts() + print('Total amount in each cluster:', clusters_count) + clusters_percent = data_complete.clusters.value_counts(normalize=True).mul(100) + print('Total percentage in each cluster', clusters_percent) + # Add cluster labels for plotting + clusterNames = ['Tension', 'Intrusion/Avoidance', 'Anhedonia'] + + # Add clusters to questionnaire subscales df + questionnaireClusters['clusters'] = pd.DataFrame(LABELS) + questionnaireClusters_std['clusters'] = pd.DataFrame(LABELS) + + return PC234, LABELS, clusterNames + + + -- GitLab