From d890b1a04c80ae451339ee61d9a8bdbb8227ec12 Mon Sep 17 00:00:00 2001
From: mcoth <mcoth@dtu.dk>
Date: Sun, 17 Nov 2024 16:48:16 +0100
Subject: [PATCH] Upload New File

---
 Perform_clustering.py | 116 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 Perform_clustering.py

diff --git a/Perform_clustering.py b/Perform_clustering.py
new file mode 100644
index 0000000..f38cf66
--- /dev/null
+++ b/Perform_clustering.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Sep  1 16:04:29 2024
+
+@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
+
+This function performs clustering analysis using K-Means on standardized and PCA-transformed data. 
+It incorporates methods to determine the optimal number of clusters and evaluate cluster quality.
+
+Includes:
+- Elbow method (WCSS), Calinski-Harabasz, Silhouette methods for determining k
+- Clustering with k-means++ initiation heuristic
+- Adds clusters to DataFrames with variables
+
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.cluster import KMeans
+from sklearn import metrics
+from yellowbrick.cluster import KElbowVisualizer
+from yellowbrick.cluster import SilhouetteVisualizer
+
+
+def perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std):
+
+    # Elbow method to decide how many clusters
+    # Empty dictionary 
+    wcss = {} 
+    # Look through range of k
+    for i in range(1,10):
+      # Run kmeans
+      kmeans = KMeans(n_clusters=i, random_state=0).fit(std_data)
+      # Sum of squared distances of samples to their closest cluster center.
+      wcss[i] = (kmeans.inertia_)
+
+    
+    # Elbow plot for determining k
+    sns.reset_defaults()
+    plt.figure(figsize=(8, 8))
+    g = sns.lineplot(data=wcss, x = wcss.keys(), y = wcss.values())
+    plt.xlabel('Number of Clusters')
+    plt.axvline(x=3, c='grey', lw=1, linestyle='dashed')
+    plt.ylabel('Within Cluster Sum of Squares')
+    plt.title('Sum of Squared Errors Across K Clusters')
+    plt.show()
+    
+    
+    # Calinski Harabasz method
+    kmeans_model = KMeans(n_clusters=5, random_state=1).fit(principleComponents.iloc[:,[0,4]])
+    labels = kmeans_model.labels_
+    metrics.calinski_harabasz_score(principleComponents.iloc[:,[0,4]], labels)
+
+    # Calinski-Harabasz for a range of k
+    for k in range(2, 5):
+        kmeans_model = KMeans(n_clusters=k, random_state=1).fit(principleComponents.iloc[:,[0,4]])
+        labels = kmeans_model.labels_
+        print(k, metrics.calinski_harabasz_score(principleComponents.iloc[:,[1,4]], labels))
+        
+
+    # Yellowbrick visualizer for k, Calinski-Harabasz
+
+    model = KMeans()
+    # k is range of number of clusters.
+    visualizer = KElbowVisualizer(model, k=(2,10), metric='calinski_harabasz',timings=True, locate_elbow=False)
+    visualizer.fit(principleComponents.iloc[:,[0,4]])
+    visualizer.show()
+    
+    
+    #Silhouette visualizer
+    fig, ax = plt.subplots(2, 2, figsize=(15,6))
+    for i in [2, 3, 4, 5]:
+        '''
+        Create KMeans instance for different number of clusters
+        '''
+        km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
+        q, mod = divmod(i, 2)
+        
+        visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
+        visualizer.fit(principleComponents.iloc[:,[1,4]])
+        
+        # Remove labels, reduce label size
+        ax[q-1][mod].set_ylabel('') 
+        ax[q-1][mod].yaxis.set_tick_params(labelsize=10)
+
+        if i == 5:
+            visualizer.set_title('')
+
+    visualizer.show()
+    
+    # KMeans clustering
+    PC234 = principleComponents.iloc[:,1:4] # Change 1 to 0 for severity clusters
+    kmeans = KMeans(init= 'k-means++', n_clusters = 3, n_init='auto', max_iter=100, random_state=7)
+    kmeans.fit(PC234)
+    LABELS = kmeans.labels_
+
+    # Add clusters to data
+    data_complete['clusters']=pd.DataFrame(LABELS) 
+    # Check amount and percentages of each clusters size
+    clusters_count = data_complete.clusters.value_counts()
+    print('Total amount in each cluster:', clusters_count)
+    clusters_percent = data_complete.clusters.value_counts(normalize=True).mul(100)
+    print('Total percentage in each cluster', clusters_percent)
+    # Add cluster labels for plotting
+    clusterNames = ['Tension', 'Intrusion/Avoidance', 'Anhedonia']
+    
+    # Add clusters to questionnaire subscales df
+    questionnaireClusters['clusters'] = pd.DataFrame(LABELS)
+    questionnaireClusters_std['clusters'] = pd.DataFrame(LABELS)
+    
+    return PC234, LABELS, clusterNames
+
+
+
-- 
GitLab