"git@lab.compute.dtu.dk:manli/FCN-CD-PyTorch.git" did not exist on "master"
Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 16:04:29 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This function performs clustering analysis using K-Means on standardized and PCA-transformed data.
It incorporates methods to determine the optimal number of clusters and evaluate cluster quality.
Includes:
- Elbow method (WCSS), Calinski-Harabasz, Silhouette methods for determining k
- Clustering with k-means++ initiation heuristic
- Adds clusters to DataFrames with variables
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import metrics
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
def perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std):
# Elbow method to decide how many clusters
# Empty dictionary
wcss = {}
# Look through range of k
for i in range(1,10):
# Run kmeans
kmeans = KMeans(n_clusters=i, random_state=0).fit(std_data)
# Sum of squared distances of samples to their closest cluster center.
wcss[i] = (kmeans.inertia_)
# Elbow plot for determining k
sns.reset_defaults()
plt.figure(figsize=(8, 8))
g = sns.lineplot(data=wcss, x = wcss.keys(), y = wcss.values())
plt.xlabel('Number of Clusters')
plt.axvline(x=3, c='grey', lw=1, linestyle='dashed')
plt.ylabel('Within Cluster Sum of Squares')
plt.title('Sum of Squared Errors Across K Clusters')
plt.show()
# Calinski Harabasz method
kmeans_model = KMeans(n_clusters=5, random_state=1).fit(principleComponents.iloc[:,[0,4]])
labels = kmeans_model.labels_
metrics.calinski_harabasz_score(principleComponents.iloc[:,[0,4]], labels)
# Calinski-Harabasz for a range of k
for k in range(2, 5):
kmeans_model = KMeans(n_clusters=k, random_state=1).fit(principleComponents.iloc[:,[0,4]])
labels = kmeans_model.labels_
print(k, metrics.calinski_harabasz_score(principleComponents.iloc[:,[1,4]], labels))
# Yellowbrick visualizer for k, Calinski-Harabasz
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,10), metric='calinski_harabasz',timings=True, locate_elbow=False)
visualizer.fit(principleComponents.iloc[:,[0,4]])
visualizer.show()
#Silhouette visualizer
fig, ax = plt.subplots(2, 2, figsize=(15,6))
for i in [2, 3, 4, 5]:
'''
Create KMeans instance for different number of clusters
'''
km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
q, mod = divmod(i, 2)
visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
visualizer.fit(principleComponents.iloc[:,[1,4]])
# Remove labels, reduce label size
ax[q-1][mod].set_ylabel('')
ax[q-1][mod].yaxis.set_tick_params(labelsize=10)
if i == 5:
visualizer.set_title('')
visualizer.show()
# KMeans clustering
PC234 = principleComponents.iloc[:,1:4] # Change 1 to 0 for severity clusters
kmeans = KMeans(init= 'k-means++', n_clusters = 3, n_init='auto', max_iter=100, random_state=7)
kmeans.fit(PC234)
LABELS = kmeans.labels_
# Add clusters to data
data_complete['clusters']=pd.DataFrame(LABELS)
# Check amount and percentages of each clusters size
clusters_count = data_complete.clusters.value_counts()
print('Total amount in each cluster:', clusters_count)
clusters_percent = data_complete.clusters.value_counts(normalize=True).mul(100)
print('Total percentage in each cluster', clusters_percent)
# Add cluster labels for plotting
clusterNames = ['Tension', 'Intrusion/Avoidance', 'Anhedonia']
# Add clusters to questionnaire subscales df
questionnaireClusters['clusters'] = pd.DataFrame(LABELS)
questionnaireClusters_std['clusters'] = pd.DataFrame(LABELS)
return PC234, LABELS, clusterNames