Skip to content
Snippets Groups Projects
Perform_clustering.py 4.07 KiB
Newer Older
  • Learn to ignore specific revisions
  • mcoth's avatar
    mcoth committed
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Created on Sun Sep  1 16:04:29 2024
    
    @author: Maya Coulson Theodorsen (mcoth@dtu.dk)
    
    This function performs clustering analysis using K-Means on standardized and PCA-transformed data. 
    It incorporates methods to determine the optimal number of clusters and evaluate cluster quality.
    
    Includes:
    - Elbow method (WCSS), Calinski-Harabasz, Silhouette methods for determining k
    - Clustering with k-means++ initiation heuristic
    - Adds clusters to DataFrames with variables
    
    """
    
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.cluster import KMeans
    from sklearn import metrics
    from yellowbrick.cluster import KElbowVisualizer
    from yellowbrick.cluster import SilhouetteVisualizer
    
    
    def perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std):
    
        # Elbow method to decide how many clusters
        # Empty dictionary 
        wcss = {} 
        # Look through range of k
        for i in range(1,10):
          # Run kmeans
          kmeans = KMeans(n_clusters=i, random_state=0).fit(std_data)
          # Sum of squared distances of samples to their closest cluster center.
          wcss[i] = (kmeans.inertia_)
    
        
        # Elbow plot for determining k
        sns.reset_defaults()
        plt.figure(figsize=(8, 8))
        g = sns.lineplot(data=wcss, x = wcss.keys(), y = wcss.values())
        plt.xlabel('Number of Clusters')
        plt.axvline(x=3, c='grey', lw=1, linestyle='dashed')
        plt.ylabel('Within Cluster Sum of Squares')
        plt.title('Sum of Squared Errors Across K Clusters')
        plt.show()
        
        
        # Calinski Harabasz method
        kmeans_model = KMeans(n_clusters=5, random_state=1).fit(principleComponents.iloc[:,[0,4]])
        labels = kmeans_model.labels_
        metrics.calinski_harabasz_score(principleComponents.iloc[:,[0,4]], labels)
    
        # Calinski-Harabasz for a range of k
        for k in range(2, 5):
            kmeans_model = KMeans(n_clusters=k, random_state=1).fit(principleComponents.iloc[:,[0,4]])
            labels = kmeans_model.labels_
            print(k, metrics.calinski_harabasz_score(principleComponents.iloc[:,[1,4]], labels))
            
    
        # Yellowbrick visualizer for k, Calinski-Harabasz
    
        model = KMeans()
        # k is range of number of clusters.
        visualizer = KElbowVisualizer(model, k=(2,10), metric='calinski_harabasz',timings=True, locate_elbow=False)
        visualizer.fit(principleComponents.iloc[:,[0,4]])
        visualizer.show()
        
        
        #Silhouette visualizer
        fig, ax = plt.subplots(2, 2, figsize=(15,6))
        for i in [2, 3, 4, 5]:
            '''
            Create KMeans instance for different number of clusters
            '''
            km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
            q, mod = divmod(i, 2)
            
            visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
            visualizer.fit(principleComponents.iloc[:,[1,4]])
            
            # Remove labels, reduce label size
            ax[q-1][mod].set_ylabel('') 
            ax[q-1][mod].yaxis.set_tick_params(labelsize=10)
    
            if i == 5:
                visualizer.set_title('')
    
        visualizer.show()
        
        # KMeans clustering
        PC234 = principleComponents.iloc[:,1:4] # Change 1 to 0 for severity clusters
        kmeans = KMeans(init= 'k-means++', n_clusters = 3, n_init='auto', max_iter=100, random_state=7)
        kmeans.fit(PC234)
        LABELS = kmeans.labels_
    
        # Add clusters to data
        data_complete['clusters']=pd.DataFrame(LABELS) 
        # Check amount and percentages of each clusters size
        clusters_count = data_complete.clusters.value_counts()
        print('Total amount in each cluster:', clusters_count)
        clusters_percent = data_complete.clusters.value_counts(normalize=True).mul(100)
        print('Total percentage in each cluster', clusters_percent)
        # Add cluster labels for plotting
        clusterNames = ['Tension', 'Intrusion/Avoidance', 'Anhedonia']
        
        # Add clusters to questionnaire subscales df
        questionnaireClusters['clusters'] = pd.DataFrame(LABELS)
        questionnaireClusters_std['clusters'] = pd.DataFrame(LABELS)
        
        return PC234, LABELS, clusterNames