From c2851099a05dcf8d7012364de98aa99f0ec06dd7 Mon Sep 17 00:00:00 2001
From: mcoth <mcoth@dtu.dk>
Date: Sun, 17 Nov 2024 16:48:43 +0100
Subject: [PATCH] Upload New File

---
 Compare_clusters.py | 263 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 Compare_clusters.py

diff --git a/Compare_clusters.py b/Compare_clusters.py
new file mode 100644
index 0000000..632f1fe
--- /dev/null
+++ b/Compare_clusters.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Sep  1 18:52:23 2024
+
+@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
+
+This function performs statistical comparisons across clusters derived from prior analysis.
+It evaluates differences between clusters using both categorical and continuous variables,
+applies appropriate statistical tests, and adjusts p-values for multiple testing.
+
+Overview:
+Comparison of Categorical Variables:
+  - Chi-square or Fisher's Exact Test
+  - Effect size calculations using Cramér's V
+  - Post-hoc chi-square comparisons for significant results
+
+Comparison of Continuous Variables:
+  - Tests for normality using Shapiro-Wilk
+  - Bartlett's test or Levene's test for homogeneity of variances
+  - Welch's ANOVA, One-way ANOVA, or Kruskal-Wallis depending on normality and variance equality
+  - Effect size calculations 
+  - Dunn's post-hoc test for significant results
+
+Multiple Testing Correction:
+  - False Discovery Rate (FDR) adjustment using the Benjamini-Hochberg method
+
+Output:
+  - P-values and corrected p-values for all variables
+  - Summary of significant variables
+  - Post-hoc comparisons
+
+"""
+
+import pandas as pd
+import pingouin as pg
+from scipy import stats
+from itertools import combinations
+from scipy.stats import bartlett, levene
+import scikit_posthocs as sp
+from statsmodels.stats.multitest import multipletests
+
+def compare_clusters(data_complete, questionnaireClusters):
+    # Categorical variables
+    categorical_variables = [
+        ('PTSD_t0_DSMIV', 'Probable PTSD diagnosis', data_complete),
+        ('q0002', 'Sex (male)', data_complete),
+        ('q0006', 'Children', data_complete),
+        ('civil_status', 'Civil status (single)', data_complete),
+        ('Psychoanaleptica', 'Psychoanaleptica', data_complete),
+        ('Psycholeptica', 'Psycholeptica', data_complete),
+        ('q0033_0001', 'Suicidal history', data_complete),
+        ('binge', 'Excessive alcohol intake', data_complete),
+        ('ADHD_total_GROUP_t0', 'Probable childhood ADHD', data_complete),
+        ('drugs', 'Current drug usage', data_complete),
+        ('Military_trauma', 'Exposed to war', data_complete),
+        ('Unemployed', 'Unemployed', data_complete)
+    ]
+
+    # Continuous variables
+    continuous_variables = [
+        ('q0003_0001', 'Age', data_complete),
+        ('q0008_0001', 'Self-rated health', data_complete),
+        ('PCL_t0', 'PCL total score', data_complete),
+        ('Intrusion_t0', 'PCL Intrusion', data_complete),
+        ('PCL Avoidance', 'PCL Avoidance', questionnaireClusters),
+        ('PCL Numbing', 'PCL Numbing', questionnaireClusters),
+        ('PCL Hyperarousal', 'PCL Hyperarousal', questionnaireClusters),
+        ('DASS Anxiety', 'DASS Anxiety', questionnaireClusters),
+        ('DASS Depression', 'DASS Depression', questionnaireClusters),
+        ('DASS Stress', 'DASS Stress', questionnaireClusters),
+        ('Traume_t0_total', 'Total traumas', data_complete),
+        ('Antal_unikke_traumer_t0', 'Total unique traumas', data_complete)
+    ]
+    
+    # Store p-values for all comparisons
+    p_values = []
+    significant_tests = []
+
+    
+    # Compare each categorical variable
+    for outcome_column, outcome_label, df in categorical_variables:
+        result = compare_categorical_clusters(df, 'clusters', outcome_column, outcome_label)
+        p_values.append(result) # Add each p value to the vector
+        significant_tests.append((outcome_column, outcome_label, df, 'categorical')) # Add variable info
+
+    # Compare each continuous variable
+    for outcome_column, outcome_label, df in continuous_variables:
+        result = compare_continuous_clusters(df, 'clusters', outcome_column, outcome_label)
+        p_values.append(result) # Add each p value to the vector
+        significant_tests.append((outcome_column, outcome_label, df, 'continuous')) # Add variable info
+    
+    # Extract the original p-values from the dictionaries
+    original_p_values = [result['p_value'] for result in p_values]
+
+    # Apply FDR with Benjamini-Hochberg multiple testing correction
+    reject, corrected_p_values, _, _ = multipletests(original_p_values, alpha=0.05, method='fdr_bh')
+    
+
+    # Add corrected p_values to dict
+    for i, result in enumerate(p_values):
+        result['corrected_p_value'] = corrected_p_values[i]
+
+    # Get significant tests for post-hoc tests
+    overview = []
+    significant_tests_filtered = []
+    for result, corr_p, test_info in zip(p_values, corrected_p_values, significant_tests):
+        uncorr_p = result['p_value']  # Extract the uncorrected p-value
+
+        if corr_p < 0.05:
+            overview.append('Sig')
+            significant_tests_filtered.append(test_info)
+        elif uncorr_p < 0.05 and corr_p >= 0.05:
+                overview.append('No longer sig')
+        else:
+            overview.append('Not sig')
+
+
+    #Post hoc tests
+    posthoc_p_values = []
+    # Perform post hoc tests on significant results
+    for outcome_column, outcome_label, df, var_type in significant_tests_filtered:
+        if var_type == 'categorical':
+            posthoc_p_value = posthoc_categorical(df, 'clusters', outcome_column, outcome_label)
+        else:
+            posthoc_p_value = posthoc_continuous(df, 'clusters', outcome_column, outcome_label)
+        
+        posthoc_p_values.extend(posthoc_p_value)
+
+
+    # Extract the original p-values from the posthoc results
+    original_p_values = [result['p_value'] for result in posthoc_p_values]
+
+    # Apply FDR with Benjamini-Hochberg multiple testing correction
+    reject, posthoc_pvals_corrected, _, _ = multipletests(original_p_values, alpha=0.05, method='fdr_bh')
+
+    # Add corrected p-values
+    for i, result in enumerate(posthoc_p_values):
+        result['corrected_p_value'] = posthoc_pvals_corrected[i]
+
+    return p_values, posthoc_p_values, categorical_variables, continuous_variables
+
+
+
+def compare_categorical_clusters(df, cluster_column, outcome_column, outcome_label):
+    
+    contingency_table = pd.crosstab(df[cluster_column], df[outcome_column])
+    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
+
+    if (expected >= 5).all():
+        test_type = 'Chi-Square'
+    else:
+        test_type = 'Fishers Exact Test'
+
+    contingency_table, observed_table, result_df = pg.chi2_independence(data=df, x=cluster_column, y=outcome_column)
+    p_value = result_df.loc[result_df['test'] == 'pearson', 'pval'].values[0]
+    chi2_stat = result_df.loc[result_df['test'] == 'pearson', 'chi2'].values[0]
+    dof = result_df.loc[result_df['test'] == 'pearson', 'dof'].values[0]
+    cramers_v = result_df.loc[result_df['test'] == 'pearson', 'cramer'].values[0]
+
+    return {
+        'outcome_label': outcome_label,
+        'test_type': test_type,
+        'p_value': p_value,
+        'test_statistic': chi2_stat,
+        'dof': dof,
+        'effect_size': cramers_v,
+        'corrected_p_value': None  # Placeholder, will be filled in after FDR_bh
+    }
+
+
+def compare_continuous_clusters(df, cluster_column, outcome_column, outcome_label):
+    clusters = df[cluster_column].unique()
+    cluster_data = [df[df[cluster_column] == cluster][outcome_column] for cluster in clusters]
+
+    # Test for normality
+    normality_results = pg.normality(df, dv=outcome_column, group=cluster_column)
+    
+    if normality_results['normal'].all():
+        
+        # Perform Bartlett's test for equal variances (for normal data)
+        bartlett_stat, bartlett_p = bartlett(*cluster_data)
+        
+        if bartlett_p < 0.05:
+            # Unequal variances, use Welch's ANOVA
+            test_type = "Welchs ANOVA"
+            welch_anova_results = pg.welch_anova(dv=outcome_column, between=cluster_column, data=df)
+            p_value = welch_anova_results['p-unc'][0]
+            dof = welch_anova_results['dof'][0]
+        else:
+            # Equal variances, use one-way ANOVA
+            test_type = "ANOVA"
+            anova = pg.anova(data=df, dv=outcome_column, between=cluster_column)
+            p_value = anova['p-unc'][0]
+            dof = anova['dof'][0]
+            pd.set_option('display.float_format', lambda x: '%.10f' % x)
+    else:
+        # Not normally distributed, perform Levene's test for equal variances
+        levene_stat, levene_p = levene(*cluster_data, center='median')  # 'median' used for robustness
+        test_type = 'Kruskal-Wallis'
+        kruskal = pg.kruskal(data=df, dv=outcome_column, between=cluster_column)
+        p_value = kruskal['p-unc'].values[0]
+        dof = kruskal['ddof1'][0]
+        H_stat = kruskal['H'][0]
+        # Calculate epsilon-squared
+        N = len(df)
+        epsilon_squared = max((H_stat - dof + 1) / (N - dof), 0)
+
+    return {
+    'outcome_label': outcome_label,
+    'test_type': test_type,
+    'p_value': p_value,
+    'test_statistic': H_stat,
+    'dof': dof,
+    'effect_size': epsilon_squared, 
+    'corrected_p_value': None  # Placeholder, will be filled in after FDR_bh
+    }
+            
+# Post hoc tests for variables significant after FDR_bh
+
+def posthoc_categorical(df, cluster_column, outcome_column, outcome_label):
+    clusters = sorted(df[cluster_column].unique())
+    posthoc_p_values = []
+    for (cluster1, cluster2) in combinations(clusters, 2):
+        subset = df[df[cluster_column].isin([cluster1, cluster2])]
+        posthoc_table = pd.crosstab(subset[cluster_column], subset[outcome_column])
+        chi2, p, dof, _ = stats.chi2_contingency(posthoc_table)
+        posthoc_p_values.append({
+            'outcome_label': outcome_label,
+            'cluster1': cluster1,
+            'cluster2': cluster2,
+            'test_type': 'Chi-squared',
+            'p_value': p  
+        })
+    return posthoc_p_values
+
+def posthoc_continuous(df, cluster_column, outcome_column, outcome_label):
+    # Use Dunn's test for pairwise group comparisons
+    dunn = sp.posthoc_dunn(df, val_col= outcome_column, group_col = cluster_column) 
+    clusters = sorted(df[cluster_column].unique())
+    posthoc_p_values = []
+
+    # Iterate through the Dunn's test result matrix
+    for i in range(len(clusters)):
+        for j in range(i + 1, len(clusters)):  
+            cluster1 = clusters[i]
+            cluster2 = clusters[j]
+            p_value = dunn.iloc[i, j]
+            posthoc_p_values.append({
+                'outcome_label': outcome_label,
+                'cluster1': cluster1,
+                'cluster2': cluster2,
+                'test_type': 'Dunns',
+                'p_value': p_value
+                
+            })
+    return posthoc_p_values
+
+   
+
+
+    
+    
\ No newline at end of file
-- 
GitLab