Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 18:52:23 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This function performs statistical comparisons across clusters derived from prior analysis.
It evaluates differences between clusters using both categorical and continuous variables,
applies appropriate statistical tests, and adjusts p-values for multiple testing.
Overview:
Comparison of Categorical Variables:
- Chi-square or Fisher's Exact Test
- Effect size calculations using Cramér's V
- Post-hoc chi-square comparisons for significant results
Comparison of Continuous Variables:
- Tests for normality using Shapiro-Wilk
- Bartlett's test or Levene's test for homogeneity of variances
- Welch's ANOVA, One-way ANOVA, or Kruskal-Wallis depending on normality and variance equality
- Effect size calculations
- Dunn's post-hoc test for significant results
Multiple Testing Correction:
- False Discovery Rate (FDR) adjustment using the Benjamini-Hochberg method
Output:
- P-values and corrected p-values for all variables
- Summary of significant variables
- Post-hoc comparisons
"""
import pandas as pd
import pingouin as pg
from scipy import stats
from itertools import combinations
from scipy.stats import bartlett, levene
import scikit_posthocs as sp
from statsmodels.stats.multitest import multipletests
def compare_clusters(data_complete, questionnaireClusters):
# Categorical variables
categorical_variables = [
('PTSD_t0_DSMIV', 'Probable PTSD diagnosis', data_complete),
('q0002', 'Sex (male)', data_complete),
('q0006', 'Children', data_complete),
('civil_status', 'Civil status (single)', data_complete),
('Psychoanaleptica', 'Psychoanaleptica', data_complete),
('Psycholeptica', 'Psycholeptica', data_complete),
('q0033_0001', 'Suicidal history', data_complete),
('binge', 'Excessive alcohol intake', data_complete),
('ADHD_total_GROUP_t0', 'Probable childhood ADHD', data_complete),
('drugs', 'Current drug usage', data_complete),
('Military_trauma', 'Exposed to war', data_complete),
('Unemployed', 'Unemployed', data_complete)
]
# Continuous variables
continuous_variables = [
('q0003_0001', 'Age', data_complete),
('q0008_0001', 'Self-rated health', data_complete),
('PCL_t0', 'PCL total score', data_complete),
('Intrusion_t0', 'PCL Intrusion', data_complete),
('PCL Avoidance', 'PCL Avoidance', questionnaireClusters),
('PCL Numbing', 'PCL Numbing', questionnaireClusters),
('PCL Hyperarousal', 'PCL Hyperarousal', questionnaireClusters),
('DASS Anxiety', 'DASS Anxiety', questionnaireClusters),
('DASS Depression', 'DASS Depression', questionnaireClusters),
('DASS Stress', 'DASS Stress', questionnaireClusters),
('Traume_t0_total', 'Total traumas', data_complete),
('Antal_unikke_traumer_t0', 'Total unique traumas', data_complete)
]
# Store p-values for all comparisons
p_values = []
significant_tests = []
# Compare each categorical variable
for outcome_column, outcome_label, df in categorical_variables:
result = compare_categorical_clusters(df, 'clusters', outcome_column, outcome_label)
p_values.append(result) # Add each p value to the vector
significant_tests.append((outcome_column, outcome_label, df, 'categorical')) # Add variable info
# Compare each continuous variable
for outcome_column, outcome_label, df in continuous_variables:
result = compare_continuous_clusters(df, 'clusters', outcome_column, outcome_label)
p_values.append(result) # Add each p value to the vector
significant_tests.append((outcome_column, outcome_label, df, 'continuous')) # Add variable info
# Extract the original p-values from the dictionaries
original_p_values = [result['p_value'] for result in p_values]
# Apply FDR with Benjamini-Hochberg multiple testing correction
reject, corrected_p_values, _, _ = multipletests(original_p_values, alpha=0.05, method='fdr_bh')
# Add corrected p_values to dict
for i, result in enumerate(p_values):
result['corrected_p_value'] = corrected_p_values[i]
# Get significant tests for post-hoc tests
overview = []
significant_tests_filtered = []
for result, corr_p, test_info in zip(p_values, corrected_p_values, significant_tests):
uncorr_p = result['p_value'] # Extract the uncorrected p-value
if corr_p < 0.05:
overview.append('Sig')
significant_tests_filtered.append(test_info)
elif uncorr_p < 0.05 and corr_p >= 0.05:
overview.append('No longer sig')
else:
overview.append('Not sig')
#Post hoc tests
posthoc_p_values = []
# Perform post hoc tests on significant results
for outcome_column, outcome_label, df, var_type in significant_tests_filtered:
if var_type == 'categorical':
posthoc_p_value = posthoc_categorical(df, 'clusters', outcome_column, outcome_label)
else:
posthoc_p_value = posthoc_continuous(df, 'clusters', outcome_column, outcome_label)
posthoc_p_values.extend(posthoc_p_value)
# Extract the original p-values from the posthoc results
original_p_values = [result['p_value'] for result in posthoc_p_values]
# Apply FDR with Benjamini-Hochberg multiple testing correction
reject, posthoc_pvals_corrected, _, _ = multipletests(original_p_values, alpha=0.05, method='fdr_bh')
# Add corrected p-values
for i, result in enumerate(posthoc_p_values):
result['corrected_p_value'] = posthoc_pvals_corrected[i]
return p_values, posthoc_p_values, categorical_variables, continuous_variables
def compare_categorical_clusters(df, cluster_column, outcome_column, outcome_label):
contingency_table = pd.crosstab(df[cluster_column], df[outcome_column])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
if (expected >= 5).all():
test_type = 'Chi-Square'
else:
test_type = 'Fishers Exact Test'
contingency_table, observed_table, result_df = pg.chi2_independence(data=df, x=cluster_column, y=outcome_column)
p_value = result_df.loc[result_df['test'] == 'pearson', 'pval'].values[0]
chi2_stat = result_df.loc[result_df['test'] == 'pearson', 'chi2'].values[0]
dof = result_df.loc[result_df['test'] == 'pearson', 'dof'].values[0]
cramers_v = result_df.loc[result_df['test'] == 'pearson', 'cramer'].values[0]
return {
'outcome_label': outcome_label,
'test_type': test_type,
'p_value': p_value,
'test_statistic': chi2_stat,
'dof': dof,
'effect_size': cramers_v,
'corrected_p_value': None # Placeholder, will be filled in after FDR_bh
}
def compare_continuous_clusters(df, cluster_column, outcome_column, outcome_label):
clusters = df[cluster_column].unique()
cluster_data = [df[df[cluster_column] == cluster][outcome_column] for cluster in clusters]
# Test for normality
normality_results = pg.normality(df, dv=outcome_column, group=cluster_column)
if normality_results['normal'].all():
# Perform Bartlett's test for equal variances (for normal data)
bartlett_stat, bartlett_p = bartlett(*cluster_data)
if bartlett_p < 0.05:
# Unequal variances, use Welch's ANOVA
test_type = "Welchs ANOVA"
welch_anova_results = pg.welch_anova(dv=outcome_column, between=cluster_column, data=df)
p_value = welch_anova_results['p-unc'][0]
dof = welch_anova_results['dof'][0]
else:
# Equal variances, use one-way ANOVA
test_type = "ANOVA"
anova = pg.anova(data=df, dv=outcome_column, between=cluster_column)
p_value = anova['p-unc'][0]
dof = anova['dof'][0]
pd.set_option('display.float_format', lambda x: '%.10f' % x)
else:
# Not normally distributed, perform Levene's test for equal variances
levene_stat, levene_p = levene(*cluster_data, center='median') # 'median' used for robustness
test_type = 'Kruskal-Wallis'
kruskal = pg.kruskal(data=df, dv=outcome_column, between=cluster_column)
p_value = kruskal['p-unc'].values[0]
dof = kruskal['ddof1'][0]
H_stat = kruskal['H'][0]
# Calculate epsilon-squared
N = len(df)
epsilon_squared = max((H_stat - dof + 1) / (N - dof), 0)
return {
'outcome_label': outcome_label,
'test_type': test_type,
'p_value': p_value,
'test_statistic': H_stat,
'dof': dof,
'effect_size': epsilon_squared,
'corrected_p_value': None # Placeholder, will be filled in after FDR_bh
}
# Post hoc tests for variables significant after FDR_bh
def posthoc_categorical(df, cluster_column, outcome_column, outcome_label):
clusters = sorted(df[cluster_column].unique())
posthoc_p_values = []
for (cluster1, cluster2) in combinations(clusters, 2):
subset = df[df[cluster_column].isin([cluster1, cluster2])]
posthoc_table = pd.crosstab(subset[cluster_column], subset[outcome_column])
chi2, p, dof, _ = stats.chi2_contingency(posthoc_table)
posthoc_p_values.append({
'outcome_label': outcome_label,
'cluster1': cluster1,
'cluster2': cluster2,
'test_type': 'Chi-squared',
'p_value': p
})
return posthoc_p_values
def posthoc_continuous(df, cluster_column, outcome_column, outcome_label):
# Use Dunn's test for pairwise group comparisons
dunn = sp.posthoc_dunn(df, val_col= outcome_column, group_col = cluster_column)
clusters = sorted(df[cluster_column].unique())
posthoc_p_values = []
# Iterate through the Dunn's test result matrix
for i in range(len(clusters)):
for j in range(i + 1, len(clusters)):
cluster1 = clusters[i]
cluster2 = clusters[j]
p_value = dunn.iloc[i, j]
posthoc_p_values.append({
'outcome_label': outcome_label,
'cluster1': cluster1,
'cluster2': cluster2,
'test_type': 'Dunns',
'p_value': p_value
})
return posthoc_p_values