Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 17:39:05 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This script calculates descriptive statistics for variables across the entire dataset
and by cluster. Outputs are formatted for LaTeX tables, including median and
interquartile range (IQR) for continuous variables and frequencies/percentages for
categorical variables.
"""
import pandas as pd
def total_descriptives(data_complete, questionnaireClusters, categorical_variables, continuous_variables, binary_variables, sorter):
# Median and IQR
# Note: Output intended for LaTeX
descriptives = {}
# Descriptives for categorical variables
for var, label, df in categorical_variables:
# Get N and %
if var in binary_variables:
count = df[var].value_counts()
percent = df[var].value_counts(normalize=True) * 100
if 1 in count:
cnt = count[1]
pct = percent[1]
descriptives[label] = f"{cnt} ({pct:.1f}\\%)"
else:
# Calculate median and IQR
median = df[var].median()
q25 = df[var].quantile(0.25)
q75 = df[var].quantile(0.75)
descriptives[label] = f"{median:.1f} ({q25:.1f}–{q75:.1f})"
# Descriptives for continuous variables
for var, label, df in continuous_variables:
median = df[var].median()
q25 = df[var].quantile(0.25)
q75 = df[var].quantile(0.75)
descriptives[label] = f"{median:.1f} ({q25:.1f}–{q75:.1f})"
# Convert to a DataFrame for easier display
descriptives_total = pd.DataFrame(descriptives.items(), columns=['Variable', 'Median(IQR)/N(%)'])
descriptives_total = descriptives_total.set_index('Variable')
descriptives_total = descriptives_total.reindex(sorter)
descriptives_total = descriptives_total.reset_index()
return descriptives_total
def cluster_descriptives(data_complete, questionnaireClusters, categorical_variables, continuous_variables, cluster_column, binary_variables, sorter):
# Empty dictionary for results
descriptives = {}
# Group data by cluster
grouped_data_complete = data_complete.groupby(cluster_column)
grouped_data_questionnaire = questionnaireClusters.groupby(cluster_column)
# Combine both grouped datasets to handle variables from either df
all_grouped_data = {'data_complete': grouped_data_complete, 'questionnaireClusters': grouped_data_questionnaire}
# Loop through categorical variables
for var, label, df in categorical_variables:
descriptives[label] = {}
grouped_data = all_grouped_data['data_complete'] if df is data_complete else all_grouped_data['questionnaireClusters']
for cluster, cluster_data in grouped_data:
if var in cluster_data.columns:
if var in binary_variables:
count = cluster_data[var].value_counts()
percent = cluster_data[var].value_counts(normalize=True) * 100
if 1 in count:
cnt = count[1]
pct = percent[1]
descriptives[label][f'Cluster {cluster}'] = f"{cnt} ({pct:.1f}\\%)"
else:
mean = cluster_data[var].mean()
std = cluster_data[var].std()
descriptives[label][f'Cluster {cluster}'] = f"{mean:.1f} ({std:.1f})"
# Loop through continuous variables
for var, label, df in continuous_variables:
descriptives[label] = {}
grouped_data = all_grouped_data['data_complete'] if df is data_complete else all_grouped_data['questionnaireClusters']
for cluster, cluster_data in grouped_data:
if var in cluster_data.columns:
# Calculate median, 25th & 75th percentile
median = cluster_data[var].median()
q25 = cluster_data[var].quantile(0.25)
q75 = cluster_data[var].quantile(0.75)
descriptives[label][f'Cluster {cluster}'] = f"{median:.1f} ({q25:.1f}–{q75:.1f})"
# Convert to a DataFrame
descriptives_cluster = pd.DataFrame(descriptives).T # Transpose for correct format
descriptives_cluster = descriptives_cluster.reindex(sorter) # Reorder variables according to sorter
return descriptives_cluster