From c1d8771d4214d1ba7435cec7881d2b7d8d08eef7 Mon Sep 17 00:00:00 2001
From: mcoth <mcoth@dtu.dk>
Date: Sun, 17 Nov 2024 16:49:32 +0100
Subject: [PATCH] Upload New File

---
 Descriptives.py | 119 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 Descriptives.py

diff --git a/Descriptives.py b/Descriptives.py
new file mode 100644
index 0000000..22a5733
--- /dev/null
+++ b/Descriptives.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Sep  1 17:39:05 2024
+
+@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
+
+This script calculates descriptive statistics for variables across the entire dataset 
+and by cluster. Outputs are formatted for LaTeX tables, including median and 
+interquartile range (IQR) for continuous variables and frequencies/percentages for 
+categorical variables.
+
+"""
+import pandas as pd
+
+def total_descriptives(data_complete, questionnaireClusters, categorical_variables, continuous_variables, binary_variables, sorter):
+    # Median and IQR
+    # Note: Output intended for LaTeX
+    descriptives = {}
+
+    # Descriptives for categorical variables
+    for var, label, df in categorical_variables:
+        # Get N and %
+        if var in binary_variables:
+            count = df[var].value_counts()
+            percent = df[var].value_counts(normalize=True) * 100
+            if 1 in count:
+                cnt = count[1]
+                pct = percent[1]
+                descriptives[label] = f"{cnt} ({pct:.1f}\\%)"
+        else:
+            # Calculate median and IQR
+            median = df[var].median()
+            q25 = df[var].quantile(0.25)
+            q75 = df[var].quantile(0.75)
+            descriptives[label] = f"{median:.1f} ({q25:.1f}–{q75:.1f})"
+
+    # Descriptives for continuous variables
+    for var, label, df in continuous_variables:
+        median = df[var].median()
+        q25 = df[var].quantile(0.25)
+        q75 = df[var].quantile(0.75)
+        descriptives[label] = f"{median:.1f} ({q25:.1f}–{q75:.1f})"
+
+    # Convert to a DataFrame for easier display
+    descriptives_total = pd.DataFrame(descriptives.items(), columns=['Variable', 'Median(IQR)/N(%)'])
+    descriptives_total = descriptives_total.set_index('Variable')
+    descriptives_total = descriptives_total.reindex(sorter)
+    descriptives_total = descriptives_total.reset_index()
+
+    return descriptives_total
+
+
+def cluster_descriptives(data_complete, questionnaireClusters, categorical_variables, continuous_variables, cluster_column, binary_variables, sorter):
+
+    # Empty dictionary for results
+    descriptives = {}
+
+    # Group data by cluster 
+    grouped_data_complete = data_complete.groupby(cluster_column)
+    grouped_data_questionnaire = questionnaireClusters.groupby(cluster_column)
+
+    # Combine both grouped datasets to handle variables from either df
+    all_grouped_data = {'data_complete': grouped_data_complete, 'questionnaireClusters': grouped_data_questionnaire}
+    
+    # Loop through categorical variables
+    for var, label, df in categorical_variables:
+        descriptives[label] = {}
+        grouped_data = all_grouped_data['data_complete'] if df is data_complete else all_grouped_data['questionnaireClusters']
+
+        for cluster, cluster_data in grouped_data:
+            if var in cluster_data.columns:
+                if var in binary_variables:
+                    count = cluster_data[var].value_counts()
+                    percent = cluster_data[var].value_counts(normalize=True) * 100
+                    if 1 in count:
+                        cnt = count[1]
+                        pct = percent[1]
+                        descriptives[label][f'Cluster {cluster}'] = f"{cnt} ({pct:.1f}\\%)"
+                else:
+                    mean = cluster_data[var].mean()
+                    std = cluster_data[var].std()
+                    descriptives[label][f'Cluster {cluster}'] = f"{mean:.1f} ({std:.1f})"
+
+    
+    # Loop through continuous variables
+    for var, label, df in continuous_variables:
+        descriptives[label] = {}
+        grouped_data = all_grouped_data['data_complete'] if df is data_complete else all_grouped_data['questionnaireClusters']
+
+        for cluster, cluster_data in grouped_data:
+            if var in cluster_data.columns:
+                # Calculate median, 25th & 75th percentile
+                median = cluster_data[var].median()
+                q25 = cluster_data[var].quantile(0.25)
+                q75 = cluster_data[var].quantile(0.75)
+                descriptives[label][f'Cluster {cluster}'] = f"{median:.1f} ({q25:.1f}–{q75:.1f})"
+    
+
+    # Convert to a DataFrame 
+    descriptives_cluster = pd.DataFrame(descriptives).T  # Transpose for correct format
+    descriptives_cluster = descriptives_cluster.reindex(sorter) # Reorder variables according to sorter
+
+    return descriptives_cluster
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+ 
\ No newline at end of file
-- 
GitLab