From c1d8771d4214d1ba7435cec7881d2b7d8d08eef7 Mon Sep 17 00:00:00 2001 From: mcoth <mcoth@dtu.dk> Date: Sun, 17 Nov 2024 16:49:32 +0100 Subject: [PATCH] Upload New File --- Descriptives.py | 119 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 Descriptives.py diff --git a/Descriptives.py b/Descriptives.py new file mode 100644 index 0000000..22a5733 --- /dev/null +++ b/Descriptives.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 1 17:39:05 2024 + +@author: Maya Coulson Theodorsen (mcoth@dtu.dk) + +This script calculates descriptive statistics for variables across the entire dataset +and by cluster. Outputs are formatted for LaTeX tables, including median and +interquartile range (IQR) for continuous variables and frequencies/percentages for +categorical variables. + +""" +import pandas as pd + +def total_descriptives(data_complete, questionnaireClusters, categorical_variables, continuous_variables, binary_variables, sorter): + # Median and IQR + # Note: Output intended for LaTeX + descriptives = {} + + # Descriptives for categorical variables + for var, label, df in categorical_variables: + # Get N and % + if var in binary_variables: + count = df[var].value_counts() + percent = df[var].value_counts(normalize=True) * 100 + if 1 in count: + cnt = count[1] + pct = percent[1] + descriptives[label] = f"{cnt} ({pct:.1f}\\%)" + else: + # Calculate median and IQR + median = df[var].median() + q25 = df[var].quantile(0.25) + q75 = df[var].quantile(0.75) + descriptives[label] = f"{median:.1f} ({q25:.1f}–{q75:.1f})" + + # Descriptives for continuous variables + for var, label, df in continuous_variables: + median = df[var].median() + q25 = df[var].quantile(0.25) + q75 = df[var].quantile(0.75) + descriptives[label] = f"{median:.1f} ({q25:.1f}–{q75:.1f})" + + # Convert to a DataFrame for easier display + descriptives_total = pd.DataFrame(descriptives.items(), columns=['Variable', 'Median(IQR)/N(%)']) + descriptives_total = descriptives_total.set_index('Variable') + descriptives_total = descriptives_total.reindex(sorter) + descriptives_total = descriptives_total.reset_index() + + return descriptives_total + + +def cluster_descriptives(data_complete, questionnaireClusters, categorical_variables, continuous_variables, cluster_column, binary_variables, sorter): + + # Empty dictionary for results + descriptives = {} + + # Group data by cluster + grouped_data_complete = data_complete.groupby(cluster_column) + grouped_data_questionnaire = questionnaireClusters.groupby(cluster_column) + + # Combine both grouped datasets to handle variables from either df + all_grouped_data = {'data_complete': grouped_data_complete, 'questionnaireClusters': grouped_data_questionnaire} + + # Loop through categorical variables + for var, label, df in categorical_variables: + descriptives[label] = {} + grouped_data = all_grouped_data['data_complete'] if df is data_complete else all_grouped_data['questionnaireClusters'] + + for cluster, cluster_data in grouped_data: + if var in cluster_data.columns: + if var in binary_variables: + count = cluster_data[var].value_counts() + percent = cluster_data[var].value_counts(normalize=True) * 100 + if 1 in count: + cnt = count[1] + pct = percent[1] + descriptives[label][f'Cluster {cluster}'] = f"{cnt} ({pct:.1f}\\%)" + else: + mean = cluster_data[var].mean() + std = cluster_data[var].std() + descriptives[label][f'Cluster {cluster}'] = f"{mean:.1f} ({std:.1f})" + + + # Loop through continuous variables + for var, label, df in continuous_variables: + descriptives[label] = {} + grouped_data = all_grouped_data['data_complete'] if df is data_complete else all_grouped_data['questionnaireClusters'] + + for cluster, cluster_data in grouped_data: + if var in cluster_data.columns: + # Calculate median, 25th & 75th percentile + median = cluster_data[var].median() + q25 = cluster_data[var].quantile(0.25) + q75 = cluster_data[var].quantile(0.75) + descriptives[label][f'Cluster {cluster}'] = f"{median:.1f} ({q25:.1f}–{q75:.1f})" + + + # Convert to a DataFrame + descriptives_cluster = pd.DataFrame(descriptives).T # Transpose for correct format + descriptives_cluster = descriptives_cluster.reindex(sorter) # Reorder variables according to sorter + + return descriptives_cluster + + + + + + + + + + + + + + + \ No newline at end of file -- GitLab