From ee497bbffa45ce5dbfb26bef6d4a1f6b5eafdb56 Mon Sep 17 00:00:00 2001
From: mcoth <mcoth@dtu.dk>
Date: Sun, 17 Nov 2024 16:47:04 +0100
Subject: [PATCH] Upload New File

---
 Sort_data.py | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 Sort_data.py

diff --git a/Sort_data.py b/Sort_data.py
new file mode 100644
index 0000000..53f30af
--- /dev/null
+++ b/Sort_data.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Sep  1 14:13:55 2024
+
+@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
+
+This script preprocesses questionnaire data by organizing questionnaire items 
+into subscales, calculating derived variables, and standardizing the data.
+Computation of variables. 
+
+
+"""
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+
+
+# Define sorting and organizing of data
+def sort_data(data_complete):
+    # DataFrame with only the 59 DASS & PCL items
+    data = data_complete.loc[:, 'q0010_0001': 'q0014_0007']
+
+    # Make new column names so easier to understand
+    def prepend(list, str):
+        str += '{0}'
+        list = [str.format(i) for i in list]
+        return(list)
+ 
+ 
+    #Make DASS1-DASS42 list
+    list = [*range(1,43)]
+    str = 'DASS'
+    DASScolumns = (prepend(list, str))
+
+    #Make PCL1-PCL17 list
+    list = [*range(1,18)]
+    str = 'PCL'
+    PCLcolumns = (prepend(list, str))
+
+    #Make PCA1-PCA59 list to use later
+    list = [*range(1,60)]
+    str = 'PCA'
+    PCAcolumns = (prepend(list, str))
+
+    # Add column names
+    columnNames = DASScolumns + PCLcolumns
+    data.columns = columnNames
+
+    # Make df with only DASS
+    DASS = data.loc[:,'DASS1':'DASS42']
+    
+    # Make df with only PCL
+    PCL = data.loc[:,'PCL1':'PCL17']
+
+    # Questionnaire subscales
+    questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0', 
+                                       'Hyperarousal_t0', 'DASS_A_t0',
+                                       'DASS_D_t0', 'DASS_S_t0']].copy()
+    # Rename columns 
+    questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal',
+                                   'DASS Anxiety', 'DASS Depression', 'DASS Stress']
+    
+    # Separate PCL Avoidance and Numbing
+    Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']]
+    Avoidance = Avoidance.sum(axis=1)
+    Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']]     
+    Numb = Numb.sum(axis=1)   
+
+    # Add to questionnaire subscales df
+    questionnaireClusters['PCL Avoidance'] = Avoidance
+    questionnaireClusters['PCL Numbing'] = Numb
+
+    questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion', 
+                                                   'PCL Avoidance','PCL Numbing',
+                                                   'PCL Hyperarousal',
+                                                   'DASS Anxiety', 'DASS Depression',
+                                                   'DASS Stress',]]
+    # Standardize questionnaires
+    questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters)
+    questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std)
+    questionnaireClusters_std.columns = questionnaireClusters.columns
+    
+    # Standardize items to a scale of 0 to 1
+    std_data = data.copy()
+    std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1
+    std_PCL = std_PCL.div(4)
+    std_DASS = DASS.div(3)
+    std_data = pd.concat([std_DASS, std_PCL], axis=1)
+
+    # Adjust weights since DASS has 42 items and PCL has 17
+    weightDASS = 50/42
+    weightPCL = 50/17
+    std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS), 
+                        std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1)
+    
+    
+    # Transform a few variables for comparison of clusters
+    
+    # Transform marital status into binary variable
+    data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1)
+    
+    # Flip 0 & 1 in self-rated health for better interpretability
+    data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1)
+    
+    # Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol"
+    data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0)
+    
+    # Age as numeric
+    data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce')
+    
+    # Extract military trauma
+    data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0)
+
+    # Calculate unemployment (study/work active vs not)
+    data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1)
+    data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0)
+    
+    # Drugs
+    data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0)
+    data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1)
+    data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0)
+    
+    # Alcohol - convert to units per week
+    alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']]
+    alc = alc.dropna()
+    alc.columns=['AlcFreq', 'Units']
+    # Pairs to compare in all following barplots
+    pairs=[(0, 1), (0, 2), (1, 2)] 
+
+    #
+    # Mapping drinks per occasion to average values
+    drinks_map = {
+        1: 1.5,
+        2: 3.5,
+        3: 5.5,
+        4: 8.0,
+        5: 10.0
+    }
+
+    # Mapping drinking frequency to weekly frequency
+    frequency_map = {
+        1: 0,
+        2: 0.25,
+        3: 0.75,
+        4: 2.5,
+        5: 4.5
+    }
+    #
+    # Apply the mappings
+    data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map)
+    data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map)
+
+    # Calculate the mean number of standard drinks per week
+    data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week']
+
+    data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0)
+    
+    # Excessive drinking
+    data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0)
+    data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0)
+    
+    data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0)
+    data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0)
+    
+    # Excessive drinking by sex
+    data_complete['binge'] = (
+        (((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) |
+        (((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2))
+    ).astype(int)
+
+    
+    return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete
+
+
-- 
GitLab