diff --git a/Sort_data.py b/Sort_data.py new file mode 100644 index 0000000000000000000000000000000000000000..53f30afa856dc68d475872c67efff173a007fb19 --- /dev/null +++ b/Sort_data.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 1 14:13:55 2024 + +@author: Maya Coulson Theodorsen (mcoth@dtu.dk) + +This script preprocesses questionnaire data by organizing questionnaire items +into subscales, calculating derived variables, and standardizing the data. +Computation of variables. + + +""" +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler + + +# Define sorting and organizing of data +def sort_data(data_complete): + # DataFrame with only the 59 DASS & PCL items + data = data_complete.loc[:, 'q0010_0001': 'q0014_0007'] + + # Make new column names so easier to understand + def prepend(list, str): + str += '{0}' + list = [str.format(i) for i in list] + return(list) + + + #Make DASS1-DASS42 list + list = [*range(1,43)] + str = 'DASS' + DASScolumns = (prepend(list, str)) + + #Make PCL1-PCL17 list + list = [*range(1,18)] + str = 'PCL' + PCLcolumns = (prepend(list, str)) + + #Make PCA1-PCA59 list to use later + list = [*range(1,60)] + str = 'PCA' + PCAcolumns = (prepend(list, str)) + + # Add column names + columnNames = DASScolumns + PCLcolumns + data.columns = columnNames + + # Make df with only DASS + DASS = data.loc[:,'DASS1':'DASS42'] + + # Make df with only PCL + PCL = data.loc[:,'PCL1':'PCL17'] + + # Questionnaire subscales + questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0', + 'Hyperarousal_t0', 'DASS_A_t0', + 'DASS_D_t0', 'DASS_S_t0']].copy() + # Rename columns + questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal', + 'DASS Anxiety', 'DASS Depression', 'DASS Stress'] + + # Separate PCL Avoidance and Numbing + Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']] + Avoidance = Avoidance.sum(axis=1) + Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']] + Numb = Numb.sum(axis=1) + + # Add to questionnaire subscales df + questionnaireClusters['PCL Avoidance'] = Avoidance + questionnaireClusters['PCL Numbing'] = Numb + + questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion', + 'PCL Avoidance','PCL Numbing', + 'PCL Hyperarousal', + 'DASS Anxiety', 'DASS Depression', + 'DASS Stress',]] + # Standardize questionnaires + questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters) + questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std) + questionnaireClusters_std.columns = questionnaireClusters.columns + + # Standardize items to a scale of 0 to 1 + std_data = data.copy() + std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1 + std_PCL = std_PCL.div(4) + std_DASS = DASS.div(3) + std_data = pd.concat([std_DASS, std_PCL], axis=1) + + # Adjust weights since DASS has 42 items and PCL has 17 + weightDASS = 50/42 + weightPCL = 50/17 + std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS), + std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1) + + + # Transform a few variables for comparison of clusters + + # Transform marital status into binary variable + data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1) + + # Flip 0 & 1 in self-rated health for better interpretability + data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1) + + # Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol" + data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0) + + # Age as numeric + data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce') + + # Extract military trauma + data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0) + + # Calculate unemployment (study/work active vs not) + data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1) + data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0) + + # Drugs + data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0) + data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1) + data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0) + + # Alcohol - convert to units per week + alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']] + alc = alc.dropna() + alc.columns=['AlcFreq', 'Units'] + # Pairs to compare in all following barplots + pairs=[(0, 1), (0, 2), (1, 2)] + + # + # Mapping drinks per occasion to average values + drinks_map = { + 1: 1.5, + 2: 3.5, + 3: 5.5, + 4: 8.0, + 5: 10.0 + } + + # Mapping drinking frequency to weekly frequency + frequency_map = { + 1: 0, + 2: 0.25, + 3: 0.75, + 4: 2.5, + 5: 4.5 + } + # + # Apply the mappings + data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map) + data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map) + + # Calculate the mean number of standard drinks per week + data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week'] + + data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0) + + # Excessive drinking + data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0) + data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0) + + data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0) + data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0) + + # Excessive drinking by sex + data_complete['binge'] = ( + (((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) | + (((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2)) + ).astype(int) + + + return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete + +