#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Sep 1 14:13:55 2024 @author: Maya Coulson Theodorsen (mcoth@dtu.dk) This script preprocesses questionnaire data by organizing questionnaire items into subscales, calculating derived variables, and standardizing the data. Computation of variables. """ import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler # Define sorting and organizing of data def sort_data(data_complete): # DataFrame with only the 59 DASS & PCL items data = data_complete.loc[:, 'q0010_0001': 'q0014_0007'] # Make new column names so easier to understand def prepend(list, str): str += '{0}' list = [str.format(i) for i in list] return(list) #Make DASS1-DASS42 list list = [*range(1,43)] str = 'DASS' DASScolumns = (prepend(list, str)) #Make PCL1-PCL17 list list = [*range(1,18)] str = 'PCL' PCLcolumns = (prepend(list, str)) #Make PCA1-PCA59 list to use later list = [*range(1,60)] str = 'PCA' PCAcolumns = (prepend(list, str)) # Add column names columnNames = DASScolumns + PCLcolumns data.columns = columnNames # Make df with only DASS DASS = data.loc[:,'DASS1':'DASS42'] # Make df with only PCL PCL = data.loc[:,'PCL1':'PCL17'] # Questionnaire subscales questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0', 'Hyperarousal_t0', 'DASS_A_t0', 'DASS_D_t0', 'DASS_S_t0']].copy() # Rename columns questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal', 'DASS Anxiety', 'DASS Depression', 'DASS Stress'] # Separate PCL Avoidance and Numbing Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']] Avoidance = Avoidance.sum(axis=1) Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']] Numb = Numb.sum(axis=1) # Add to questionnaire subscales df questionnaireClusters['PCL Avoidance'] = Avoidance questionnaireClusters['PCL Numbing'] = Numb questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion', 'PCL Avoidance','PCL Numbing', 'PCL Hyperarousal', 'DASS Anxiety', 'DASS Depression', 'DASS Stress',]] # Standardize questionnaires questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters) questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std) questionnaireClusters_std.columns = questionnaireClusters.columns # Standardize items to a scale of 0 to 1 std_data = data.copy() std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1 std_PCL = std_PCL.div(4) std_DASS = DASS.div(3) std_data = pd.concat([std_DASS, std_PCL], axis=1) # Adjust weights since DASS has 42 items and PCL has 17 weightDASS = 50/42 weightPCL = 50/17 std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS), std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1) # Transform a few variables for comparison of clusters # Transform marital status into binary variable data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1) # Flip 0 & 1 in self-rated health for better interpretability data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1) # Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol" data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0) # Age as numeric data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce') # Extract military trauma data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0) # Calculate unemployment (study/work active vs not) data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1) data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0) # Drugs data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0) data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1) data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0) # Alcohol - convert to units per week alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']] alc = alc.dropna() alc.columns=['AlcFreq', 'Units'] # Pairs to compare in all following barplots pairs=[(0, 1), (0, 2), (1, 2)] # # Mapping drinks per occasion to average values drinks_map = { 1: 1.5, 2: 3.5, 3: 5.5, 4: 8.0, 5: 10.0 } # Mapping drinking frequency to weekly frequency frequency_map = { 1: 0, 2: 0.25, 3: 0.75, 4: 2.5, 5: 4.5 } # # Apply the mappings data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map) data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map) # Calculate the mean number of standard drinks per week data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week'] data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0) # Excessive drinking data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0) data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0) data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0) data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0) # Excessive drinking by sex data_complete['binge'] = ( (((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) | (((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2)) ).astype(int) return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete