Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 14:13:55 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This script preprocesses questionnaire data by organizing questionnaire items
into subscales, calculating derived variables, and standardizing the data.
Computation of variables.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# Define sorting and organizing of data
def sort_data(data_complete):
# DataFrame with only the 59 DASS & PCL items
data = data_complete.loc[:, 'q0010_0001': 'q0014_0007']
# Make new column names so easier to understand
def prepend(list, str):
str += '{0}'
list = [str.format(i) for i in list]
return(list)
#Make DASS1-DASS42 list
list = [*range(1,43)]
str = 'DASS'
DASScolumns = (prepend(list, str))
#Make PCL1-PCL17 list
list = [*range(1,18)]
str = 'PCL'
PCLcolumns = (prepend(list, str))
#Make PCA1-PCA59 list to use later
list = [*range(1,60)]
str = 'PCA'
PCAcolumns = (prepend(list, str))
# Add column names
columnNames = DASScolumns + PCLcolumns
data.columns = columnNames
# Make df with only DASS
DASS = data.loc[:,'DASS1':'DASS42']
# Make df with only PCL
PCL = data.loc[:,'PCL1':'PCL17']
# Questionnaire subscales
questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0',
'Hyperarousal_t0', 'DASS_A_t0',
'DASS_D_t0', 'DASS_S_t0']].copy()
# Rename columns
questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal',
'DASS Anxiety', 'DASS Depression', 'DASS Stress']
# Separate PCL Avoidance and Numbing
Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']]
Avoidance = Avoidance.sum(axis=1)
Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']]
Numb = Numb.sum(axis=1)
# Add to questionnaire subscales df
questionnaireClusters['PCL Avoidance'] = Avoidance
questionnaireClusters['PCL Numbing'] = Numb
questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion',
'PCL Avoidance','PCL Numbing',
'PCL Hyperarousal',
'DASS Anxiety', 'DASS Depression',
'DASS Stress',]]
# Standardize questionnaires
questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters)
questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std)
questionnaireClusters_std.columns = questionnaireClusters.columns
# Standardize items to a scale of 0 to 1
std_data = data.copy()
std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1
std_PCL = std_PCL.div(4)
std_DASS = DASS.div(3)
std_data = pd.concat([std_DASS, std_PCL], axis=1)
# Adjust weights since DASS has 42 items and PCL has 17
weightDASS = 50/42
weightPCL = 50/17
std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS),
std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1)
# Transform a few variables for comparison of clusters
# Transform marital status into binary variable
data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1)
# Flip 0 & 1 in self-rated health for better interpretability
data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1)
# Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol"
data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0)
# Age as numeric
data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce')
# Extract military trauma
data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0)
# Calculate unemployment (study/work active vs not)
data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1)
data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0)
# Drugs
data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0)
data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1)
data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0)
# Alcohol - convert to units per week
alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']]
alc = alc.dropna()
alc.columns=['AlcFreq', 'Units']
# Pairs to compare in all following barplots
pairs=[(0, 1), (0, 2), (1, 2)]
#
# Mapping drinks per occasion to average values
drinks_map = {
1: 1.5,
2: 3.5,
3: 5.5,
4: 8.0,
5: 10.0
}
# Mapping drinking frequency to weekly frequency
frequency_map = {
1: 0,
2: 0.25,
3: 0.75,
4: 2.5,
5: 4.5
}
#
# Apply the mappings
data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map)
data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map)
# Calculate the mean number of standard drinks per week
data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week']
data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0)
# Excessive drinking
data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0)
data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0)
data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0)
data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0)
# Excessive drinking by sex
data_complete['binge'] = (
(((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) |
(((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2))
).astype(int)
return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete