Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Symptom clustering analysis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
mcoth
Symptom clustering analysis
Commits
ee497bbf
Commit
ee497bbf
authored
4 months ago
by
mcoth
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
e057b366
No related branches found
No related tags found
No related merge requests found
Pipeline
#38563
passed with warnings
4 months ago
Stage: test
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Sort_data.py
+175
-0
175 additions, 0 deletions
Sort_data.py
with
175 additions
and
0 deletions
Sort_data.py
0 → 100644
+
175
−
0
View file @
ee497bbf
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 14:13:55 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This script preprocesses questionnaire data by organizing questionnaire items
into subscales, calculating derived variables, and standardizing the data.
Computation of variables.
"""
import
pandas
as
pd
import
numpy
as
np
from
sklearn.preprocessing
import
StandardScaler
# Define sorting and organizing of data
def
sort_data
(
data_complete
):
# DataFrame with only the 59 DASS & PCL items
data
=
data_complete
.
loc
[:,
'
q0010_0001
'
:
'
q0014_0007
'
]
# Make new column names so easier to understand
def
prepend
(
list
,
str
):
str
+=
'
{0}
'
list
=
[
str
.
format
(
i
)
for
i
in
list
]
return
(
list
)
#Make DASS1-DASS42 list
list
=
[
*
range
(
1
,
43
)]
str
=
'
DASS
'
DASScolumns
=
(
prepend
(
list
,
str
))
#Make PCL1-PCL17 list
list
=
[
*
range
(
1
,
18
)]
str
=
'
PCL
'
PCLcolumns
=
(
prepend
(
list
,
str
))
#Make PCA1-PCA59 list to use later
list
=
[
*
range
(
1
,
60
)]
str
=
'
PCA
'
PCAcolumns
=
(
prepend
(
list
,
str
))
# Add column names
columnNames
=
DASScolumns
+
PCLcolumns
data
.
columns
=
columnNames
# Make df with only DASS
DASS
=
data
.
loc
[:,
'
DASS1
'
:
'
DASS42
'
]
# Make df with only PCL
PCL
=
data
.
loc
[:,
'
PCL1
'
:
'
PCL17
'
]
# Questionnaire subscales
questionnaireClusters
=
data_complete
[[
'
PCL_t0
'
,
'
Intrusion_t0
'
,
'
Avoidance_t0
'
,
'
Hyperarousal_t0
'
,
'
DASS_A_t0
'
,
'
DASS_D_t0
'
,
'
DASS_S_t0
'
]].
copy
()
# Rename columns
questionnaireClusters
.
columns
=
[
'
PCL total
'
,
'
PCL Intrusion
'
,
'
PCL Avoidance
'
,
'
PCL Hyperarousal
'
,
'
DASS Anxiety
'
,
'
DASS Depression
'
,
'
DASS Stress
'
]
# Separate PCL Avoidance and Numbing
Avoidance
=
data_complete
.
loc
[:,[
'
q0013_0006
'
,
'
q0013_0007
'
]]
Avoidance
=
Avoidance
.
sum
(
axis
=
1
)
Numb
=
data_complete
.
loc
[:,[
'
q0013_0008
'
,
'
q0013_0009
'
,
'
q0013_0010
'
,
'
q0014_0001
'
,
'
q0014_0002
'
]]
Numb
=
Numb
.
sum
(
axis
=
1
)
# Add to questionnaire subscales df
questionnaireClusters
[
'
PCL Avoidance
'
]
=
Avoidance
questionnaireClusters
[
'
PCL Numbing
'
]
=
Numb
questionnaireClusters
=
questionnaireClusters
[[
'
PCL total
'
,
'
PCL Intrusion
'
,
'
PCL Avoidance
'
,
'
PCL Numbing
'
,
'
PCL Hyperarousal
'
,
'
DASS Anxiety
'
,
'
DASS Depression
'
,
'
DASS Stress
'
,]]
# Standardize questionnaires
questionnaireClusters_std
=
StandardScaler
().
fit_transform
(
questionnaireClusters
)
questionnaireClusters_std
=
pd
.
DataFrame
(
questionnaireClusters_std
)
questionnaireClusters_std
.
columns
=
questionnaireClusters
.
columns
# Standardize items to a scale of 0 to 1
std_data
=
data
.
copy
()
std_PCL
=
std_data
.
loc
[:,
'
PCL1
'
:
'
PCL17
'
]
-
1
std_PCL
=
std_PCL
.
div
(
4
)
std_DASS
=
DASS
.
div
(
3
)
std_data
=
pd
.
concat
([
std_DASS
,
std_PCL
],
axis
=
1
)
# Adjust weights since DASS has 42 items and PCL has 17
weightDASS
=
50
/
42
weightPCL
=
50
/
17
std_data
=
pd
.
concat
([
std_data
.
loc
[:,
'
DASS1
'
:
'
DASS42
'
].
mul
(
weightDASS
),
std_data
.
loc
[:,
'
PCL1
'
:
'
PCL17
'
].
mul
(
weightPCL
)],
axis
=
1
)
# Transform a few variables for comparison of clusters
# Transform marital status into binary variable
data_complete
[
'
civil_status
'
]
=
data_complete
[
'
q0004
'
].
apply
(
lambda
x
:
0
if
x
in
[
2
,
3
]
else
1
)
# Flip 0 & 1 in self-rated health for better interpretability
data_complete
[
'
Selv_vur_helbred
'
]
=
data_complete
[
'
Selv_vur_helbred
'
].
apply
(
lambda
x
:
0
if
x
in
[
1
]
else
1
)
# Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol"
data_complete
[
'
q0021_0001
'
]
=
data_complete
[
'
q0021_0001
'
].
fillna
(
0
)
# Age as numeric
data_complete
[
'
q0003_0001
'
]
=
pd
.
to_numeric
(
data_complete
[
'
q0003_0001
'
],
errors
=
'
coerce
'
)
# Extract military trauma
data_complete
[
'
Military_trauma
'
]
=
data_complete
[
'
Traume_t0_4
'
].
apply
(
lambda
x
:
1
if
x
>
0
else
0
)
# Calculate unemployment (study/work active vs not)
data_complete
[
'
Unemployed
'
]
=
data_complete
[[
'
q0005_0010
'
,
'
q0005_0011
'
,
'
q0005_0012
'
,
'
q0005_0013
'
,
'
q0005_0014
'
,
'
q0005_0015
'
]].
fillna
(
0
).
sum
(
axis
=
1
)
data_complete
[
'
Unemployed
'
]
=
data_complete
[
'
Unemployed
'
].
apply
(
lambda
x
:
1
if
x
>
0
else
0
)
# Drugs
data_complete
[
'
hash
'
]
=
data_complete
[
'
q0024_0001
'
].
apply
(
lambda
x
:
1
if
x
>
1
else
0
)
data_complete
[
'
drugs
'
]
=
data_complete
[[
'
q0026_0001
'
,
'
hash
'
]].
fillna
(
0
).
sum
(
axis
=
1
)
data_complete
[
'
drugs
'
]
=
data_complete
[
'
drugs
'
].
apply
(
lambda
x
:
1
if
x
>
0
else
0
)
# Alcohol - convert to units per week
alc
=
data_complete
.
loc
[:,[
'
q0020_0001
'
,
'
q0021_0001
'
]]
alc
=
alc
.
dropna
()
alc
.
columns
=
[
'
AlcFreq
'
,
'
Units
'
]
# Pairs to compare in all following barplots
pairs
=
[(
0
,
1
),
(
0
,
2
),
(
1
,
2
)]
#
# Mapping drinks per occasion to average values
drinks_map
=
{
1
:
1.5
,
2
:
3.5
,
3
:
5.5
,
4
:
8.0
,
5
:
10.0
}
# Mapping drinking frequency to weekly frequency
frequency_map
=
{
1
:
0
,
2
:
0.25
,
3
:
0.75
,
4
:
2.5
,
5
:
4.5
}
#
# Apply the mappings
data_complete
[
'
drinks_per_occasion
'
]
=
data_complete
[
'
q0021_0001
'
].
map
(
drinks_map
)
data_complete
[
'
frequency_per_week
'
]
=
data_complete
[
'
q0020_0001
'
].
map
(
frequency_map
)
# Calculate the mean number of standard drinks per week
data_complete
[
'
units_per_week
'
]
=
data_complete
[
'
drinks_per_occasion
'
]
*
data_complete
[
'
frequency_per_week
'
]
data_complete
[
'
units_per_week
'
]
=
data_complete
[
'
units_per_week
'
].
fillna
(
0
)
# Excessive drinking
data_complete
[
'
alcohol_over_5
'
]
=
data_complete
[
'
q0021_0001
'
].
apply
(
lambda
x
:
1
if
x
>
2
else
0
)
data_complete
[
'
alcohol_over_4
'
]
=
data_complete
[
'
q0021_0001
'
].
apply
(
lambda
x
:
1
if
x
>
1
else
0
)
data_complete
[
'
alcohol_over_14
'
]
=
data_complete
[
'
units_per_week
'
].
apply
(
lambda
x
:
1
if
x
>
14
else
0
)
data_complete
[
'
alcohol_over_7
'
]
=
data_complete
[
'
units_per_week
'
].
apply
(
lambda
x
:
1
if
x
>
7
else
0
)
# Excessive drinking by sex
data_complete
[
'
binge
'
]
=
(
(((
data_complete
[
'
alcohol_over_5
'
]
==
1
)
|
(
data_complete
[
'
alcohol_over_14
'
]
==
1
))
&
(
data_complete
[
'
q0002
'
]
==
1
))
|
(((
data_complete
[
'
alcohol_over_4
'
]
==
1
)
|
(
data_complete
[
'
alcohol_over_7
'
]
==
1
))
&
(
data_complete
[
'
q0002
'
]
==
2
))
).
astype
(
int
)
return
data
,
DASS
,
PCL
,
questionnaireClusters
,
questionnaireClusters_std
,
std_data
,
columnNames
,
PCAcolumns
,
data_complete
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment