Skip to content
Snippets Groups Projects
Commit ee497bbf authored by mcoth's avatar mcoth
Browse files

Upload New File

parent e057b366
No related branches found
No related tags found
No related merge requests found
Pipeline #38563 passed with warnings
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 1 14:13:55 2024
@author: Maya Coulson Theodorsen (mcoth@dtu.dk)
This script preprocesses questionnaire data by organizing questionnaire items
into subscales, calculating derived variables, and standardizing the data.
Computation of variables.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# Define sorting and organizing of data
def sort_data(data_complete):
# DataFrame with only the 59 DASS & PCL items
data = data_complete.loc[:, 'q0010_0001': 'q0014_0007']
# Make new column names so easier to understand
def prepend(list, str):
str += '{0}'
list = [str.format(i) for i in list]
return(list)
#Make DASS1-DASS42 list
list = [*range(1,43)]
str = 'DASS'
DASScolumns = (prepend(list, str))
#Make PCL1-PCL17 list
list = [*range(1,18)]
str = 'PCL'
PCLcolumns = (prepend(list, str))
#Make PCA1-PCA59 list to use later
list = [*range(1,60)]
str = 'PCA'
PCAcolumns = (prepend(list, str))
# Add column names
columnNames = DASScolumns + PCLcolumns
data.columns = columnNames
# Make df with only DASS
DASS = data.loc[:,'DASS1':'DASS42']
# Make df with only PCL
PCL = data.loc[:,'PCL1':'PCL17']
# Questionnaire subscales
questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0',
'Hyperarousal_t0', 'DASS_A_t0',
'DASS_D_t0', 'DASS_S_t0']].copy()
# Rename columns
questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal',
'DASS Anxiety', 'DASS Depression', 'DASS Stress']
# Separate PCL Avoidance and Numbing
Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']]
Avoidance = Avoidance.sum(axis=1)
Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']]
Numb = Numb.sum(axis=1)
# Add to questionnaire subscales df
questionnaireClusters['PCL Avoidance'] = Avoidance
questionnaireClusters['PCL Numbing'] = Numb
questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion',
'PCL Avoidance','PCL Numbing',
'PCL Hyperarousal',
'DASS Anxiety', 'DASS Depression',
'DASS Stress',]]
# Standardize questionnaires
questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters)
questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std)
questionnaireClusters_std.columns = questionnaireClusters.columns
# Standardize items to a scale of 0 to 1
std_data = data.copy()
std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1
std_PCL = std_PCL.div(4)
std_DASS = DASS.div(3)
std_data = pd.concat([std_DASS, std_PCL], axis=1)
# Adjust weights since DASS has 42 items and PCL has 17
weightDASS = 50/42
weightPCL = 50/17
std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS),
std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1)
# Transform a few variables for comparison of clusters
# Transform marital status into binary variable
data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1)
# Flip 0 & 1 in self-rated health for better interpretability
data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1)
# Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol"
data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0)
# Age as numeric
data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce')
# Extract military trauma
data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0)
# Calculate unemployment (study/work active vs not)
data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1)
data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0)
# Drugs
data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0)
data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1)
data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0)
# Alcohol - convert to units per week
alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']]
alc = alc.dropna()
alc.columns=['AlcFreq', 'Units']
# Pairs to compare in all following barplots
pairs=[(0, 1), (0, 2), (1, 2)]
#
# Mapping drinks per occasion to average values
drinks_map = {
1: 1.5,
2: 3.5,
3: 5.5,
4: 8.0,
5: 10.0
}
# Mapping drinking frequency to weekly frequency
frequency_map = {
1: 0,
2: 0.25,
3: 0.75,
4: 2.5,
5: 4.5
}
#
# Apply the mappings
data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map)
data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map)
# Calculate the mean number of standard drinks per week
data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week']
data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0)
# Excessive drinking
data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0)
data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0)
data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0)
data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0)
# Excessive drinking by sex
data_complete['binge'] = (
(((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) |
(((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2))
).astype(int)
return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment