Sort_data.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Sep  1 14:13:55 2024

@author: Maya Coulson Theodorsen (mcoth@dtu.dk)

This script preprocesses questionnaire data by organizing questionnaire items 
into subscales, calculating derived variables, and standardizing the data.
Computation of variables. 


"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Define sorting and organizing of data
def sort_data(data_complete):
    # DataFrame with only the 59 DASS & PCL items
    data = data_complete.loc[:, 'q0010_0001': 'q0014_0007']

    # Make new column names so easier to understand
    def prepend(list, str):
        str += '{0}'
        list = [str.format(i) for i in list]
        return(list)
 
 
    #Make DASS1-DASS42 list
    list = [*range(1,43)]
    str = 'DASS'
    DASScolumns = (prepend(list, str))

    #Make PCL1-PCL17 list
    list = [*range(1,18)]
    str = 'PCL'
    PCLcolumns = (prepend(list, str))

    #Make PCA1-PCA59 list to use later
    list = [*range(1,60)]
    str = 'PCA'
    PCAcolumns = (prepend(list, str))

    # Add column names
    columnNames = DASScolumns + PCLcolumns
    data.columns = columnNames

    # Make df with only DASS
    DASS = data.loc[:,'DASS1':'DASS42']
    
    # Make df with only PCL
    PCL = data.loc[:,'PCL1':'PCL17']

    # Questionnaire subscales
    questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0', 
                                       'Hyperarousal_t0', 'DASS_A_t0',
                                       'DASS_D_t0', 'DASS_S_t0']].copy()
    # Rename columns 
    questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal',
                                   'DASS Anxiety', 'DASS Depression', 'DASS Stress']
    
    # Separate PCL Avoidance and Numbing
    Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']]
    Avoidance = Avoidance.sum(axis=1)
    Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']]     
    Numb = Numb.sum(axis=1)   

    # Add to questionnaire subscales df
    questionnaireClusters['PCL Avoidance'] = Avoidance
    questionnaireClusters['PCL Numbing'] = Numb

    questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion', 
                                                   'PCL Avoidance','PCL Numbing',
                                                   'PCL Hyperarousal',
                                                   'DASS Anxiety', 'DASS Depression',
                                                   'DASS Stress',]]
    # Standardize questionnaires
    questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters)
    questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std)
    questionnaireClusters_std.columns = questionnaireClusters.columns
    
    # Standardize items to a scale of 0 to 1
    std_data = data.copy()
    std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1
    std_PCL = std_PCL.div(4)
    std_DASS = DASS.div(3)
    std_data = pd.concat([std_DASS, std_PCL], axis=1)

    # Adjust weights since DASS has 42 items and PCL has 17
    weightDASS = 50/42
    weightPCL = 50/17
    std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS), 
                        std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1)
    
    
    # Transform a few variables for comparison of clusters
    
    # Transform marital status into binary variable
    data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1)
    
    # Flip 0 & 1 in self-rated health for better interpretability
    data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1)
    
    # Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol"
    data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0)
    
    # Age as numeric
    data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce')
    
    # Extract military trauma
    data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0)

    # Calculate unemployment (study/work active vs not)
    data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1)
    data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0)
    
    # Drugs
    data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0)
    data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1)
    data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0)
    
    # Alcohol - convert to units per week
    alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']]
    alc = alc.dropna()
    alc.columns=['AlcFreq', 'Units']
    # Pairs to compare in all following barplots
    pairs=[(0, 1), (0, 2), (1, 2)] 

    #
    # Mapping drinks per occasion to average values
    drinks_map = {
        1: 1.5,
        2: 3.5,
        3: 5.5,
        4: 8.0,
        5: 10.0
    }

    # Mapping drinking frequency to weekly frequency
    frequency_map = {
        1: 0,
        2: 0.25,
        3: 0.75,
        4: 2.5,
        5: 4.5
    }
    #
    # Apply the mappings
    data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map)
    data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map)

    # Calculate the mean number of standard drinks per week
    data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week']

    data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0)
    
    # Excessive drinking
    data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0)
    data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0)
    
    data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0)
    data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0)
    
    # Excessive drinking by sex
    data_complete['binge'] = (
        (((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) |
        (((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2))
    ).astype(int)

    
    return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete