Skip to content
Snippets Groups Projects
Sort_data.py 6.63 KiB
Newer Older
  • Learn to ignore specific revisions
  • mcoth's avatar
    mcoth committed
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Created on Sun Sep  1 14:13:55 2024
    
    @author: Maya Coulson Theodorsen (mcoth@dtu.dk)
    
    This script preprocesses questionnaire data by organizing questionnaire items 
    into subscales, calculating derived variables, and standardizing the data.
    Computation of variables. 
    
    
    """
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    
    
    # Define sorting and organizing of data
    def sort_data(data_complete):
        # DataFrame with only the 59 DASS & PCL items
        data = data_complete.loc[:, 'q0010_0001': 'q0014_0007']
    
        # Make new column names so easier to understand
        def prepend(list, str):
            str += '{0}'
            list = [str.format(i) for i in list]
            return(list)
     
     
        #Make DASS1-DASS42 list
        list = [*range(1,43)]
        str = 'DASS'
        DASScolumns = (prepend(list, str))
    
        #Make PCL1-PCL17 list
        list = [*range(1,18)]
        str = 'PCL'
        PCLcolumns = (prepend(list, str))
    
        #Make PCA1-PCA59 list to use later
        list = [*range(1,60)]
        str = 'PCA'
        PCAcolumns = (prepend(list, str))
    
        # Add column names
        columnNames = DASScolumns + PCLcolumns
        data.columns = columnNames
    
        # Make df with only DASS
        DASS = data.loc[:,'DASS1':'DASS42']
        
        # Make df with only PCL
        PCL = data.loc[:,'PCL1':'PCL17']
    
        # Questionnaire subscales
        questionnaireClusters = data_complete[['PCL_t0', 'Intrusion_t0', 'Avoidance_t0', 
                                           'Hyperarousal_t0', 'DASS_A_t0',
                                           'DASS_D_t0', 'DASS_S_t0']].copy()
        # Rename columns 
        questionnaireClusters.columns=['PCL total','PCL Intrusion', 'PCL Avoidance', 'PCL Hyperarousal',
                                       'DASS Anxiety', 'DASS Depression', 'DASS Stress']
        
        # Separate PCL Avoidance and Numbing
        Avoidance = data_complete.loc[:,['q0013_0006', 'q0013_0007']]
        Avoidance = Avoidance.sum(axis=1)
        Numb = data_complete.loc[:,['q0013_0008','q0013_0009', 'q0013_0010','q0014_0001', 'q0014_0002']]     
        Numb = Numb.sum(axis=1)   
    
        # Add to questionnaire subscales df
        questionnaireClusters['PCL Avoidance'] = Avoidance
        questionnaireClusters['PCL Numbing'] = Numb
    
        questionnaireClusters = questionnaireClusters[['PCL total','PCL Intrusion', 
                                                       'PCL Avoidance','PCL Numbing',
                                                       'PCL Hyperarousal',
                                                       'DASS Anxiety', 'DASS Depression',
                                                       'DASS Stress',]]
        # Standardize questionnaires
        questionnaireClusters_std = StandardScaler().fit_transform(questionnaireClusters)
        questionnaireClusters_std = pd.DataFrame(questionnaireClusters_std)
        questionnaireClusters_std.columns = questionnaireClusters.columns
        
        # Standardize items to a scale of 0 to 1
        std_data = data.copy()
        std_PCL = std_data.loc[:,'PCL1':'PCL17'] - 1
        std_PCL = std_PCL.div(4)
        std_DASS = DASS.div(3)
        std_data = pd.concat([std_DASS, std_PCL], axis=1)
    
        # Adjust weights since DASS has 42 items and PCL has 17
        weightDASS = 50/42
        weightPCL = 50/17
        std_data = pd.concat([std_data.loc[:,'DASS1':'DASS42'].mul(weightDASS), 
                            std_data.loc[:,'PCL1':'PCL17'].mul(weightPCL)], axis=1)
        
        
        # Transform a few variables for comparison of clusters
        
        # Transform marital status into binary variable
        data_complete['civil_status'] = data_complete['q0004'].apply(lambda x: 0 if x in [2, 3] else 1)
        
        # Flip 0 & 1 in self-rated health for better interpretability
        data_complete['Selv_vur_helbred'] = data_complete['Selv_vur_helbred'].apply(lambda x: 0 if x in [1] else 1)
        
        # Replace NAs with 0 for units per week to those who answered q0020_0001 as "Never drink alcohol"
        data_complete['q0021_0001'] = data_complete['q0021_0001'].fillna(0)
        
        # Age as numeric
        data_complete['q0003_0001'] = pd.to_numeric(data_complete['q0003_0001'], errors='coerce')
        
        # Extract military trauma
        data_complete['Military_trauma'] = data_complete['Traume_t0_4'].apply(lambda x: 1 if x > 0 else 0)
    
        # Calculate unemployment (study/work active vs not)
        data_complete['Unemployed'] = data_complete[['q0005_0010','q0005_0011','q0005_0012','q0005_0013','q0005_0014','q0005_0015']].fillna(0).sum(axis=1)
        data_complete['Unemployed'] = data_complete['Unemployed'].apply(lambda x: 1 if x > 0 else 0)
        
        # Drugs
        data_complete['hash'] = data_complete['q0024_0001'].apply(lambda x: 1 if x > 1 else 0)
        data_complete['drugs'] = data_complete[['q0026_0001', 'hash']].fillna(0).sum(axis=1)
        data_complete['drugs'] = data_complete['drugs'].apply(lambda x: 1 if x > 0 else 0)
        
        # Alcohol - convert to units per week
        alc = data_complete.loc[:,['q0020_0001', 'q0021_0001']]
        alc = alc.dropna()
        alc.columns=['AlcFreq', 'Units']
        # Pairs to compare in all following barplots
        pairs=[(0, 1), (0, 2), (1, 2)] 
    
        #
        # Mapping drinks per occasion to average values
        drinks_map = {
            1: 1.5,
            2: 3.5,
            3: 5.5,
            4: 8.0,
            5: 10.0
        }
    
        # Mapping drinking frequency to weekly frequency
        frequency_map = {
            1: 0,
            2: 0.25,
            3: 0.75,
            4: 2.5,
            5: 4.5
        }
        #
        # Apply the mappings
        data_complete['drinks_per_occasion'] = data_complete['q0021_0001'].map(drinks_map)
        data_complete['frequency_per_week'] = data_complete['q0020_0001'].map(frequency_map)
    
        # Calculate the mean number of standard drinks per week
        data_complete['units_per_week'] = data_complete['drinks_per_occasion'] * data_complete['frequency_per_week']
    
        data_complete['units_per_week'] = data_complete['units_per_week'].fillna(0)
        
        # Excessive drinking
        data_complete['alcohol_over_5'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 2 else 0)
        data_complete['alcohol_over_4'] = data_complete['q0021_0001'].apply(lambda x: 1 if x > 1 else 0)
        
        data_complete['alcohol_over_14'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 14 else 0)
        data_complete['alcohol_over_7'] = data_complete['units_per_week'].apply(lambda x: 1 if x > 7 else 0)
        
        # Excessive drinking by sex
        data_complete['binge'] = (
            (((data_complete['alcohol_over_5'] == 1) | (data_complete['alcohol_over_14'] == 1)) & (data_complete['q0002'] == 1)) |
            (((data_complete['alcohol_over_4'] == 1) | (data_complete['alcohol_over_7'] == 1)) & (data_complete['q0002'] == 2))
        ).astype(int)
    
        
        return data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete