Skip to content
Snippets Groups Projects
Main_script.py 3.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • mcoth's avatar
    mcoth committed
    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    """
    Created on Sun Sep  1 13:50:11 2024
    
    @author: Maya Coulson Theodorsen (mcoth@dtu.dk)
    
    This is the main script used for the analysis. Runninng this file alone is
    sufficient, as long as the custom functions below are also imported. 
    
    """
    
    import os
    os.chdir('/Volumes/T7/')
    import sys
    sys.path.append('/Volumes/T7')  # Path
    
    # Import custom functions
    from Import_data import load_data
    from Sort_data import sort_data
    from Perform_pca import perform_pca
    from Perform_clustering import perform_clustering
    from Compare_clusters import compare_clusters
    from Descriptives import total_descriptives, cluster_descriptives
    
    #Import all necessary packages
    import numpy as np
    import pandas as pd
    
    # Plotting
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # PCA
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    from sklearn import metrics
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
    
    #Clustering
    from scipy.cluster.hierarchy import dendrogram, linkage
    from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
    
    # Statisticl tests
    import pingouin as pg
    import scipy.stats as stats
    import statsmodels.api as sm
    import scikit_posthocs as sp
    from sklearn.preprocessing import StandardScaler
    from scipy.stats import bartlett, levene, chi2_contingency
    from pingouin import normality, kruskal, homoscedasticity
    from itertools import combinations
    from statsmodels.stats.multitest import multipletests
    
    # Turn off warnings
    import warnings
    warnings.filterwarnings("ignore")
    
    #%% Import data using my Import_data function file
    data_complete = load_data("/Volumes/T7/data6_9_2023.csv")
    data = data_complete.loc[:, 'q0010_0001': 'q0014_0007']
    #%% Call the sort_data function
    data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete = sort_data(data_complete)
    
    #%% Call the perform_pca function
    pca, loadings, principleComponents = perform_pca(std_data, PCAcolumns, columnNames)
    
    #%% Call the perform_clustering function
    PC234, LABELS, clusterNames = perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std)
    
    #%% Call the function to compare clusters across all variables
    p_values, posthoc_p_values, categorical_variables, continuous_variables = compare_clusters(data_complete, questionnaireClusters)
    pd.options.display.float_format = '{:.10f}'.format
    p_values = pd.DataFrame(p_values)
    posthoc_p_values = pd.DataFrame(posthoc_p_values)
    
    #%% Descriptive stats for total N and each k
    cluster_column = 'clusters'
    sorter = ['Sex (male)', 'Age', 'Civil status (single)', 'Children', 'Unemployed', 
              'Self-rated health', 'Psychoanaleptica', 'Psycholeptica', 'Excessive alcohol intake',
              'Current drug usage', 'Suicidal history', 'Probable childhood ADHD', 'Exposed to war', 'combat',
              'PCL Intrusion', 'PCL Avoidance', 'PCL Numbing', 'PCL Hyperarousal', 'DASS Anxiety',
              'DASS Depression', 'DASS Stress', 'PCL total score', 'Probable PTSD diagnosis','Total traumas',
              'Total unique traumas']
    
    binary_variables = ['PTSD_t0_DSMIV','q0002', 'q0006', 'civil_status', 'Psychoanaleptica', 'Psycholeptica', 'binge','q0033_0001', 'ADHD_total_GROUP_t0', 'drugs', 'Military_trauma', 'combat','Unemployed']
    
    descriptives_total = total_descriptives(data_complete, questionnaireClusters,categorical_variables, continuous_variables, binary_variables, sorter)
    
    descriptives_cluster = cluster_descriptives(data_complete, questionnaireClusters,categorical_variables, continuous_variables, cluster_column, binary_variables, sorter)