diff --git a/Main_script.py b/Main_script.py new file mode 100644 index 0000000000000000000000000000000000000000..7062de7476eb78ff5a18a1685729c5a085b052c3 --- /dev/null +++ b/Main_script.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 1 13:50:11 2024 + +@author: Maya Coulson Theodorsen (mcoth@dtu.dk) + +This is the main script used for the analysis. Runninng this file alone is +sufficient, as long as the custom functions below are also imported. + +""" + +import os +os.chdir('/Volumes/T7/') +import sys +sys.path.append('/Volumes/T7') # Path + +# Import custom functions +from Import_data import load_data +from Sort_data import sort_data +from Perform_pca import perform_pca +from Perform_clustering import perform_clustering +from Compare_clusters import compare_clusters +from Descriptives import total_descriptives, cluster_descriptives + +#Import all necessary packages +import numpy as np +import pandas as pd + +# Plotting +import matplotlib.pyplot as plt +import seaborn as sns + +# PCA +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +from sklearn import metrics +from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo + +#Clustering +from scipy.cluster.hierarchy import dendrogram, linkage +from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer + +# Statisticl tests +import pingouin as pg +import scipy.stats as stats +import statsmodels.api as sm +import scikit_posthocs as sp +from sklearn.preprocessing import StandardScaler +from scipy.stats import bartlett, levene, chi2_contingency +from pingouin import normality, kruskal, homoscedasticity +from itertools import combinations +from statsmodels.stats.multitest import multipletests + +# Turn off warnings +import warnings +warnings.filterwarnings("ignore") + +#%% Import data using my Import_data function file +data_complete = load_data("/Volumes/T7/data6_9_2023.csv") +data = data_complete.loc[:, 'q0010_0001': 'q0014_0007'] +#%% Call the sort_data function +data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete = sort_data(data_complete) + +#%% Call the perform_pca function +pca, loadings, principleComponents = perform_pca(std_data, PCAcolumns, columnNames) + +#%% Call the perform_clustering function +PC234, LABELS, clusterNames = perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std) + +#%% Call the function to compare clusters across all variables +p_values, posthoc_p_values, categorical_variables, continuous_variables = compare_clusters(data_complete, questionnaireClusters) +pd.options.display.float_format = '{:.10f}'.format +p_values = pd.DataFrame(p_values) +posthoc_p_values = pd.DataFrame(posthoc_p_values) + +#%% Descriptive stats for total N and each k +cluster_column = 'clusters' +sorter = ['Sex (male)', 'Age', 'Civil status (single)', 'Children', 'Unemployed', + 'Self-rated health', 'Psychoanaleptica', 'Psycholeptica', 'Excessive alcohol intake', + 'Current drug usage', 'Suicidal history', 'Probable childhood ADHD', 'Exposed to war', 'combat', + 'PCL Intrusion', 'PCL Avoidance', 'PCL Numbing', 'PCL Hyperarousal', 'DASS Anxiety', + 'DASS Depression', 'DASS Stress', 'PCL total score', 'Probable PTSD diagnosis','Total traumas', + 'Total unique traumas'] + +binary_variables = ['PTSD_t0_DSMIV','q0002', 'q0006', 'civil_status', 'Psychoanaleptica', 'Psycholeptica', 'binge','q0033_0001', 'ADHD_total_GROUP_t0', 'drugs', 'Military_trauma', 'combat','Unemployed'] + +descriptives_total = total_descriptives(data_complete, questionnaireClusters,categorical_variables, continuous_variables, binary_variables, sorter) + +descriptives_cluster = cluster_descriptives(data_complete, questionnaireClusters,categorical_variables, continuous_variables, cluster_column, binary_variables, sorter)