#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Sep 1 13:50:11 2024 @author: Maya Coulson Theodorsen (mcoth@dtu.dk) This is the main script used for the analysis. Runninng this file alone is sufficient, as long as the custom functions below are also imported. """ import os os.chdir('/Volumes/T7/') import sys sys.path.append('/Volumes/T7') # Path # Import custom functions from Import_data import load_data from Sort_data import sort_data from Perform_pca import perform_pca from Perform_clustering import perform_clustering from Compare_clusters import compare_clusters from Descriptives import total_descriptives, cluster_descriptives #Import all necessary packages import numpy as np import pandas as pd # Plotting import matplotlib.pyplot as plt import seaborn as sns # PCA from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn import metrics from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo #Clustering from scipy.cluster.hierarchy import dendrogram, linkage from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer # Statisticl tests import pingouin as pg import scipy.stats as stats import statsmodels.api as sm import scikit_posthocs as sp from sklearn.preprocessing import StandardScaler from scipy.stats import bartlett, levene, chi2_contingency from pingouin import normality, kruskal, homoscedasticity from itertools import combinations from statsmodels.stats.multitest import multipletests # Turn off warnings import warnings warnings.filterwarnings("ignore") #%% Import data using my Import_data function file data_complete = load_data("/Volumes/T7/data6_9_2023.csv") data = data_complete.loc[:, 'q0010_0001': 'q0014_0007'] #%% Call the sort_data function data, DASS, PCL, questionnaireClusters, questionnaireClusters_std, std_data, columnNames, PCAcolumns, data_complete = sort_data(data_complete) #%% Call the perform_pca function pca, loadings, principleComponents = perform_pca(std_data, PCAcolumns, columnNames) #%% Call the perform_clustering function PC234, LABELS, clusterNames = perform_clustering(std_data, principleComponents, data_complete, questionnaireClusters, questionnaireClusters_std) #%% Call the function to compare clusters across all variables p_values, posthoc_p_values, categorical_variables, continuous_variables = compare_clusters(data_complete, questionnaireClusters) pd.options.display.float_format = '{:.10f}'.format p_values = pd.DataFrame(p_values) posthoc_p_values = pd.DataFrame(posthoc_p_values) #%% Descriptive stats for total N and each k cluster_column = 'clusters' sorter = ['Sex (male)', 'Age', 'Civil status (single)', 'Children', 'Unemployed', 'Self-rated health', 'Psychoanaleptica', 'Psycholeptica', 'Excessive alcohol intake', 'Current drug usage', 'Suicidal history', 'Probable childhood ADHD', 'Exposed to war', 'combat', 'PCL Intrusion', 'PCL Avoidance', 'PCL Numbing', 'PCL Hyperarousal', 'DASS Anxiety', 'DASS Depression', 'DASS Stress', 'PCL total score', 'Probable PTSD diagnosis','Total traumas', 'Total unique traumas'] binary_variables = ['PTSD_t0_DSMIV','q0002', 'q0006', 'civil_status', 'Psychoanaleptica', 'Psycholeptica', 'binge','q0033_0001', 'ADHD_total_GROUP_t0', 'drugs', 'Military_trauma', 'combat','Unemployed'] descriptives_total = total_descriptives(data_complete, questionnaireClusters,categorical_variables, continuous_variables, binary_variables, sorter) descriptives_cluster = cluster_descriptives(data_complete, questionnaireClusters,categorical_variables, continuous_variables, cluster_column, binary_variables, sorter)