Machine Learning

f8f5f366 · glia · c66f2b2b · f8f5f366
Commit f8f5f366 authored Oct 24, 2022 by glia
--- a/MachineLearning.py
+++ b/MachineLearning.py
+# -*- coding: utf-8 -*-
+"""
+Updated Oct 18 2022
+@author: Qianliang Li (glia@dtu.dk)
+This script contains the code for the machine learning analysis
+It should be run after Preprocessing.py and FeatureEstimation.py
+"""
+# Set working directory
+import os
+wkdir = "/home/glia/EEG/Final_scripts/"
+os.chdir(wkdir)
+# Load all libraries from the Preamble
+from Preamble import *
+# Load the questionnaire data
+final_qdf = pd.read_csv("final_qdf.csv", sep=",", na_values=' ')
+# Define cases as >= 44 total PCL
+# Type: numpy array with subject id
+cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44])
+n_groups = 2
+Groups = ["CTRL", "PTSD"]
+# Make function to load EEG features
+def load_features_df():
+    # Load all features
+    power_df = pd.read_pickle(Feature_savepath+"Power_df.pkl")
+    fTBR_data_df = pd.read_pickle(Feature_savepath+"fTBR_df.pkl")
+    asymmetry_df = pd.read_pickle(Feature_savepath+"asymmetry_df.pkl")
+    PAF_data_df = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_df.pkl")
+    PAF_data_df_global = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_global_df.pkl")
+    OOF_data_df = pd.read_pickle(Feature_savepath+"OOF_data_FOOOF_df.pkl")
+    con_data_df = pd.read_pickle(Feature_savepath+"con_data_source_drop_interpol_df.pkl")
+    pec_data_df = pd.read_pickle(Feature_savepath+"pec_data_drop_interpol_ch_df.pkl")
+    microstate_transition_data_df = pd.read_pickle(Feature_savepath+"microstate_transition_data_df.pkl")
+    microstate_time_df = pd.read_pickle(Feature_savepath+"microstate_time_df.pkl")
+    microstate_entropy_df = pd.read_pickle(Feature_savepath+"microstate_entropy_df.pkl")
+    GC_data_df = pd.read_pickle(Feature_savepath+"GC_data_source_drop_interpol_df.pkl")
+    H_data_df = pd.read_pickle(Feature_savepath+"H_data_df.pkl")
+    H_data_df_global = pd.read_pickle(Feature_savepath+"H_data_global_df.pkl")
+    # List of features
+    EEG_features_name_list = [['Power'],
+                              ['Frontal Theta Beta Ratio',
+                              'Asymmetry'],
+                              ['Peak Alpha Frequency',
+                              'Global Peak Alpha Frequency'],
+                              ["1/f exponent"],
+                              ['imcoh'],
+                              ['wpli'],
+                              ['Power Envelope Correlation'],
+                              ['Microstate Transition',
+                              'Microstate Ratio Time',
+                              'Microstate Entropy'],
+                              ["Granger Causality"],
+                              ['DFA Exponent',
+                              'Global DFA Exponent']]
+    # Arrange them to fit one 2D dataframe
+    # Make function to add measurement column for indexing
+    def add_measurement_column(df, measurement = "Text"):
+        dummy_variable = [measurement]*df.shape[0]
+        df.insert(1, "Measurement", dummy_variable)
+        return df
+    # Make function to convert column tuple to string
+    def convertTupleHeader(header):
+        header = list(header)
+        str = "_".join(header)
+        return str
+    # Prepare overall dataframe
+    EEG_features_df = pd.DataFrame(Subject_id, columns = ["Subject_ID"])
+    # Add power spectral densities
+    power_df = add_measurement_column(power_df, "Power")
+    temp_df = power_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Quant_status", "Eye_status",
+                                        "Freq_band", "Channel"], dropna=False,
+                                   values="PSD").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add frontal theta/beta ratio
+    # fTBR_data_df = add_measurement_column(fTBR_data_df, "Frontal Theta Beta Ratio")
+    temp_df = fTBR_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status"], dropna=False,
+                                       values="TBR").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add power asymmetry
+    asymmetry_df = add_measurement_column(asymmetry_df, "Asymmetry")
+    temp_df = asymmetry_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Freq_band", "ROI"], dropna=False,
+                                        values="Asymmetry_score").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add peak alpha frequency
+    PAF_data_df = add_measurement_column(PAF_data_df, "Peak Alpha Frequency")
+    temp_df = PAF_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Channel"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    # NaN values are interpolated with means across channels for each condition
+    eye_status = list(final_epochs[0].event_id.keys())
+    for ee in eye_status:
+        temp = temp_df.loc[:,("Peak Alpha Frequency",ee)] # get data
+        temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0)
+        temp_df.loc[:,("Peak Alpha Frequency",ee)] = temp.to_numpy()
+    # If there are still NaN the values are interpolated across channels and condition
+    temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add 1/f exponent
+    OOF_data_df = add_measurement_column(OOF_data_df, "1/f exponent")
+    temp_df = OOF_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Channel"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    # NaN values are interpolated with means across channels for each condition
+    eye_status = list(final_epochs[0].event_id.keys())
+    for ee in eye_status:
+        temp = temp_df.loc[:,("1/f exponent",ee)] # get data
+        temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0)
+        temp_df.loc[:,("1/f exponent",ee)] = temp.to_numpy()
+    # If there are still NaN the values are interpolated across channels and condition
+    temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add global peak alpha frequency
+    #PAF_data_df_global = add_measurement_column(PAF_data_df_global, "Global_Peak_Alpha_Frequency") # already exists
+    temp_df = PAF_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    # NaN values are interpolated across eye condition
+    temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add connectivity measurements
+    #con_data_df = add_measurement_column(con_data_df, "Connectivity") # already exists
+    temp_df = con_data_df.pivot_table(index="Subject_ID",columns=["Con_measurement",
+                                        "Eye_status", "chx", "chy", "Freq_band"], dropna=True,
+                                        values="Value").reset_index(drop=True)
+    # Drop coh and plv, which are more susceptible to volume conduction
+    temp_df = temp_df.drop("coh",axis=1)
+    temp_df = temp_df.drop("plv",axis=1)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add orthogonalized power enveloped correlations
+    pec_data_df = add_measurement_column(pec_data_df, "Power Envelope Correlation")
+    temp_df = pec_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "chx", "chy", "Freq_band"], dropna=True,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add microstate transition probabilities
+    microstate_transition_data_df = add_measurement_column(microstate_transition_data_df, "Microstate Transition")
+    temp_df = microstate_transition_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Transition"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add microstate time covered
+    microstate_time_df = add_measurement_column(microstate_time_df, "Microstate Ratio Time")
+    # Convert microstate to str before using it as column
+    microstate_time_df = microstate_time_df.astype({"Microstate": str})
+    temp_df = microstate_time_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Microstate"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add microstate entropy
+    #microstate_entropy_df = add_measurement_column(microstate_entropy_df, "Microstate_Entropy") # already exists
+    microstate_entropy_df["Measurement"] = ["Microstate Entropy"]*microstate_entropy_df.shape[0]
+    temp_df = microstate_entropy_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add Granger Causality # fixed so it is always chx --> chy
+    GC_data_df = add_measurement_column(GC_data_df, "Granger Causality")
+    temp_df = GC_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "chx", "chy", "Freq_band"], dropna=True,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    # Check if any GC was not calculated properly
+    assert temp_df.shape[1] == (31*31-31)*5*2 # expected number of features with 31 sources
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add DFA exponent
+    H_data_df = add_measurement_column(H_data_df, "DFA Exponent")
+    temp_df = H_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Channel", "Freq_band"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    # Add Global Hurst exponent
+    H_data_df_global = H_data_df_global.drop("Measurement",axis=1)
+    H_data_df_global = add_measurement_column(H_data_df_global, "Global DFA Exponent") # already exists
+    temp_df = H_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement",
+                                        "Eye_status", "Freq_band"], dropna=False,
+                                        values="Value").reset_index(drop=True)
+    temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
+    EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
+    return EEG_features_df, EEG_features_name_list
+# %% ML predction of PTSD status using all features
+# mRMR -> mRMR -> SVM/RF/LogReg with L2
+# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
+EEG_features_df, EEG_features_name_list = load_features_df()
+# Add group status
+Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
+Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
+EEG_features_df.insert(1, "Group_status", Group_status)
+# Subject info columns
+Subject_info_cols = list(EEG_features_df.columns[0:2])
+n_subject_info_cols = len(Subject_info_cols)
+n_discrete_cols = 2
+# To ensure proper stratification into train/test set I will stratify using group status and study status
+# A variable that encodes for this is created
+n_studies = 3
+study_group_status = EEG_features_df["Group_status"].copy()
+for i in range(n_studies):
+    # Get study index
+    study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
+    # Assign label
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
+# Target variable
+Target = ["Group_status"]
+Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
+Target_col_idx = np.where(Target_col == True)[0]
+# Make 3 models and save them to use enseemble in the end
+CLF_models = ["SVM", "LogReg", "RF"]
+n_models = len(CLF_models)
+# Repeat the classification to see if I just got a lucky seed
+n_repetitions = 10
+k_out = 10
+accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
+model_par = []
+final_models = []
+final_features = []
+final_y_preds = []
+np.random.seed(42)
+# Prepare the splits beforehand to make sure the repetitions are not the same
+Rep_Outer_CV = []
+Rep_Outer_CV_test = [] # a list with only the test indices
+for rep in range(n_repetitions):
+    skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+    # I am also using converting it to an iterable list instad of the generator to promote reuse
+    Outer_CV = []
+    Outer_CV_test = []
+    for train_index, test_index in skf.split(EEG_features_df,study_group_status):
+        Outer_CV.append([train_index,test_index])
+        Outer_CV_test.append(test_index)
+    Rep_Outer_CV.append(Outer_CV)
+    Rep_Outer_CV_test.append(Outer_CV_test)
+# Check none of the repetitions are the same by using only the test sets
+# The list is first flattened
+Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
+# All elements are converted to strings
+# This makes it easier to look for uniques, and the indices are already in numerical order
+Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
+def allUnique(x):
+    seen = set()
+    return not any(i in seen or seen.add(i) for i in x)
+assert allUnique(Rep_Outer_CV_test_flat_str)
+# Get current time
+c_time1 = time.localtime()
+c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
+print(c_time1)
+for rep in range(n_repetitions):
+    # The outer fold CV has already been saved as lists
+    Outer_CV = Rep_Outer_CV[rep]
+    # Pre-allocate memory
+    model_par0 = []
+    final_models0 = []
+    final_features0 = []
+    final_y_preds0 = []
+    Outer_counter = 0
+    for train_index, test_index in Outer_CV:
+        test_df = EEG_features_df.iloc[test_index]
+        train_df = EEG_features_df.iloc[train_index]
+        # Training data will be standardized
+        standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
+        train_df_standard = train_df.copy()
+        train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
+        # Test data will also be standardized but using mean and std from training data
+        test_df_standard = test_df.copy()
+        test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
+        # Get the training data
+        X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_train = train_df_standard[Target]
+        # Get test data
+        X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_test = test_df_standard[Target]
+        # Prepare initial filtering of feature types to alleviate imbalance in number of features
+        # Use a variable that is optimized using inner CV
+        n_feat_mRMR1 = [20,30,40,50]
+        n_feat_mRMR2 = [30,40,50,60]
+        # Max features from mRMR for each eye status
+        max_mRMR_features = n_feat_mRMR1[-1]
+        max_mRMR_features2 = n_feat_mRMR2[-1]
+        eye_status = ["Eyes Closed", "Eyes Open"]
+        n_eye_status = len(eye_status)
+        mRMR_features = []
+        feat_counter = []
+        # Perform mRMR for each feature type
+        for fset in range(len(EEG_features_name_list)):
+            temp_feat = EEG_features_name_list[fset]
+            other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
+            other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
+            for e in range(n_eye_status):
+                ee = eye_status[e]
+                temp_ee = "{}_{}".format(temp_feat,ee)
+                # Retrieve the dataset for each feature
+                col_idx = np.zeros(len(X_train.columns), dtype=bool)
+                for fsub in range(len(temp_feat)):
+                    temp_feat0 = temp_feat[fsub]
+                    col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
+                    col_idx = np.logical_or(col_idx,col_idx0) # append all trues
+                temp_X_train = X_train.loc[:,col_idx]
+                # Check if any of the other features were wrongly chosen
+                for fcheck in range(len(other_feats)):
+                    if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
+                        temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]+"_"))]
+                if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
+                    temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
+                # Index for eye status
+                temp_X_train = temp_X_train.loc[:,temp_X_train.columns.str.contains(ee)]
+                # Save number of original features fed into mRMR
+                feat_counter.append([temp_feat,temp_X_train.shape[1]])
+                # Do not use mRMR if there are fewer than max_mRMR_features
+                if temp_X_train.shape[1] <= max_mRMR_features:
+                    filter_features = temp_X_train.columns
+                else:
+                    # mRMR
+                    filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
+                                                               num_features_to_select=max_mRMR_features,
+                                                               K_MAX=1000,n_jobs=-1,verbose=False)
+                    # Get selected features
+                    filter_features = temp_X_train.columns[filter_feat_selector]
+                # Save features selected
+                mRMR_features.append(list(filter_features))
+        # Check that no features were excluded when checking for wrongly chosen
+        total_features = 0
+        for feat_c in range(len(feat_counter)):
+            total_features += feat_counter[feat_c][1]
+        assert total_features == X_train.shape[1]
+        # Part 2 with second mRMR and classifiers in loop
+        k_fold = 10
+        skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+        # I am also using converting it to an iterable list instad of the generator to promote reuse
+        Inner_CV = []
+        for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
+            Inner_CV.append([train_index2,test_index2])
+        # SVM with L2-norm (it is by default squared)
+        # Prepare hyper parameters
+        exponent = np.linspace(-3,1,9)
+        exponent = np.round(exponent,5) # sometimes linspace are not so exact
+        C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
+        kernels = ["linear"]
+        # rbf overfit easier, whereas linear empirically works better in high D data
+        # Sequential Feature Selection for Logistic Regression
+        # In-built model selection CV
+        # The L2 is the inverse of C for LogReg
+        exponent = np.linspace(-3,1,9)
+        exponent = np.round(exponent,5) # sometimes linspace are not so exact
+        C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
+        # Random forest classifier
+        # Prepare hyper parameters
+        trees = np.array([10, 100, 500, 1000])
+        depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
+        # Prepare arrays for validation errors
+        val_err_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels)))
+        val_feat_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels),max_mRMR_features2))
+        val_feat_svm.fill(np.nan)
+        val_err_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg)))
+        val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg),max_mRMR_features2))
+        val_feat_logreg.fill(np.nan)
+        val_err_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth)))
+        val_feat_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2))
+        val_feat_rf.fill(np.nan)
+        val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2))
+        val_feat_import_rf.fill(np.nan)
+        for n1 in range(len(n_feat_mRMR1)):
+            # Get the features from first mRMR
+            mRMR_features1 = [item for sublist in mRMR_features for item in sublist[0:n_feat_mRMR1[n1]]]
+            # print("{} features are left after first mRMR filtering".format(len(mRMR_features1)))
+            X_train_mRMR = X_train[mRMR_features1].copy()
+            # Second round of mRMR on the reduced list
+            filter_feat_selector = mRMR_feature_select(X_train_mRMR.to_numpy(),y_train.to_numpy(),
+                                                   num_features_to_select=max_mRMR_features2,
+                                                   K_MAX=1000,n_jobs=-1,verbose=False)
+            for n2 in range(len(n_feat_mRMR2)):
+                # Get selected features from mRMR2
+                filter_features = X_train_mRMR.columns[filter_feat_selector[0:n_feat_mRMR2[n2]]]
+                X_train_mRMR2 = X_train_mRMR[filter_features].copy()
+                # SVM with recursive feature elemination 
+                # Stratified CV to find best regularization strength and number of features
+                # CV is built-in RFECV
+                min_features_to_select = 1
+                # Using normal for loop as RFECV has inbuilt multiprocessing
+                for C in range(len(C_parameter_SVM)):
+                    for K in range(len(kernels)):
+                        # Define the model
+                        svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000)
+                        # Perform recurive feature elimination with in-built CV
+                        rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy",
+                                      cv=Inner_CV,
+                                      min_features_to_select=min_features_to_select)
+                        rfecv.fit(X_train_mRMR2,y_train.to_numpy().ravel())
+                        # Get CV score
+                        err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
+                        # Save results for hyperparameter optimization
+                        val_err_svm[n1,n2,C,K] = err
+                        # Save the features chosen by RFE based on index from pre mRMR
+                        rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0]
+                        val_feat_svm[n1,n2,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx
+                        # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
+                # Logistic regression with sequential forward selection
+                # In-bult CV
+                k_features = np.arange(1,n_feat_mRMR2[n2]+1,1) # try up to all features
+                inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
+                for C in range(len(C_parameter_LogReg)):
+                    LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
+                                                max_iter = 50000)
+                    sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
+                              forward = True, scoring = "balanced_accuracy",
+                              verbose = 0, floating = False,
+                              cv = Inner_CV, n_jobs = -1)
+                    sfs = sfs.fit(X_train_mRMR2, y_train.to_numpy().ravel())
+                    # Save CV scores for each SFS step
+                    for feat in range(len(k_features)):
+                        inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
+                    # Find the best number of features
+                    # I am rounding to make it more smooth and disregard small improvements
+                    K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
+                    if len(K) > 1:
+                        K = K[np.where(K == np.min(K))[0]]
+                    err = inner_cv_scores[C,K]
+                    # Save validation error and features
+                    val_err_logreg[n1,n2,C] = err
+                    # Save the features chosen by RFE based on index from pre mRMR
+                    sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
+                    val_feat_logreg[n1,n2,C,0:len(sfs_feat_idx)] = sfs_feat_idx
+                    # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
+                # Random forest
+                model_test_err = np.zeros((k_fold,len(trees),len(depth)))
+                RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR2.shape[1]))
+                counter = 0
+                for cv in range(k_fold):
+                    # Retrieve CV indices
+                    train_index_rf = Inner_CV[cv][0]
+                    test_index_rf = Inner_CV[cv][1]
+                    # Retrieve datasets
+                    X_train2 = X_train_mRMR2.iloc[train_index_rf]; X_test2 = X_train_mRMR2.iloc[test_index_rf]
+                    y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
+                    for t in range(len(trees)):
+                        for d in range(len(depth)):
+                            RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
+                                                        n_jobs=-1, random_state=None)
+                            RF.fit(X_train2.to_numpy(),y_train2)
+                            # Validation error
+                            RF_y = RF.predict(X_test2.to_numpy())
+                            err = balanced_accuracy_score(y_test2, RF_y)
+                            # Save to array
+                            model_test_err[cv,t,d] = err
+                            # Save feature importance array
+                            RF_feat_import[cv,t,d] = RF.feature_importances_
+                    counter += 1
+                    # print("Finished RF run {} out of {}".format(counter, k_fold))
+                # Average the errors over the CV folds
+                model_test_err_mean = np.mean(model_test_err, axis=0)
+                val_err_rf[n1,n2,:,:] = model_test_err_mean
+                # Average the feature importances over the CV folds
+                RF_feat_import = np.mean(RF_feat_import, axis=0)
+                val_feat_import_rf[n1,n2,:,:,0:n_feat_mRMR2[n2]] = RF_feat_import
+                # Save the features used by the RF based on index from pre mRMR
+                rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0]
+                val_feat_rf[n1,n2,:,:,0:len(rf_feat_idx)] = rf_feat_idx
+            print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100))
+        # Choose the optimal parameters
+        ### SVM
+        n1, n2, C, K = np.where(val_err_svm==np.max(val_err_svm))
+        if len(C) > 1:
+            print("There are multiple SVM runs with the same validation error")
+            print("Choosing run with fewest features to alleviate overfitting")
+            rfe_feat_len = []
+            for i2 in range(len(C)):
+                n1_temp = int(n1[i2])
+                n2_temp = int(n2[i2])
+                C_temp = int(C[i2])
+                K_temp = int(K[i2])
+                temp_feats = val_feat_svm[n1_temp,n2_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,n2_temp,C_temp,K_temp])].astype(int)
+                rfe_feat_len.append(len(temp_feats))
+            rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
+            if len(rfe_feat_min) > 1:
+                print("Multiple SVM runs with same number of fewest features")
+                print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                C_min = np.argmin(C[rfe_feat_min])
+                n1, n2, C, K = [int(n1[rfe_feat_min][C_min]), int(n2[rfe_feat_min][C_min]),
+                                int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])]
+            else:
+                n1, n2, C, K = [int(n1[int(rfe_feat_min)]), int(n2[int(rfe_feat_min)]),
+                             int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])]
+        else:
+            n1, n2, C, K = [int(n1), int(n2), int(C), int(K)]
+        mRMR_chosen1 = n_feat_mRMR1[n1]
+        mRMR_chosen2 = n_feat_mRMR2[n2]
+        C_chosen = C_parameter_SVM[C]
+        K_chosen = kernels[K]
+        # Save the best validation erro
+        val_error = val_err_svm[n1, n2, C, K]
+        # Get the subsets chosen
+        chosen_feats = val_feat_svm[n1,n2,C,K][~np.isnan(val_feat_svm[n1,n2,C,K])].astype(int)
+        rfe_features = list(X_train.columns[chosen_feats])
+        n_final_feat = len(rfe_features)
+        x_train_mRMR2_rfe = X_train[rfe_features]
+        # Fit on all training data
+        model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000)
+        model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
+        # Get training output
+        model_train_y = model.predict(x_train_mRMR2_rfe)
+        # Get training error
+        train_error = balanced_accuracy_score(y_train, model_train_y)
+        # Get prediction of test data
+        y_pred = model.predict(X_test[rfe_features])
+        # Use model to predict class on test data
+        test_error = balanced_accuracy_score(y_test, y_pred)
+        # Save or prepare to save
+        accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error]
+        SVM_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat,K_chosen]
+        SVM_model = model
+        SVM_y_pred = [[model_train_y],[y_pred]]
+        ### LogReg with SFS
+        n1, n2, C = np.where(val_err_logreg==np.max(val_err_logreg))
+        if len(C) > 1:
+            print("There are multiple LogReg runs with the same validation error")
+            print("Choosing run with fewest features to alleviate overfitting")
+            sfs_feat_len = []
+            for i2 in range(len(C)):
+                n1_temp = int(n1[i2])
+                n2_temp = int(n2[i2])
+                C_temp = int(C[i2])
+                temp_feats = val_feat_logreg[n1_temp,n2_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,n2_temp,C_temp])].astype(int)
+                sfs_feat_len.append(len(temp_feats))
+            sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
+            if len(sfs_feat_min) > 1:
+                print("Multiple LogReg runs with same number of fewest features")
+                print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                C_min = np.argmin(C[sfs_feat_min])
+                n1, n2, C = [int(n1[sfs_feat_min][C_min]), int(n2[sfs_feat_min][C_min]),
+                                int(C[sfs_feat_min][C_min])]
+            else:
+                n1, n2, C = [int(n1[int(sfs_feat_min)]), int(n2[int(sfs_feat_min)]),
+                             int(C[int(sfs_feat_min)])]
+        else:
+            n1, n2, C = [int(n1), int(n2), int(C)]
+        mRMR_chosen1 = n_feat_mRMR1[n1]
+        mRMR_chosen2 = n_feat_mRMR2[n2]
+        C_chosen = C_parameter_LogReg[C]
+        # Save the best validation erro
+        val_error = val_err_logreg[n1, n2, C]
+        # Get the subsets chosen
+        chosen_feats = val_feat_logreg[n1,n2,C][~np.isnan(val_feat_logreg[n1,n2,C])].astype(int)
+        sfs_features = list(X_train.columns[chosen_feats])
+        n_final_feat = len(sfs_features)
+        x_train_mRMR2_sfs = X_train[sfs_features]
+        # Fit on all training data
+        model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
+        model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
+        # Get training output
+        model_train_y = model.predict(x_train_mRMR2_sfs)
+        # Get training error
+        train_error = balanced_accuracy_score(y_train, model_train_y)
+        # Get prediction of test data
+        y_pred = model.predict(X_test[sfs_features])
+        # Use model to predict class on test data
+        test_error = balanced_accuracy_score(y_test, y_pred)
+        # Save or prepare to save
+        accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error]
+        LogReg_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat]
+        LogReg_model = model
+        LogReg_y_pred = [[model_train_y],[y_pred]]
+        ### RF
+        n1, n2, t, d = np.where(val_err_rf==np.max(val_err_rf))
+        if len(d) > 1:
+            print("There are multiple RF runs with the same validation error")
+            print("Choosing run with lowest depth to alleviate overfitting")
+            d_min = np.where(d==np.min(d))[0]
+            if len(d_min) > 1:
+                print("Multiple RF runs with same number number of depths")
+                print("Choosing run with lowest trees to alleviate overfitting")
+                t_min = np.argmin(t[d_min]) # argmin just takes the first
+                # If there are multiple with same parameters and validation error it is most likely the same
+                n1, n2, t, d = [int(n1[d_min][t_min]), int(n2[d_min][t_min]),
+                                int(t[d_min][t_min]), int(d[d_min][t_min])]
+            else:
+                n1, n2, t, d = [int(n1[int(d_min)]), int(n2[int(d_min)]),
+                             int(t[int(d_min)]), int(d[int(d_min)])]
+        else:
+            n1, n2, t, d = [int(n1), int(n2), int(t), int(d)]
+        mRMR_chosen1 = n_feat_mRMR1[n1]
+        mRMR_chosen2 = n_feat_mRMR2[n2]
+        t_chosen = trees[t]
+        d_chosen = depth[d]
+        # Save the best validation erro
+        val_error = val_err_rf[n1, n2, t, d]
+        # Get the chosen features and feature importances
+        chosen_feats = val_feat_rf[n1,n2,t,d][~np.isnan(val_feat_rf[n1,n2,t,d])].astype(int)
+        rf_features = list(X_train.columns[chosen_feats])
+        n_final_feat = len(rf_features)
+        x_train_mRMR2_rf = X_train[rf_features]
+        rf_feat_importances = val_feat_import_rf[n1,n2,t,d][~np.isnan(val_feat_import_rf[n1,n2,t,d])]
+        # Fit on all training data
+        model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
+                                                        n_jobs=-1, random_state=None)
+        model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
+        # Get training output
+        model_train_y = model.predict(x_train_mRMR2_rf)
+        # Get training error
+        train_error = balanced_accuracy_score(y_train, model_train_y)
+        # Get prediction of test data
+        y_pred = model.predict(X_test[rf_features])
+        # Use model to predict class on test data
+        test_error = balanced_accuracy_score(y_test, y_pred)
+        # Save or prepare to save
+        accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error]
+        RF_model_par = [mRMR_chosen1,mRMR_chosen2,t_chosen,d_chosen]
+        RF_model = model
+        RF_y_pred = [[model_train_y],[y_pred]]
+        # Save the rest
+        model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par])
+        final_models0.append([SVM_model,LogReg_model,RF_model])
+        final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]])
+        final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred])
+        # Move counter
+        Outer_counter += 1
+        print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1))
+    # Save results from all outer folds
+    model_par.append(model_par0)
+    final_models.append(final_models0)
+    final_features.append(final_features0)
+    final_y_preds.append(final_y_preds0)
+    # Save results to file
+    Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds]
+    # Run with variable feat in first and second mRMR
+    with open(Model_savepath+"Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Group_Status_results_010222.pkl", "wb") as filehandle:
+        pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle)
+    # Get current time
+    c_time2 = time.localtime()
+    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
+    print("Started", c_time1, "\nCurrent Time",c_time2)
+    # Print total progress
+    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
+# %% ML predction of PTSD status using each feature separately
+# mRMR -> mRMR -> SVM/RF/LogReg with L2
+# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
+# With concurrent processes for faster computation
+EEG_features_df, EEG_features_name_list = load_features_df()
+# Add group status
+Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
+Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
+EEG_features_df.insert(1, "Group_status", Group_status)
+# Subject info columns
+Subject_info_cols = list(EEG_features_df.columns[0:2])
+n_subject_info_cols = len(Subject_info_cols)
+n_discrete_cols = 2
+# To ensure proper stratification into train/test set I will stratify using group status and study status
+# A variable that encodes for this is created
+n_studies = 3
+study_group_status = EEG_features_df["Group_status"].copy()
+for i in range(n_studies):
+    # Get study index
+    study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
+    # Assign label
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
+# Target variable
+Target = ["Group_status"]
+Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
+Target_col_idx = np.where(Target_col == True)[0]
+# Make 3 models and save them to use enseemble in the end
+CLF_models = ["SVM", "LogReg", "RF"]
+n_models = len(CLF_models)
+# Repeat the classification to see if I just got a lucky seed
+n_repetitions = 10
+k_out = 10
+# accuracy_arr = np.zeros((n_repetitions,k_out,len(EEG_features_name_list),n_models,3))
+Ind_feat_clf = [0]*n_repetitions
+np.random.seed(42)
+# Prepare the splits beforehand to make sure the repetitions are not the same
+Rep_Outer_CV = []
+Rep_Outer_CV_test = [] # a list with only the test indices
+for rep in range(n_repetitions):
+    skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+    # I am also using converting it to an iterable list instad of the generator to promote reuse
+    Outer_CV = []
+    Outer_CV_test = []
+    for train_index, test_index in skf.split(EEG_features_df,study_group_status):
+        Outer_CV.append([train_index,test_index])
+        Outer_CV_test.append(test_index)
+    Rep_Outer_CV.append(Outer_CV)
+    Rep_Outer_CV_test.append(Outer_CV_test)
+# Check none of the repetitions are the same by using only the test sets
+# The list is first flattened
+Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
+# All elements are converted to strings
+# This makes it easier to look for uniques, and the indices are already in numerical order
+Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
+def allUnique(x):
+    seen = set()
+    return not any(i in seen or seen.add(i) for i in x)
+assert allUnique(Rep_Outer_CV_test_flat_str)
+# Get current time
+c_time1 = time.localtime()
+c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
+print(c_time1)
+def Each_feat_10rep_10x10CV(rep):
+    # The outer fold CV has already been saved as lists
+    Outer_CV = Rep_Outer_CV[rep]
+    # Pre-allocate memory
+    Ind_feat_clf0 = []
+    accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3))
+    Outer_counter = 0
+    for train_index, test_index in Outer_CV:
+        test_df = EEG_features_df.iloc[test_index]
+        train_df = EEG_features_df.iloc[train_index]
+        # Training data will be standardized
+        standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
+        train_df_standard = train_df.copy()
+        train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
+        # Test data will also be standardized but using mean and std from training data
+        test_df_standard = test_df.copy()
+        test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
+        # Get the training data
+        X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_train = train_df_standard[Target]
+        # Get test data
+        X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_test = test_df_standard[Target]
+        # Part 2 with second mRMR and classifiers in loop
+        k_fold = 10
+        skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+        # I am also using converting it to an iterable list instad of the generator to promote reuse
+        Inner_CV = []
+        for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
+            Inner_CV.append([train_index2,test_index2])
+        # Prepare initial filtering of feature types to alleviate imbalance in number of features
+        # Use a variable that is optimized using inner CV
+        n_feat_mRMR1 = [20,30,40,50]
+        # Max features from mRMR for each eye status
+        max_mRMR_features = n_feat_mRMR1[-1]
+        eye_status = ["Eyes Closed", "Eyes Open"]
+        n_eye_status = len(eye_status)
+        feat_counter = []
+        # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
+        Ind_feat_clf00 = []
+        for fset in range(len(EEG_features_name_list)):
+            temp_feat = EEG_features_name_list[fset]
+            other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
+            other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
+            # Retrieve the dataset for each feature
+            col_idx = np.zeros(len(X_train.columns), dtype=bool)
+            for fsub in range(len(temp_feat)):
+                temp_feat0 = temp_feat[fsub]
+                col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
+                col_idx = np.logical_or(col_idx,col_idx0) # append all trues
+            temp_X_train = X_train.loc[:,col_idx]
+            # Check if any of the other features were wrongly chosen
+            for fcheck in range(len(other_feats)):
+                if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
+                    temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))]
+            if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
+                temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
+            # Save number of original features fed into mRMR
+            feat_counter.append([temp_feat,temp_X_train.shape[1]])
+            # If there are no features selected, then skip the rest of the loop
+            if temp_X_train.shape[1] == 0:
+                continue
+            # Do not use mRMR if there are fewer than n_features
+            if temp_X_train.shape[1] <= max_mRMR_features:
+                filter_features = temp_X_train.columns
+            else:
+                # mRMR
+                filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
+                                                            num_features_to_select=max_mRMR_features,
+                                                            K_MAX=1000,n_jobs=1,verbose=False)
+                # Get selected features
+                filter_features = temp_X_train.columns[filter_feat_selector]
+            # SVM with L2-norm (it is by default squared)
+            # Prepare hyper parameters
+            exponent = np.linspace(-3,1,9)
+            exponent = np.round(exponent,5) # sometimes linspace are not so exact
+            C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
+            # rbf kernel overfit easier, whereas linear empirically works better in high D data
+            # Sequential Feature Selection for Logistic Regression
+            # In-built model selection CV
+            # The L2 is the inverse of C for LogReg
+            exponent = np.linspace(-3,1,9)
+            exponent = np.round(exponent,5) # sometimes linspace are not so exact
+            C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
+            # Random forest classifier
+            # Prepare hyper parameters
+            trees = np.array([10, 100, 500, 1000])
+            depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
+            # Prepare arrays for validation errors
+            val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM)))
+            val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features))
+            val_feat_svm.fill(np.nan)
+            val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
+            val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
+            val_feat_logreg.fill(np.nan)
+            val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
+            val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
+            val_feat_rf.fill(np.nan)
+            val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
+            val_feat_import_rf.fill(np.nan)
+            min_features_to_select = 1
+            for n1 in range(len(n_feat_mRMR1)):
+                mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]]
+                # Use the selected filter features
+                temp_X_train_mRMR = temp_X_train[mRMR_features1]
+                # SVM with recursive feature elemination 
+                # Stratified CV to find best regularization strength and number of features
+                # CV is built-in RFECV
+                for C in range(len(C_parameter_SVM)):
+                    # Define the model
+                    svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000)
+                    # Perform recurive feature elimination with in-built CV
+                    rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy",
+                                  cv=Inner_CV,
+                                  min_features_to_select=min_features_to_select)
+                    rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel())
+                    # Get CV score
+                    err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
+                    # Save results for hyperparameter optimization
+                    val_err_svm[n1,C] = err
+                    # Save the features chosen by RFE based on index from pre mRMR
+                    rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0]
+                    val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx
+                    # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
+                # Logistic regression with sequential forward selection
+                # In-bult CV
+                k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]])
+                k_features = np.arange(1,k_max+1,1) # try up to all features
+                inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
+                for C in range(len(C_parameter_LogReg)):
+                    LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
+                                                max_iter = 50000)
+                    sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
+                              forward = True, scoring = "balanced_accuracy",
+                              verbose = 0, floating = False,
+                              cv = Inner_CV, n_jobs = 1)
+                    sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel())
+                    # Save CV scores for each SFS step
+                    for feat in range(len(k_features)):
+                        inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
+                    # Find the best number of features
+                    # I am rounding to make it more smooth and disregard small improvements
+                    K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
+                    if len(K) > 1:
+                        K = K[np.where(K == np.min(K))[0]]
+                    err = inner_cv_scores[C,K]
+                    # Save validation error and features
+                    val_err_logreg[n1,C] = err
+                    # Save the features chosen by RFE based on index from pre mRMR
+                    sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
+                    val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
+                    # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
+                # Random forest                
+                model_test_err = np.zeros((k_fold,len(trees),len(depth)))
+                RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1]))
+                counter = 0
+                for cv in range(k_fold):
+                    # Retrieve CV indices
+                    train_index_rf = Inner_CV[cv][0]
+                    test_index_rf = Inner_CV[cv][1]
+                    # Retrieve datasets
+                    X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf]
+                    y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
+                    for t in range(len(trees)):
+                        for d in range(len(depth)):
+                            RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
+                                                        n_jobs=1, random_state=None)
+                            RF.fit(X_train2.to_numpy(),y_train2)
+                            # Validation error
+                            RF_y = RF.predict(X_test2.to_numpy())
+                            err = balanced_accuracy_score(y_test2,RF_y)
+                            # Save to array
+                            model_test_err[cv,t,d] = err
+                            # Save feature importance array
+                            RF_feat_import[cv,t,d] = RF.feature_importances_
+                    counter += 1
+                    # print("Finished RF run {} out of {}".format(counter, k_fold))
+                # Average the errors over the CV folds
+                model_test_err_mean = np.mean(model_test_err, axis=0)
+                val_err_rf[n1,:,:] = model_test_err_mean
+                # Average the feature importances over the CV folds
+                RF_feat_import = np.mean(RF_feat_import, axis=0)
+                val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import
+                # Save the features used by the RF based on index from pre mRMR
+                rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0]
+                val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
+                # Print progress
+                current_progress = (n1+1)/(len(n_feat_mRMR1))*100
+                print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0]))
+            # Choose the optimal parameters
+            ### SVM
+            n1, C = np.where(val_err_svm==np.max(val_err_svm))
+            if len(C) > 1:
+                print("There are multiple SVM runs with the same validation error")
+                print("Choosing run with fewest features to alleviate overfitting")
+                rfe_feat_len = []
+                for i2 in range(len(C)):
+                    n1_temp = int(n1[i2])
+                    C_temp = int(C[i2])
+                    temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int)
+                    rfe_feat_len.append(len(temp_feats))
+                rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
+                if len(rfe_feat_min) > 1:
+                    print("Multiple SVM runs with same number of fewest features")
+                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                    C_min = np.argmin(C[rfe_feat_min])
+                    n1, C = [int(n1[rfe_feat_min][C_min]),
+                                    int(C[rfe_feat_min][C_min])]
+                else:
+                    n1, C = [int(n1[int(rfe_feat_min)]),
+                                  int(C[int(rfe_feat_min)])]
+            else:
+                n1, C = [int(n1), int(C)]
+            mRMR_chosen1 = n_feat_mRMR1[n1]
+            C_chosen = C_parameter_SVM[C]
+            # Save the best validation error
+            val_error = val_err_svm[n1, C]
+            # Get the subsets chosen
+            chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int)
+            rfe_features = list(X_train.columns[chosen_feats])
+            n_final_feat = len(rfe_features)
+            x_train_mRMR2_rfe = X_train[rfe_features]
+            # Fit on all training data
+            model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000)
+            model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
+            # Get training output
+            model_train_y = model.predict(x_train_mRMR2_rfe)
+            # Get training error
+            train_error = balanced_accuracy_score(y_train, model_train_y)
+            # Get prediction of test data
+            y_pred = model.predict(X_test[rfe_features])
+            # Use model to predict class on test data
+            test_error = balanced_accuracy_score(y_test, y_pred)
+            # Save or prepare to save
+            accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error]
+            SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
+            SVM_model = model
+            SVM_y_pred = [[model_train_y],[y_pred]]
+            ### LogReg with SFS
+            n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
+            if len(C) > 1:
+                print("There are multiple LogReg runs with the same validation error")
+                print("Choosing run with fewest features to alleviate overfitting")
+                sfs_feat_len = []
+                for i2 in range(len(C)):
+                    n1_temp = int(n1[i2])
+                    C_temp = int(C[i2])
+                    temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
+                    sfs_feat_len.append(len(temp_feats))
+                sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
+                if len(sfs_feat_min) > 1:
+                    print("Multiple LogReg runs with same number of fewest features")
+                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                    C_min = np.argmin(C[sfs_feat_min])
+                    n1, C = [int(n1[sfs_feat_min][C_min]),
+                                    int(C[sfs_feat_min][C_min])]
+                else:
+                    n1, C = [int(n1[int(sfs_feat_min)]),
+                                  int(C[int(sfs_feat_min)])]
+            else:
+                n1, C = [int(n1), int(C)]
+            mRMR_chosen1 = n_feat_mRMR1[n1]
+            C_chosen = C_parameter_LogReg[C]
+            # Save the best validation erro
+            val_error = val_err_logreg[n1, C]
+            # Get the subsets chosen
+            chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
+            sfs_features = list(X_train.columns[chosen_feats])
+            n_final_feat = len(sfs_features)
+            x_train_mRMR2_sfs = X_train[sfs_features]
+            # Fit on all training data
+            model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
+            model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
+            # Get training output
+            model_train_y = model.predict(x_train_mRMR2_sfs)
+            # Get training error
+            train_error = balanced_accuracy_score(y_train, model_train_y)
+            # Get prediction of test data
+            y_pred = model.predict(X_test[sfs_features])
+            # Use model to predict class on test data
+            test_error = balanced_accuracy_score(y_test, y_pred)
+            # Save or prepare to save
+            accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error]
+            LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
+            LogReg_model = model
+            LogReg_y_pred = [[model_train_y],[y_pred]]
+            ### RF
+            n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
+            if len(d) > 1:
+                print("There are multiple RF runs with the same validation error")
+                print("Choosing run with lowest depth to alleviate overfitting")
+                d_min = np.where(d==np.min(d))[0]
+                if len(d_min) > 1:
+                    print("Multiple RF runs with same number number of depths")
+                    print("Choosing run with lowest trees to alleviate overfitting")
+                    t_min = np.argmin(t[d_min]) # argmin just takes the first
+                    # If there are multiple with same parameters and validation error it is most likely the same
+                    n1, t, d = [int(n1[d_min][t_min]),
+                                    int(t[d_min][t_min]), int(d[d_min][t_min])]
+                else:
+                    n1, t, d = [int(n1[int(d_min)]),
+                                  int(t[int(d_min)]), int(d[int(d_min)])]
+            else:
+                n1, t, d = [int(n1), int(t), int(d)]
+            mRMR_chosen1 = n_feat_mRMR1[n1]
+            t_chosen = trees[t]
+            d_chosen = depth[d]
+            # Save the best validation error
+            val_error = val_err_rf[n1, t, d]
+            # Get the chosen features and feature importances
+            chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
+            rf_features = list(X_train.columns[chosen_feats])
+            n_final_feat = len(rf_features)
+            x_train_mRMR2_rf = X_train[rf_features]
+            rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
+            # Fit on all training data
+            model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
+                                                            n_jobs=1, random_state=None)
+            model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
+            # Get training output
+            model_train_y = model.predict(x_train_mRMR2_rf)
+            # Get training error
+            train_error = balanced_accuracy_score(y_train, model_train_y)
+            # Get prediction of test data
+            y_pred = model.predict(X_test[rf_features])
+            # Use model to predict class on test data
+            test_error = balanced_accuracy_score(y_test, y_pred)
+            # Save or prepare to save
+            accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error]
+            RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
+            RF_model = model
+            RF_y_pred = [[model_train_y],[y_pred]]
+            # Save the rest
+            model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par]
+            final_models00 = [SVM_model,LogReg_model,RF_model]
+            final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]]
+            final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred]
+            data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_]
+            # Save the results for the specific feature
+            res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00]
+            Ind_feat_clf00.append(res)
+            print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat))
+        # Check that no features were excluded when checking for wrongly chosen
+        total_features = 0
+        for feat_c in range(len(feat_counter)):
+            total_features += feat_counter[feat_c][1]
+        assert total_features == X_train.shape[1]
+        # Save results over outer folds
+        Ind_feat_clf0.append(Ind_feat_clf00)
+        print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1))
+        # Move counter
+        Outer_counter += 1
+    out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable]
+    # Print total progress
+    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
+    # Get current time
+    c_time2 = time.localtime()
+    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
+    print("Started", c_time1, "\nCurrent Time",c_time2)
+    return rep, out
+Ind_feat_clf = [0]*n_repetitions
+with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
+    for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments
+        Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
+        # Save results to file and overwrite when new results arrive
+        with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR_SVM_LogReg_RF_results_070222.pkl", "wb") as filehandle:
+            pickle.dump(Ind_feat_clf, filehandle)
+# %% Sparse clustering of all EEG features to look for subtypes
+EEG_features_df, EEG_features_name_list = load_features_df()
+# Add group status
+Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
+Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
+EEG_features_df.insert(1, "Group_status", Group_status)
+# Only use PTSD patient group
+EEG_features_df2 = EEG_features_df.loc[EEG_features_df["Group_status"]==1,:]
+Subject_info_cols = ["Subject_ID","Group_status"]
+# Standardize the values
+X = np.array(EEG_features_df2.copy().drop(Subject_info_cols, axis=1))
+standardizer = preprocessing.StandardScaler().fit(X)
+X = standardizer.transform(X)
+# Use gridsearch and permutations to estimate gap statistic and use it to 
+# determine number of clusters and sparsity s
+# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020
+max_clusters = 6
+n_sparsity_feat = 20
+perm_res = []
+Timestamps = []
+# Get current time
+c_time1 = time.localtime()
+c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
+print(c_time1)
+Timestamps.append(c_time1)
+for k in range(1,max_clusters):
+    # Cannot permute with 1 cluster
+    n_clusters = k+1
+    perm = pysparcl.cluster.permute_modified(X, k=n_clusters, verbose=True,
+                                             nvals=n_sparsity_feat, nperms=100)
+    perm_res.append(perm)
+    # Get current time
+    c_time2 = time.localtime()
+    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
+    print("Started", c_time1, "\nCurrent Time",c_time2)
+    Timestamps.append(c_time2)
+# Save the results
+with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "wb") as file:
+    pickle.dump(perm_res, file)
+# # Load
+# with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "rb") as file:
+#     perm_res = pickle.load(file)
+# Convert results to array
+perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4))
+for i in range(len(perm_res)):
+    _, gaps, sdgaps, wbounds, _ = perm_res[i].values()
+    for i2 in range(n_sparsity_feat):
+        perm_res_arr[n_sparsity_feat*i+i2,0] = i+2 # cluster size
+        perm_res_arr[n_sparsity_feat*i+i2,1] = gaps[i2] # gap statistic
+        perm_res_arr[n_sparsity_feat*i+i2,2] = sdgaps[i2] # gap statistic std
+        perm_res_arr[n_sparsity_feat*i+i2,3] = wbounds[i2] # sparsity feature s
+# For each sparsity s, determine best k using one-standard-error criterion
+# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s
+# that fulfill Gap(k) >= Gap(k+1)-std(k+1)
+def one_standard_deviation_search(gaps, std):
+    best_gaps = np.argmax(gaps)
+    current_gaps = gaps[best_gaps]
+    current_std = std[best_gaps]
+    current_gaps_idx = best_gaps
+    while (gaps[current_gaps_idx-1] >= current_gaps - current_std):
+        if current_gaps_idx == 0:
+            break
+        else:
+            current_gaps_idx -= 1
+            current_gaps = gaps[current_gaps_idx]
+            current_std = std[current_gaps_idx]
+    out = current_gaps, current_std, current_gaps_idx
+    return out
+best_ks = np.zeros((n_sparsity_feat, 2))
+all_s = np.unique(perm_res_arr[:,3])
+plt.figure(figsize=(12,12))
+for i2 in range(n_sparsity_feat):
+    current_s = all_s[i2]
+    gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1]
+    std = perm_res_arr[perm_res_arr[:,3] == current_s,2]
+    _, _, idx = one_standard_deviation_search(gaps, std)
+    # Save to array
+    best_ks[i2,0] = current_s
+    best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx]
+    # Plot gap
+    plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"),
+             gaps, yerr=std, capsize=5, label = np.round(current_s,3))
+plt.title("Gap statistic for different fixed s")
+plt.legend(loc=1)
+plt.xlabel("Number of clusters")
+plt.ylabel("Gap statistic")
+best_k = int(scipy.stats.mode(best_ks[:,1])[0])
+# Determine s using fixed k as lowest s within 1 std of max gap statistic
+# According to Witten & Tibshirani, 2010
+best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1])
+best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx]
+best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx]
+one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std
+best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]])
+# Perform clustering with k clusters
+sparcl = pysparcl.cluster.kmeans(X, k=best_k, wbounds=best_s)[0]
+# # Save the results
+# with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "wb") as file:
+#     pickle.dump(sparcl, file)
+with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
+    sparcl = pickle.load(file)
+# %% Sparse kmeans -> mRMR -> SVM/RF/LogReg with L2
+# Using sparse kmeans selected features, only eyes closed as they were primarily chosen
+# Prediction of subtype 1
+EEG_features_df, EEG_features_name_list = load_features_df()
+with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
+    sparcl = pickle.load(file)
+# Use concatenated features
+EEG_features_name_list = [['Power'],
+                          ['Frontal Theta Beta Ratio',
+                          'Asymmetry'],
+                          ['Peak Alpha Frequency',
+                          'Global Peak Alpha Frequency'],
+                          ["1/f exponent"],
+                          ['imcoh'],
+                          ['wpli'],
+                          ['Power Envelope Correlation'],
+                          ['Microstate Transition',
+                          'Microstate Ratio Time',
+                          'Microstate Entropy'],
+                          ["Granger Causality"],
+                          ['DFA Exponent',
+                          'Global DFA Exponent']]
+# Add group status
+Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
+Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
+EEG_features_df.insert(1, "Group_status", Group_status)
+# Only keep PTSD subtype 1 by dropping subtype 2
+subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
+subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
+EEG_features_df = EEG_features_df.set_index("Subject_ID")
+EEG_features_df = EEG_features_df.drop(subtype2)
+EEG_features_df = EEG_features_df.reset_index()
+# Check it was dropped correctly
+assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])
+# Subject info columns
+Subject_info_cols = list(EEG_features_df.columns[0:2])
+n_subject_info_cols = len(Subject_info_cols)
+n_discrete_cols = 2
+# Get features from sparse kmeans
+nonzero_idx = sparcl["ws"].nonzero()
+sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
+sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO
+# Only use eyes closed (2483 features)
+sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")]
+EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1)
+# To ensure proper stratification into train/test set I will stratify using group status and study status
+# A variable that encodes for this is created
+n_studies = 3
+study_group_status = EEG_features_df["Group_status"].copy()
+for i in range(n_studies):
+    # Get study index
+    study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
+    # Assign label
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
+# Target variable
+Target = ["Group_status"]
+Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
+Target_col_idx = np.where(Target_col == True)[0]
+# Make 3 models and save them to use enseemble in the end
+CLF_models = ["SVM", "LogReg", "RF"]
+n_models = len(CLF_models)
+# Repeat the classification to see if I just got a lucky seed
+n_repetitions = 10
+k_out = 10
+accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
+model_par = []
+final_models = []
+final_features = []
+final_y_preds = []
+np.random.seed(42)
+# Prepare the splits beforehand to make sure the repetitions are not the same
+Rep_Outer_CV = []
+Rep_Outer_CV_test = [] # a list with only the test indices
+for rep in range(n_repetitions):
+    skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+    # I am also using converting it to an iterable list instad of the generator to promote reuse
+    Outer_CV = []
+    Outer_CV_test = []
+    for train_index, test_index in skf.split(EEG_features_df,study_group_status):
+        Outer_CV.append([train_index,test_index])
+        Outer_CV_test.append(test_index)
+    Rep_Outer_CV.append(Outer_CV)
+    Rep_Outer_CV_test.append(Outer_CV_test)
+# Check none of the repetitions are the same by using only the test sets
+# The list is first flattened
+Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
+# All elements are converted to strings
+# This makes it easier to look for uniques, and the indices are already in numerical order
+Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
+def allUnique(x):
+    seen = set()
+    return not any(i in seen or seen.add(i) for i in x)
+assert allUnique(Rep_Outer_CV_test_flat_str)
+# Get current time
+c_time1 = time.localtime()
+c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
+print(c_time1)
+for rep in range(n_repetitions):
+    # The outer fold CV has already been saved as lists
+    Outer_CV = Rep_Outer_CV[rep]
+    # Pre-allocate memory
+    model_par0 = []
+    final_models0 = []
+    final_features0 = []
+    final_y_preds0 = []
+    Outer_counter = 0
+    for train_index, test_index in Outer_CV:
+        test_df = EEG_features_df.iloc[test_index]
+        train_df = EEG_features_df.iloc[train_index]
+        # Training data will be standardized
+        standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
+        train_df_standard = train_df.copy()
+        train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
+        # Test data will also be standardized but using mean and std from training data
+        test_df_standard = test_df.copy()
+        test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
+        # Get the training data
+        X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_train = train_df_standard[Target]
+        # Get test data
+        X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_test = test_df_standard[Target]
+        # Prepare initial filtering of feature types to alleviate imbalance in number of features
+        # Use a variable that is optimized using inner CV
+        n_feat_mRMR1 = [30,40,50,60]
+        # Max features from mRMR for each eye status
+        max_mRMR_features = n_feat_mRMR1[-1]
+        # mRMR on the all sparse kmeans selected features
+        filter_feat_selector = mRMR_feature_select(X_train.to_numpy(),y_train.to_numpy(),
+                                               num_features_to_select=max_mRMR_features,
+                                               K_MAX=1000,n_jobs=-1,verbose=False)
+        # Part 2 with second mRMR and classifiers in loop
+        k_fold = 10
+        skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+        # I am also using converting it to an iterable list instad of the generator to promote reuse
+        Inner_CV = []
+        for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
+            Inner_CV.append([train_index2,test_index2])
+        # SVM with L2-norm (it is by default squared)
+        # Prepare hyper parameters
+        exponent = np.linspace(-3,1,9)
+        exponent = np.round(exponent,5) # sometimes linspace are not so exact
+        C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
+        kernels = ["linear"] # error when using rbf and RFECV
+        # rbf also overfit easier, whereas linear empirically works better in high D data
+        # Sequential Feature Selection for Logistic Regression
+        # In-built model selection CV
+        # The L2 is the inverse of C for LogReg
+        exponent = np.linspace(-3,1,9)
+        exponent = np.round(exponent,5) # sometimes linspace are not so exact
+        C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
+        # Random forest classifier
+        # Prepare hyper parameters
+        trees = np.array([10, 100, 500, 1000])
+        depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
+        # Prepare arrays for validation errors
+        val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels)))
+        val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels),max_mRMR_features))
+        val_feat_svm.fill(np.nan)
+        val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
+        val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
+        val_feat_logreg.fill(np.nan)
+        val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
+        val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
+        val_feat_rf.fill(np.nan)
+        val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
+        val_feat_import_rf.fill(np.nan)
+        for n1 in range(len(n_feat_mRMR1)):
+            # Get selected features from mRMR
+            filter_features = X_train.columns[filter_feat_selector[0:n_feat_mRMR1[n1]]]
+            X_train_mRMR = X_train[filter_features].copy()
+            # SVM with recursive feature elemination 
+            # Stratified CV to find best regularization strength and number of features
+            # CV is built-in RFECV
+            min_features_to_select = 1
+            # Using normal for loop as RFECV has inbuilt multiprocessing
+            for C in range(len(C_parameter_SVM)):
+                for K in range(len(kernels)):
+                    # Define the model
+                    svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000)
+                    # Perform recurive feature elimination with in-built CV
+                    rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy",
+                                  cv=Inner_CV,
+                                  min_features_to_select=min_features_to_select)
+                    rfecv.fit(X_train_mRMR,y_train.to_numpy().ravel())
+                    # Get CV score
+                    err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
+                    # Save results for hyperparameter optimization
+                    val_err_svm[n1,C,K] = err
+                    # Save the features chosen by RFE based on index from pre mRMR
+                    rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0]
+                    val_feat_svm[n1,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx
+                    # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
+            # Logistic regression with sequential forward selection
+            # In-bult CV
+            k_features = np.arange(1,n_feat_mRMR1[n1]+1,1) # try up to all features
+            inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
+            for C in range(len(C_parameter_LogReg)):
+                LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
+                                            max_iter = 50000)
+                sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
+                          forward = True, scoring = "balanced_accuracy",
+                          verbose = 0, floating = False,
+                          cv = Inner_CV, n_jobs = -1)
+                sfs = sfs.fit(X_train_mRMR, y_train.to_numpy().ravel())
+                # Save CV scores for each SFS step
+                for feat in range(len(k_features)):
+                    inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
+                # Find the best number of features
+                # I am rounding to make it more smooth and disregard small improvements
+                K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
+                if len(K) > 1:
+                    K = K[np.where(K == np.min(K))[0]]
+                err = inner_cv_scores[C,K]
+                # Save validation error and features
+                val_err_logreg[n1,C] = err
+                # Save the features chosen by RFE based on index from pre mRMR
+                sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
+                val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
+                # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
+            # Random forest
+            model_test_err = np.zeros((k_fold,len(trees),len(depth)))
+            RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR.shape[1]))
+            counter = 0
+            for cv in range(k_fold):
+                # Retrieve CV indices
+                train_index_rf = Inner_CV[cv][0]
+                test_index_rf = Inner_CV[cv][1]
+                # Retrieve datasets
+                X_train2 = X_train_mRMR.iloc[train_index_rf]; X_test2 = X_train_mRMR.iloc[test_index_rf]
+                y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
+                for t in range(len(trees)):
+                    for d in range(len(depth)):
+                        RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
+                                                    n_jobs=-1, random_state=None)
+                        RF.fit(X_train2.to_numpy(),y_train2)
+                        # Validation error
+                        RF_y = RF.predict(X_test2.to_numpy())
+                        err = balanced_accuracy_score(y_test2, RF_y)
+                        # Save to array
+                        model_test_err[cv,t,d] = err
+                        # Save feature importance array
+                        RF_feat_import[cv,t,d] = RF.feature_importances_
+                counter += 1
+                # print("Finished RF run {} out of {}".format(counter, k_fold))
+            # Average the errors over the CV folds
+            model_test_err_mean = np.mean(model_test_err, axis=0)
+            val_err_rf[n1,:,:] = model_test_err_mean
+            # Average the feature importances over the CV folds
+            RF_feat_import = np.mean(RF_feat_import, axis=0)
+            val_feat_import_rf[n1,:,:,0:n_feat_mRMR1[n1]] = RF_feat_import
+            # Save the features used by the RF based on index from pre mRMR
+            rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0]
+            val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
+            print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100))
+        # Choose the optimal parameters
+        ### SVM
+        n1, C, K = np.where(val_err_svm==np.max(val_err_svm))
+        if len(C) > 1:
+            print("There are multiple SVM runs with the same validation error")
+            print("Choosing run with fewest features to alleviate overfitting")
+            rfe_feat_len = []
+            for i2 in range(len(C)):
+                n1_temp = int(n1[i2])
+                C_temp = int(C[i2])
+                K_temp = int(K[i2])
+                temp_feats = val_feat_svm[n1_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,C_temp,K_temp])].astype(int)
+                rfe_feat_len.append(len(temp_feats))
+            rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
+            if len(rfe_feat_min) > 1:
+                print("Multiple SVM runs with same number of fewest features")
+                print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                C_min = np.argmin(C[rfe_feat_min])
+                n1, C, K = [int(n1[rfe_feat_min][C_min]),
+                                int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])]
+            else:
+                n1, C, K = [int(n1[int(rfe_feat_min)]),
+                             int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])]
+        else:
+            n1, C, K = [int(n1), int(C), int(K)]
+        mRMR_chosen1 = n_feat_mRMR1[n1]
+        C_chosen = C_parameter_SVM[C]
+        K_chosen = kernels[K]
+        # Save the best validation erro
+        val_error = val_err_svm[n1, C, K]
+        # Get the subsets chosen
+        chosen_feats = val_feat_svm[n1,C,K][~np.isnan(val_feat_svm[n1,C,K])].astype(int)
+        rfe_features = list(X_train.columns[chosen_feats])
+        n_final_feat = len(rfe_features)
+        x_train_mRMR_rfe = X_train[rfe_features]
+        # Fit on all training data
+        model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000)
+        model = model.fit(x_train_mRMR_rfe,y_train.to_numpy().ravel())
+        # Get training output
+        model_train_y = model.predict(x_train_mRMR_rfe)
+        # Get training error
+        train_error = balanced_accuracy_score(y_train, model_train_y)
+        # Get prediction of test data
+        y_pred = model.predict(X_test[rfe_features])
+        # Use model to predict class on test data
+        test_error = balanced_accuracy_score(y_test, y_pred)
+        # Save or prepare to save
+        accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error]
+        SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat,K_chosen]
+        SVM_model = model
+        SVM_y_pred = [[model_train_y],[y_pred]]
+        ### LogReg with SFS
+        n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
+        if len(C) > 1:
+            print("There are multiple LogReg runs with the same validation error")
+            print("Choosing run with fewest features to alleviate overfitting")
+            sfs_feat_len = []
+            for i2 in range(len(C)):
+                n1_temp = int(n1[i2])
+                C_temp = int(C[i2])
+                temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
+                sfs_feat_len.append(len(temp_feats))
+            sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
+            if len(sfs_feat_min) > 1:
+                print("Multiple LogReg runs with same number of fewest features")
+                print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                C_min = np.argmin(C[sfs_feat_min])
+                n1, C = [int(n1[sfs_feat_min][C_min]),
+                                int(C[sfs_feat_min][C_min])]
+            else:
+                n1, C = [int(n1[int(sfs_feat_min)]),
+                             int(C[int(sfs_feat_min)])]
+        else:
+            n1, C = [int(n1), int(C)]
+        mRMR_chosen1 = n_feat_mRMR1[n1]
+        C_chosen = C_parameter_LogReg[C]
+        # Save the best validation erro
+        val_error = val_err_logreg[n1, C]
+        # Get the subsets chosen
+        chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
+        sfs_features = list(X_train.columns[chosen_feats])
+        n_final_feat = len(sfs_features)
+        x_train_mRMR_sfs = X_train[sfs_features]
+        # Fit on all training data
+        model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
+        model = model.fit(x_train_mRMR_sfs,y_train.to_numpy().ravel())
+        # Get training output
+        model_train_y = model.predict(x_train_mRMR_sfs)
+        # Get training error
+        train_error = balanced_accuracy_score(y_train, model_train_y)
+        # Get prediction of test data
+        y_pred = model.predict(X_test[sfs_features])
+        # Use model to predict class on test data
+        test_error = balanced_accuracy_score(y_test, y_pred)
+        # Save or prepare to save
+        accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error]
+        LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
+        LogReg_model = model
+        LogReg_y_pred = [[model_train_y],[y_pred]]
+        ### RF
+        n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
+        if len(d) > 1:
+            print("There are multiple RF runs with the same validation error")
+            print("Choosing run with lowest depth to alleviate overfitting")
+            d_min = np.where(d==np.min(d))[0]
+            if len(d_min) > 1:
+                print("Multiple RF runs with same number number of depths")
+                print("Choosing run with lowest trees to alleviate overfitting")
+                t_min = np.argmin(t[d_min]) # argmin just takes the first
+                # If there are multiple with same parameters and validation error it is most likely the same
+                n1, t, d = [int(n1[d_min][t_min]),
+                                int(t[d_min][t_min]), int(d[d_min][t_min])]
+            else:
+                n1, t, d = [int(n1[int(d_min)]),
+                             int(t[int(d_min)]), int(d[int(d_min)])]
+        else:
+            n1, t, d = [int(n1), int(t), int(d)]
+        mRMR_chosen1 = n_feat_mRMR1[n1]
+        t_chosen = trees[t]
+        d_chosen = depth[d]
+        # Save the best validation error
+        val_error = val_err_rf[n1, t, d]
+        # Get the chosen features and feature importances
+        chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
+        rf_features = list(X_train.columns[chosen_feats])
+        n_final_feat = len(rf_features)
+        x_train_mRMR_rf = X_train[rf_features]
+        rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
+        # Fit on all training data
+        model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
+                                                        n_jobs=-1, random_state=None)
+        model = model.fit(x_train_mRMR_rf,y_train.to_numpy().ravel())
+        # Get training output
+        model_train_y = model.predict(x_train_mRMR_rf)
+        # Get training error
+        train_error = balanced_accuracy_score(y_train, model_train_y)
+        # Get prediction of test data
+        y_pred = model.predict(X_test[rf_features])
+        # Use model to predict class on test data
+        test_error = balanced_accuracy_score(y_test, y_pred)
+        # Save or prepare to save
+        accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error]
+        RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
+        RF_model = model
+        RF_y_pred = [[model_train_y],[y_pred]]
+        # Save the rest
+        model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par])
+        final_models0.append([SVM_model,LogReg_model,RF_model])
+        final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]])
+        final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred])
+        # Move counter
+        Outer_counter += 1
+        print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1))
+    # Save results from all outer folds
+    model_par.append(model_par0)
+    final_models.append(final_models0)
+    final_features.append(final_features0)
+    final_y_preds.append(final_y_preds0)
+    # Save results to file
+    Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds]
+    # Run with variable feat in first and second mRMR
+    with open(Model_savepath+"Rep10_10x10CV_SparsKmean_mRMR_SVM_LogReg_RF_Subtype1_results_210122.pkl", "wb") as filehandle:
+        pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle)
+    # Get current time
+    c_time2 = time.localtime()
+    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
+    print("Started", c_time1, "\nCurrent Time",c_time2)
+    # Print total progress
+    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
+# %% Each feat mRMR -> mRMR -> SVM/RF/LogReg with L2
+# Using each feature (of the sparse Kmeans EC chosen ones)
+# For Subtype 1
+# 10 repetitions of 10 fold outer and 20 fold inner two-layer CV
+# Prediction of subtype 1
+EEG_features_df, EEG_features_name_list = load_features_df() # removed coh and plv and renamed Hurst in v4
+with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
+    sparcl = pickle.load(file)
+# Add group status
+Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
+Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
+EEG_features_df.insert(1, "Group_status", Group_status)
+# Only keep PTSD subtype 1 by dropping subtype 2
+PTSD_idx = np.array([i in cases for i in Subject_id])
+CTRL_idx = np.array([not i in cases for i in Subject_id])
+subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
+subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
+EEG_features_df = EEG_features_df.set_index("Subject_ID")
+EEG_features_df = EEG_features_df.drop(subtype2)
+EEG_features_df = EEG_features_df.reset_index()
+# Check it was dropped correctly
+assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])
+# Subject info columns
+Subject_info_cols = list(EEG_features_df.columns[0:2])
+n_subject_info_cols = len(Subject_info_cols)
+n_discrete_cols = 2
+# Get features from sparse kmeans
+nonzero_idx = sparcl["ws"].nonzero()
+sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
+sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO
+# Only use eyes closed (2483 features)
+sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")]
+EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1)
+# To ensure proper stratification into train/test set I will stratify using group status and study status
+# A variable that encodes for this is created
+n_studies = 3
+study_group_status = EEG_features_df["Group_status"].copy()
+for i in range(n_studies):
+    # Get study index
+    study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
+    # Assign label
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
+    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
+# Target variable
+Target = ["Group_status"]
+Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
+Target_col_idx = np.where(Target_col == True)[0]
+# Make 3 models and save them to use enseemble in the end
+CLF_models = ["SVM", "LogReg", "RF"]
+n_models = len(CLF_models)
+# Repeat the classification to see if I just got a lucky seed
+n_repetitions = 10
+k_out = 10
+accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
+model_par = []
+final_models = []
+final_features = []
+final_y_preds = []
+np.random.seed(42)
+# Prepare the splits beforehand to make sure the repetitions are not the same
+Rep_Outer_CV = []
+Rep_Outer_CV_test = [] # a list with only the test indices
+for rep in range(n_repetitions):
+    skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+    # I am also using converting it to an iterable list instad of the generator to promote reuse
+    Outer_CV = []
+    Outer_CV_test = []
+    for train_index, test_index in skf.split(EEG_features_df,study_group_status):
+        Outer_CV.append([train_index,test_index])
+        Outer_CV_test.append(test_index)
+    Rep_Outer_CV.append(Outer_CV)
+    Rep_Outer_CV_test.append(Outer_CV_test)
+# Check none of the repetitions are the same by using only the test sets
+# The list is first flattened
+Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
+# All elements are converted to strings
+# This makes it easier to look for uniques, and the indices are already in numerical order
+Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
+def allUnique(x):
+    seen = set()
+    return not any(i in seen or seen.add(i) for i in x)
+assert allUnique(Rep_Outer_CV_test_flat_str)
+# Get current time
+c_time1 = time.localtime()
+c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
+print(c_time1)
+def Each_feat_10rep_10x10CV(rep):
+    # The outer fold CV has already been saved as lists
+    Outer_CV = Rep_Outer_CV[rep]
+    # Pre-allocate memory
+    Ind_feat_clf0 = []
+    accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3))
+    Outer_counter = 0
+    for train_index, test_index in Outer_CV:
+        test_df = EEG_features_df.iloc[test_index]
+        train_df = EEG_features_df.iloc[train_index]
+        # Training data will be standardized
+        standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
+        train_df_standard = train_df.copy()
+        train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
+        # Test data will also be standardized but using mean and std from training data
+        test_df_standard = test_df.copy()
+        test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
+        # Get the training data
+        X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_train = train_df_standard[Target]
+        # Get test data
+        X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
+        y_test = test_df_standard[Target]
+        # Part 2 with second mRMR and classifiers in loop
+        k_fold = 10
+        skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
+        # I am also using converting it to an iterable list instad of the generator to promote reuse
+        Inner_CV = []
+        for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
+            Inner_CV.append([train_index2,test_index2])
+        # Prepare initial filtering of feature types to alleviate imbalance in number of features
+        # Use a variable that is optimized using inner CV
+        n_feat_mRMR1 = [20,30,40,50]
+        # Max features from mRMR for each eye status
+        max_mRMR_features = n_feat_mRMR1[-1]
+        eye_status = ["Eyes Closed", "Eyes Open"]
+        n_eye_status = len(eye_status)
+        feat_counter = []
+        # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
+        Ind_feat_clf00 = []
+        for fset in range(len(EEG_features_name_list)):
+            temp_feat = EEG_features_name_list[fset]
+            other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
+            other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
+            # Retrieve the dataset for each feature
+            col_idx = np.zeros(len(X_train.columns), dtype=bool)
+            for fsub in range(len(temp_feat)):
+                temp_feat0 = temp_feat[fsub]
+                col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
+                col_idx = np.logical_or(col_idx,col_idx0) # append all trues
+            temp_X_train = X_train.loc[:,col_idx]
+            # Check if any of the other features were wrongly chosen
+            for fcheck in range(len(other_feats)):
+                if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
+                    temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))]
+            if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
+                temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
+            # Save number of original features fed into mRMR
+            feat_counter.append([temp_feat,temp_X_train.shape[1]])
+            # If there are no features selected, then skip the rest of the loop
+            if temp_X_train.shape[1] == 0:
+                continue
+            # Do not use mRMR if there are fewer than n_features
+            if temp_X_train.shape[1] <= max_mRMR_features:
+                filter_features = temp_X_train.columns
+            else:
+                # mRMR
+                filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
+                                                            num_features_to_select=max_mRMR_features,
+                                                            K_MAX=1000,n_jobs=1,verbose=False)
+                # Get selected features
+                filter_features = temp_X_train.columns[filter_feat_selector]
+            # SVM with L2-norm (it is by default squared)
+            # Prepare hyper parameters
+            exponent = np.linspace(-3,1,9)
+            exponent = np.round(exponent,5) # sometimes linspace are not so exact
+            C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
+            # rbf kernel overfit easier, whereas linear empirically works better in high D data
+            # Sequential Feature Selection for Logistic Regression
+            # In-built model selection CV
+            # The L2 is the inverse of C for LogReg
+            exponent = np.linspace(-3,1,9)
+            exponent = np.round(exponent,5) # sometimes linspace are not so exact
+            C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
+            # Random forest classifier
+            # Prepare hyper parameters
+            trees = np.array([10, 100, 500, 1000])
+            depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
+            # Prepare arrays for validation errors
+            val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM)))
+            val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features))
+            val_feat_svm.fill(np.nan)
+            val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
+            val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
+            val_feat_logreg.fill(np.nan)
+            val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
+            val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
+            val_feat_rf.fill(np.nan)
+            val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
+            val_feat_import_rf.fill(np.nan)
+            min_features_to_select = 1
+            for n1 in range(len(n_feat_mRMR1)):
+                mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]]
+                # Use the selected filter features
+                temp_X_train_mRMR = temp_X_train[mRMR_features1]
+                # SVM with recursive feature elemination 
+                # Stratified CV to find best regularization strength and number of features
+                # CV is built-in RFECV
+                for C in range(len(C_parameter_SVM)):
+                    # Define the model
+                    svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000)
+                    # Perform recurive feature elimination with in-built CV
+                    rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy",
+                                  cv=Inner_CV,
+                                  min_features_to_select=min_features_to_select)
+                    rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel())
+                    # Get CV score
+                    err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
+                    # Save results for hyperparameter optimization
+                    val_err_svm[n1,C] = err
+                    # Save the features chosen by RFE based on index from pre mRMR
+                    rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0]
+                    val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx
+                    # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
+                # Logistic regression with sequential forward selection
+                # In-bult CV
+                k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]])
+                k_features = np.arange(1,k_max+1,1) # try up to all features
+                inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
+                for C in range(len(C_parameter_LogReg)):
+                    LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
+                                                max_iter = 50000)
+                    sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
+                              forward = True, scoring = "balanced_accuracy",
+                              verbose = 0, floating = False,
+                              cv = Inner_CV, n_jobs = 1)
+                    sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel())
+                    # Save CV scores for each SFS step
+                    for feat in range(len(k_features)):
+                        inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
+                    # Find the best number of features
+                    # I am rounding to make it more smooth and disregard small improvements
+                    K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
+                    if len(K) > 1:
+                        K = K[np.where(K == np.min(K))[0]]
+                    err = inner_cv_scores[C,K]
+                    # Save validation error and features
+                    val_err_logreg[n1,C] = err
+                    # Save the features chosen by RFE based on index from pre mRMR
+                    sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
+                    val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
+                    # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
+                # Random forest                
+                model_test_err = np.zeros((k_fold,len(trees),len(depth)))
+                RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1]))
+                counter = 0
+                for cv in range(k_fold):
+                    # Retrieve CV indices
+                    train_index_rf = Inner_CV[cv][0]
+                    test_index_rf = Inner_CV[cv][1]
+                    # Retrieve datasets
+                    X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf]
+                    y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
+                    for t in range(len(trees)):
+                        for d in range(len(depth)):
+                            RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
+                                                        n_jobs=1, random_state=None)
+                            RF.fit(X_train2.to_numpy(),y_train2)
+                            # Validation error
+                            RF_y = RF.predict(X_test2.to_numpy())
+                            err = balanced_accuracy_score(y_test2,RF_y)
+                            # Save to array
+                            model_test_err[cv,t,d] = err
+                            # Save feature importance array
+                            RF_feat_import[cv,t,d] = RF.feature_importances_
+                    counter += 1
+                    # print("Finished RF run {} out of {}".format(counter, k_fold))
+                # Average the errors over the CV folds
+                model_test_err_mean = np.mean(model_test_err, axis=0)
+                val_err_rf[n1,:,:] = model_test_err_mean
+                # Average the feature importances over the CV folds
+                RF_feat_import = np.mean(RF_feat_import, axis=0)
+                val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import
+                # Save the features used by the RF based on index from pre mRMR
+                rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0]
+                val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
+                # Print progress
+                current_progress = (n1+1)/(len(n_feat_mRMR1))*100
+                print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0]))
+            # Choose the optimal parameters
+            ### SVM
+            n1, C = np.where(val_err_svm==np.max(val_err_svm))
+            if len(C) > 1:
+                print("There are multiple SVM runs with the same validation error")
+                print("Choosing run with fewest features to alleviate overfitting")
+                rfe_feat_len = []
+                for i2 in range(len(C)):
+                    n1_temp = int(n1[i2])
+                    C_temp = int(C[i2])
+                    temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int)
+                    rfe_feat_len.append(len(temp_feats))
+                rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
+                if len(rfe_feat_min) > 1:
+                    print("Multiple SVM runs with same number of fewest features")
+                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                    C_min = np.argmin(C[rfe_feat_min])
+                    n1, C = [int(n1[rfe_feat_min][C_min]),
+                                    int(C[rfe_feat_min][C_min])]
+                else:
+                    n1, C = [int(n1[int(rfe_feat_min)]),
+                                  int(C[int(rfe_feat_min)])]
+            else:
+                n1, C = [int(n1), int(C)]
+            mRMR_chosen1 = n_feat_mRMR1[n1]
+            C_chosen = C_parameter_SVM[C]
+            # Save the best validation error
+            val_error = val_err_svm[n1, C]
+            # Get the subsets chosen
+            chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int)
+            rfe_features = list(X_train.columns[chosen_feats])
+            n_final_feat = len(rfe_features)
+            x_train_mRMR2_rfe = X_train[rfe_features]
+            # Fit on all training data
+            model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000)
+            model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
+            # Get training output
+            model_train_y = model.predict(x_train_mRMR2_rfe)
+            # Get training error
+            train_error = balanced_accuracy_score(y_train, model_train_y)
+            # Get prediction of test data
+            y_pred = model.predict(X_test[rfe_features])
+            # Use model to predict class on test data
+            test_error = balanced_accuracy_score(y_test, y_pred)
+            # Save or prepare to save
+            accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error]
+            SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
+            SVM_model = model
+            SVM_y_pred = [[model_train_y],[y_pred]]
+            ### LogReg with SFS
+            n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
+            if len(C) > 1:
+                print("There are multiple LogReg runs with the same validation error")
+                print("Choosing run with fewest features to alleviate overfitting")
+                sfs_feat_len = []
+                for i2 in range(len(C)):
+                    n1_temp = int(n1[i2])
+                    C_temp = int(C[i2])
+                    temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
+                    sfs_feat_len.append(len(temp_feats))
+                sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
+                if len(sfs_feat_min) > 1:
+                    print("Multiple LogReg runs with same number of fewest features")
+                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
+                    C_min = np.argmin(C[sfs_feat_min])
+                    n1, C = [int(n1[sfs_feat_min][C_min]),
+                                    int(C[sfs_feat_min][C_min])]
+                else:
+                    n1, C = [int(n1[int(sfs_feat_min)]),
+                                  int(C[int(sfs_feat_min)])]
+            else:
+                n1, C = [int(n1), int(C)]
+            mRMR_chosen1 = n_feat_mRMR1[n1]
+            C_chosen = C_parameter_LogReg[C]
+            # Save the best validation erro
+            val_error = val_err_logreg[n1, C]
+            # Get the subsets chosen
+            chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
+            sfs_features = list(X_train.columns[chosen_feats])
+            n_final_feat = len(sfs_features)
+            x_train_mRMR2_sfs = X_train[sfs_features]
+            # Fit on all training data
+            model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
+            model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
+            # Get training output
+            model_train_y = model.predict(x_train_mRMR2_sfs)
+            # Get training error
+            train_error = balanced_accuracy_score(y_train, model_train_y)
+            # Get prediction of test data
+            y_pred = model.predict(X_test[sfs_features])
+            # Use model to predict class on test data
+            test_error = balanced_accuracy_score(y_test, y_pred)
+            # Save or prepare to save
+            accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error]
+            LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
+            LogReg_model = model
+            LogReg_y_pred = [[model_train_y],[y_pred]]
+            ### RF
+            n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
+            if len(d) > 1:
+                print("There are multiple RF runs with the same validation error")
+                print("Choosing run with lowest depth to alleviate overfitting")
+                d_min = np.where(d==np.min(d))[0]
+                if len(d_min) > 1:
+                    print("Multiple RF runs with same number number of depths")
+                    print("Choosing run with lowest trees to alleviate overfitting")
+                    t_min = np.argmin(t[d_min]) # argmin just takes the first
+                    # If there are multiple with same parameters and validation error it is most likely the same
+                    n1, t, d = [int(n1[d_min][t_min]),
+                                    int(t[d_min][t_min]), int(d[d_min][t_min])]
+                else:
+                    n1, t, d = [int(n1[int(d_min)]),
+                                  int(t[int(d_min)]), int(d[int(d_min)])]
+            else:
+                n1, t, d = [int(n1), int(t), int(d)]
+            mRMR_chosen1 = n_feat_mRMR1[n1]
+            t_chosen = trees[t]
+            d_chosen = depth[d]
+            # Save the best validation error
+            val_error = val_err_rf[n1, t, d]
+            # Get the chosen features and feature importances
+            chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
+            rf_features = list(X_train.columns[chosen_feats])
+            n_final_feat = len(rf_features)
+            x_train_mRMR2_rf = X_train[rf_features]
+            rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
+            # Fit on all training data
+            model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
+                                                            n_jobs=1, random_state=None)
+            model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
+            # Get training output
+            model_train_y = model.predict(x_train_mRMR2_rf)
+            # Get training error
+            train_error = balanced_accuracy_score(y_train, model_train_y)
+            # Get prediction of test data
+            y_pred = model.predict(X_test[rf_features])
+            # Use model to predict class on test data
+            test_error = balanced_accuracy_score(y_test, y_pred)
+            # Save or prepare to save
+            accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error]
+            RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
+            RF_model = model
+            RF_y_pred = [[model_train_y],[y_pred]]
+            # Save the rest
+            model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par]
+            final_models00 = [SVM_model,LogReg_model,RF_model]
+            final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]]
+            final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred]
+            data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_]
+            # Save the results for the specific feature
+            res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00]
+            Ind_feat_clf00.append(res)
+            print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat))
+        # Check that no features were excluded when checking for wrongly chosen
+        total_features = 0
+        for feat_c in range(len(feat_counter)):
+            total_features += feat_counter[feat_c][1]
+        assert total_features == X_train.shape[1]
+        # Save results over outer folds
+        Ind_feat_clf0.append(Ind_feat_clf00)
+        print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1))
+        # Move counter
+        Outer_counter += 1
+    out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable]
+    # Print total progress
+    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
+    # Get current time
+    c_time2 = time.localtime()
+    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
+    print("Started", c_time1, "\nCurrent Time",c_time2)
+    return rep, out
+Ind_feat_clf = [0]*n_repetitions
+with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
+    for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments
+        Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
+        # Save results to file and overwrite when new results arrive
+        with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Subtype1_results_250122.pkl", "wb") as filehandle:
+            pickle.dump(Ind_feat_clf, filehandle)
+# %% For subtype 2
+# The two previous sections are rerun but for subtype 2
+# The only modification was indexing the subjects
+# Only keep PTSD subtype 2 by dropping subtype 1
+subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
+subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
+EEG_features_df = EEG_features_df.set_index("Subject_ID")
+EEG_features_df = EEG_features_df.drop(subtype1)
+EEG_features_df = EEG_features_df.reset_index()
+# Check it was dropped correctly
+assert all(subtype2 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])