diff --git a/MachineLearning.py b/MachineLearning.py new file mode 100644 index 0000000000000000000000000000000000000000..f5cfe590f3e49f5789b3c7cae0a8c78f13048e36 --- /dev/null +++ b/MachineLearning.py @@ -0,0 +1,2423 @@ +# -*- coding: utf-8 -*- +""" +Updated Oct 18 2022 + +@author: Qianliang Li (glia@dtu.dk) + +This script contains the code for the machine learning analysis + +It should be run after Preprocessing.py and FeatureEstimation.py +""" + +# Set working directory +import os +wkdir = "/home/glia/EEG/Final_scripts/" +os.chdir(wkdir) + +# Load all libraries from the Preamble +from Preamble import * + +# Load the questionnaire data +final_qdf = pd.read_csv("final_qdf.csv", sep=",", na_values=' ') + +# Define cases as >= 44 total PCL +# Type: numpy array with subject id +cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44]) +n_groups = 2 +Groups = ["CTRL", "PTSD"] + +# Make function to load EEG features +def load_features_df(): + # Load all features + power_df = pd.read_pickle(Feature_savepath+"Power_df.pkl") + fTBR_data_df = pd.read_pickle(Feature_savepath+"fTBR_df.pkl") + asymmetry_df = pd.read_pickle(Feature_savepath+"asymmetry_df.pkl") + PAF_data_df = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_df.pkl") + PAF_data_df_global = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_global_df.pkl") + OOF_data_df = pd.read_pickle(Feature_savepath+"OOF_data_FOOOF_df.pkl") + con_data_df = pd.read_pickle(Feature_savepath+"con_data_source_drop_interpol_df.pkl") + pec_data_df = pd.read_pickle(Feature_savepath+"pec_data_drop_interpol_ch_df.pkl") + microstate_transition_data_df = pd.read_pickle(Feature_savepath+"microstate_transition_data_df.pkl") + microstate_time_df = pd.read_pickle(Feature_savepath+"microstate_time_df.pkl") + microstate_entropy_df = pd.read_pickle(Feature_savepath+"microstate_entropy_df.pkl") + GC_data_df = pd.read_pickle(Feature_savepath+"GC_data_source_drop_interpol_df.pkl") + H_data_df = pd.read_pickle(Feature_savepath+"H_data_df.pkl") + H_data_df_global = pd.read_pickle(Feature_savepath+"H_data_global_df.pkl") + + # List of features + EEG_features_name_list = [['Power'], + ['Frontal Theta Beta Ratio', + 'Asymmetry'], + ['Peak Alpha Frequency', + 'Global Peak Alpha Frequency'], + ["1/f exponent"], + ['imcoh'], + ['wpli'], + ['Power Envelope Correlation'], + ['Microstate Transition', + 'Microstate Ratio Time', + 'Microstate Entropy'], + ["Granger Causality"], + ['DFA Exponent', + 'Global DFA Exponent']] + + # Arrange them to fit one 2D dataframe + # Make function to add measurement column for indexing + def add_measurement_column(df, measurement = "Text"): + dummy_variable = [measurement]*df.shape[0] + df.insert(1, "Measurement", dummy_variable) + return df + # Make function to convert column tuple to string + def convertTupleHeader(header): + header = list(header) + str = "_".join(header) + return str + + # Prepare overall dataframe + EEG_features_df = pd.DataFrame(Subject_id, columns = ["Subject_ID"]) + + # Add power spectral densities + power_df = add_measurement_column(power_df, "Power") + temp_df = power_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Quant_status", "Eye_status", + + "Freq_band", "Channel"], dropna=False, + values="PSD").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add frontal theta/beta ratio + # fTBR_data_df = add_measurement_column(fTBR_data_df, "Frontal Theta Beta Ratio") + temp_df = fTBR_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status"], dropna=False, + values="TBR").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add power asymmetry + asymmetry_df = add_measurement_column(asymmetry_df, "Asymmetry") + temp_df = asymmetry_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Freq_band", "ROI"], dropna=False, + values="Asymmetry_score").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add peak alpha frequency + PAF_data_df = add_measurement_column(PAF_data_df, "Peak Alpha Frequency") + temp_df = PAF_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Channel"], dropna=False, + values="Value").reset_index(drop=True) + # NaN values are interpolated with means across channels for each condition + eye_status = list(final_epochs[0].event_id.keys()) + for ee in eye_status: + temp = temp_df.loc[:,("Peak Alpha Frequency",ee)] # get data + temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0) + temp_df.loc[:,("Peak Alpha Frequency",ee)] = temp.to_numpy() + # If there are still NaN the values are interpolated across channels and condition + temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T + + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add 1/f exponent + OOF_data_df = add_measurement_column(OOF_data_df, "1/f exponent") + temp_df = OOF_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Channel"], dropna=False, + values="Value").reset_index(drop=True) + # NaN values are interpolated with means across channels for each condition + eye_status = list(final_epochs[0].event_id.keys()) + for ee in eye_status: + temp = temp_df.loc[:,("1/f exponent",ee)] # get data + temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0) + temp_df.loc[:,("1/f exponent",ee)] = temp.to_numpy() + # If there are still NaN the values are interpolated across channels and condition + temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T + + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add global peak alpha frequency + #PAF_data_df_global = add_measurement_column(PAF_data_df_global, "Global_Peak_Alpha_Frequency") # already exists + temp_df = PAF_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status"], dropna=False, + values="Value").reset_index(drop=True) + # NaN values are interpolated across eye condition + temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T + + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add connectivity measurements + #con_data_df = add_measurement_column(con_data_df, "Connectivity") # already exists + temp_df = con_data_df.pivot_table(index="Subject_ID",columns=["Con_measurement", + "Eye_status", "chx", "chy", "Freq_band"], dropna=True, + values="Value").reset_index(drop=True) + # Drop coh and plv, which are more susceptible to volume conduction + temp_df = temp_df.drop("coh",axis=1) + temp_df = temp_df.drop("plv",axis=1) + + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add orthogonalized power enveloped correlations + pec_data_df = add_measurement_column(pec_data_df, "Power Envelope Correlation") + temp_df = pec_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "chx", "chy", "Freq_band"], dropna=True, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add microstate transition probabilities + microstate_transition_data_df = add_measurement_column(microstate_transition_data_df, "Microstate Transition") + temp_df = microstate_transition_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Transition"], dropna=False, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add microstate time covered + microstate_time_df = add_measurement_column(microstate_time_df, "Microstate Ratio Time") + # Convert microstate to str before using it as column + microstate_time_df = microstate_time_df.astype({"Microstate": str}) + + temp_df = microstate_time_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Microstate"], dropna=False, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add microstate entropy + #microstate_entropy_df = add_measurement_column(microstate_entropy_df, "Microstate_Entropy") # already exists + microstate_entropy_df["Measurement"] = ["Microstate Entropy"]*microstate_entropy_df.shape[0] + temp_df = microstate_entropy_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status"], dropna=False, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add Granger Causality # fixed so it is always chx --> chy + GC_data_df = add_measurement_column(GC_data_df, "Granger Causality") + temp_df = GC_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "chx", "chy", "Freq_band"], dropna=True, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + # Check if any GC was not calculated properly + assert temp_df.shape[1] == (31*31-31)*5*2 # expected number of features with 31 sources + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add DFA exponent + H_data_df = add_measurement_column(H_data_df, "DFA Exponent") + temp_df = H_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Channel", "Freq_band"], dropna=False, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + # Add Global Hurst exponent + H_data_df_global = H_data_df_global.drop("Measurement",axis=1) + H_data_df_global = add_measurement_column(H_data_df_global, "Global DFA Exponent") # already exists + temp_df = H_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "Freq_band"], dropna=False, + values="Value").reset_index(drop=True) + temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + + EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) + + return EEG_features_df, EEG_features_name_list + +# %% ML predction of PTSD status using all features +# mRMR -> mRMR -> SVM/RF/LogReg with L2 +# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV +EEG_features_df, EEG_features_name_list = load_features_df() + +# Add group status +Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 +Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 +EEG_features_df.insert(1, "Group_status", Group_status) + +# Subject info columns +Subject_info_cols = list(EEG_features_df.columns[0:2]) +n_subject_info_cols = len(Subject_info_cols) +n_discrete_cols = 2 + +# To ensure proper stratification into train/test set I will stratify using group status and study status +# A variable that encodes for this is created +n_studies = 3 +study_group_status = EEG_features_df["Group_status"].copy() +for i in range(n_studies): + # Get study index + study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) + # Assign label + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD + +# Target variable +Target = ["Group_status"] +Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) +Target_col_idx = np.where(Target_col == True)[0] + +# Make 3 models and save them to use enseemble in the end +CLF_models = ["SVM", "LogReg", "RF"] +n_models = len(CLF_models) + +# Repeat the classification to see if I just got a lucky seed +n_repetitions = 10 +k_out = 10 +accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3)) +model_par = [] +final_models = [] +final_features = [] +final_y_preds = [] +np.random.seed(42) + +# Prepare the splits beforehand to make sure the repetitions are not the same +Rep_Outer_CV = [] +Rep_Outer_CV_test = [] # a list with only the test indices +for rep in range(n_repetitions): + skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Outer_CV = [] + Outer_CV_test = [] + for train_index, test_index in skf.split(EEG_features_df,study_group_status): + Outer_CV.append([train_index,test_index]) + Outer_CV_test.append(test_index) + Rep_Outer_CV.append(Outer_CV) + Rep_Outer_CV_test.append(Outer_CV_test) +# Check none of the repetitions are the same by using only the test sets +# The list is first flattened +Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] +# All elements are converted to strings +# This makes it easier to look for uniques, and the indices are already in numerical order +Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] + +def allUnique(x): + seen = set() + return not any(i in seen or seen.add(i) for i in x) +assert allUnique(Rep_Outer_CV_test_flat_str) + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +for rep in range(n_repetitions): + # The outer fold CV has already been saved as lists + Outer_CV = Rep_Outer_CV[rep] + # Pre-allocate memory + model_par0 = [] + final_models0 = [] + final_features0 = [] + final_y_preds0 = [] + Outer_counter = 0 + for train_index, test_index in Outer_CV: + test_df = EEG_features_df.iloc[test_index] + train_df = EEG_features_df.iloc[train_index] + + # Training data will be standardized + standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) + + train_df_standard = train_df.copy() + train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) + # Test data will also be standardized but using mean and std from training data + test_df_standard = test_df.copy() + test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) + + # Get the training data + X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) + y_train = train_df_standard[Target] + # Get test data + X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) + y_test = test_df_standard[Target] + + # Prepare initial filtering of feature types to alleviate imbalance in number of features + # Use a variable that is optimized using inner CV + n_feat_mRMR1 = [20,30,40,50] + n_feat_mRMR2 = [30,40,50,60] + # Max features from mRMR for each eye status + max_mRMR_features = n_feat_mRMR1[-1] + max_mRMR_features2 = n_feat_mRMR2[-1] + + eye_status = ["Eyes Closed", "Eyes Open"] + n_eye_status = len(eye_status) + + mRMR_features = [] + feat_counter = [] + # Perform mRMR for each feature type + for fset in range(len(EEG_features_name_list)): + temp_feat = EEG_features_name_list[fset] + other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:] + other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten + for e in range(n_eye_status): + ee = eye_status[e] + temp_ee = "{}_{}".format(temp_feat,ee) + # Retrieve the dataset for each feature + col_idx = np.zeros(len(X_train.columns), dtype=bool) + for fsub in range(len(temp_feat)): + temp_feat0 = temp_feat[fsub] + col_idx0 = X_train.columns.str.contains(temp_feat0+"_") + col_idx = np.logical_or(col_idx,col_idx0) # append all trues + temp_X_train = X_train.loc[:,col_idx] + # Check if any of the other features were wrongly chosen + for fcheck in range(len(other_feats)): + if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True): + temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]+"_"))] + if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again + temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")] + # Index for eye status + temp_X_train = temp_X_train.loc[:,temp_X_train.columns.str.contains(ee)] + # Save number of original features fed into mRMR + feat_counter.append([temp_feat,temp_X_train.shape[1]]) + + # Do not use mRMR if there are fewer than max_mRMR_features + if temp_X_train.shape[1] <= max_mRMR_features: + filter_features = temp_X_train.columns + else: + # mRMR + filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(), + num_features_to_select=max_mRMR_features, + K_MAX=1000,n_jobs=-1,verbose=False) + # Get selected features + filter_features = temp_X_train.columns[filter_feat_selector] + + # Save features selected + mRMR_features.append(list(filter_features)) + + # Check that no features were excluded when checking for wrongly chosen + total_features = 0 + for feat_c in range(len(feat_counter)): + total_features += feat_counter[feat_c][1] + assert total_features == X_train.shape[1] + + # Part 2 with second mRMR and classifiers in loop + k_fold = 10 + skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Inner_CV = [] + for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): + Inner_CV.append([train_index2,test_index2]) + + # SVM with L2-norm (it is by default squared) + # Prepare hyper parameters + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) + kernels = ["linear"] + # rbf overfit easier, whereas linear empirically works better in high D data + + # Sequential Feature Selection for Logistic Regression + # In-built model selection CV + # The L2 is the inverse of C for LogReg + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) + + # Random forest classifier + # Prepare hyper parameters + trees = np.array([10, 100, 500, 1000]) + depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting + + # Prepare arrays for validation errors + val_err_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels))) + val_feat_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels),max_mRMR_features2)) + val_feat_svm.fill(np.nan) + + val_err_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg))) + val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg),max_mRMR_features2)) + val_feat_logreg.fill(np.nan) + + val_err_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth))) + val_feat_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2)) + val_feat_rf.fill(np.nan) + val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2)) + val_feat_import_rf.fill(np.nan) + + for n1 in range(len(n_feat_mRMR1)): + # Get the features from first mRMR + mRMR_features1 = [item for sublist in mRMR_features for item in sublist[0:n_feat_mRMR1[n1]]] + # print("{} features are left after first mRMR filtering".format(len(mRMR_features1))) + + X_train_mRMR = X_train[mRMR_features1].copy() + + # Second round of mRMR on the reduced list + filter_feat_selector = mRMR_feature_select(X_train_mRMR.to_numpy(),y_train.to_numpy(), + num_features_to_select=max_mRMR_features2, + K_MAX=1000,n_jobs=-1,verbose=False) + for n2 in range(len(n_feat_mRMR2)): + # Get selected features from mRMR2 + filter_features = X_train_mRMR.columns[filter_feat_selector[0:n_feat_mRMR2[n2]]] + + X_train_mRMR2 = X_train_mRMR[filter_features].copy() + + # SVM with recursive feature elemination + # Stratified CV to find best regularization strength and number of features + # CV is built-in RFECV + min_features_to_select = 1 + # Using normal for loop as RFECV has inbuilt multiprocessing + for C in range(len(C_parameter_SVM)): + for K in range(len(kernels)): + # Define the model + svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000) + # Perform recurive feature elimination with in-built CV + rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy", + cv=Inner_CV, + min_features_to_select=min_features_to_select) + rfecv.fit(X_train_mRMR2,y_train.to_numpy().ravel()) + # Get CV score + err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] + # Save results for hyperparameter optimization + val_err_svm[n1,n2,C,K] = err + # Save the features chosen by RFE based on index from pre mRMR + rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0] + val_feat_svm[n1,n2,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx + + # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)]))) + + # Logistic regression with sequential forward selection + # In-bult CV + k_features = np.arange(1,n_feat_mRMR2[n2]+1,1) # try up to all features + inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) + for C in range(len(C_parameter_LogReg)): + LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], + max_iter = 50000) + sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), + forward = True, scoring = "balanced_accuracy", + verbose = 0, floating = False, + cv = Inner_CV, n_jobs = -1) + + sfs = sfs.fit(X_train_mRMR2, y_train.to_numpy().ravel()) + # Save CV scores for each SFS step + for feat in range(len(k_features)): + inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] + + # Find the best number of features + # I am rounding to make it more smooth and disregard small improvements + K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] + if len(K) > 1: + K = K[np.where(K == np.min(K))[0]] + err = inner_cv_scores[C,K] + # Save validation error and features + val_err_logreg[n1,n2,C] = err + # Save the features chosen by RFE based on index from pre mRMR + sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature + val_feat_logreg[n1,n2,C,0:len(sfs_feat_idx)] = sfs_feat_idx + + # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) + + # Random forest + model_test_err = np.zeros((k_fold,len(trees),len(depth))) + RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR2.shape[1])) + + counter = 0 + for cv in range(k_fold): + # Retrieve CV indices + train_index_rf = Inner_CV[cv][0] + test_index_rf = Inner_CV[cv][1] + # Retrieve datasets + X_train2 = X_train_mRMR2.iloc[train_index_rf]; X_test2 = X_train_mRMR2.iloc[test_index_rf] + y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] + + for t in range(len(trees)): + for d in range(len(depth)): + RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], + n_jobs=-1, random_state=None) + RF.fit(X_train2.to_numpy(),y_train2) + # Validation error + RF_y = RF.predict(X_test2.to_numpy()) + err = balanced_accuracy_score(y_test2, RF_y) + # Save to array + model_test_err[cv,t,d] = err + # Save feature importance array + RF_feat_import[cv,t,d] = RF.feature_importances_ + counter += 1 + # print("Finished RF run {} out of {}".format(counter, k_fold)) + # Average the errors over the CV folds + model_test_err_mean = np.mean(model_test_err, axis=0) + val_err_rf[n1,n2,:,:] = model_test_err_mean + # Average the feature importances over the CV folds + RF_feat_import = np.mean(RF_feat_import, axis=0) + val_feat_import_rf[n1,n2,:,:,0:n_feat_mRMR2[n2]] = RF_feat_import + # Save the features used by the RF based on index from pre mRMR + rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0] + val_feat_rf[n1,n2,:,:,0:len(rf_feat_idx)] = rf_feat_idx + + print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100)) + + # Choose the optimal parameters + ### SVM + n1, n2, C, K = np.where(val_err_svm==np.max(val_err_svm)) + + if len(C) > 1: + print("There are multiple SVM runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + rfe_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + n2_temp = int(n2[i2]) + C_temp = int(C[i2]) + K_temp = int(K[i2]) + temp_feats = val_feat_svm[n1_temp,n2_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,n2_temp,C_temp,K_temp])].astype(int) + rfe_feat_len.append(len(temp_feats)) + + rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] + if len(rfe_feat_min) > 1: + print("Multiple SVM runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[rfe_feat_min]) + n1, n2, C, K = [int(n1[rfe_feat_min][C_min]), int(n2[rfe_feat_min][C_min]), + int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])] + else: + n1, n2, C, K = [int(n1[int(rfe_feat_min)]), int(n2[int(rfe_feat_min)]), + int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])] + else: + n1, n2, C, K = [int(n1), int(n2), int(C), int(K)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + mRMR_chosen2 = n_feat_mRMR2[n2] + C_chosen = C_parameter_SVM[C] + K_chosen = kernels[K] + + # Save the best validation erro + val_error = val_err_svm[n1, n2, C, K] + + # Get the subsets chosen + chosen_feats = val_feat_svm[n1,n2,C,K][~np.isnan(val_feat_svm[n1,n2,C,K])].astype(int) + rfe_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rfe_features) + x_train_mRMR2_rfe = X_train[rfe_features] + + # Fit on all training data + model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000) + model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_rfe) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rfe_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error] + SVM_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat,K_chosen] + SVM_model = model + SVM_y_pred = [[model_train_y],[y_pred]] + + ### LogReg with SFS + n1, n2, C = np.where(val_err_logreg==np.max(val_err_logreg)) + if len(C) > 1: + print("There are multiple LogReg runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + sfs_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + n2_temp = int(n2[i2]) + C_temp = int(C[i2]) + temp_feats = val_feat_logreg[n1_temp,n2_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,n2_temp,C_temp])].astype(int) + sfs_feat_len.append(len(temp_feats)) + + sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] + if len(sfs_feat_min) > 1: + print("Multiple LogReg runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[sfs_feat_min]) + n1, n2, C = [int(n1[sfs_feat_min][C_min]), int(n2[sfs_feat_min][C_min]), + int(C[sfs_feat_min][C_min])] + else: + n1, n2, C = [int(n1[int(sfs_feat_min)]), int(n2[int(sfs_feat_min)]), + int(C[int(sfs_feat_min)])] + else: + n1, n2, C = [int(n1), int(n2), int(C)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + mRMR_chosen2 = n_feat_mRMR2[n2] + C_chosen = C_parameter_LogReg[C] + + # Save the best validation erro + val_error = val_err_logreg[n1, n2, C] + + # Get the subsets chosen + chosen_feats = val_feat_logreg[n1,n2,C][~np.isnan(val_feat_logreg[n1,n2,C])].astype(int) + sfs_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(sfs_features) + x_train_mRMR2_sfs = X_train[sfs_features] + + # Fit on all training data + model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) + model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_sfs) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[sfs_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error] + LogReg_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat] + LogReg_model = model + LogReg_y_pred = [[model_train_y],[y_pred]] + + ### RF + n1, n2, t, d = np.where(val_err_rf==np.max(val_err_rf)) + + if len(d) > 1: + print("There are multiple RF runs with the same validation error") + print("Choosing run with lowest depth to alleviate overfitting") + d_min = np.where(d==np.min(d))[0] + + if len(d_min) > 1: + print("Multiple RF runs with same number number of depths") + print("Choosing run with lowest trees to alleviate overfitting") + t_min = np.argmin(t[d_min]) # argmin just takes the first + # If there are multiple with same parameters and validation error it is most likely the same + n1, n2, t, d = [int(n1[d_min][t_min]), int(n2[d_min][t_min]), + int(t[d_min][t_min]), int(d[d_min][t_min])] + else: + n1, n2, t, d = [int(n1[int(d_min)]), int(n2[int(d_min)]), + int(t[int(d_min)]), int(d[int(d_min)])] + else: + n1, n2, t, d = [int(n1), int(n2), int(t), int(d)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + mRMR_chosen2 = n_feat_mRMR2[n2] + t_chosen = trees[t] + d_chosen = depth[d] + + # Save the best validation erro + val_error = val_err_rf[n1, n2, t, d] + + # Get the chosen features and feature importances + chosen_feats = val_feat_rf[n1,n2,t,d][~np.isnan(val_feat_rf[n1,n2,t,d])].astype(int) + rf_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rf_features) + x_train_mRMR2_rf = X_train[rf_features] + + rf_feat_importances = val_feat_import_rf[n1,n2,t,d][~np.isnan(val_feat_import_rf[n1,n2,t,d])] + + # Fit on all training data + model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, + n_jobs=-1, random_state=None) + model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_rf) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rf_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error] + RF_model_par = [mRMR_chosen1,mRMR_chosen2,t_chosen,d_chosen] + RF_model = model + RF_y_pred = [[model_train_y],[y_pred]] + + # Save the rest + model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par]) + final_models0.append([SVM_model,LogReg_model,RF_model]) + final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]]) + final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred]) + # Move counter + Outer_counter += 1 + print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1)) + # Save results from all outer folds + model_par.append(model_par0) + final_models.append(final_models0) + final_features.append(final_features0) + final_y_preds.append(final_y_preds0) + # Save results to file + Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds] + # Run with variable feat in first and second mRMR + with open(Model_savepath+"Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Group_Status_results_010222.pkl", "wb") as filehandle: + pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle) + + # Get current time + c_time2 = time.localtime() + c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) + print("Started", c_time1, "\nCurrent Time",c_time2) + + # Print total progress + print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) + +# %% ML predction of PTSD status using each feature separately +# mRMR -> mRMR -> SVM/RF/LogReg with L2 +# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV +# With concurrent processes for faster computation +EEG_features_df, EEG_features_name_list = load_features_df() + +# Add group status +Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 +Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 +EEG_features_df.insert(1, "Group_status", Group_status) + +# Subject info columns +Subject_info_cols = list(EEG_features_df.columns[0:2]) +n_subject_info_cols = len(Subject_info_cols) +n_discrete_cols = 2 + +# To ensure proper stratification into train/test set I will stratify using group status and study status +# A variable that encodes for this is created +n_studies = 3 +study_group_status = EEG_features_df["Group_status"].copy() +for i in range(n_studies): + # Get study index + study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) + # Assign label + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD + +# Target variable +Target = ["Group_status"] +Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) +Target_col_idx = np.where(Target_col == True)[0] + +# Make 3 models and save them to use enseemble in the end +CLF_models = ["SVM", "LogReg", "RF"] +n_models = len(CLF_models) + +# Repeat the classification to see if I just got a lucky seed +n_repetitions = 10 +k_out = 10 +# accuracy_arr = np.zeros((n_repetitions,k_out,len(EEG_features_name_list),n_models,3)) +Ind_feat_clf = [0]*n_repetitions +np.random.seed(42) + +# Prepare the splits beforehand to make sure the repetitions are not the same +Rep_Outer_CV = [] +Rep_Outer_CV_test = [] # a list with only the test indices +for rep in range(n_repetitions): + skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Outer_CV = [] + Outer_CV_test = [] + for train_index, test_index in skf.split(EEG_features_df,study_group_status): + Outer_CV.append([train_index,test_index]) + Outer_CV_test.append(test_index) + Rep_Outer_CV.append(Outer_CV) + Rep_Outer_CV_test.append(Outer_CV_test) +# Check none of the repetitions are the same by using only the test sets +# The list is first flattened +Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] +# All elements are converted to strings +# This makes it easier to look for uniques, and the indices are already in numerical order +Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] + +def allUnique(x): + seen = set() + return not any(i in seen or seen.add(i) for i in x) +assert allUnique(Rep_Outer_CV_test_flat_str) + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +def Each_feat_10rep_10x10CV(rep): + # The outer fold CV has already been saved as lists + Outer_CV = Rep_Outer_CV[rep] + # Pre-allocate memory + Ind_feat_clf0 = [] + accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3)) + Outer_counter = 0 + for train_index, test_index in Outer_CV: + test_df = EEG_features_df.iloc[test_index] + train_df = EEG_features_df.iloc[train_index] + + # Training data will be standardized + standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) + + train_df_standard = train_df.copy() + train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) + # Test data will also be standardized but using mean and std from training data + test_df_standard = test_df.copy() + test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) + + # Get the training data + X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) + y_train = train_df_standard[Target] + # Get test data + X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) + y_test = test_df_standard[Target] + + # Part 2 with second mRMR and classifiers in loop + k_fold = 10 + skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Inner_CV = [] + for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): + Inner_CV.append([train_index2,test_index2]) + + # Prepare initial filtering of feature types to alleviate imbalance in number of features + # Use a variable that is optimized using inner CV + n_feat_mRMR1 = [20,30,40,50] + + # Max features from mRMR for each eye status + max_mRMR_features = n_feat_mRMR1[-1] + + eye_status = ["Eyes Closed", "Eyes Open"] + n_eye_status = len(eye_status) + + feat_counter = [] + # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF + Ind_feat_clf00 = [] + for fset in range(len(EEG_features_name_list)): + temp_feat = EEG_features_name_list[fset] + other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:] + other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten + # Retrieve the dataset for each feature + col_idx = np.zeros(len(X_train.columns), dtype=bool) + for fsub in range(len(temp_feat)): + temp_feat0 = temp_feat[fsub] + col_idx0 = X_train.columns.str.contains(temp_feat0+"_") + col_idx = np.logical_or(col_idx,col_idx0) # append all trues + temp_X_train = X_train.loc[:,col_idx] + # Check if any of the other features were wrongly chosen + for fcheck in range(len(other_feats)): + if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True): + temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))] + if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again + temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")] + + # Save number of original features fed into mRMR + feat_counter.append([temp_feat,temp_X_train.shape[1]]) + # If there are no features selected, then skip the rest of the loop + if temp_X_train.shape[1] == 0: + continue + + # Do not use mRMR if there are fewer than n_features + if temp_X_train.shape[1] <= max_mRMR_features: + filter_features = temp_X_train.columns + else: + # mRMR + filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(), + num_features_to_select=max_mRMR_features, + K_MAX=1000,n_jobs=1,verbose=False) + # Get selected features + filter_features = temp_X_train.columns[filter_feat_selector] + + # SVM with L2-norm (it is by default squared) + # Prepare hyper parameters + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) + # rbf kernel overfit easier, whereas linear empirically works better in high D data + + # Sequential Feature Selection for Logistic Regression + # In-built model selection CV + # The L2 is the inverse of C for LogReg + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) + + # Random forest classifier + # Prepare hyper parameters + trees = np.array([10, 100, 500, 1000]) + depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting + + # Prepare arrays for validation errors + val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM))) + val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features)) + val_feat_svm.fill(np.nan) + + val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg))) + val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features)) + val_feat_logreg.fill(np.nan) + + val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth))) + val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) + val_feat_rf.fill(np.nan) + val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) + val_feat_import_rf.fill(np.nan) + + min_features_to_select = 1 + + for n1 in range(len(n_feat_mRMR1)): + mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]] + # Use the selected filter features + temp_X_train_mRMR = temp_X_train[mRMR_features1] + + # SVM with recursive feature elemination + # Stratified CV to find best regularization strength and number of features + # CV is built-in RFECV + for C in range(len(C_parameter_SVM)): + # Define the model + svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000) + # Perform recurive feature elimination with in-built CV + rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy", + cv=Inner_CV, + min_features_to_select=min_features_to_select) + rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel()) + # Get CV score + err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] + # Save results for hyperparameter optimization + val_err_svm[n1,C] = err + # Save the features chosen by RFE based on index from pre mRMR + rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0] + val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx + # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM))) + + # Logistic regression with sequential forward selection + # In-bult CV + k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]]) + k_features = np.arange(1,k_max+1,1) # try up to all features + inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) + for C in range(len(C_parameter_LogReg)): + LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], + max_iter = 50000) + sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), + forward = True, scoring = "balanced_accuracy", + verbose = 0, floating = False, + cv = Inner_CV, n_jobs = 1) + + sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel()) + # Save CV scores for each SFS step + for feat in range(len(k_features)): + inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] + + # Find the best number of features + # I am rounding to make it more smooth and disregard small improvements + K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] + if len(K) > 1: + K = K[np.where(K == np.min(K))[0]] + err = inner_cv_scores[C,K] + # Save validation error and features + val_err_logreg[n1,C] = err + # Save the features chosen by RFE based on index from pre mRMR + sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature + val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx + + # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) + + # Random forest + model_test_err = np.zeros((k_fold,len(trees),len(depth))) + RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1])) + + counter = 0 + for cv in range(k_fold): + # Retrieve CV indices + train_index_rf = Inner_CV[cv][0] + test_index_rf = Inner_CV[cv][1] + # Retrieve datasets + X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf] + y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] + + for t in range(len(trees)): + for d in range(len(depth)): + RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], + n_jobs=1, random_state=None) + RF.fit(X_train2.to_numpy(),y_train2) + # Validation error + RF_y = RF.predict(X_test2.to_numpy()) + err = balanced_accuracy_score(y_test2,RF_y) + # Save to array + model_test_err[cv,t,d] = err + # Save feature importance array + RF_feat_import[cv,t,d] = RF.feature_importances_ + counter += 1 + # print("Finished RF run {} out of {}".format(counter, k_fold)) + + # Average the errors over the CV folds + model_test_err_mean = np.mean(model_test_err, axis=0) + val_err_rf[n1,:,:] = model_test_err_mean + # Average the feature importances over the CV folds + RF_feat_import = np.mean(RF_feat_import, axis=0) + val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import + # Save the features used by the RF based on index from pre mRMR + rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0] + val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx + + # Print progress + current_progress = (n1+1)/(len(n_feat_mRMR1))*100 + print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0])) + + # Choose the optimal parameters + ### SVM + n1, C = np.where(val_err_svm==np.max(val_err_svm)) + + if len(C) > 1: + print("There are multiple SVM runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + rfe_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + C_temp = int(C[i2]) + temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int) + rfe_feat_len.append(len(temp_feats)) + + rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] + if len(rfe_feat_min) > 1: + print("Multiple SVM runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[rfe_feat_min]) + n1, C = [int(n1[rfe_feat_min][C_min]), + int(C[rfe_feat_min][C_min])] + else: + n1, C = [int(n1[int(rfe_feat_min)]), + int(C[int(rfe_feat_min)])] + else: + n1, C = [int(n1), int(C)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + C_chosen = C_parameter_SVM[C] + + # Save the best validation error + val_error = val_err_svm[n1, C] + + # Get the subsets chosen + chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int) + rfe_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rfe_features) + x_train_mRMR2_rfe = X_train[rfe_features] + + # Fit on all training data + model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000) + model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_rfe) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rfe_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error] + SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat] + SVM_model = model + SVM_y_pred = [[model_train_y],[y_pred]] + + ### LogReg with SFS + n1, C = np.where(val_err_logreg==np.max(val_err_logreg)) + if len(C) > 1: + print("There are multiple LogReg runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + sfs_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + C_temp = int(C[i2]) + temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int) + sfs_feat_len.append(len(temp_feats)) + + sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] + if len(sfs_feat_min) > 1: + print("Multiple LogReg runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[sfs_feat_min]) + n1, C = [int(n1[sfs_feat_min][C_min]), + int(C[sfs_feat_min][C_min])] + else: + n1, C = [int(n1[int(sfs_feat_min)]), + int(C[int(sfs_feat_min)])] + else: + n1, C = [int(n1), int(C)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + C_chosen = C_parameter_LogReg[C] + + # Save the best validation erro + val_error = val_err_logreg[n1, C] + + # Get the subsets chosen + chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int) + sfs_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(sfs_features) + x_train_mRMR2_sfs = X_train[sfs_features] + + # Fit on all training data + model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) + model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_sfs) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[sfs_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error] + LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat] + LogReg_model = model + LogReg_y_pred = [[model_train_y],[y_pred]] + + ### RF + n1, t, d = np.where(val_err_rf==np.max(val_err_rf)) + + if len(d) > 1: + print("There are multiple RF runs with the same validation error") + print("Choosing run with lowest depth to alleviate overfitting") + d_min = np.where(d==np.min(d))[0] + + if len(d_min) > 1: + print("Multiple RF runs with same number number of depths") + print("Choosing run with lowest trees to alleviate overfitting") + t_min = np.argmin(t[d_min]) # argmin just takes the first + # If there are multiple with same parameters and validation error it is most likely the same + n1, t, d = [int(n1[d_min][t_min]), + int(t[d_min][t_min]), int(d[d_min][t_min])] + else: + n1, t, d = [int(n1[int(d_min)]), + int(t[int(d_min)]), int(d[int(d_min)])] + else: + n1, t, d = [int(n1), int(t), int(d)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + t_chosen = trees[t] + d_chosen = depth[d] + + # Save the best validation error + val_error = val_err_rf[n1, t, d] + + # Get the chosen features and feature importances + chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int) + rf_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rf_features) + x_train_mRMR2_rf = X_train[rf_features] + + rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])] + + # Fit on all training data + model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, + n_jobs=1, random_state=None) + model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_rf) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rf_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error] + RF_model_par = [mRMR_chosen1,t_chosen,d_chosen] + RF_model = model + RF_y_pred = [[model_train_y],[y_pred]] + + # Save the rest + model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par] + final_models00 = [SVM_model,LogReg_model,RF_model] + final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]] + final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred] + data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_] + + # Save the results for the specific feature + res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00] + Ind_feat_clf00.append(res) + print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat)) + + # Check that no features were excluded when checking for wrongly chosen + total_features = 0 + for feat_c in range(len(feat_counter)): + total_features += feat_counter[feat_c][1] + assert total_features == X_train.shape[1] + # Save results over outer folds + Ind_feat_clf0.append(Ind_feat_clf00) + + print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1)) + # Move counter + Outer_counter += 1 + + out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable] + # Print total progress + print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) + + # Get current time + c_time2 = time.localtime() + c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) + print("Started", c_time1, "\nCurrent Time",c_time2) + + return rep, out + +Ind_feat_clf = [0]*n_repetitions +with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: + for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments + Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable] + # Save results to file and overwrite when new results arrive + with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR_SVM_LogReg_RF_results_070222.pkl", "wb") as filehandle: + pickle.dump(Ind_feat_clf, filehandle) + +# %% Sparse clustering of all EEG features to look for subtypes +EEG_features_df, EEG_features_name_list = load_features_df() + +# Add group status +Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 +Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 +EEG_features_df.insert(1, "Group_status", Group_status) + +# Only use PTSD patient group +EEG_features_df2 = EEG_features_df.loc[EEG_features_df["Group_status"]==1,:] + +Subject_info_cols = ["Subject_ID","Group_status"] + +# Standardize the values +X = np.array(EEG_features_df2.copy().drop(Subject_info_cols, axis=1)) +standardizer = preprocessing.StandardScaler().fit(X) +X = standardizer.transform(X) + +# Use gridsearch and permutations to estimate gap statistic and use it to +# determine number of clusters and sparsity s +# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020 +max_clusters = 6 +n_sparsity_feat = 20 +perm_res = [] + +Timestamps = [] +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) +Timestamps.append(c_time1) +for k in range(1,max_clusters): + # Cannot permute with 1 cluster + n_clusters = k+1 + perm = pysparcl.cluster.permute_modified(X, k=n_clusters, verbose=True, + nvals=n_sparsity_feat, nperms=100) + perm_res.append(perm) + # Get current time + c_time2 = time.localtime() + c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) + print("Started", c_time1, "\nCurrent Time",c_time2) + Timestamps.append(c_time2) + +# Save the results +with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "wb") as file: + pickle.dump(perm_res, file) + +# # Load +# with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "rb") as file: +# perm_res = pickle.load(file) + +# Convert results to array +perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4)) +for i in range(len(perm_res)): + _, gaps, sdgaps, wbounds, _ = perm_res[i].values() + for i2 in range(n_sparsity_feat): + perm_res_arr[n_sparsity_feat*i+i2,0] = i+2 # cluster size + perm_res_arr[n_sparsity_feat*i+i2,1] = gaps[i2] # gap statistic + perm_res_arr[n_sparsity_feat*i+i2,2] = sdgaps[i2] # gap statistic std + perm_res_arr[n_sparsity_feat*i+i2,3] = wbounds[i2] # sparsity feature s + +# For each sparsity s, determine best k using one-standard-error criterion +# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s +# that fulfill Gap(k) >= Gap(k+1)-std(k+1) +def one_standard_deviation_search(gaps, std): + best_gaps = np.argmax(gaps) + current_gaps = gaps[best_gaps] + current_std = std[best_gaps] + current_gaps_idx = best_gaps + while (gaps[current_gaps_idx-1] >= current_gaps - current_std): + if current_gaps_idx == 0: + break + else: + current_gaps_idx -= 1 + current_gaps = gaps[current_gaps_idx] + current_std = std[current_gaps_idx] + out = current_gaps, current_std, current_gaps_idx + return out + +best_ks = np.zeros((n_sparsity_feat, 2)) +all_s = np.unique(perm_res_arr[:,3]) +plt.figure(figsize=(12,12)) +for i2 in range(n_sparsity_feat): + current_s = all_s[i2] + gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1] + std = perm_res_arr[perm_res_arr[:,3] == current_s,2] + _, _, idx = one_standard_deviation_search(gaps, std) + # Save to array + best_ks[i2,0] = current_s + best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx] + # Plot gap + plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"), + gaps, yerr=std, capsize=5, label = np.round(current_s,3)) +plt.title("Gap statistic for different fixed s") +plt.legend(loc=1) +plt.xlabel("Number of clusters") +plt.ylabel("Gap statistic") + +best_k = int(scipy.stats.mode(best_ks[:,1])[0]) + +# Determine s using fixed k as lowest s within 1 std of max gap statistic +# According to Witten & Tibshirani, 2010 +best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1]) +best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx] +best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx] +one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std + +best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]]) + +# Perform clustering with k clusters +sparcl = pysparcl.cluster.kmeans(X, k=best_k, wbounds=best_s)[0] + +# # Save the results +# with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "wb") as file: +# pickle.dump(sparcl, file) + +with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file: + sparcl = pickle.load(file) + +# %% Sparse kmeans -> mRMR -> SVM/RF/LogReg with L2 +# Using sparse kmeans selected features, only eyes closed as they were primarily chosen +# Prediction of subtype 1 +EEG_features_df, EEG_features_name_list = load_features_df() + +with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file: + sparcl = pickle.load(file) + +# Use concatenated features +EEG_features_name_list = [['Power'], + ['Frontal Theta Beta Ratio', + 'Asymmetry'], + ['Peak Alpha Frequency', + 'Global Peak Alpha Frequency'], + ["1/f exponent"], + ['imcoh'], + ['wpli'], + ['Power Envelope Correlation'], + ['Microstate Transition', + 'Microstate Ratio Time', + 'Microstate Entropy'], + ["Granger Causality"], + ['DFA Exponent', + 'Global DFA Exponent']] + +# Add group status +Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 +Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 +EEG_features_df.insert(1, "Group_status", Group_status) + +# Only keep PTSD subtype 1 by dropping subtype 2 +subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0] +subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1] +EEG_features_df = EEG_features_df.set_index("Subject_ID") +EEG_features_df = EEG_features_df.drop(subtype2) +EEG_features_df = EEG_features_df.reset_index() +# Check it was dropped correctly +assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"]) + +# Subject info columns +Subject_info_cols = list(EEG_features_df.columns[0:2]) +n_subject_info_cols = len(Subject_info_cols) +n_discrete_cols = 2 + +# Get features from sparse kmeans +nonzero_idx = sparcl["ws"].nonzero() +sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx] +sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO +# Only use eyes closed (2483 features) +sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")] +EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1) + +# To ensure proper stratification into train/test set I will stratify using group status and study status +# A variable that encodes for this is created +n_studies = 3 +study_group_status = EEG_features_df["Group_status"].copy() +for i in range(n_studies): + # Get study index + study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) + # Assign label + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD + +# Target variable +Target = ["Group_status"] +Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) +Target_col_idx = np.where(Target_col == True)[0] + +# Make 3 models and save them to use enseemble in the end +CLF_models = ["SVM", "LogReg", "RF"] +n_models = len(CLF_models) + +# Repeat the classification to see if I just got a lucky seed +n_repetitions = 10 +k_out = 10 +accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3)) +model_par = [] +final_models = [] +final_features = [] +final_y_preds = [] +np.random.seed(42) + +# Prepare the splits beforehand to make sure the repetitions are not the same +Rep_Outer_CV = [] +Rep_Outer_CV_test = [] # a list with only the test indices +for rep in range(n_repetitions): + skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Outer_CV = [] + Outer_CV_test = [] + for train_index, test_index in skf.split(EEG_features_df,study_group_status): + Outer_CV.append([train_index,test_index]) + Outer_CV_test.append(test_index) + Rep_Outer_CV.append(Outer_CV) + Rep_Outer_CV_test.append(Outer_CV_test) +# Check none of the repetitions are the same by using only the test sets +# The list is first flattened +Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] +# All elements are converted to strings +# This makes it easier to look for uniques, and the indices are already in numerical order +Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] + +def allUnique(x): + seen = set() + return not any(i in seen or seen.add(i) for i in x) +assert allUnique(Rep_Outer_CV_test_flat_str) + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +for rep in range(n_repetitions): + # The outer fold CV has already been saved as lists + Outer_CV = Rep_Outer_CV[rep] + # Pre-allocate memory + model_par0 = [] + final_models0 = [] + final_features0 = [] + final_y_preds0 = [] + Outer_counter = 0 + for train_index, test_index in Outer_CV: + test_df = EEG_features_df.iloc[test_index] + train_df = EEG_features_df.iloc[train_index] + + # Training data will be standardized + standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) + + train_df_standard = train_df.copy() + train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) + # Test data will also be standardized but using mean and std from training data + test_df_standard = test_df.copy() + test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) + + # Get the training data + X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) + y_train = train_df_standard[Target] + # Get test data + X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) + y_test = test_df_standard[Target] + + # Prepare initial filtering of feature types to alleviate imbalance in number of features + # Use a variable that is optimized using inner CV + n_feat_mRMR1 = [30,40,50,60] + # Max features from mRMR for each eye status + max_mRMR_features = n_feat_mRMR1[-1] + + # mRMR on the all sparse kmeans selected features + filter_feat_selector = mRMR_feature_select(X_train.to_numpy(),y_train.to_numpy(), + num_features_to_select=max_mRMR_features, + K_MAX=1000,n_jobs=-1,verbose=False) + + # Part 2 with second mRMR and classifiers in loop + k_fold = 10 + skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Inner_CV = [] + for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): + Inner_CV.append([train_index2,test_index2]) + + # SVM with L2-norm (it is by default squared) + # Prepare hyper parameters + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) + kernels = ["linear"] # error when using rbf and RFECV + # rbf also overfit easier, whereas linear empirically works better in high D data + + # Sequential Feature Selection for Logistic Regression + # In-built model selection CV + # The L2 is the inverse of C for LogReg + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) + + # Random forest classifier + # Prepare hyper parameters + trees = np.array([10, 100, 500, 1000]) + depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting + + # Prepare arrays for validation errors + val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels))) + val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels),max_mRMR_features)) + val_feat_svm.fill(np.nan) + + val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg))) + val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features)) + val_feat_logreg.fill(np.nan) + + val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth))) + val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) + val_feat_rf.fill(np.nan) + val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) + val_feat_import_rf.fill(np.nan) + + for n1 in range(len(n_feat_mRMR1)): + # Get selected features from mRMR + filter_features = X_train.columns[filter_feat_selector[0:n_feat_mRMR1[n1]]] + + X_train_mRMR = X_train[filter_features].copy() + + # SVM with recursive feature elemination + # Stratified CV to find best regularization strength and number of features + # CV is built-in RFECV + min_features_to_select = 1 + # Using normal for loop as RFECV has inbuilt multiprocessing + for C in range(len(C_parameter_SVM)): + for K in range(len(kernels)): + # Define the model + svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000) + # Perform recurive feature elimination with in-built CV + rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy", + cv=Inner_CV, + min_features_to_select=min_features_to_select) + rfecv.fit(X_train_mRMR,y_train.to_numpy().ravel()) + # Get CV score + err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] + # Save results for hyperparameter optimization + val_err_svm[n1,C,K] = err + # Save the features chosen by RFE based on index from pre mRMR + rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0] + val_feat_svm[n1,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx + + # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)]))) + + # Logistic regression with sequential forward selection + # In-bult CV + k_features = np.arange(1,n_feat_mRMR1[n1]+1,1) # try up to all features + inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) + for C in range(len(C_parameter_LogReg)): + LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], + max_iter = 50000) + sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), + forward = True, scoring = "balanced_accuracy", + verbose = 0, floating = False, + cv = Inner_CV, n_jobs = -1) + + sfs = sfs.fit(X_train_mRMR, y_train.to_numpy().ravel()) + # Save CV scores for each SFS step + for feat in range(len(k_features)): + inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] + + # Find the best number of features + # I am rounding to make it more smooth and disregard small improvements + K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] + if len(K) > 1: + K = K[np.where(K == np.min(K))[0]] + err = inner_cv_scores[C,K] + # Save validation error and features + val_err_logreg[n1,C] = err + # Save the features chosen by RFE based on index from pre mRMR + sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature + val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx + + # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) + + # Random forest + model_test_err = np.zeros((k_fold,len(trees),len(depth))) + RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR.shape[1])) + + counter = 0 + for cv in range(k_fold): + # Retrieve CV indices + train_index_rf = Inner_CV[cv][0] + test_index_rf = Inner_CV[cv][1] + # Retrieve datasets + X_train2 = X_train_mRMR.iloc[train_index_rf]; X_test2 = X_train_mRMR.iloc[test_index_rf] + y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] + + for t in range(len(trees)): + for d in range(len(depth)): + RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], + n_jobs=-1, random_state=None) + RF.fit(X_train2.to_numpy(),y_train2) + # Validation error + RF_y = RF.predict(X_test2.to_numpy()) + err = balanced_accuracy_score(y_test2, RF_y) + # Save to array + model_test_err[cv,t,d] = err + # Save feature importance array + RF_feat_import[cv,t,d] = RF.feature_importances_ + counter += 1 + # print("Finished RF run {} out of {}".format(counter, k_fold)) + # Average the errors over the CV folds + model_test_err_mean = np.mean(model_test_err, axis=0) + val_err_rf[n1,:,:] = model_test_err_mean + # Average the feature importances over the CV folds + RF_feat_import = np.mean(RF_feat_import, axis=0) + val_feat_import_rf[n1,:,:,0:n_feat_mRMR1[n1]] = RF_feat_import + # Save the features used by the RF based on index from pre mRMR + rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0] + val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx + + print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100)) + + # Choose the optimal parameters + ### SVM + n1, C, K = np.where(val_err_svm==np.max(val_err_svm)) + + if len(C) > 1: + print("There are multiple SVM runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + rfe_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + C_temp = int(C[i2]) + K_temp = int(K[i2]) + temp_feats = val_feat_svm[n1_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,C_temp,K_temp])].astype(int) + rfe_feat_len.append(len(temp_feats)) + + rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] + if len(rfe_feat_min) > 1: + print("Multiple SVM runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[rfe_feat_min]) + n1, C, K = [int(n1[rfe_feat_min][C_min]), + int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])] + else: + n1, C, K = [int(n1[int(rfe_feat_min)]), + int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])] + else: + n1, C, K = [int(n1), int(C), int(K)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + C_chosen = C_parameter_SVM[C] + K_chosen = kernels[K] + + # Save the best validation erro + val_error = val_err_svm[n1, C, K] + + # Get the subsets chosen + chosen_feats = val_feat_svm[n1,C,K][~np.isnan(val_feat_svm[n1,C,K])].astype(int) + rfe_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rfe_features) + x_train_mRMR_rfe = X_train[rfe_features] + + # Fit on all training data + model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000) + model = model.fit(x_train_mRMR_rfe,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR_rfe) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rfe_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error] + SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat,K_chosen] + SVM_model = model + SVM_y_pred = [[model_train_y],[y_pred]] + + ### LogReg with SFS + n1, C = np.where(val_err_logreg==np.max(val_err_logreg)) + if len(C) > 1: + print("There are multiple LogReg runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + sfs_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + C_temp = int(C[i2]) + temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int) + sfs_feat_len.append(len(temp_feats)) + + sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] + if len(sfs_feat_min) > 1: + print("Multiple LogReg runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[sfs_feat_min]) + n1, C = [int(n1[sfs_feat_min][C_min]), + int(C[sfs_feat_min][C_min])] + else: + n1, C = [int(n1[int(sfs_feat_min)]), + int(C[int(sfs_feat_min)])] + else: + n1, C = [int(n1), int(C)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + C_chosen = C_parameter_LogReg[C] + + # Save the best validation erro + val_error = val_err_logreg[n1, C] + + # Get the subsets chosen + chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int) + sfs_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(sfs_features) + x_train_mRMR_sfs = X_train[sfs_features] + + # Fit on all training data + model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) + model = model.fit(x_train_mRMR_sfs,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR_sfs) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[sfs_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error] + LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat] + LogReg_model = model + LogReg_y_pred = [[model_train_y],[y_pred]] + + ### RF + n1, t, d = np.where(val_err_rf==np.max(val_err_rf)) + + if len(d) > 1: + print("There are multiple RF runs with the same validation error") + print("Choosing run with lowest depth to alleviate overfitting") + d_min = np.where(d==np.min(d))[0] + + if len(d_min) > 1: + print("Multiple RF runs with same number number of depths") + print("Choosing run with lowest trees to alleviate overfitting") + t_min = np.argmin(t[d_min]) # argmin just takes the first + # If there are multiple with same parameters and validation error it is most likely the same + n1, t, d = [int(n1[d_min][t_min]), + int(t[d_min][t_min]), int(d[d_min][t_min])] + else: + n1, t, d = [int(n1[int(d_min)]), + int(t[int(d_min)]), int(d[int(d_min)])] + else: + n1, t, d = [int(n1), int(t), int(d)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + t_chosen = trees[t] + d_chosen = depth[d] + + # Save the best validation error + val_error = val_err_rf[n1, t, d] + + # Get the chosen features and feature importances + chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int) + rf_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rf_features) + x_train_mRMR_rf = X_train[rf_features] + + rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])] + + # Fit on all training data + model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, + n_jobs=-1, random_state=None) + model = model.fit(x_train_mRMR_rf,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR_rf) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rf_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error] + RF_model_par = [mRMR_chosen1,t_chosen,d_chosen] + RF_model = model + RF_y_pred = [[model_train_y],[y_pred]] + + # Save the rest + model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par]) + final_models0.append([SVM_model,LogReg_model,RF_model]) + final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]]) + final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred]) + # Move counter + Outer_counter += 1 + print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1)) + # Save results from all outer folds + model_par.append(model_par0) + final_models.append(final_models0) + final_features.append(final_features0) + final_y_preds.append(final_y_preds0) + # Save results to file + Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds] + # Run with variable feat in first and second mRMR + with open(Model_savepath+"Rep10_10x10CV_SparsKmean_mRMR_SVM_LogReg_RF_Subtype1_results_210122.pkl", "wb") as filehandle: + pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle) + + # Get current time + c_time2 = time.localtime() + c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) + print("Started", c_time1, "\nCurrent Time",c_time2) + + # Print total progress + print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) + + +# %% Each feat mRMR -> mRMR -> SVM/RF/LogReg with L2 +# Using each feature (of the sparse Kmeans EC chosen ones) +# For Subtype 1 +# 10 repetitions of 10 fold outer and 20 fold inner two-layer CV +# Prediction of subtype 1 +EEG_features_df, EEG_features_name_list = load_features_df() # removed coh and plv and renamed Hurst in v4 + +with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file: + sparcl = pickle.load(file) + +# Add group status +Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 +Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 +EEG_features_df.insert(1, "Group_status", Group_status) + +# Only keep PTSD subtype 1 by dropping subtype 2 +PTSD_idx = np.array([i in cases for i in Subject_id]) +CTRL_idx = np.array([not i in cases for i in Subject_id]) +subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0] +subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1] +EEG_features_df = EEG_features_df.set_index("Subject_ID") +EEG_features_df = EEG_features_df.drop(subtype2) +EEG_features_df = EEG_features_df.reset_index() +# Check it was dropped correctly +assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"]) + +# Subject info columns +Subject_info_cols = list(EEG_features_df.columns[0:2]) +n_subject_info_cols = len(Subject_info_cols) +n_discrete_cols = 2 + +# Get features from sparse kmeans +nonzero_idx = sparcl["ws"].nonzero() +sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx] +sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO +# Only use eyes closed (2483 features) +sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")] +EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1) + +# To ensure proper stratification into train/test set I will stratify using group status and study status +# A variable that encodes for this is created +n_studies = 3 +study_group_status = EEG_features_df["Group_status"].copy() +for i in range(n_studies): + # Get study index + study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) + # Assign label + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL + study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD + +# Target variable +Target = ["Group_status"] +Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) +Target_col_idx = np.where(Target_col == True)[0] + +# Make 3 models and save them to use enseemble in the end +CLF_models = ["SVM", "LogReg", "RF"] +n_models = len(CLF_models) + +# Repeat the classification to see if I just got a lucky seed +n_repetitions = 10 +k_out = 10 +accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3)) +model_par = [] +final_models = [] +final_features = [] +final_y_preds = [] +np.random.seed(42) + +# Prepare the splits beforehand to make sure the repetitions are not the same +Rep_Outer_CV = [] +Rep_Outer_CV_test = [] # a list with only the test indices +for rep in range(n_repetitions): + skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Outer_CV = [] + Outer_CV_test = [] + for train_index, test_index in skf.split(EEG_features_df,study_group_status): + Outer_CV.append([train_index,test_index]) + Outer_CV_test.append(test_index) + Rep_Outer_CV.append(Outer_CV) + Rep_Outer_CV_test.append(Outer_CV_test) +# Check none of the repetitions are the same by using only the test sets +# The list is first flattened +Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] +# All elements are converted to strings +# This makes it easier to look for uniques, and the indices are already in numerical order +Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] + +def allUnique(x): + seen = set() + return not any(i in seen or seen.add(i) for i in x) +assert allUnique(Rep_Outer_CV_test_flat_str) + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +def Each_feat_10rep_10x10CV(rep): + # The outer fold CV has already been saved as lists + Outer_CV = Rep_Outer_CV[rep] + # Pre-allocate memory + Ind_feat_clf0 = [] + accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3)) + Outer_counter = 0 + for train_index, test_index in Outer_CV: + test_df = EEG_features_df.iloc[test_index] + train_df = EEG_features_df.iloc[train_index] + + # Training data will be standardized + standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) + + train_df_standard = train_df.copy() + train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) + # Test data will also be standardized but using mean and std from training data + test_df_standard = test_df.copy() + test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) + + # Get the training data + X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) + y_train = train_df_standard[Target] + # Get test data + X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) + y_test = test_df_standard[Target] + + # Part 2 with second mRMR and classifiers in loop + k_fold = 10 + skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects + # I am also using converting it to an iterable list instad of the generator to promote reuse + Inner_CV = [] + for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): + Inner_CV.append([train_index2,test_index2]) + + # Prepare initial filtering of feature types to alleviate imbalance in number of features + # Use a variable that is optimized using inner CV + n_feat_mRMR1 = [20,30,40,50] + + # Max features from mRMR for each eye status + max_mRMR_features = n_feat_mRMR1[-1] + + eye_status = ["Eyes Closed", "Eyes Open"] + n_eye_status = len(eye_status) + + feat_counter = [] + # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF + Ind_feat_clf00 = [] + for fset in range(len(EEG_features_name_list)): + temp_feat = EEG_features_name_list[fset] + other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:] + other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten + # Retrieve the dataset for each feature + col_idx = np.zeros(len(X_train.columns), dtype=bool) + for fsub in range(len(temp_feat)): + temp_feat0 = temp_feat[fsub] + col_idx0 = X_train.columns.str.contains(temp_feat0+"_") + col_idx = np.logical_or(col_idx,col_idx0) # append all trues + temp_X_train = X_train.loc[:,col_idx] + # Check if any of the other features were wrongly chosen + for fcheck in range(len(other_feats)): + if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True): + temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))] + if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again + temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")] + + # Save number of original features fed into mRMR + feat_counter.append([temp_feat,temp_X_train.shape[1]]) + # If there are no features selected, then skip the rest of the loop + if temp_X_train.shape[1] == 0: + continue + + # Do not use mRMR if there are fewer than n_features + if temp_X_train.shape[1] <= max_mRMR_features: + filter_features = temp_X_train.columns + else: + # mRMR + filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(), + num_features_to_select=max_mRMR_features, + K_MAX=1000,n_jobs=1,verbose=False) + # Get selected features + filter_features = temp_X_train.columns[filter_feat_selector] + + # SVM with L2-norm (it is by default squared) + # Prepare hyper parameters + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) + # rbf kernel overfit easier, whereas linear empirically works better in high D data + + # Sequential Feature Selection for Logistic Regression + # In-built model selection CV + # The L2 is the inverse of C for LogReg + exponent = np.linspace(-3,1,9) + exponent = np.round(exponent,5) # sometimes linspace are not so exact + C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) + + # Random forest classifier + # Prepare hyper parameters + trees = np.array([10, 100, 500, 1000]) + depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting + + # Prepare arrays for validation errors + val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM))) + val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features)) + val_feat_svm.fill(np.nan) + + val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg))) + val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features)) + val_feat_logreg.fill(np.nan) + + val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth))) + val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) + val_feat_rf.fill(np.nan) + val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) + val_feat_import_rf.fill(np.nan) + + min_features_to_select = 1 + + for n1 in range(len(n_feat_mRMR1)): + mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]] + # Use the selected filter features + temp_X_train_mRMR = temp_X_train[mRMR_features1] + + # SVM with recursive feature elemination + # Stratified CV to find best regularization strength and number of features + # CV is built-in RFECV + for C in range(len(C_parameter_SVM)): + # Define the model + svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000) + # Perform recurive feature elimination with in-built CV + rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy", + cv=Inner_CV, + min_features_to_select=min_features_to_select) + rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel()) + # Get CV score + err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] + # Save results for hyperparameter optimization + val_err_svm[n1,C] = err + # Save the features chosen by RFE based on index from pre mRMR + rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0] + val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx + # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM))) + + # Logistic regression with sequential forward selection + # In-bult CV + k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]]) + k_features = np.arange(1,k_max+1,1) # try up to all features + inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) + for C in range(len(C_parameter_LogReg)): + LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], + max_iter = 50000) + sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), + forward = True, scoring = "balanced_accuracy", + verbose = 0, floating = False, + cv = Inner_CV, n_jobs = 1) + + sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel()) + # Save CV scores for each SFS step + for feat in range(len(k_features)): + inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] + + # Find the best number of features + # I am rounding to make it more smooth and disregard small improvements + K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] + if len(K) > 1: + K = K[np.where(K == np.min(K))[0]] + err = inner_cv_scores[C,K] + # Save validation error and features + val_err_logreg[n1,C] = err + # Save the features chosen by RFE based on index from pre mRMR + sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature + val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx + + # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) + + # Random forest + model_test_err = np.zeros((k_fold,len(trees),len(depth))) + RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1])) + + counter = 0 + for cv in range(k_fold): + # Retrieve CV indices + train_index_rf = Inner_CV[cv][0] + test_index_rf = Inner_CV[cv][1] + # Retrieve datasets + X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf] + y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] + + for t in range(len(trees)): + for d in range(len(depth)): + RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], + n_jobs=1, random_state=None) + RF.fit(X_train2.to_numpy(),y_train2) + # Validation error + RF_y = RF.predict(X_test2.to_numpy()) + err = balanced_accuracy_score(y_test2,RF_y) + # Save to array + model_test_err[cv,t,d] = err + # Save feature importance array + RF_feat_import[cv,t,d] = RF.feature_importances_ + counter += 1 + # print("Finished RF run {} out of {}".format(counter, k_fold)) + + # Average the errors over the CV folds + model_test_err_mean = np.mean(model_test_err, axis=0) + val_err_rf[n1,:,:] = model_test_err_mean + # Average the feature importances over the CV folds + RF_feat_import = np.mean(RF_feat_import, axis=0) + val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import + # Save the features used by the RF based on index from pre mRMR + rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0] + val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx + + # Print progress + current_progress = (n1+1)/(len(n_feat_mRMR1))*100 + print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0])) + + # Choose the optimal parameters + ### SVM + n1, C = np.where(val_err_svm==np.max(val_err_svm)) + + if len(C) > 1: + print("There are multiple SVM runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + rfe_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + C_temp = int(C[i2]) + temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int) + rfe_feat_len.append(len(temp_feats)) + + rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] + if len(rfe_feat_min) > 1: + print("Multiple SVM runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[rfe_feat_min]) + n1, C = [int(n1[rfe_feat_min][C_min]), + int(C[rfe_feat_min][C_min])] + else: + n1, C = [int(n1[int(rfe_feat_min)]), + int(C[int(rfe_feat_min)])] + else: + n1, C = [int(n1), int(C)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + C_chosen = C_parameter_SVM[C] + + # Save the best validation error + val_error = val_err_svm[n1, C] + + # Get the subsets chosen + chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int) + rfe_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rfe_features) + x_train_mRMR2_rfe = X_train[rfe_features] + + # Fit on all training data + model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000) + model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_rfe) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rfe_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error] + SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat] + SVM_model = model + SVM_y_pred = [[model_train_y],[y_pred]] + + ### LogReg with SFS + n1, C = np.where(val_err_logreg==np.max(val_err_logreg)) + if len(C) > 1: + print("There are multiple LogReg runs with the same validation error") + print("Choosing run with fewest features to alleviate overfitting") + sfs_feat_len = [] + for i2 in range(len(C)): + n1_temp = int(n1[i2]) + C_temp = int(C[i2]) + temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int) + sfs_feat_len.append(len(temp_feats)) + + sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] + if len(sfs_feat_min) > 1: + print("Multiple LogReg runs with same number of fewest features") + print("Choosing run with lowest C (highest regularization) to alleviate overfitting") + C_min = np.argmin(C[sfs_feat_min]) + n1, C = [int(n1[sfs_feat_min][C_min]), + int(C[sfs_feat_min][C_min])] + else: + n1, C = [int(n1[int(sfs_feat_min)]), + int(C[int(sfs_feat_min)])] + else: + n1, C = [int(n1), int(C)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + C_chosen = C_parameter_LogReg[C] + + # Save the best validation erro + val_error = val_err_logreg[n1, C] + + # Get the subsets chosen + chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int) + sfs_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(sfs_features) + x_train_mRMR2_sfs = X_train[sfs_features] + + # Fit on all training data + model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) + model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_sfs) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[sfs_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error] + LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat] + LogReg_model = model + LogReg_y_pred = [[model_train_y],[y_pred]] + + ### RF + n1, t, d = np.where(val_err_rf==np.max(val_err_rf)) + + if len(d) > 1: + print("There are multiple RF runs with the same validation error") + print("Choosing run with lowest depth to alleviate overfitting") + d_min = np.where(d==np.min(d))[0] + + if len(d_min) > 1: + print("Multiple RF runs with same number number of depths") + print("Choosing run with lowest trees to alleviate overfitting") + t_min = np.argmin(t[d_min]) # argmin just takes the first + # If there are multiple with same parameters and validation error it is most likely the same + n1, t, d = [int(n1[d_min][t_min]), + int(t[d_min][t_min]), int(d[d_min][t_min])] + else: + n1, t, d = [int(n1[int(d_min)]), + int(t[int(d_min)]), int(d[int(d_min)])] + else: + n1, t, d = [int(n1), int(t), int(d)] + + mRMR_chosen1 = n_feat_mRMR1[n1] + t_chosen = trees[t] + d_chosen = depth[d] + + # Save the best validation error + val_error = val_err_rf[n1, t, d] + + # Get the chosen features and feature importances + chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int) + rf_features = list(X_train.columns[chosen_feats]) + n_final_feat = len(rf_features) + x_train_mRMR2_rf = X_train[rf_features] + + rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])] + + # Fit on all training data + model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, + n_jobs=1, random_state=None) + model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel()) + # Get training output + model_train_y = model.predict(x_train_mRMR2_rf) + # Get training error + train_error = balanced_accuracy_score(y_train, model_train_y) + + # Get prediction of test data + y_pred = model.predict(X_test[rf_features]) + # Use model to predict class on test data + test_error = balanced_accuracy_score(y_test, y_pred) + + # Save or prepare to save + accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error] + RF_model_par = [mRMR_chosen1,t_chosen,d_chosen] + RF_model = model + RF_y_pred = [[model_train_y],[y_pred]] + + # Save the rest + model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par] + final_models00 = [SVM_model,LogReg_model,RF_model] + final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]] + final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred] + data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_] + + # Save the results for the specific feature + res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00] + Ind_feat_clf00.append(res) + print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat)) + + # Check that no features were excluded when checking for wrongly chosen + total_features = 0 + for feat_c in range(len(feat_counter)): + total_features += feat_counter[feat_c][1] + assert total_features == X_train.shape[1] + # Save results over outer folds + Ind_feat_clf0.append(Ind_feat_clf00) + + print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1)) + # Move counter + Outer_counter += 1 + + out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable] + # Print total progress + print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) + + # Get current time + c_time2 = time.localtime() + c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) + print("Started", c_time1, "\nCurrent Time",c_time2) + + return rep, out + +Ind_feat_clf = [0]*n_repetitions +with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: + for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments + Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable] + # Save results to file and overwrite when new results arrive + with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Subtype1_results_250122.pkl", "wb") as filehandle: + pickle.dump(Ind_feat_clf, filehandle) + +# %% For subtype 2 +# The two previous sections are rerun but for subtype 2 +# The only modification was indexing the subjects + +# Only keep PTSD subtype 2 by dropping subtype 1 +subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0] +subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1] +EEG_features_df = EEG_features_df.set_index("Subject_ID") +EEG_features_df = EEG_features_df.drop(subtype1) +EEG_features_df = EEG_features_df.reset_index() +# Check it was dropped correctly +assert all(subtype2 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"]) +