# -*- coding: utf-8 -*- """ Updated Oct 18 2022 @author: Qianliang Li (glia@dtu.dk) This script contains the code for the machine learning analysis It should be run after Preprocessing.py and FeatureEstimation.py """ # Set working directory import os wkdir = "/home/glia/EEG/Final_scripts/" os.chdir(wkdir) # Load all libraries from the Preamble from Preamble import * # Load the questionnaire data final_qdf = pd.read_csv("final_qdf.csv", sep=",", na_values=' ') # Define cases as >= 44 total PCL # Type: numpy array with subject id cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44]) n_groups = 2 Groups = ["CTRL", "PTSD"] # Make function to load EEG features def load_features_df(): # Load all features power_df = pd.read_pickle(Feature_savepath+"Power_df.pkl") fTBR_data_df = pd.read_pickle(Feature_savepath+"fTBR_df.pkl") asymmetry_df = pd.read_pickle(Feature_savepath+"asymmetry_df.pkl") PAF_data_df = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_df.pkl") PAF_data_df_global = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_global_df.pkl") OOF_data_df = pd.read_pickle(Feature_savepath+"OOF_data_FOOOF_df.pkl") con_data_df = pd.read_pickle(Feature_savepath+"con_data_source_drop_interpol_df.pkl") pec_data_df = pd.read_pickle(Feature_savepath+"pec_data_drop_interpol_ch_df.pkl") microstate_transition_data_df = pd.read_pickle(Feature_savepath+"microstate_transition_data_df.pkl") microstate_time_df = pd.read_pickle(Feature_savepath+"microstate_time_df.pkl") microstate_entropy_df = pd.read_pickle(Feature_savepath+"microstate_entropy_df.pkl") GC_data_df = pd.read_pickle(Feature_savepath+"GC_data_source_drop_interpol_df.pkl") H_data_df = pd.read_pickle(Feature_savepath+"H_data_df.pkl") H_data_df_global = pd.read_pickle(Feature_savepath+"H_data_global_df.pkl") # List of features EEG_features_name_list = [['Power'], ['Frontal Theta Beta Ratio', 'Asymmetry'], ['Peak Alpha Frequency', 'Global Peak Alpha Frequency'], ["1/f exponent"], ['imcoh'], ['wpli'], ['Power Envelope Correlation'], ['Microstate Transition', 'Microstate Ratio Time', 'Microstate Entropy'], ["Granger Causality"], ['DFA Exponent', 'Global DFA Exponent']] # Arrange them to fit one 2D dataframe # Make function to add measurement column for indexing def add_measurement_column(df, measurement = "Text"): dummy_variable = [measurement]*df.shape[0] df.insert(1, "Measurement", dummy_variable) return df # Make function to convert column tuple to string def convertTupleHeader(header): header = list(header) str = "_".join(header) return str # Prepare overall dataframe EEG_features_df = pd.DataFrame(Subject_id, columns = ["Subject_ID"]) # Add power spectral densities power_df = add_measurement_column(power_df, "Power") temp_df = power_df.pivot_table(index="Subject_ID",columns=["Measurement", "Quant_status", "Eye_status", "Freq_band", "Channel"], dropna=False, values="PSD").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add frontal theta/beta ratio # fTBR_data_df = add_measurement_column(fTBR_data_df, "Frontal Theta Beta Ratio") temp_df = fTBR_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status"], dropna=False, values="TBR").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add power asymmetry asymmetry_df = add_measurement_column(asymmetry_df, "Asymmetry") temp_df = asymmetry_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Freq_band", "ROI"], dropna=False, values="Asymmetry_score").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add peak alpha frequency PAF_data_df = add_measurement_column(PAF_data_df, "Peak Alpha Frequency") temp_df = PAF_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Channel"], dropna=False, values="Value").reset_index(drop=True) # NaN values are interpolated with means across channels for each condition eye_status = list(final_epochs[0].event_id.keys()) for ee in eye_status: temp = temp_df.loc[:,("Peak Alpha Frequency",ee)] # get data temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0) temp_df.loc[:,("Peak Alpha Frequency",ee)] = temp.to_numpy() # If there are still NaN the values are interpolated across channels and condition temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add 1/f exponent OOF_data_df = add_measurement_column(OOF_data_df, "1/f exponent") temp_df = OOF_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Channel"], dropna=False, values="Value").reset_index(drop=True) # NaN values are interpolated with means across channels for each condition eye_status = list(final_epochs[0].event_id.keys()) for ee in eye_status: temp = temp_df.loc[:,("1/f exponent",ee)] # get data temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0) temp_df.loc[:,("1/f exponent",ee)] = temp.to_numpy() # If there are still NaN the values are interpolated across channels and condition temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add global peak alpha frequency #PAF_data_df_global = add_measurement_column(PAF_data_df_global, "Global_Peak_Alpha_Frequency") # already exists temp_df = PAF_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status"], dropna=False, values="Value").reset_index(drop=True) # NaN values are interpolated across eye condition temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add connectivity measurements #con_data_df = add_measurement_column(con_data_df, "Connectivity") # already exists temp_df = con_data_df.pivot_table(index="Subject_ID",columns=["Con_measurement", "Eye_status", "chx", "chy", "Freq_band"], dropna=True, values="Value").reset_index(drop=True) # Drop coh and plv, which are more susceptible to volume conduction temp_df = temp_df.drop("coh",axis=1) temp_df = temp_df.drop("plv",axis=1) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add orthogonalized power enveloped correlations pec_data_df = add_measurement_column(pec_data_df, "Power Envelope Correlation") temp_df = pec_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "chx", "chy", "Freq_band"], dropna=True, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add microstate transition probabilities microstate_transition_data_df = add_measurement_column(microstate_transition_data_df, "Microstate Transition") temp_df = microstate_transition_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Transition"], dropna=False, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add microstate time covered microstate_time_df = add_measurement_column(microstate_time_df, "Microstate Ratio Time") # Convert microstate to str before using it as column microstate_time_df = microstate_time_df.astype({"Microstate": str}) temp_df = microstate_time_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Microstate"], dropna=False, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add microstate entropy #microstate_entropy_df = add_measurement_column(microstate_entropy_df, "Microstate_Entropy") # already exists microstate_entropy_df["Measurement"] = ["Microstate Entropy"]*microstate_entropy_df.shape[0] temp_df = microstate_entropy_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status"], dropna=False, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add Granger Causality # fixed so it is always chx --> chy GC_data_df = add_measurement_column(GC_data_df, "Granger Causality") temp_df = GC_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "chx", "chy", "Freq_band"], dropna=True, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] # Check if any GC was not calculated properly assert temp_df.shape[1] == (31*31-31)*5*2 # expected number of features with 31 sources EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add DFA exponent H_data_df = add_measurement_column(H_data_df, "DFA Exponent") temp_df = H_data_df.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Channel", "Freq_band"], dropna=False, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) # Add Global Hurst exponent H_data_df_global = H_data_df_global.drop("Measurement",axis=1) H_data_df_global = add_measurement_column(H_data_df_global, "Global DFA Exponent") # already exists temp_df = H_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement", "Eye_status", "Freq_band"], dropna=False, values="Value").reset_index(drop=True) temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1) return EEG_features_df, EEG_features_name_list # %% ML predction of PTSD status using all features # mRMR -> mRMR -> SVM/RF/LogReg with L2 # 10 repetitions of 10 fold outer and 10 fold inner two-layer CV EEG_features_df, EEG_features_name_list = load_features_df() # Add group status Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 EEG_features_df.insert(1, "Group_status", Group_status) # Subject info columns Subject_info_cols = list(EEG_features_df.columns[0:2]) n_subject_info_cols = len(Subject_info_cols) n_discrete_cols = 2 # To ensure proper stratification into train/test set I will stratify using group status and study status # A variable that encodes for this is created n_studies = 3 study_group_status = EEG_features_df["Group_status"].copy() for i in range(n_studies): # Get study index study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) # Assign label study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD # Target variable Target = ["Group_status"] Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) Target_col_idx = np.where(Target_col == True)[0] # Make 3 models and save them to use enseemble in the end CLF_models = ["SVM", "LogReg", "RF"] n_models = len(CLF_models) # Repeat the classification to see if I just got a lucky seed n_repetitions = 10 k_out = 10 accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3)) model_par = [] final_models = [] final_features = [] final_y_preds = [] np.random.seed(42) # Prepare the splits beforehand to make sure the repetitions are not the same Rep_Outer_CV = [] Rep_Outer_CV_test = [] # a list with only the test indices for rep in range(n_repetitions): skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Outer_CV = [] Outer_CV_test = [] for train_index, test_index in skf.split(EEG_features_df,study_group_status): Outer_CV.append([train_index,test_index]) Outer_CV_test.append(test_index) Rep_Outer_CV.append(Outer_CV) Rep_Outer_CV_test.append(Outer_CV_test) # Check none of the repetitions are the same by using only the test sets # The list is first flattened Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] # All elements are converted to strings # This makes it easier to look for uniques, and the indices are already in numerical order Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] def allUnique(x): seen = set() return not any(i in seen or seen.add(i) for i in x) assert allUnique(Rep_Outer_CV_test_flat_str) # Get current time c_time1 = time.localtime() c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) print(c_time1) for rep in range(n_repetitions): # The outer fold CV has already been saved as lists Outer_CV = Rep_Outer_CV[rep] # Pre-allocate memory model_par0 = [] final_models0 = [] final_features0 = [] final_y_preds0 = [] Outer_counter = 0 for train_index, test_index in Outer_CV: test_df = EEG_features_df.iloc[test_index] train_df = EEG_features_df.iloc[train_index] # Training data will be standardized standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) train_df_standard = train_df.copy() train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) # Test data will also be standardized but using mean and std from training data test_df_standard = test_df.copy() test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) # Get the training data X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) y_train = train_df_standard[Target] # Get test data X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) y_test = test_df_standard[Target] # Prepare initial filtering of feature types to alleviate imbalance in number of features # Use a variable that is optimized using inner CV n_feat_mRMR1 = [20,30,40,50] n_feat_mRMR2 = [30,40,50,60] # Max features from mRMR for each eye status max_mRMR_features = n_feat_mRMR1[-1] max_mRMR_features2 = n_feat_mRMR2[-1] eye_status = ["Eyes Closed", "Eyes Open"] n_eye_status = len(eye_status) mRMR_features = [] feat_counter = [] # Perform mRMR for each feature type for fset in range(len(EEG_features_name_list)): temp_feat = EEG_features_name_list[fset] other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:] other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten for e in range(n_eye_status): ee = eye_status[e] temp_ee = "{}_{}".format(temp_feat,ee) # Retrieve the dataset for each feature col_idx = np.zeros(len(X_train.columns), dtype=bool) for fsub in range(len(temp_feat)): temp_feat0 = temp_feat[fsub] col_idx0 = X_train.columns.str.contains(temp_feat0+"_") col_idx = np.logical_or(col_idx,col_idx0) # append all trues temp_X_train = X_train.loc[:,col_idx] # Check if any of the other features were wrongly chosen for fcheck in range(len(other_feats)): if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True): temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]+"_"))] if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")] # Index for eye status temp_X_train = temp_X_train.loc[:,temp_X_train.columns.str.contains(ee)] # Save number of original features fed into mRMR feat_counter.append([temp_feat,temp_X_train.shape[1]]) # Do not use mRMR if there are fewer than max_mRMR_features if temp_X_train.shape[1] <= max_mRMR_features: filter_features = temp_X_train.columns else: # mRMR filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(), num_features_to_select=max_mRMR_features, K_MAX=1000,n_jobs=-1,verbose=False) # Get selected features filter_features = temp_X_train.columns[filter_feat_selector] # Save features selected mRMR_features.append(list(filter_features)) # Check that no features were excluded when checking for wrongly chosen total_features = 0 for feat_c in range(len(feat_counter)): total_features += feat_counter[feat_c][1] assert total_features == X_train.shape[1] # Part 2 with second mRMR and classifiers in loop k_fold = 10 skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Inner_CV = [] for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): Inner_CV.append([train_index2,test_index2]) # SVM with L2-norm (it is by default squared) # Prepare hyper parameters exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) kernels = ["linear"] # rbf overfit easier, whereas linear empirically works better in high D data # Sequential Feature Selection for Logistic Regression # In-built model selection CV # The L2 is the inverse of C for LogReg exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) # Random forest classifier # Prepare hyper parameters trees = np.array([10, 100, 500, 1000]) depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting # Prepare arrays for validation errors val_err_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels))) val_feat_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels),max_mRMR_features2)) val_feat_svm.fill(np.nan) val_err_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg))) val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg),max_mRMR_features2)) val_feat_logreg.fill(np.nan) val_err_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth))) val_feat_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2)) val_feat_rf.fill(np.nan) val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2)) val_feat_import_rf.fill(np.nan) for n1 in range(len(n_feat_mRMR1)): # Get the features from first mRMR mRMR_features1 = [item for sublist in mRMR_features for item in sublist[0:n_feat_mRMR1[n1]]] # print("{} features are left after first mRMR filtering".format(len(mRMR_features1))) X_train_mRMR = X_train[mRMR_features1].copy() # Second round of mRMR on the reduced list filter_feat_selector = mRMR_feature_select(X_train_mRMR.to_numpy(),y_train.to_numpy(), num_features_to_select=max_mRMR_features2, K_MAX=1000,n_jobs=-1,verbose=False) for n2 in range(len(n_feat_mRMR2)): # Get selected features from mRMR2 filter_features = X_train_mRMR.columns[filter_feat_selector[0:n_feat_mRMR2[n2]]] X_train_mRMR2 = X_train_mRMR[filter_features].copy() # SVM with recursive feature elemination # Stratified CV to find best regularization strength and number of features # CV is built-in RFECV min_features_to_select = 1 # Using normal for loop as RFECV has inbuilt multiprocessing for C in range(len(C_parameter_SVM)): for K in range(len(kernels)): # Define the model svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000) # Perform recurive feature elimination with in-built CV rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy", cv=Inner_CV, min_features_to_select=min_features_to_select) rfecv.fit(X_train_mRMR2,y_train.to_numpy().ravel()) # Get CV score err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] # Save results for hyperparameter optimization val_err_svm[n1,n2,C,K] = err # Save the features chosen by RFE based on index from pre mRMR rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0] val_feat_svm[n1,n2,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)]))) # Logistic regression with sequential forward selection # In-bult CV k_features = np.arange(1,n_feat_mRMR2[n2]+1,1) # try up to all features inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) for C in range(len(C_parameter_LogReg)): LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], max_iter = 50000) sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), forward = True, scoring = "balanced_accuracy", verbose = 0, floating = False, cv = Inner_CV, n_jobs = -1) sfs = sfs.fit(X_train_mRMR2, y_train.to_numpy().ravel()) # Save CV scores for each SFS step for feat in range(len(k_features)): inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] # Find the best number of features # I am rounding to make it more smooth and disregard small improvements K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] if len(K) > 1: K = K[np.where(K == np.min(K))[0]] err = inner_cv_scores[C,K] # Save validation error and features val_err_logreg[n1,n2,C] = err # Save the features chosen by RFE based on index from pre mRMR sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature val_feat_logreg[n1,n2,C,0:len(sfs_feat_idx)] = sfs_feat_idx # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) # Random forest model_test_err = np.zeros((k_fold,len(trees),len(depth))) RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR2.shape[1])) counter = 0 for cv in range(k_fold): # Retrieve CV indices train_index_rf = Inner_CV[cv][0] test_index_rf = Inner_CV[cv][1] # Retrieve datasets X_train2 = X_train_mRMR2.iloc[train_index_rf]; X_test2 = X_train_mRMR2.iloc[test_index_rf] y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] for t in range(len(trees)): for d in range(len(depth)): RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], n_jobs=-1, random_state=None) RF.fit(X_train2.to_numpy(),y_train2) # Validation error RF_y = RF.predict(X_test2.to_numpy()) err = balanced_accuracy_score(y_test2, RF_y) # Save to array model_test_err[cv,t,d] = err # Save feature importance array RF_feat_import[cv,t,d] = RF.feature_importances_ counter += 1 # print("Finished RF run {} out of {}".format(counter, k_fold)) # Average the errors over the CV folds model_test_err_mean = np.mean(model_test_err, axis=0) val_err_rf[n1,n2,:,:] = model_test_err_mean # Average the feature importances over the CV folds RF_feat_import = np.mean(RF_feat_import, axis=0) val_feat_import_rf[n1,n2,:,:,0:n_feat_mRMR2[n2]] = RF_feat_import # Save the features used by the RF based on index from pre mRMR rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0] val_feat_rf[n1,n2,:,:,0:len(rf_feat_idx)] = rf_feat_idx print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100)) # Choose the optimal parameters ### SVM n1, n2, C, K = np.where(val_err_svm==np.max(val_err_svm)) if len(C) > 1: print("There are multiple SVM runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") rfe_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) n2_temp = int(n2[i2]) C_temp = int(C[i2]) K_temp = int(K[i2]) temp_feats = val_feat_svm[n1_temp,n2_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,n2_temp,C_temp,K_temp])].astype(int) rfe_feat_len.append(len(temp_feats)) rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] if len(rfe_feat_min) > 1: print("Multiple SVM runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[rfe_feat_min]) n1, n2, C, K = [int(n1[rfe_feat_min][C_min]), int(n2[rfe_feat_min][C_min]), int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])] else: n1, n2, C, K = [int(n1[int(rfe_feat_min)]), int(n2[int(rfe_feat_min)]), int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])] else: n1, n2, C, K = [int(n1), int(n2), int(C), int(K)] mRMR_chosen1 = n_feat_mRMR1[n1] mRMR_chosen2 = n_feat_mRMR2[n2] C_chosen = C_parameter_SVM[C] K_chosen = kernels[K] # Save the best validation erro val_error = val_err_svm[n1, n2, C, K] # Get the subsets chosen chosen_feats = val_feat_svm[n1,n2,C,K][~np.isnan(val_feat_svm[n1,n2,C,K])].astype(int) rfe_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rfe_features) x_train_mRMR2_rfe = X_train[rfe_features] # Fit on all training data model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000) model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_rfe) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rfe_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error] SVM_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat,K_chosen] SVM_model = model SVM_y_pred = [[model_train_y],[y_pred]] ### LogReg with SFS n1, n2, C = np.where(val_err_logreg==np.max(val_err_logreg)) if len(C) > 1: print("There are multiple LogReg runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") sfs_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) n2_temp = int(n2[i2]) C_temp = int(C[i2]) temp_feats = val_feat_logreg[n1_temp,n2_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,n2_temp,C_temp])].astype(int) sfs_feat_len.append(len(temp_feats)) sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] if len(sfs_feat_min) > 1: print("Multiple LogReg runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[sfs_feat_min]) n1, n2, C = [int(n1[sfs_feat_min][C_min]), int(n2[sfs_feat_min][C_min]), int(C[sfs_feat_min][C_min])] else: n1, n2, C = [int(n1[int(sfs_feat_min)]), int(n2[int(sfs_feat_min)]), int(C[int(sfs_feat_min)])] else: n1, n2, C = [int(n1), int(n2), int(C)] mRMR_chosen1 = n_feat_mRMR1[n1] mRMR_chosen2 = n_feat_mRMR2[n2] C_chosen = C_parameter_LogReg[C] # Save the best validation erro val_error = val_err_logreg[n1, n2, C] # Get the subsets chosen chosen_feats = val_feat_logreg[n1,n2,C][~np.isnan(val_feat_logreg[n1,n2,C])].astype(int) sfs_features = list(X_train.columns[chosen_feats]) n_final_feat = len(sfs_features) x_train_mRMR2_sfs = X_train[sfs_features] # Fit on all training data model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_sfs) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[sfs_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error] LogReg_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat] LogReg_model = model LogReg_y_pred = [[model_train_y],[y_pred]] ### RF n1, n2, t, d = np.where(val_err_rf==np.max(val_err_rf)) if len(d) > 1: print("There are multiple RF runs with the same validation error") print("Choosing run with lowest depth to alleviate overfitting") d_min = np.where(d==np.min(d))[0] if len(d_min) > 1: print("Multiple RF runs with same number number of depths") print("Choosing run with lowest trees to alleviate overfitting") t_min = np.argmin(t[d_min]) # argmin just takes the first # If there are multiple with same parameters and validation error it is most likely the same n1, n2, t, d = [int(n1[d_min][t_min]), int(n2[d_min][t_min]), int(t[d_min][t_min]), int(d[d_min][t_min])] else: n1, n2, t, d = [int(n1[int(d_min)]), int(n2[int(d_min)]), int(t[int(d_min)]), int(d[int(d_min)])] else: n1, n2, t, d = [int(n1), int(n2), int(t), int(d)] mRMR_chosen1 = n_feat_mRMR1[n1] mRMR_chosen2 = n_feat_mRMR2[n2] t_chosen = trees[t] d_chosen = depth[d] # Save the best validation erro val_error = val_err_rf[n1, n2, t, d] # Get the chosen features and feature importances chosen_feats = val_feat_rf[n1,n2,t,d][~np.isnan(val_feat_rf[n1,n2,t,d])].astype(int) rf_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rf_features) x_train_mRMR2_rf = X_train[rf_features] rf_feat_importances = val_feat_import_rf[n1,n2,t,d][~np.isnan(val_feat_import_rf[n1,n2,t,d])] # Fit on all training data model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, n_jobs=-1, random_state=None) model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_rf) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rf_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error] RF_model_par = [mRMR_chosen1,mRMR_chosen2,t_chosen,d_chosen] RF_model = model RF_y_pred = [[model_train_y],[y_pred]] # Save the rest model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par]) final_models0.append([SVM_model,LogReg_model,RF_model]) final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]]) final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred]) # Move counter Outer_counter += 1 print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1)) # Save results from all outer folds model_par.append(model_par0) final_models.append(final_models0) final_features.append(final_features0) final_y_preds.append(final_y_preds0) # Save results to file Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds] # Run with variable feat in first and second mRMR with open(Model_savepath+"Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Group_Status_results_010222.pkl", "wb") as filehandle: pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle) # Get current time c_time2 = time.localtime() c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) print("Started", c_time1, "\nCurrent Time",c_time2) # Print total progress print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) # %% ML predction of PTSD status using each feature separately # mRMR -> mRMR -> SVM/RF/LogReg with L2 # 10 repetitions of 10 fold outer and 10 fold inner two-layer CV # With concurrent processes for faster computation EEG_features_df, EEG_features_name_list = load_features_df() # Add group status Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 EEG_features_df.insert(1, "Group_status", Group_status) # Subject info columns Subject_info_cols = list(EEG_features_df.columns[0:2]) n_subject_info_cols = len(Subject_info_cols) n_discrete_cols = 2 # To ensure proper stratification into train/test set I will stratify using group status and study status # A variable that encodes for this is created n_studies = 3 study_group_status = EEG_features_df["Group_status"].copy() for i in range(n_studies): # Get study index study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) # Assign label study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD # Target variable Target = ["Group_status"] Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) Target_col_idx = np.where(Target_col == True)[0] # Make 3 models and save them to use enseemble in the end CLF_models = ["SVM", "LogReg", "RF"] n_models = len(CLF_models) # Repeat the classification to see if I just got a lucky seed n_repetitions = 10 k_out = 10 # accuracy_arr = np.zeros((n_repetitions,k_out,len(EEG_features_name_list),n_models,3)) Ind_feat_clf = [0]*n_repetitions np.random.seed(42) # Prepare the splits beforehand to make sure the repetitions are not the same Rep_Outer_CV = [] Rep_Outer_CV_test = [] # a list with only the test indices for rep in range(n_repetitions): skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Outer_CV = [] Outer_CV_test = [] for train_index, test_index in skf.split(EEG_features_df,study_group_status): Outer_CV.append([train_index,test_index]) Outer_CV_test.append(test_index) Rep_Outer_CV.append(Outer_CV) Rep_Outer_CV_test.append(Outer_CV_test) # Check none of the repetitions are the same by using only the test sets # The list is first flattened Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] # All elements are converted to strings # This makes it easier to look for uniques, and the indices are already in numerical order Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] def allUnique(x): seen = set() return not any(i in seen or seen.add(i) for i in x) assert allUnique(Rep_Outer_CV_test_flat_str) # Get current time c_time1 = time.localtime() c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) print(c_time1) def Each_feat_10rep_10x10CV(rep): # The outer fold CV has already been saved as lists Outer_CV = Rep_Outer_CV[rep] # Pre-allocate memory Ind_feat_clf0 = [] accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3)) Outer_counter = 0 for train_index, test_index in Outer_CV: test_df = EEG_features_df.iloc[test_index] train_df = EEG_features_df.iloc[train_index] # Training data will be standardized standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) train_df_standard = train_df.copy() train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) # Test data will also be standardized but using mean and std from training data test_df_standard = test_df.copy() test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) # Get the training data X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) y_train = train_df_standard[Target] # Get test data X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) y_test = test_df_standard[Target] # Part 2 with second mRMR and classifiers in loop k_fold = 10 skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Inner_CV = [] for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): Inner_CV.append([train_index2,test_index2]) # Prepare initial filtering of feature types to alleviate imbalance in number of features # Use a variable that is optimized using inner CV n_feat_mRMR1 = [20,30,40,50] # Max features from mRMR for each eye status max_mRMR_features = n_feat_mRMR1[-1] eye_status = ["Eyes Closed", "Eyes Open"] n_eye_status = len(eye_status) feat_counter = [] # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF Ind_feat_clf00 = [] for fset in range(len(EEG_features_name_list)): temp_feat = EEG_features_name_list[fset] other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:] other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten # Retrieve the dataset for each feature col_idx = np.zeros(len(X_train.columns), dtype=bool) for fsub in range(len(temp_feat)): temp_feat0 = temp_feat[fsub] col_idx0 = X_train.columns.str.contains(temp_feat0+"_") col_idx = np.logical_or(col_idx,col_idx0) # append all trues temp_X_train = X_train.loc[:,col_idx] # Check if any of the other features were wrongly chosen for fcheck in range(len(other_feats)): if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True): temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))] if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")] # Save number of original features fed into mRMR feat_counter.append([temp_feat,temp_X_train.shape[1]]) # If there are no features selected, then skip the rest of the loop if temp_X_train.shape[1] == 0: continue # Do not use mRMR if there are fewer than n_features if temp_X_train.shape[1] <= max_mRMR_features: filter_features = temp_X_train.columns else: # mRMR filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(), num_features_to_select=max_mRMR_features, K_MAX=1000,n_jobs=1,verbose=False) # Get selected features filter_features = temp_X_train.columns[filter_feat_selector] # SVM with L2-norm (it is by default squared) # Prepare hyper parameters exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) # rbf kernel overfit easier, whereas linear empirically works better in high D data # Sequential Feature Selection for Logistic Regression # In-built model selection CV # The L2 is the inverse of C for LogReg exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) # Random forest classifier # Prepare hyper parameters trees = np.array([10, 100, 500, 1000]) depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting # Prepare arrays for validation errors val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM))) val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features)) val_feat_svm.fill(np.nan) val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg))) val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features)) val_feat_logreg.fill(np.nan) val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth))) val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) val_feat_rf.fill(np.nan) val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) val_feat_import_rf.fill(np.nan) min_features_to_select = 1 for n1 in range(len(n_feat_mRMR1)): mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]] # Use the selected filter features temp_X_train_mRMR = temp_X_train[mRMR_features1] # SVM with recursive feature elemination # Stratified CV to find best regularization strength and number of features # CV is built-in RFECV for C in range(len(C_parameter_SVM)): # Define the model svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000) # Perform recurive feature elimination with in-built CV rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy", cv=Inner_CV, min_features_to_select=min_features_to_select) rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel()) # Get CV score err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] # Save results for hyperparameter optimization val_err_svm[n1,C] = err # Save the features chosen by RFE based on index from pre mRMR rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0] val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM))) # Logistic regression with sequential forward selection # In-bult CV k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]]) k_features = np.arange(1,k_max+1,1) # try up to all features inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) for C in range(len(C_parameter_LogReg)): LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], max_iter = 50000) sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), forward = True, scoring = "balanced_accuracy", verbose = 0, floating = False, cv = Inner_CV, n_jobs = 1) sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel()) # Save CV scores for each SFS step for feat in range(len(k_features)): inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] # Find the best number of features # I am rounding to make it more smooth and disregard small improvements K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] if len(K) > 1: K = K[np.where(K == np.min(K))[0]] err = inner_cv_scores[C,K] # Save validation error and features val_err_logreg[n1,C] = err # Save the features chosen by RFE based on index from pre mRMR sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) # Random forest model_test_err = np.zeros((k_fold,len(trees),len(depth))) RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1])) counter = 0 for cv in range(k_fold): # Retrieve CV indices train_index_rf = Inner_CV[cv][0] test_index_rf = Inner_CV[cv][1] # Retrieve datasets X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf] y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] for t in range(len(trees)): for d in range(len(depth)): RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], n_jobs=1, random_state=None) RF.fit(X_train2.to_numpy(),y_train2) # Validation error RF_y = RF.predict(X_test2.to_numpy()) err = balanced_accuracy_score(y_test2,RF_y) # Save to array model_test_err[cv,t,d] = err # Save feature importance array RF_feat_import[cv,t,d] = RF.feature_importances_ counter += 1 # print("Finished RF run {} out of {}".format(counter, k_fold)) # Average the errors over the CV folds model_test_err_mean = np.mean(model_test_err, axis=0) val_err_rf[n1,:,:] = model_test_err_mean # Average the feature importances over the CV folds RF_feat_import = np.mean(RF_feat_import, axis=0) val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import # Save the features used by the RF based on index from pre mRMR rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0] val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx # Print progress current_progress = (n1+1)/(len(n_feat_mRMR1))*100 print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0])) # Choose the optimal parameters ### SVM n1, C = np.where(val_err_svm==np.max(val_err_svm)) if len(C) > 1: print("There are multiple SVM runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") rfe_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) C_temp = int(C[i2]) temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int) rfe_feat_len.append(len(temp_feats)) rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] if len(rfe_feat_min) > 1: print("Multiple SVM runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[rfe_feat_min]) n1, C = [int(n1[rfe_feat_min][C_min]), int(C[rfe_feat_min][C_min])] else: n1, C = [int(n1[int(rfe_feat_min)]), int(C[int(rfe_feat_min)])] else: n1, C = [int(n1), int(C)] mRMR_chosen1 = n_feat_mRMR1[n1] C_chosen = C_parameter_SVM[C] # Save the best validation error val_error = val_err_svm[n1, C] # Get the subsets chosen chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int) rfe_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rfe_features) x_train_mRMR2_rfe = X_train[rfe_features] # Fit on all training data model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000) model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_rfe) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rfe_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error] SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat] SVM_model = model SVM_y_pred = [[model_train_y],[y_pred]] ### LogReg with SFS n1, C = np.where(val_err_logreg==np.max(val_err_logreg)) if len(C) > 1: print("There are multiple LogReg runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") sfs_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) C_temp = int(C[i2]) temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int) sfs_feat_len.append(len(temp_feats)) sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] if len(sfs_feat_min) > 1: print("Multiple LogReg runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[sfs_feat_min]) n1, C = [int(n1[sfs_feat_min][C_min]), int(C[sfs_feat_min][C_min])] else: n1, C = [int(n1[int(sfs_feat_min)]), int(C[int(sfs_feat_min)])] else: n1, C = [int(n1), int(C)] mRMR_chosen1 = n_feat_mRMR1[n1] C_chosen = C_parameter_LogReg[C] # Save the best validation erro val_error = val_err_logreg[n1, C] # Get the subsets chosen chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int) sfs_features = list(X_train.columns[chosen_feats]) n_final_feat = len(sfs_features) x_train_mRMR2_sfs = X_train[sfs_features] # Fit on all training data model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_sfs) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[sfs_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error] LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat] LogReg_model = model LogReg_y_pred = [[model_train_y],[y_pred]] ### RF n1, t, d = np.where(val_err_rf==np.max(val_err_rf)) if len(d) > 1: print("There are multiple RF runs with the same validation error") print("Choosing run with lowest depth to alleviate overfitting") d_min = np.where(d==np.min(d))[0] if len(d_min) > 1: print("Multiple RF runs with same number number of depths") print("Choosing run with lowest trees to alleviate overfitting") t_min = np.argmin(t[d_min]) # argmin just takes the first # If there are multiple with same parameters and validation error it is most likely the same n1, t, d = [int(n1[d_min][t_min]), int(t[d_min][t_min]), int(d[d_min][t_min])] else: n1, t, d = [int(n1[int(d_min)]), int(t[int(d_min)]), int(d[int(d_min)])] else: n1, t, d = [int(n1), int(t), int(d)] mRMR_chosen1 = n_feat_mRMR1[n1] t_chosen = trees[t] d_chosen = depth[d] # Save the best validation error val_error = val_err_rf[n1, t, d] # Get the chosen features and feature importances chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int) rf_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rf_features) x_train_mRMR2_rf = X_train[rf_features] rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])] # Fit on all training data model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, n_jobs=1, random_state=None) model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_rf) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rf_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error] RF_model_par = [mRMR_chosen1,t_chosen,d_chosen] RF_model = model RF_y_pred = [[model_train_y],[y_pred]] # Save the rest model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par] final_models00 = [SVM_model,LogReg_model,RF_model] final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]] final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred] data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_] # Save the results for the specific feature res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00] Ind_feat_clf00.append(res) print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat)) # Check that no features were excluded when checking for wrongly chosen total_features = 0 for feat_c in range(len(feat_counter)): total_features += feat_counter[feat_c][1] assert total_features == X_train.shape[1] # Save results over outer folds Ind_feat_clf0.append(Ind_feat_clf00) print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1)) # Move counter Outer_counter += 1 out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable] # Print total progress print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) # Get current time c_time2 = time.localtime() c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) print("Started", c_time1, "\nCurrent Time",c_time2) return rep, out Ind_feat_clf = [0]*n_repetitions with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable] # Save results to file and overwrite when new results arrive with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR_SVM_LogReg_RF_results_070222.pkl", "wb") as filehandle: pickle.dump(Ind_feat_clf, filehandle) # %% Sparse clustering of all EEG features to look for subtypes EEG_features_df, EEG_features_name_list = load_features_df() # Add group status Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 EEG_features_df.insert(1, "Group_status", Group_status) # Only use PTSD patient group EEG_features_df2 = EEG_features_df.loc[EEG_features_df["Group_status"]==1,:] Subject_info_cols = ["Subject_ID","Group_status"] # Standardize the values X = np.array(EEG_features_df2.copy().drop(Subject_info_cols, axis=1)) standardizer = preprocessing.StandardScaler().fit(X) X = standardizer.transform(X) # Use gridsearch and permutations to estimate gap statistic and use it to # determine number of clusters and sparsity s # I will use 100 permutations and test 2 to 6 clusters as Zhang 2020 max_clusters = 6 n_sparsity_feat = 20 perm_res = [] Timestamps = [] # Get current time c_time1 = time.localtime() c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) print(c_time1) Timestamps.append(c_time1) for k in range(1,max_clusters): # Cannot permute with 1 cluster n_clusters = k+1 perm = pysparcl.cluster.permute_modified(X, k=n_clusters, verbose=True, nvals=n_sparsity_feat, nperms=100) perm_res.append(perm) # Get current time c_time2 = time.localtime() c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) print("Started", c_time1, "\nCurrent Time",c_time2) Timestamps.append(c_time2) # Save the results with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "wb") as file: pickle.dump(perm_res, file) # # Load # with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "rb") as file: # perm_res = pickle.load(file) # Convert results to array perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4)) for i in range(len(perm_res)): _, gaps, sdgaps, wbounds, _ = perm_res[i].values() for i2 in range(n_sparsity_feat): perm_res_arr[n_sparsity_feat*i+i2,0] = i+2 # cluster size perm_res_arr[n_sparsity_feat*i+i2,1] = gaps[i2] # gap statistic perm_res_arr[n_sparsity_feat*i+i2,2] = sdgaps[i2] # gap statistic std perm_res_arr[n_sparsity_feat*i+i2,3] = wbounds[i2] # sparsity feature s # For each sparsity s, determine best k using one-standard-error criterion # Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s # that fulfill Gap(k) >= Gap(k+1)-std(k+1) def one_standard_deviation_search(gaps, std): best_gaps = np.argmax(gaps) current_gaps = gaps[best_gaps] current_std = std[best_gaps] current_gaps_idx = best_gaps while (gaps[current_gaps_idx-1] >= current_gaps - current_std): if current_gaps_idx == 0: break else: current_gaps_idx -= 1 current_gaps = gaps[current_gaps_idx] current_std = std[current_gaps_idx] out = current_gaps, current_std, current_gaps_idx return out best_ks = np.zeros((n_sparsity_feat, 2)) all_s = np.unique(perm_res_arr[:,3]) plt.figure(figsize=(12,12)) for i2 in range(n_sparsity_feat): current_s = all_s[i2] gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1] std = perm_res_arr[perm_res_arr[:,3] == current_s,2] _, _, idx = one_standard_deviation_search(gaps, std) # Save to array best_ks[i2,0] = current_s best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx] # Plot gap plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"), gaps, yerr=std, capsize=5, label = np.round(current_s,3)) plt.title("Gap statistic for different fixed s") plt.legend(loc=1) plt.xlabel("Number of clusters") plt.ylabel("Gap statistic") best_k = int(scipy.stats.mode(best_ks[:,1])[0]) # Determine s using fixed k as lowest s within 1 std of max gap statistic # According to Witten & Tibshirani, 2010 best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1]) best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx] best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx] one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]]) # Perform clustering with k clusters sparcl = pysparcl.cluster.kmeans(X, k=best_k, wbounds=best_s)[0] # # Save the results # with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "wb") as file: # pickle.dump(sparcl, file) with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file: sparcl = pickle.load(file) # %% Sparse kmeans -> mRMR -> SVM/RF/LogReg with L2 # Using sparse kmeans selected features, only eyes closed as they were primarily chosen # Prediction of subtype 1 EEG_features_df, EEG_features_name_list = load_features_df() with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file: sparcl = pickle.load(file) # Use concatenated features EEG_features_name_list = [['Power'], ['Frontal Theta Beta Ratio', 'Asymmetry'], ['Peak Alpha Frequency', 'Global Peak Alpha Frequency'], ["1/f exponent"], ['imcoh'], ['wpli'], ['Power Envelope Correlation'], ['Microstate Transition', 'Microstate Ratio Time', 'Microstate Entropy'], ["Granger Causality"], ['DFA Exponent', 'Global DFA Exponent']] # Add group status Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 EEG_features_df.insert(1, "Group_status", Group_status) # Only keep PTSD subtype 1 by dropping subtype 2 subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0] subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1] EEG_features_df = EEG_features_df.set_index("Subject_ID") EEG_features_df = EEG_features_df.drop(subtype2) EEG_features_df = EEG_features_df.reset_index() # Check it was dropped correctly assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"]) # Subject info columns Subject_info_cols = list(EEG_features_df.columns[0:2]) n_subject_info_cols = len(Subject_info_cols) n_discrete_cols = 2 # Get features from sparse kmeans nonzero_idx = sparcl["ws"].nonzero() sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx] sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO # Only use eyes closed (2483 features) sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")] EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1) # To ensure proper stratification into train/test set I will stratify using group status and study status # A variable that encodes for this is created n_studies = 3 study_group_status = EEG_features_df["Group_status"].copy() for i in range(n_studies): # Get study index study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) # Assign label study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD # Target variable Target = ["Group_status"] Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) Target_col_idx = np.where(Target_col == True)[0] # Make 3 models and save them to use enseemble in the end CLF_models = ["SVM", "LogReg", "RF"] n_models = len(CLF_models) # Repeat the classification to see if I just got a lucky seed n_repetitions = 10 k_out = 10 accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3)) model_par = [] final_models = [] final_features = [] final_y_preds = [] np.random.seed(42) # Prepare the splits beforehand to make sure the repetitions are not the same Rep_Outer_CV = [] Rep_Outer_CV_test = [] # a list with only the test indices for rep in range(n_repetitions): skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Outer_CV = [] Outer_CV_test = [] for train_index, test_index in skf.split(EEG_features_df,study_group_status): Outer_CV.append([train_index,test_index]) Outer_CV_test.append(test_index) Rep_Outer_CV.append(Outer_CV) Rep_Outer_CV_test.append(Outer_CV_test) # Check none of the repetitions are the same by using only the test sets # The list is first flattened Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] # All elements are converted to strings # This makes it easier to look for uniques, and the indices are already in numerical order Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] def allUnique(x): seen = set() return not any(i in seen or seen.add(i) for i in x) assert allUnique(Rep_Outer_CV_test_flat_str) # Get current time c_time1 = time.localtime() c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) print(c_time1) for rep in range(n_repetitions): # The outer fold CV has already been saved as lists Outer_CV = Rep_Outer_CV[rep] # Pre-allocate memory model_par0 = [] final_models0 = [] final_features0 = [] final_y_preds0 = [] Outer_counter = 0 for train_index, test_index in Outer_CV: test_df = EEG_features_df.iloc[test_index] train_df = EEG_features_df.iloc[train_index] # Training data will be standardized standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) train_df_standard = train_df.copy() train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) # Test data will also be standardized but using mean and std from training data test_df_standard = test_df.copy() test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) # Get the training data X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) y_train = train_df_standard[Target] # Get test data X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) y_test = test_df_standard[Target] # Prepare initial filtering of feature types to alleviate imbalance in number of features # Use a variable that is optimized using inner CV n_feat_mRMR1 = [30,40,50,60] # Max features from mRMR for each eye status max_mRMR_features = n_feat_mRMR1[-1] # mRMR on the all sparse kmeans selected features filter_feat_selector = mRMR_feature_select(X_train.to_numpy(),y_train.to_numpy(), num_features_to_select=max_mRMR_features, K_MAX=1000,n_jobs=-1,verbose=False) # Part 2 with second mRMR and classifiers in loop k_fold = 10 skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Inner_CV = [] for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): Inner_CV.append([train_index2,test_index2]) # SVM with L2-norm (it is by default squared) # Prepare hyper parameters exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) kernels = ["linear"] # error when using rbf and RFECV # rbf also overfit easier, whereas linear empirically works better in high D data # Sequential Feature Selection for Logistic Regression # In-built model selection CV # The L2 is the inverse of C for LogReg exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) # Random forest classifier # Prepare hyper parameters trees = np.array([10, 100, 500, 1000]) depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting # Prepare arrays for validation errors val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels))) val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels),max_mRMR_features)) val_feat_svm.fill(np.nan) val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg))) val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features)) val_feat_logreg.fill(np.nan) val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth))) val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) val_feat_rf.fill(np.nan) val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) val_feat_import_rf.fill(np.nan) for n1 in range(len(n_feat_mRMR1)): # Get selected features from mRMR filter_features = X_train.columns[filter_feat_selector[0:n_feat_mRMR1[n1]]] X_train_mRMR = X_train[filter_features].copy() # SVM with recursive feature elemination # Stratified CV to find best regularization strength and number of features # CV is built-in RFECV min_features_to_select = 1 # Using normal for loop as RFECV has inbuilt multiprocessing for C in range(len(C_parameter_SVM)): for K in range(len(kernels)): # Define the model svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000) # Perform recurive feature elimination with in-built CV rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy", cv=Inner_CV, min_features_to_select=min_features_to_select) rfecv.fit(X_train_mRMR,y_train.to_numpy().ravel()) # Get CV score err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] # Save results for hyperparameter optimization val_err_svm[n1,C,K] = err # Save the features chosen by RFE based on index from pre mRMR rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0] val_feat_svm[n1,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)]))) # Logistic regression with sequential forward selection # In-bult CV k_features = np.arange(1,n_feat_mRMR1[n1]+1,1) # try up to all features inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) for C in range(len(C_parameter_LogReg)): LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], max_iter = 50000) sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), forward = True, scoring = "balanced_accuracy", verbose = 0, floating = False, cv = Inner_CV, n_jobs = -1) sfs = sfs.fit(X_train_mRMR, y_train.to_numpy().ravel()) # Save CV scores for each SFS step for feat in range(len(k_features)): inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] # Find the best number of features # I am rounding to make it more smooth and disregard small improvements K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] if len(K) > 1: K = K[np.where(K == np.min(K))[0]] err = inner_cv_scores[C,K] # Save validation error and features val_err_logreg[n1,C] = err # Save the features chosen by RFE based on index from pre mRMR sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) # Random forest model_test_err = np.zeros((k_fold,len(trees),len(depth))) RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR.shape[1])) counter = 0 for cv in range(k_fold): # Retrieve CV indices train_index_rf = Inner_CV[cv][0] test_index_rf = Inner_CV[cv][1] # Retrieve datasets X_train2 = X_train_mRMR.iloc[train_index_rf]; X_test2 = X_train_mRMR.iloc[test_index_rf] y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] for t in range(len(trees)): for d in range(len(depth)): RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], n_jobs=-1, random_state=None) RF.fit(X_train2.to_numpy(),y_train2) # Validation error RF_y = RF.predict(X_test2.to_numpy()) err = balanced_accuracy_score(y_test2, RF_y) # Save to array model_test_err[cv,t,d] = err # Save feature importance array RF_feat_import[cv,t,d] = RF.feature_importances_ counter += 1 # print("Finished RF run {} out of {}".format(counter, k_fold)) # Average the errors over the CV folds model_test_err_mean = np.mean(model_test_err, axis=0) val_err_rf[n1,:,:] = model_test_err_mean # Average the feature importances over the CV folds RF_feat_import = np.mean(RF_feat_import, axis=0) val_feat_import_rf[n1,:,:,0:n_feat_mRMR1[n1]] = RF_feat_import # Save the features used by the RF based on index from pre mRMR rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0] val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100)) # Choose the optimal parameters ### SVM n1, C, K = np.where(val_err_svm==np.max(val_err_svm)) if len(C) > 1: print("There are multiple SVM runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") rfe_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) C_temp = int(C[i2]) K_temp = int(K[i2]) temp_feats = val_feat_svm[n1_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,C_temp,K_temp])].astype(int) rfe_feat_len.append(len(temp_feats)) rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] if len(rfe_feat_min) > 1: print("Multiple SVM runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[rfe_feat_min]) n1, C, K = [int(n1[rfe_feat_min][C_min]), int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])] else: n1, C, K = [int(n1[int(rfe_feat_min)]), int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])] else: n1, C, K = [int(n1), int(C), int(K)] mRMR_chosen1 = n_feat_mRMR1[n1] C_chosen = C_parameter_SVM[C] K_chosen = kernels[K] # Save the best validation erro val_error = val_err_svm[n1, C, K] # Get the subsets chosen chosen_feats = val_feat_svm[n1,C,K][~np.isnan(val_feat_svm[n1,C,K])].astype(int) rfe_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rfe_features) x_train_mRMR_rfe = X_train[rfe_features] # Fit on all training data model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000) model = model.fit(x_train_mRMR_rfe,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR_rfe) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rfe_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error] SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat,K_chosen] SVM_model = model SVM_y_pred = [[model_train_y],[y_pred]] ### LogReg with SFS n1, C = np.where(val_err_logreg==np.max(val_err_logreg)) if len(C) > 1: print("There are multiple LogReg runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") sfs_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) C_temp = int(C[i2]) temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int) sfs_feat_len.append(len(temp_feats)) sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] if len(sfs_feat_min) > 1: print("Multiple LogReg runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[sfs_feat_min]) n1, C = [int(n1[sfs_feat_min][C_min]), int(C[sfs_feat_min][C_min])] else: n1, C = [int(n1[int(sfs_feat_min)]), int(C[int(sfs_feat_min)])] else: n1, C = [int(n1), int(C)] mRMR_chosen1 = n_feat_mRMR1[n1] C_chosen = C_parameter_LogReg[C] # Save the best validation erro val_error = val_err_logreg[n1, C] # Get the subsets chosen chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int) sfs_features = list(X_train.columns[chosen_feats]) n_final_feat = len(sfs_features) x_train_mRMR_sfs = X_train[sfs_features] # Fit on all training data model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) model = model.fit(x_train_mRMR_sfs,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR_sfs) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[sfs_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error] LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat] LogReg_model = model LogReg_y_pred = [[model_train_y],[y_pred]] ### RF n1, t, d = np.where(val_err_rf==np.max(val_err_rf)) if len(d) > 1: print("There are multiple RF runs with the same validation error") print("Choosing run with lowest depth to alleviate overfitting") d_min = np.where(d==np.min(d))[0] if len(d_min) > 1: print("Multiple RF runs with same number number of depths") print("Choosing run with lowest trees to alleviate overfitting") t_min = np.argmin(t[d_min]) # argmin just takes the first # If there are multiple with same parameters and validation error it is most likely the same n1, t, d = [int(n1[d_min][t_min]), int(t[d_min][t_min]), int(d[d_min][t_min])] else: n1, t, d = [int(n1[int(d_min)]), int(t[int(d_min)]), int(d[int(d_min)])] else: n1, t, d = [int(n1), int(t), int(d)] mRMR_chosen1 = n_feat_mRMR1[n1] t_chosen = trees[t] d_chosen = depth[d] # Save the best validation error val_error = val_err_rf[n1, t, d] # Get the chosen features and feature importances chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int) rf_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rf_features) x_train_mRMR_rf = X_train[rf_features] rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])] # Fit on all training data model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, n_jobs=-1, random_state=None) model = model.fit(x_train_mRMR_rf,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR_rf) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rf_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error] RF_model_par = [mRMR_chosen1,t_chosen,d_chosen] RF_model = model RF_y_pred = [[model_train_y],[y_pred]] # Save the rest model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par]) final_models0.append([SVM_model,LogReg_model,RF_model]) final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]]) final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred]) # Move counter Outer_counter += 1 print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1)) # Save results from all outer folds model_par.append(model_par0) final_models.append(final_models0) final_features.append(final_features0) final_y_preds.append(final_y_preds0) # Save results to file Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds] # Run with variable feat in first and second mRMR with open(Model_savepath+"Rep10_10x10CV_SparsKmean_mRMR_SVM_LogReg_RF_Subtype1_results_210122.pkl", "wb") as filehandle: pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle) # Get current time c_time2 = time.localtime() c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) print("Started", c_time1, "\nCurrent Time",c_time2) # Print total progress print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) # %% Each feat mRMR -> mRMR -> SVM/RF/LogReg with L2 # Using each feature (of the sparse Kmeans EC chosen ones) # For Subtype 1 # 10 repetitions of 10 fold outer and 20 fold inner two-layer CV # Prediction of subtype 1 EEG_features_df, EEG_features_name_list = load_features_df() # removed coh and plv and renamed Hurst in v4 with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file: sparcl = pickle.load(file) # Add group status Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0 Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1 EEG_features_df.insert(1, "Group_status", Group_status) # Only keep PTSD subtype 1 by dropping subtype 2 PTSD_idx = np.array([i in cases for i in Subject_id]) CTRL_idx = np.array([not i in cases for i in Subject_id]) subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0] subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1] EEG_features_df = EEG_features_df.set_index("Subject_ID") EEG_features_df = EEG_features_df.drop(subtype2) EEG_features_df = EEG_features_df.reset_index() # Check it was dropped correctly assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"]) # Subject info columns Subject_info_cols = list(EEG_features_df.columns[0:2]) n_subject_info_cols = len(Subject_info_cols) n_discrete_cols = 2 # Get features from sparse kmeans nonzero_idx = sparcl["ws"].nonzero() sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx] sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO # Only use eyes closed (2483 features) sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")] EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1) # To ensure proper stratification into train/test set I will stratify using group status and study status # A variable that encodes for this is created n_studies = 3 study_group_status = EEG_features_df["Group_status"].copy() for i in range(n_studies): # Get study index study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000) # Assign label study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD # Target variable Target = ["Group_status"] Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target) Target_col_idx = np.where(Target_col == True)[0] # Make 3 models and save them to use enseemble in the end CLF_models = ["SVM", "LogReg", "RF"] n_models = len(CLF_models) # Repeat the classification to see if I just got a lucky seed n_repetitions = 10 k_out = 10 accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3)) model_par = [] final_models = [] final_features = [] final_y_preds = [] np.random.seed(42) # Prepare the splits beforehand to make sure the repetitions are not the same Rep_Outer_CV = [] Rep_Outer_CV_test = [] # a list with only the test indices for rep in range(n_repetitions): skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Outer_CV = [] Outer_CV_test = [] for train_index, test_index in skf.split(EEG_features_df,study_group_status): Outer_CV.append([train_index,test_index]) Outer_CV_test.append(test_index) Rep_Outer_CV.append(Outer_CV) Rep_Outer_CV_test.append(Outer_CV_test) # Check none of the repetitions are the same by using only the test sets # The list is first flattened Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist] # All elements are converted to strings # This makes it easier to look for uniques, and the indices are already in numerical order Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat] def allUnique(x): seen = set() return not any(i in seen or seen.add(i) for i in x) assert allUnique(Rep_Outer_CV_test_flat_str) # Get current time c_time1 = time.localtime() c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) print(c_time1) def Each_feat_10rep_10x10CV(rep): # The outer fold CV has already been saved as lists Outer_CV = Rep_Outer_CV[rep] # Pre-allocate memory Ind_feat_clf0 = [] accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3)) Outer_counter = 0 for train_index, test_index in Outer_CV: test_df = EEG_features_df.iloc[test_index] train_df = EEG_features_df.iloc[train_index] # Training data will be standardized standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:]) train_df_standard = train_df.copy() train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:]) # Test data will also be standardized but using mean and std from training data test_df_standard = test_df.copy() test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:]) # Get the training data X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1) y_train = train_df_standard[Target] # Get test data X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1) y_test = test_df_standard[Target] # Part 2 with second mRMR and classifiers in loop k_fold = 10 skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects # I am also using converting it to an iterable list instad of the generator to promote reuse Inner_CV = [] for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]): Inner_CV.append([train_index2,test_index2]) # Prepare initial filtering of feature types to alleviate imbalance in number of features # Use a variable that is optimized using inner CV n_feat_mRMR1 = [20,30,40,50] # Max features from mRMR for each eye status max_mRMR_features = n_feat_mRMR1[-1] eye_status = ["Eyes Closed", "Eyes Open"] n_eye_status = len(eye_status) feat_counter = [] # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF Ind_feat_clf00 = [] for fset in range(len(EEG_features_name_list)): temp_feat = EEG_features_name_list[fset] other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:] other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten # Retrieve the dataset for each feature col_idx = np.zeros(len(X_train.columns), dtype=bool) for fsub in range(len(temp_feat)): temp_feat0 = temp_feat[fsub] col_idx0 = X_train.columns.str.contains(temp_feat0+"_") col_idx = np.logical_or(col_idx,col_idx0) # append all trues temp_X_train = X_train.loc[:,col_idx] # Check if any of the other features were wrongly chosen for fcheck in range(len(other_feats)): if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True): temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))] if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")] # Save number of original features fed into mRMR feat_counter.append([temp_feat,temp_X_train.shape[1]]) # If there are no features selected, then skip the rest of the loop if temp_X_train.shape[1] == 0: continue # Do not use mRMR if there are fewer than n_features if temp_X_train.shape[1] <= max_mRMR_features: filter_features = temp_X_train.columns else: # mRMR filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(), num_features_to_select=max_mRMR_features, K_MAX=1000,n_jobs=1,verbose=False) # Get selected features filter_features = temp_X_train.columns[filter_feat_selector] # SVM with L2-norm (it is by default squared) # Prepare hyper parameters exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent) # rbf kernel overfit easier, whereas linear empirically works better in high D data # Sequential Feature Selection for Logistic Regression # In-built model selection CV # The L2 is the inverse of C for LogReg exponent = np.linspace(-3,1,9) exponent = np.round(exponent,5) # sometimes linspace are not so exact C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent) # Random forest classifier # Prepare hyper parameters trees = np.array([10, 100, 500, 1000]) depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting # Prepare arrays for validation errors val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM))) val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features)) val_feat_svm.fill(np.nan) val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg))) val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features)) val_feat_logreg.fill(np.nan) val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth))) val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) val_feat_rf.fill(np.nan) val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features)) val_feat_import_rf.fill(np.nan) min_features_to_select = 1 for n1 in range(len(n_feat_mRMR1)): mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]] # Use the selected filter features temp_X_train_mRMR = temp_X_train[mRMR_features1] # SVM with recursive feature elemination # Stratified CV to find best regularization strength and number of features # CV is built-in RFECV for C in range(len(C_parameter_SVM)): # Define the model svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000) # Perform recurive feature elimination with in-built CV rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy", cv=Inner_CV, min_features_to_select=min_features_to_select) rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel()) # Get CV score err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select] # Save results for hyperparameter optimization val_err_svm[n1,C] = err # Save the features chosen by RFE based on index from pre mRMR rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0] val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM))) # Logistic regression with sequential forward selection # In-bult CV k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]]) k_features = np.arange(1,k_max+1,1) # try up to all features inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features))) for C in range(len(C_parameter_LogReg)): LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C], max_iter = 50000) sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]), forward = True, scoring = "balanced_accuracy", verbose = 0, floating = False, cv = Inner_CV, n_jobs = 1) sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel()) # Save CV scores for each SFS step for feat in range(len(k_features)): inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"] # Find the best number of features # I am rounding to make it more smooth and disregard small improvements K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0] if len(K) > 1: K = K[np.where(K == np.min(K))[0]] err = inner_cv_scores[C,K] # Save validation error and features val_err_logreg[n1,C] = err # Save the features chosen by RFE based on index from pre mRMR sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg))) # Random forest model_test_err = np.zeros((k_fold,len(trees),len(depth))) RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1])) counter = 0 for cv in range(k_fold): # Retrieve CV indices train_index_rf = Inner_CV[cv][0] test_index_rf = Inner_CV[cv][1] # Retrieve datasets X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf] y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf] for t in range(len(trees)): for d in range(len(depth)): RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d], n_jobs=1, random_state=None) RF.fit(X_train2.to_numpy(),y_train2) # Validation error RF_y = RF.predict(X_test2.to_numpy()) err = balanced_accuracy_score(y_test2,RF_y) # Save to array model_test_err[cv,t,d] = err # Save feature importance array RF_feat_import[cv,t,d] = RF.feature_importances_ counter += 1 # print("Finished RF run {} out of {}".format(counter, k_fold)) # Average the errors over the CV folds model_test_err_mean = np.mean(model_test_err, axis=0) val_err_rf[n1,:,:] = model_test_err_mean # Average the feature importances over the CV folds RF_feat_import = np.mean(RF_feat_import, axis=0) val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import # Save the features used by the RF based on index from pre mRMR rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0] val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx # Print progress current_progress = (n1+1)/(len(n_feat_mRMR1))*100 print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0])) # Choose the optimal parameters ### SVM n1, C = np.where(val_err_svm==np.max(val_err_svm)) if len(C) > 1: print("There are multiple SVM runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") rfe_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) C_temp = int(C[i2]) temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int) rfe_feat_len.append(len(temp_feats)) rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0] if len(rfe_feat_min) > 1: print("Multiple SVM runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[rfe_feat_min]) n1, C = [int(n1[rfe_feat_min][C_min]), int(C[rfe_feat_min][C_min])] else: n1, C = [int(n1[int(rfe_feat_min)]), int(C[int(rfe_feat_min)])] else: n1, C = [int(n1), int(C)] mRMR_chosen1 = n_feat_mRMR1[n1] C_chosen = C_parameter_SVM[C] # Save the best validation error val_error = val_err_svm[n1, C] # Get the subsets chosen chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int) rfe_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rfe_features) x_train_mRMR2_rfe = X_train[rfe_features] # Fit on all training data model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000) model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_rfe) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rfe_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error] SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat] SVM_model = model SVM_y_pred = [[model_train_y],[y_pred]] ### LogReg with SFS n1, C = np.where(val_err_logreg==np.max(val_err_logreg)) if len(C) > 1: print("There are multiple LogReg runs with the same validation error") print("Choosing run with fewest features to alleviate overfitting") sfs_feat_len = [] for i2 in range(len(C)): n1_temp = int(n1[i2]) C_temp = int(C[i2]) temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int) sfs_feat_len.append(len(temp_feats)) sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0] if len(sfs_feat_min) > 1: print("Multiple LogReg runs with same number of fewest features") print("Choosing run with lowest C (highest regularization) to alleviate overfitting") C_min = np.argmin(C[sfs_feat_min]) n1, C = [int(n1[sfs_feat_min][C_min]), int(C[sfs_feat_min][C_min])] else: n1, C = [int(n1[int(sfs_feat_min)]), int(C[int(sfs_feat_min)])] else: n1, C = [int(n1), int(C)] mRMR_chosen1 = n_feat_mRMR1[n1] C_chosen = C_parameter_LogReg[C] # Save the best validation erro val_error = val_err_logreg[n1, C] # Get the subsets chosen chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int) sfs_features = list(X_train.columns[chosen_feats]) n_final_feat = len(sfs_features) x_train_mRMR2_sfs = X_train[sfs_features] # Fit on all training data model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000) model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_sfs) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[sfs_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error] LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat] LogReg_model = model LogReg_y_pred = [[model_train_y],[y_pred]] ### RF n1, t, d = np.where(val_err_rf==np.max(val_err_rf)) if len(d) > 1: print("There are multiple RF runs with the same validation error") print("Choosing run with lowest depth to alleviate overfitting") d_min = np.where(d==np.min(d))[0] if len(d_min) > 1: print("Multiple RF runs with same number number of depths") print("Choosing run with lowest trees to alleviate overfitting") t_min = np.argmin(t[d_min]) # argmin just takes the first # If there are multiple with same parameters and validation error it is most likely the same n1, t, d = [int(n1[d_min][t_min]), int(t[d_min][t_min]), int(d[d_min][t_min])] else: n1, t, d = [int(n1[int(d_min)]), int(t[int(d_min)]), int(d[int(d_min)])] else: n1, t, d = [int(n1), int(t), int(d)] mRMR_chosen1 = n_feat_mRMR1[n1] t_chosen = trees[t] d_chosen = depth[d] # Save the best validation error val_error = val_err_rf[n1, t, d] # Get the chosen features and feature importances chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int) rf_features = list(X_train.columns[chosen_feats]) n_final_feat = len(rf_features) x_train_mRMR2_rf = X_train[rf_features] rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])] # Fit on all training data model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen, n_jobs=1, random_state=None) model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel()) # Get training output model_train_y = model.predict(x_train_mRMR2_rf) # Get training error train_error = balanced_accuracy_score(y_train, model_train_y) # Get prediction of test data y_pred = model.predict(X_test[rf_features]) # Use model to predict class on test data test_error = balanced_accuracy_score(y_test, y_pred) # Save or prepare to save accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error] RF_model_par = [mRMR_chosen1,t_chosen,d_chosen] RF_model = model RF_y_pred = [[model_train_y],[y_pred]] # Save the rest model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par] final_models00 = [SVM_model,LogReg_model,RF_model] final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]] final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred] data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_] # Save the results for the specific feature res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00] Ind_feat_clf00.append(res) print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat)) # Check that no features were excluded when checking for wrongly chosen total_features = 0 for feat_c in range(len(feat_counter)): total_features += feat_counter[feat_c][1] assert total_features == X_train.shape[1] # Save results over outer folds Ind_feat_clf0.append(Ind_feat_clf00) print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1)) # Move counter Outer_counter += 1 out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable] # Print total progress print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions)) # Get current time c_time2 = time.localtime() c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) print("Started", c_time1, "\nCurrent Time",c_time2) return rep, out Ind_feat_clf = [0]*n_repetitions with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor: for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable] # Save results to file and overwrite when new results arrive with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Subtype1_results_250122.pkl", "wb") as filehandle: pickle.dump(Ind_feat_clf, filehandle) # %% For subtype 2 # The two previous sections are rerun but for subtype 2 # The only modification was indexing the subjects # Only keep PTSD subtype 2 by dropping subtype 1 subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0] subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1] EEG_features_df = EEG_features_df.set_index("Subject_ID") EEG_features_df = EEG_features_df.drop(subtype1) EEG_features_df = EEG_features_df.reset_index() # Check it was dropped correctly assert all(subtype2 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])