MachineLearning.py

        X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
        y_train = train_df_standard[Target]
        # Get test data
        X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
        y_test = test_df_standard[Target]
        
        # Part 2 with second mRMR and classifiers in loop
        k_fold = 10
        skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
        # I am also using converting it to an iterable list instad of the generator to promote reuse
        Inner_CV = []
        for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
            Inner_CV.append([train_index2,test_index2])
        
        # Prepare initial filtering of feature types to alleviate imbalance in number of features
        # Use a variable that is optimized using inner CV
        n_feat_mRMR1 = [20,30,40,50]
        
        # Max features from mRMR for each eye status
        max_mRMR_features = n_feat_mRMR1[-1]
        
        eye_status = ["Eyes Closed", "Eyes Open"]
        n_eye_status = len(eye_status)
        
        feat_counter = []
        # Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
        Ind_feat_clf00 = []
        for fset in range(len(EEG_features_name_list)):
            temp_feat = EEG_features_name_list[fset]
            other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
            other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
            # Retrieve the dataset for each feature
            col_idx = np.zeros(len(X_train.columns), dtype=bool)
            for fsub in range(len(temp_feat)):
                temp_feat0 = temp_feat[fsub]
                col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
                col_idx = np.logical_or(col_idx,col_idx0) # append all trues
            temp_X_train = X_train.loc[:,col_idx]
            # Check if any of the other features were wrongly chosen
            for fcheck in range(len(other_feats)):
                if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
                    temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))]
            if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
                temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
            
            # Save number of original features fed into mRMR
            feat_counter.append([temp_feat,temp_X_train.shape[1]])
            # If there are no features selected, then skip the rest of the loop
            if temp_X_train.shape[1] == 0:
                continue
            
            # Do not use mRMR if there are fewer than n_features
            if temp_X_train.shape[1] <= max_mRMR_features:
                filter_features = temp_X_train.columns
            else:
                # mRMR
                filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
                                                            num_features_to_select=max_mRMR_features,
                                                            K_MAX=1000,n_jobs=1,verbose=False)
                # Get selected features
                filter_features = temp_X_train.columns[filter_feat_selector]
            
            # SVM with L2-norm (it is by default squared)
            # Prepare hyper parameters
            exponent = np.linspace(-3,1,9)
            exponent = np.round(exponent,5) # sometimes linspace are not so exact
            C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
            # rbf kernel overfit easier, whereas linear empirically works better in high D data
            
            # Sequential Feature Selection for Logistic Regression
            # In-built model selection CV
            # The L2 is the inverse of C for LogReg
            exponent = np.linspace(-3,1,9)
            exponent = np.round(exponent,5) # sometimes linspace are not so exact
            C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
            
            # Random forest classifier
            # Prepare hyper parameters
            trees = np.array([10, 100, 500, 1000])
            depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
            
            # Prepare arrays for validation errors
            val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM)))
            val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features))
            val_feat_svm.fill(np.nan)
            
            val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
            val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
            val_feat_logreg.fill(np.nan)
            
            val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
            val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
            val_feat_rf.fill(np.nan)
            val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
            val_feat_import_rf.fill(np.nan)
            
            min_features_to_select = 1
            
            for n1 in range(len(n_feat_mRMR1)):
                mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]]
                # Use the selected filter features
                temp_X_train_mRMR = temp_X_train[mRMR_features1]
                
                # SVM with recursive feature elemination 
                # Stratified CV to find best regularization strength and number of features
                # CV is built-in RFECV
                for C in range(len(C_parameter_SVM)):
                    # Define the model
                    svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000)
                    # Perform recurive feature elimination with in-built CV
                    rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy",
                                  cv=Inner_CV,
                                  min_features_to_select=min_features_to_select)
                    rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel())
                    # Get CV score
                    err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
                    # Save results for hyperparameter optimization
                    val_err_svm[n1,C] = err
                    # Save the features chosen by RFE based on index from pre mRMR
                    rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0]
                    val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx
                    # print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
                
                # Logistic regression with sequential forward selection
                # In-bult CV
                k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]])
                k_features = np.arange(1,k_max+1,1) # try up to all features
                inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
                for C in range(len(C_parameter_LogReg)):
                    LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
                                                max_iter = 50000)
                    sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
                              forward = True, scoring = "balanced_accuracy",
                              verbose = 0, floating = False,
                              cv = Inner_CV, n_jobs = 1)
                
                    sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel())
                    # Save CV scores for each SFS step
                    for feat in range(len(k_features)):
                        inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
                    
                    # Find the best number of features
                    # I am rounding to make it more smooth and disregard small improvements
                    K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
                    if len(K) > 1:
                        K = K[np.where(K == np.min(K))[0]]
                    err = inner_cv_scores[C,K]
                    # Save validation error and features
                    val_err_logreg[n1,C] = err
                    # Save the features chosen by RFE based on index from pre mRMR
                    sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
                    val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
                    
                    # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
                
                # Random forest                
                model_test_err = np.zeros((k_fold,len(trees),len(depth)))
                RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1]))
                
                counter = 0
                for cv in range(k_fold):
                    # Retrieve CV indices
                    train_index_rf = Inner_CV[cv][0]
                    test_index_rf = Inner_CV[cv][1]
                    # Retrieve datasets
                    X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf]
                    y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
                    
                    for t in range(len(trees)):
                        for d in range(len(depth)):
                            RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
                                                        n_jobs=1, random_state=None)
                            RF.fit(X_train2.to_numpy(),y_train2)
                            # Validation error
                            RF_y = RF.predict(X_test2.to_numpy())
                            err = balanced_accuracy_score(y_test2,RF_y)
                            # Save to array
                            model_test_err[cv,t,d] = err
                            # Save feature importance array
                            RF_feat_import[cv,t,d] = RF.feature_importances_
                    counter += 1
                    # print("Finished RF run {} out of {}".format(counter, k_fold))
                
                # Average the errors over the CV folds
                model_test_err_mean = np.mean(model_test_err, axis=0)
                val_err_rf[n1,:,:] = model_test_err_mean
                # Average the feature importances over the CV folds
                RF_feat_import = np.mean(RF_feat_import, axis=0)
                val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import
                # Save the features used by the RF based on index from pre mRMR
                rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0]
                val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
                
                # Print progress
                current_progress = (n1+1)/(len(n_feat_mRMR1))*100
                print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0]))
                
            # Choose the optimal parameters
            ### SVM
            n1, C = np.where(val_err_svm==np.max(val_err_svm))
            
            if len(C) > 1:
                print("There are multiple SVM runs with the same validation error")
                print("Choosing run with fewest features to alleviate overfitting")
                rfe_feat_len = []
                for i2 in range(len(C)):
                    n1_temp = int(n1[i2])
                    C_temp = int(C[i2])
                    temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int)
                    rfe_feat_len.append(len(temp_feats))
                
                rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
                if len(rfe_feat_min) > 1:
                    print("Multiple SVM runs with same number of fewest features")
                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
                    C_min = np.argmin(C[rfe_feat_min])
                    n1, C = [int(n1[rfe_feat_min][C_min]),
                                    int(C[rfe_feat_min][C_min])]
                else:
                    n1, C = [int(n1[int(rfe_feat_min)]),
                                  int(C[int(rfe_feat_min)])]
            else:
                n1, C = [int(n1), int(C)]
            
            mRMR_chosen1 = n_feat_mRMR1[n1]
            C_chosen = C_parameter_SVM[C]
            
            # Save the best validation error
            val_error = val_err_svm[n1, C]
            
            # Get the subsets chosen
            chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int)
            rfe_features = list(X_train.columns[chosen_feats])
            n_final_feat = len(rfe_features)
            x_train_mRMR2_rfe = X_train[rfe_features]
            
            # Fit on all training data
            model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000)
            model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
            # Get training output
            model_train_y = model.predict(x_train_mRMR2_rfe)
            # Get training error
            train_error = balanced_accuracy_score(y_train, model_train_y)
            
            # Get prediction of test data
            y_pred = model.predict(X_test[rfe_features])
            # Use model to predict class on test data
            test_error = balanced_accuracy_score(y_test, y_pred)
            
            # Save or prepare to save
            accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error]
            SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
            SVM_model = model
            SVM_y_pred = [[model_train_y],[y_pred]]
            
            ### LogReg with SFS
            n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
            if len(C) > 1:
                print("There are multiple LogReg runs with the same validation error")
                print("Choosing run with fewest features to alleviate overfitting")
                sfs_feat_len = []
                for i2 in range(len(C)):
                    n1_temp = int(n1[i2])
                    C_temp = int(C[i2])
                    temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
                    sfs_feat_len.append(len(temp_feats))
                
                sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
                if len(sfs_feat_min) > 1:
                    print("Multiple LogReg runs with same number of fewest features")
                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
                    C_min = np.argmin(C[sfs_feat_min])
                    n1, C = [int(n1[sfs_feat_min][C_min]),
                                    int(C[sfs_feat_min][C_min])]
                else:
                    n1, C = [int(n1[int(sfs_feat_min)]),
                                  int(C[int(sfs_feat_min)])]
            else:
                n1, C = [int(n1), int(C)]
            
            mRMR_chosen1 = n_feat_mRMR1[n1]
            C_chosen = C_parameter_LogReg[C]
            
            # Save the best validation erro
            val_error = val_err_logreg[n1, C]
            
            # Get the subsets chosen
            chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
            sfs_features = list(X_train.columns[chosen_feats])
            n_final_feat = len(sfs_features)
            x_train_mRMR2_sfs = X_train[sfs_features]
            
            # Fit on all training data
            model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
            model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
            # Get training output
            model_train_y = model.predict(x_train_mRMR2_sfs)
            # Get training error
            train_error = balanced_accuracy_score(y_train, model_train_y)
            
            # Get prediction of test data
            y_pred = model.predict(X_test[sfs_features])
            # Use model to predict class on test data
            test_error = balanced_accuracy_score(y_test, y_pred)
            
            # Save or prepare to save
            accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error]
            LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
            LogReg_model = model
            LogReg_y_pred = [[model_train_y],[y_pred]]
            
            ### RF
            n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
            
            if len(d) > 1:
                print("There are multiple RF runs with the same validation error")
                print("Choosing run with lowest depth to alleviate overfitting")
                d_min = np.where(d==np.min(d))[0]
                
                if len(d_min) > 1:
                    print("Multiple RF runs with same number number of depths")
                    print("Choosing run with lowest trees to alleviate overfitting")
                    t_min = np.argmin(t[d_min]) # argmin just takes the first
                    # If there are multiple with same parameters and validation error it is most likely the same
                    n1, t, d = [int(n1[d_min][t_min]),
                                    int(t[d_min][t_min]), int(d[d_min][t_min])]
                else:
                    n1, t, d = [int(n1[int(d_min)]),
                                  int(t[int(d_min)]), int(d[int(d_min)])]
            else:
                n1, t, d = [int(n1), int(t), int(d)]
            
            mRMR_chosen1 = n_feat_mRMR1[n1]
            t_chosen = trees[t]
            d_chosen = depth[d]
            
            # Save the best validation error
            val_error = val_err_rf[n1, t, d]
            
            # Get the chosen features and feature importances
            chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
            rf_features = list(X_train.columns[chosen_feats])
            n_final_feat = len(rf_features)
            x_train_mRMR2_rf = X_train[rf_features]
            
            rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
            
            # Fit on all training data
            model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
                                                            n_jobs=1, random_state=None)
            model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
            # Get training output
            model_train_y = model.predict(x_train_mRMR2_rf)
            # Get training error
            train_error = balanced_accuracy_score(y_train, model_train_y)
            
            # Get prediction of test data
            y_pred = model.predict(X_test[rf_features])
            # Use model to predict class on test data
            test_error = balanced_accuracy_score(y_test, y_pred)
            
            # Save or prepare to save
            accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error]
            RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
            RF_model = model
            RF_y_pred = [[model_train_y],[y_pred]]
            
            # Save the rest
            model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par]
            final_models00 = [SVM_model,LogReg_model,RF_model]
            final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]]
            final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred]
            data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_]
            
            # Save the results for the specific feature
            res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00]
            Ind_feat_clf00.append(res)
            print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat))
        
        # Check that no features were excluded when checking for wrongly chosen
        total_features = 0
        for feat_c in range(len(feat_counter)):
            total_features += feat_counter[feat_c][1]
        assert total_features == X_train.shape[1]
        # Save results over outer folds
        Ind_feat_clf0.append(Ind_feat_clf00)
        
        print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1))
        # Move counter
        Outer_counter += 1
    
    out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable]
    # Print total progress
    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
    
    # Get current time
    c_time2 = time.localtime()
    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
    print("Started", c_time1, "\nCurrent Time",c_time2)
    
    return rep, out

Ind_feat_clf = [0]*n_repetitions
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments
        Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
        # Save results to file and overwrite when new results arrive
        with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Subtype1_results_250122.pkl", "wb") as filehandle:
            pickle.dump(Ind_feat_clf, filehandle)

# %% For subtype 2
# The two previous sections are rerun but for subtype 2
# The only modification was indexing the subjects

# Only keep PTSD subtype 2 by dropping subtype 1
subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
EEG_features_df = EEG_features_df.set_index("Subject_ID")
EEG_features_df = EEG_features_df.drop(subtype1)
EEG_features_df = EEG_features_df.reset_index()
# Check it was dropped correctly
assert all(subtype2 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])