MachineLearning.py

                    if len(K) > 1:
                        K = K[np.where(K == np.min(K))[0]]
                    err = inner_cv_scores[C,K]
                    # Save validation error and features
                    val_err_logreg[n1,C] = err
                    # Save the features chosen by RFE based on index from pre mRMR
                    sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
                    val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
                    
                    # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
                
                # Random forest                
                model_test_err = np.zeros((k_fold,len(trees),len(depth)))
                RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1]))
                
                counter = 0
                for cv in range(k_fold):
                    # Retrieve CV indices
                    train_index_rf = Inner_CV[cv][0]
                    test_index_rf = Inner_CV[cv][1]
                    # Retrieve datasets
                    X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf]
                    y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
                    
                    for t in range(len(trees)):
                        for d in range(len(depth)):
                            RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
                                                        n_jobs=1, random_state=None)
                            RF.fit(X_train2.to_numpy(),y_train2)
                            # Validation error
                            RF_y = RF.predict(X_test2.to_numpy())
                            err = balanced_accuracy_score(y_test2,RF_y)
                            # Save to array
                            model_test_err[cv,t,d] = err
                            # Save feature importance array
                            RF_feat_import[cv,t,d] = RF.feature_importances_
                    counter += 1
                    # print("Finished RF run {} out of {}".format(counter, k_fold))
                
                # Average the errors over the CV folds
                model_test_err_mean = np.mean(model_test_err, axis=0)
                val_err_rf[n1,:,:] = model_test_err_mean
                # Average the feature importances over the CV folds
                RF_feat_import = np.mean(RF_feat_import, axis=0)
                val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import
                # Save the features used by the RF based on index from pre mRMR
                rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0]
                val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
                
                # Print progress
                current_progress = (n1+1)/(len(n_feat_mRMR1))*100
                print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0]))
                
            # Choose the optimal parameters
            ### SVM
            n1, C = np.where(val_err_svm==np.max(val_err_svm))
            
            if len(C) > 1:
                print("There are multiple SVM runs with the same validation error")
                print("Choosing run with fewest features to alleviate overfitting")
                rfe_feat_len = []
                for i2 in range(len(C)):
                    n1_temp = int(n1[i2])
                    C_temp = int(C[i2])
                    temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int)
                    rfe_feat_len.append(len(temp_feats))
                
                rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
                if len(rfe_feat_min) > 1:
                    print("Multiple SVM runs with same number of fewest features")
                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
                    C_min = np.argmin(C[rfe_feat_min])
                    n1, C = [int(n1[rfe_feat_min][C_min]),
                                    int(C[rfe_feat_min][C_min])]
                else:
                    n1, C = [int(n1[int(rfe_feat_min)]),
                                  int(C[int(rfe_feat_min)])]
            else:
                n1, C = [int(n1), int(C)]
            
            mRMR_chosen1 = n_feat_mRMR1[n1]
            C_chosen = C_parameter_SVM[C]
            
            # Save the best validation error
            val_error = val_err_svm[n1, C]
            
            # Get the subsets chosen
            chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int)
            rfe_features = list(X_train.columns[chosen_feats])
            n_final_feat = len(rfe_features)
            x_train_mRMR2_rfe = X_train[rfe_features]
            
            # Fit on all training data
            model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000)
            model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
            # Get training output
            model_train_y = model.predict(x_train_mRMR2_rfe)
            # Get training error
            train_error = balanced_accuracy_score(y_train, model_train_y)
            
            # Get prediction of test data
            y_pred = model.predict(X_test[rfe_features])
            # Use model to predict class on test data
            test_error = balanced_accuracy_score(y_test, y_pred)
            
            # Save or prepare to save
            accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error]
            SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
            SVM_model = model
            SVM_y_pred = [[model_train_y],[y_pred]]
            
            ### LogReg with SFS
            n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
            if len(C) > 1:
                print("There are multiple LogReg runs with the same validation error")
                print("Choosing run with fewest features to alleviate overfitting")
                sfs_feat_len = []
                for i2 in range(len(C)):
                    n1_temp = int(n1[i2])
                    C_temp = int(C[i2])
                    temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
                    sfs_feat_len.append(len(temp_feats))
                
                sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
                if len(sfs_feat_min) > 1:
                    print("Multiple LogReg runs with same number of fewest features")
                    print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
                    C_min = np.argmin(C[sfs_feat_min])
                    n1, C = [int(n1[sfs_feat_min][C_min]),
                                    int(C[sfs_feat_min][C_min])]
                else:
                    n1, C = [int(n1[int(sfs_feat_min)]),
                                  int(C[int(sfs_feat_min)])]
            else:
                n1, C = [int(n1), int(C)]
            
            mRMR_chosen1 = n_feat_mRMR1[n1]
            C_chosen = C_parameter_LogReg[C]
            
            # Save the best validation erro
            val_error = val_err_logreg[n1, C]
            
            # Get the subsets chosen
            chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
            sfs_features = list(X_train.columns[chosen_feats])
            n_final_feat = len(sfs_features)
            x_train_mRMR2_sfs = X_train[sfs_features]
            
            # Fit on all training data
            model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
            model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
            # Get training output
            model_train_y = model.predict(x_train_mRMR2_sfs)
            # Get training error
            train_error = balanced_accuracy_score(y_train, model_train_y)
            
            # Get prediction of test data
            y_pred = model.predict(X_test[sfs_features])
            # Use model to predict class on test data
            test_error = balanced_accuracy_score(y_test, y_pred)
            
            # Save or prepare to save
            accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error]
            LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
            LogReg_model = model
            LogReg_y_pred = [[model_train_y],[y_pred]]
            
            ### RF
            n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
            
            if len(d) > 1:
                print("There are multiple RF runs with the same validation error")
                print("Choosing run with lowest depth to alleviate overfitting")
                d_min = np.where(d==np.min(d))[0]
                
                if len(d_min) > 1:
                    print("Multiple RF runs with same number number of depths")
                    print("Choosing run with lowest trees to alleviate overfitting")
                    t_min = np.argmin(t[d_min]) # argmin just takes the first
                    # If there are multiple with same parameters and validation error it is most likely the same
                    n1, t, d = [int(n1[d_min][t_min]),
                                    int(t[d_min][t_min]), int(d[d_min][t_min])]
                else:
                    n1, t, d = [int(n1[int(d_min)]),
                                  int(t[int(d_min)]), int(d[int(d_min)])]
            else:
                n1, t, d = [int(n1), int(t), int(d)]
            
            mRMR_chosen1 = n_feat_mRMR1[n1]
            t_chosen = trees[t]
            d_chosen = depth[d]
            
            # Save the best validation error
            val_error = val_err_rf[n1, t, d]
            
            # Get the chosen features and feature importances
            chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
            rf_features = list(X_train.columns[chosen_feats])
            n_final_feat = len(rf_features)
            x_train_mRMR2_rf = X_train[rf_features]
            
            rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
            
            # Fit on all training data
            model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
                                                            n_jobs=1, random_state=None)
            model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
            # Get training output
            model_train_y = model.predict(x_train_mRMR2_rf)
            # Get training error
            train_error = balanced_accuracy_score(y_train, model_train_y)
            
            # Get prediction of test data
            y_pred = model.predict(X_test[rf_features])
            # Use model to predict class on test data
            test_error = balanced_accuracy_score(y_test, y_pred)
            
            # Save or prepare to save
            accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error]
            RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
            RF_model = model
            RF_y_pred = [[model_train_y],[y_pred]]
            
            # Save the rest
            model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par]
            final_models00 = [SVM_model,LogReg_model,RF_model]
            final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]]
            final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred]
            data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_]
            
            # Save the results for the specific feature
            res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00]
            Ind_feat_clf00.append(res)
            print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat))
        
        # Check that no features were excluded when checking for wrongly chosen
        total_features = 0
        for feat_c in range(len(feat_counter)):
            total_features += feat_counter[feat_c][1]
        assert total_features == X_train.shape[1]
        # Save results over outer folds
        Ind_feat_clf0.append(Ind_feat_clf00)
        
        print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1))
        # Move counter
        Outer_counter += 1
    
    out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable]
    # Print total progress
    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
    
    # Get current time
    c_time2 = time.localtime()
    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
    print("Started", c_time1, "\nCurrent Time",c_time2)
    
    return rep, out

Ind_feat_clf = [0]*n_repetitions
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments
        Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
        # Save results to file and overwrite when new results arrive
        with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR_SVM_LogReg_RF_results_070222.pkl", "wb") as filehandle:
            pickle.dump(Ind_feat_clf, filehandle)

# %% Sparse clustering of all EEG features to look for subtypes
EEG_features_df, EEG_features_name_list = load_features_df()

# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)

# Only use PTSD patient group
EEG_features_df2 = EEG_features_df.loc[EEG_features_df["Group_status"]==1,:]

Subject_info_cols = ["Subject_ID","Group_status"]

# Standardize the values
X = np.array(EEG_features_df2.copy().drop(Subject_info_cols, axis=1))
standardizer = preprocessing.StandardScaler().fit(X)
X = standardizer.transform(X)

# Use gridsearch and permutations to estimate gap statistic and use it to 
# determine number of clusters and sparsity s
# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020
max_clusters = 6
n_sparsity_feat = 20
perm_res = []

Timestamps = []
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
Timestamps.append(c_time1)
for k in range(1,max_clusters):
    # Cannot permute with 1 cluster
    n_clusters = k+1
    perm = pysparcl.cluster.permute_modified(X, k=n_clusters, verbose=True,
                                             nvals=n_sparsity_feat, nperms=100)
    perm_res.append(perm)
    # Get current time
    c_time2 = time.localtime()
    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
    print("Started", c_time1, "\nCurrent Time",c_time2)
    Timestamps.append(c_time2)

# Save the results
with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "wb") as file:
    pickle.dump(perm_res, file)

# # Load
# with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "rb") as file:
#     perm_res = pickle.load(file)

# Convert results to array
perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4))
for i in range(len(perm_res)):
    _, gaps, sdgaps, wbounds, _ = perm_res[i].values()
    for i2 in range(n_sparsity_feat):
        perm_res_arr[n_sparsity_feat*i+i2,0] = i+2 # cluster size
        perm_res_arr[n_sparsity_feat*i+i2,1] = gaps[i2] # gap statistic
        perm_res_arr[n_sparsity_feat*i+i2,2] = sdgaps[i2] # gap statistic std
        perm_res_arr[n_sparsity_feat*i+i2,3] = wbounds[i2] # sparsity feature s

# For each sparsity s, determine best k using one-standard-error criterion
# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s
# that fulfill Gap(k) >= Gap(k+1)-std(k+1)
def one_standard_deviation_search(gaps, std):
    best_gaps = np.argmax(gaps)
    current_gaps = gaps[best_gaps]
    current_std = std[best_gaps]
    current_gaps_idx = best_gaps
    while (gaps[current_gaps_idx-1] >= current_gaps - current_std):
        if current_gaps_idx == 0:
            break
        else:
            current_gaps_idx -= 1
            current_gaps = gaps[current_gaps_idx]
            current_std = std[current_gaps_idx]
    out = current_gaps, current_std, current_gaps_idx
    return out

best_ks = np.zeros((n_sparsity_feat, 2))
all_s = np.unique(perm_res_arr[:,3])
plt.figure(figsize=(12,12))
for i2 in range(n_sparsity_feat):
    current_s = all_s[i2]
    gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1]
    std = perm_res_arr[perm_res_arr[:,3] == current_s,2]
    _, _, idx = one_standard_deviation_search(gaps, std)
    # Save to array
    best_ks[i2,0] = current_s
    best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx]
    # Plot gap
    plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"),
             gaps, yerr=std, capsize=5, label = np.round(current_s,3))
plt.title("Gap statistic for different fixed s")
plt.legend(loc=1)
plt.xlabel("Number of clusters")
plt.ylabel("Gap statistic")

best_k = int(scipy.stats.mode(best_ks[:,1])[0])

# Determine s using fixed k as lowest s within 1 std of max gap statistic
# According to Witten & Tibshirani, 2010
best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1])
best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx]
best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx]
one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std

best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]])

# Perform clustering with k clusters
sparcl = pysparcl.cluster.kmeans(X, k=best_k, wbounds=best_s)[0]

# # Save the results
# with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "wb") as file:
#     pickle.dump(sparcl, file)

with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
    sparcl = pickle.load(file)

# %% Sparse kmeans -> mRMR -> SVM/RF/LogReg with L2
# Using sparse kmeans selected features, only eyes closed as they were primarily chosen
# Prediction of subtype 1
EEG_features_df, EEG_features_name_list = load_features_df()

with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
    sparcl = pickle.load(file)

# Use concatenated features
EEG_features_name_list = [['Power'],
                          ['Frontal Theta Beta Ratio',
                          'Asymmetry'],
                          ['Peak Alpha Frequency',
                          'Global Peak Alpha Frequency'],
                          ["1/f exponent"],
                          ['imcoh'],
                          ['wpli'],
                          ['Power Envelope Correlation'],
                          ['Microstate Transition',
                          'Microstate Ratio Time',
                          'Microstate Entropy'],
                          ["Granger Causality"],
                          ['DFA Exponent',
                          'Global DFA Exponent']]

# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)

# Only keep PTSD subtype 1 by dropping subtype 2
subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
EEG_features_df = EEG_features_df.set_index("Subject_ID")
EEG_features_df = EEG_features_df.drop(subtype2)
EEG_features_df = EEG_features_df.reset_index()
# Check it was dropped correctly
assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])

# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2

# Get features from sparse kmeans
nonzero_idx = sparcl["ws"].nonzero()
sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO
# Only use eyes closed (2483 features)
sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")]
EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1)

# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
    # Get study index
    study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
    # Assign label
    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD

# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]

# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)

# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
model_par = []
final_models = []
final_features = []
final_y_preds = []
np.random.seed(42)

# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
    skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
    # I am also using converting it to an iterable list instad of the generator to promote reuse
    Outer_CV = []
    Outer_CV_test = []
    for train_index, test_index in skf.split(EEG_features_df,study_group_status):
        Outer_CV.append([train_index,test_index])
        Outer_CV_test.append(test_index)
    Rep_Outer_CV.append(Outer_CV)
    Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]

def allUnique(x):
    seen = set()
    return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)

# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)

for rep in range(n_repetitions):
    # The outer fold CV has already been saved as lists
    Outer_CV = Rep_Outer_CV[rep]
    # Pre-allocate memory
    model_par0 = []
    final_models0 = []
    final_features0 = []
    final_y_preds0 = []
    Outer_counter = 0
    for train_index, test_index in Outer_CV:
        test_df = EEG_features_df.iloc[test_index]
        train_df = EEG_features_df.iloc[train_index]
        
        # Training data will be standardized
        standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])

        train_df_standard = train_df.copy()
        train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
        # Test data will also be standardized but using mean and std from training data
        test_df_standard = test_df.copy()
        test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
        
        # Get the training data
        X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
        y_train = train_df_standard[Target]
        # Get test data
        X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
        y_test = test_df_standard[Target]
        
        # Prepare initial filtering of feature types to alleviate imbalance in number of features
        # Use a variable that is optimized using inner CV
        n_feat_mRMR1 = [30,40,50,60]
        # Max features from mRMR for each eye status
        max_mRMR_features = n_feat_mRMR1[-1]
        
        # mRMR on the all sparse kmeans selected features
        filter_feat_selector = mRMR_feature_select(X_train.to_numpy(),y_train.to_numpy(),
                                               num_features_to_select=max_mRMR_features,
                                               K_MAX=1000,n_jobs=-1,verbose=False)
        
        # Part 2 with second mRMR and classifiers in loop
        k_fold = 10
        skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
        # I am also using converting it to an iterable list instad of the generator to promote reuse
        Inner_CV = []
        for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
            Inner_CV.append([train_index2,test_index2])
        
        # SVM with L2-norm (it is by default squared)
        # Prepare hyper parameters
        exponent = np.linspace(-3,1,9)
        exponent = np.round(exponent,5) # sometimes linspace are not so exact
        C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
        kernels = ["linear"] # error when using rbf and RFECV
        # rbf also overfit easier, whereas linear empirically works better in high D data
        
        # Sequential Feature Selection for Logistic Regression
        # In-built model selection CV
        # The L2 is the inverse of C for LogReg
        exponent = np.linspace(-3,1,9)
        exponent = np.round(exponent,5) # sometimes linspace are not so exact
        C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
        
        # Random forest classifier
        # Prepare hyper parameters
        trees = np.array([10, 100, 500, 1000])
        depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
        
        # Prepare arrays for validation errors
        val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels)))
        val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels),max_mRMR_features))
        val_feat_svm.fill(np.nan)
        
        val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
        val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
        val_feat_logreg.fill(np.nan)
        
        val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
        val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
        val_feat_rf.fill(np.nan)
        val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
        val_feat_import_rf.fill(np.nan)
        
        for n1 in range(len(n_feat_mRMR1)):
            # Get selected features from mRMR
            filter_features = X_train.columns[filter_feat_selector[0:n_feat_mRMR1[n1]]]
            
            X_train_mRMR = X_train[filter_features].copy()
    
            # SVM with recursive feature elemination 
            # Stratified CV to find best regularization strength and number of features
            # CV is built-in RFECV
            min_features_to_select = 1
            # Using normal for loop as RFECV has inbuilt multiprocessing
            for C in range(len(C_parameter_SVM)):
                for K in range(len(kernels)):
                    # Define the model
                    svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000)
                    # Perform recurive feature elimination with in-built CV
                    rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy",
                                  cv=Inner_CV,
                                  min_features_to_select=min_features_to_select)
                    rfecv.fit(X_train_mRMR,y_train.to_numpy().ravel())
                    # Get CV score
                    err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
                    # Save results for hyperparameter optimization
                    val_err_svm[n1,C,K] = err
                    # Save the features chosen by RFE based on index from pre mRMR
                    rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0]
                    val_feat_svm[n1,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx
                    
                    # print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
            
            # Logistic regression with sequential forward selection
            # In-bult CV
            k_features = np.arange(1,n_feat_mRMR1[n1]+1,1) # try up to all features
            inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
            for C in range(len(C_parameter_LogReg)):
                LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
                                            max_iter = 50000)
                sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
                          forward = True, scoring = "balanced_accuracy",
                          verbose = 0, floating = False,
                          cv = Inner_CV, n_jobs = -1)
            
                sfs = sfs.fit(X_train_mRMR, y_train.to_numpy().ravel())
                # Save CV scores for each SFS step
                for feat in range(len(k_features)):
                    inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
                
                # Find the best number of features
                # I am rounding to make it more smooth and disregard small improvements
                K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
                if len(K) > 1:
                    K = K[np.where(K == np.min(K))[0]]
                err = inner_cv_scores[C,K]
                # Save validation error and features
                val_err_logreg[n1,C] = err
                # Save the features chosen by RFE based on index from pre mRMR
                sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
                val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
                
                # print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
            
            # Random forest
            model_test_err = np.zeros((k_fold,len(trees),len(depth)))
            RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR.shape[1]))
            
            counter = 0
            for cv in range(k_fold):
                # Retrieve CV indices
                train_index_rf = Inner_CV[cv][0]
                test_index_rf = Inner_CV[cv][1]
                # Retrieve datasets
                X_train2 = X_train_mRMR.iloc[train_index_rf]; X_test2 = X_train_mRMR.iloc[test_index_rf]
                y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
                
                for t in range(len(trees)):
                    for d in range(len(depth)):
                        RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
                                                    n_jobs=-1, random_state=None)
                        RF.fit(X_train2.to_numpy(),y_train2)
                        # Validation error
                        RF_y = RF.predict(X_test2.to_numpy())
                        err = balanced_accuracy_score(y_test2, RF_y)
                        # Save to array
                        model_test_err[cv,t,d] = err
                        # Save feature importance array
                        RF_feat_import[cv,t,d] = RF.feature_importances_
                counter += 1
                # print("Finished RF run {} out of {}".format(counter, k_fold))
            # Average the errors over the CV folds
            model_test_err_mean = np.mean(model_test_err, axis=0)
            val_err_rf[n1,:,:] = model_test_err_mean
            # Average the feature importances over the CV folds
            RF_feat_import = np.mean(RF_feat_import, axis=0)
            val_feat_import_rf[n1,:,:,0:n_feat_mRMR1[n1]] = RF_feat_import
            # Save the features used by the RF based on index from pre mRMR
            rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0]
            val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
            
            print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100))
        
        # Choose the optimal parameters
        ### SVM
        n1, C, K = np.where(val_err_svm==np.max(val_err_svm))
        
        if len(C) > 1:
            print("There are multiple SVM runs with the same validation error")
            print("Choosing run with fewest features to alleviate overfitting")
            rfe_feat_len = []
            for i2 in range(len(C)):
                n1_temp = int(n1[i2])
                C_temp = int(C[i2])
                K_temp = int(K[i2])
                temp_feats = val_feat_svm[n1_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,C_temp,K_temp])].astype(int)
                rfe_feat_len.append(len(temp_feats))
            
            rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
            if len(rfe_feat_min) > 1:
                print("Multiple SVM runs with same number of fewest features")
                print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
                C_min = np.argmin(C[rfe_feat_min])
                n1, C, K = [int(n1[rfe_feat_min][C_min]),
                                int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])]
            else:
                n1, C, K = [int(n1[int(rfe_feat_min)]),
                             int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])]
        else:
            n1, C, K = [int(n1), int(C), int(K)]
        
        mRMR_chosen1 = n_feat_mRMR1[n1]
        C_chosen = C_parameter_SVM[C]
        K_chosen = kernels[K]
        
        # Save the best validation erro
        val_error = val_err_svm[n1, C, K]
        
        # Get the subsets chosen
        chosen_feats = val_feat_svm[n1,C,K][~np.isnan(val_feat_svm[n1,C,K])].astype(int)
        rfe_features = list(X_train.columns[chosen_feats])
        n_final_feat = len(rfe_features)
        x_train_mRMR_rfe = X_train[rfe_features]
        
        # Fit on all training data
        model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000)
        model = model.fit(x_train_mRMR_rfe,y_train.to_numpy().ravel())
        # Get training output
        model_train_y = model.predict(x_train_mRMR_rfe)
        # Get training error
        train_error = balanced_accuracy_score(y_train, model_train_y)
        
        # Get prediction of test data
        y_pred = model.predict(X_test[rfe_features])
        # Use model to predict class on test data
        test_error = balanced_accuracy_score(y_test, y_pred)
        
        # Save or prepare to save
        accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error]
        SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat,K_chosen]
        SVM_model = model
        SVM_y_pred = [[model_train_y],[y_pred]]
        
        ### LogReg with SFS
        n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
        if len(C) > 1:
            print("There are multiple LogReg runs with the same validation error")
            print("Choosing run with fewest features to alleviate overfitting")
            sfs_feat_len = []
            for i2 in range(len(C)):
                n1_temp = int(n1[i2])
                C_temp = int(C[i2])
                temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
                sfs_feat_len.append(len(temp_feats))
            
            sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
            if len(sfs_feat_min) > 1:
                print("Multiple LogReg runs with same number of fewest features")
                print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
                C_min = np.argmin(C[sfs_feat_min])
                n1, C = [int(n1[sfs_feat_min][C_min]),
                                int(C[sfs_feat_min][C_min])]
            else:
                n1, C = [int(n1[int(sfs_feat_min)]),
                             int(C[int(sfs_feat_min)])]
        else:
            n1, C = [int(n1), int(C)]
        
        mRMR_chosen1 = n_feat_mRMR1[n1]
        C_chosen = C_parameter_LogReg[C]
        
        # Save the best validation erro
        val_error = val_err_logreg[n1, C]
        
        # Get the subsets chosen
        chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
        sfs_features = list(X_train.columns[chosen_feats])
        n_final_feat = len(sfs_features)
        x_train_mRMR_sfs = X_train[sfs_features]
        
        # Fit on all training data
        model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
        model = model.fit(x_train_mRMR_sfs,y_train.to_numpy().ravel())
        # Get training output
        model_train_y = model.predict(x_train_mRMR_sfs)
        # Get training error
        train_error = balanced_accuracy_score(y_train, model_train_y)
        
        # Get prediction of test data
        y_pred = model.predict(X_test[sfs_features])
        # Use model to predict class on test data
        test_error = balanced_accuracy_score(y_test, y_pred)
        
        # Save or prepare to save
        accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error]
        LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
        LogReg_model = model
        LogReg_y_pred = [[model_train_y],[y_pred]]
        
        ### RF
        n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
        
        if len(d) > 1:
            print("There are multiple RF runs with the same validation error")
            print("Choosing run with lowest depth to alleviate overfitting")
            d_min = np.where(d==np.min(d))[0]
            
            if len(d_min) > 1:
                print("Multiple RF runs with same number number of depths")
                print("Choosing run with lowest trees to alleviate overfitting")
                t_min = np.argmin(t[d_min]) # argmin just takes the first
                # If there are multiple with same parameters and validation error it is most likely the same
                n1, t, d = [int(n1[d_min][t_min]),
                                int(t[d_min][t_min]), int(d[d_min][t_min])]
            else:
                n1, t, d = [int(n1[int(d_min)]),
                             int(t[int(d_min)]), int(d[int(d_min)])]
        else:
            n1, t, d = [int(n1), int(t), int(d)]
        
        mRMR_chosen1 = n_feat_mRMR1[n1]
        t_chosen = trees[t]
        d_chosen = depth[d]
        
        # Save the best validation error
        val_error = val_err_rf[n1, t, d]
        
        # Get the chosen features and feature importances
        chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
        rf_features = list(X_train.columns[chosen_feats])
        n_final_feat = len(rf_features)
        x_train_mRMR_rf = X_train[rf_features]
        
        rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
        
        # Fit on all training data
        model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
                                                        n_jobs=-1, random_state=None)
        model = model.fit(x_train_mRMR_rf,y_train.to_numpy().ravel())
        # Get training output
        model_train_y = model.predict(x_train_mRMR_rf)
        # Get training error
        train_error = balanced_accuracy_score(y_train, model_train_y)
        
        # Get prediction of test data
        y_pred = model.predict(X_test[rf_features])
        # Use model to predict class on test data
        test_error = balanced_accuracy_score(y_test, y_pred)
        
        # Save or prepare to save
        accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error]
        RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
        RF_model = model
        RF_y_pred = [[model_train_y],[y_pred]]
        
        # Save the rest
        model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par])
        final_models0.append([SVM_model,LogReg_model,RF_model])
        final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]])
        final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred])
        # Move counter
        Outer_counter += 1
        print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1))
    # Save results from all outer folds
    model_par.append(model_par0)
    final_models.append(final_models0)
    final_features.append(final_features0)
    final_y_preds.append(final_y_preds0)
    # Save results to file
    Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds]
    # Run with variable feat in first and second mRMR
    with open(Model_savepath+"Rep10_10x10CV_SparsKmean_mRMR_SVM_LogReg_RF_Subtype1_results_210122.pkl", "wb") as filehandle:
        pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle)
    
    # Get current time
    c_time2 = time.localtime()
    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
    print("Started", c_time1, "\nCurrent Time",c_time2)
    
    # Print total progress
    print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))


# %% Each feat mRMR -> mRMR -> SVM/RF/LogReg with L2
# Using each feature (of the sparse Kmeans EC chosen ones)
# For Subtype 1
# 10 repetitions of 10 fold outer and 20 fold inner two-layer CV
# Prediction of subtype 1
EEG_features_df, EEG_features_name_list = load_features_df() # removed coh and plv and renamed Hurst in v4

with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
    sparcl = pickle.load(file)

# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)

# Only keep PTSD subtype 1 by dropping subtype 2
PTSD_idx = np.array([i in cases for i in Subject_id])
CTRL_idx = np.array([not i in cases for i in Subject_id])
subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
EEG_features_df = EEG_features_df.set_index("Subject_ID")
EEG_features_df = EEG_features_df.drop(subtype2)
EEG_features_df = EEG_features_df.reset_index()
# Check it was dropped correctly
assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])

# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2

# Get features from sparse kmeans
nonzero_idx = sparcl["ws"].nonzero()
sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO
# Only use eyes closed (2483 features)
sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")]
EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1)

# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
    # Get study index
    study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
    # Assign label
    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
    study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD

# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]

# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)

# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
model_par = []
final_models = []
final_features = []
final_y_preds = []
np.random.seed(42)

# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
    skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
    # I am also using converting it to an iterable list instad of the generator to promote reuse
    Outer_CV = []
    Outer_CV_test = []
    for train_index, test_index in skf.split(EEG_features_df,study_group_status):
        Outer_CV.append([train_index,test_index])
        Outer_CV_test.append(test_index)
    Rep_Outer_CV.append(Outer_CV)
    Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]

def allUnique(x):
    seen = set()
    return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)

# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)

def Each_feat_10rep_10x10CV(rep):
    # The outer fold CV has already been saved as lists
    Outer_CV = Rep_Outer_CV[rep]
    # Pre-allocate memory
    Ind_feat_clf0 = []
    accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3))
    Outer_counter = 0
    for train_index, test_index in Outer_CV:
        test_df = EEG_features_df.iloc[test_index]
        train_df = EEG_features_df.iloc[train_index]
        
        # Training data will be standardized
        standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])

        train_df_standard = train_df.copy()
        train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
        # Test data will also be standardized but using mean and std from training data
        test_df_standard = test_df.copy()
        test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
        
        # Get the training data