Skip to content
Snippets Groups Projects
Commit f8f5f366 authored by glia's avatar glia
Browse files

Machine Learning

parent c66f2b2b
Branches
No related tags found
No related merge requests found
# -*- coding: utf-8 -*-
"""
Updated Oct 18 2022
@author: Qianliang Li (glia@dtu.dk)
This script contains the code for the machine learning analysis
It should be run after Preprocessing.py and FeatureEstimation.py
"""
# Set working directory
import os
wkdir = "/home/glia/EEG/Final_scripts/"
os.chdir(wkdir)
# Load all libraries from the Preamble
from Preamble import *
# Load the questionnaire data
final_qdf = pd.read_csv("final_qdf.csv", sep=",", na_values=' ')
# Define cases as >= 44 total PCL
# Type: numpy array with subject id
cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44])
n_groups = 2
Groups = ["CTRL", "PTSD"]
# Make function to load EEG features
def load_features_df():
# Load all features
power_df = pd.read_pickle(Feature_savepath+"Power_df.pkl")
fTBR_data_df = pd.read_pickle(Feature_savepath+"fTBR_df.pkl")
asymmetry_df = pd.read_pickle(Feature_savepath+"asymmetry_df.pkl")
PAF_data_df = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_df.pkl")
PAF_data_df_global = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_global_df.pkl")
OOF_data_df = pd.read_pickle(Feature_savepath+"OOF_data_FOOOF_df.pkl")
con_data_df = pd.read_pickle(Feature_savepath+"con_data_source_drop_interpol_df.pkl")
pec_data_df = pd.read_pickle(Feature_savepath+"pec_data_drop_interpol_ch_df.pkl")
microstate_transition_data_df = pd.read_pickle(Feature_savepath+"microstate_transition_data_df.pkl")
microstate_time_df = pd.read_pickle(Feature_savepath+"microstate_time_df.pkl")
microstate_entropy_df = pd.read_pickle(Feature_savepath+"microstate_entropy_df.pkl")
GC_data_df = pd.read_pickle(Feature_savepath+"GC_data_source_drop_interpol_df.pkl")
H_data_df = pd.read_pickle(Feature_savepath+"H_data_df.pkl")
H_data_df_global = pd.read_pickle(Feature_savepath+"H_data_global_df.pkl")
# List of features
EEG_features_name_list = [['Power'],
['Frontal Theta Beta Ratio',
'Asymmetry'],
['Peak Alpha Frequency',
'Global Peak Alpha Frequency'],
["1/f exponent"],
['imcoh'],
['wpli'],
['Power Envelope Correlation'],
['Microstate Transition',
'Microstate Ratio Time',
'Microstate Entropy'],
["Granger Causality"],
['DFA Exponent',
'Global DFA Exponent']]
# Arrange them to fit one 2D dataframe
# Make function to add measurement column for indexing
def add_measurement_column(df, measurement = "Text"):
dummy_variable = [measurement]*df.shape[0]
df.insert(1, "Measurement", dummy_variable)
return df
# Make function to convert column tuple to string
def convertTupleHeader(header):
header = list(header)
str = "_".join(header)
return str
# Prepare overall dataframe
EEG_features_df = pd.DataFrame(Subject_id, columns = ["Subject_ID"])
# Add power spectral densities
power_df = add_measurement_column(power_df, "Power")
temp_df = power_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Quant_status", "Eye_status",
"Freq_band", "Channel"], dropna=False,
values="PSD").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add frontal theta/beta ratio
# fTBR_data_df = add_measurement_column(fTBR_data_df, "Frontal Theta Beta Ratio")
temp_df = fTBR_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status"], dropna=False,
values="TBR").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add power asymmetry
asymmetry_df = add_measurement_column(asymmetry_df, "Asymmetry")
temp_df = asymmetry_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Freq_band", "ROI"], dropna=False,
values="Asymmetry_score").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add peak alpha frequency
PAF_data_df = add_measurement_column(PAF_data_df, "Peak Alpha Frequency")
temp_df = PAF_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Channel"], dropna=False,
values="Value").reset_index(drop=True)
# NaN values are interpolated with means across channels for each condition
eye_status = list(final_epochs[0].event_id.keys())
for ee in eye_status:
temp = temp_df.loc[:,("Peak Alpha Frequency",ee)] # get data
temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0)
temp_df.loc[:,("Peak Alpha Frequency",ee)] = temp.to_numpy()
# If there are still NaN the values are interpolated across channels and condition
temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add 1/f exponent
OOF_data_df = add_measurement_column(OOF_data_df, "1/f exponent")
temp_df = OOF_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Channel"], dropna=False,
values="Value").reset_index(drop=True)
# NaN values are interpolated with means across channels for each condition
eye_status = list(final_epochs[0].event_id.keys())
for ee in eye_status:
temp = temp_df.loc[:,("1/f exponent",ee)] # get data
temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0)
temp_df.loc[:,("1/f exponent",ee)] = temp.to_numpy()
# If there are still NaN the values are interpolated across channels and condition
temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add global peak alpha frequency
#PAF_data_df_global = add_measurement_column(PAF_data_df_global, "Global_Peak_Alpha_Frequency") # already exists
temp_df = PAF_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status"], dropna=False,
values="Value").reset_index(drop=True)
# NaN values are interpolated across eye condition
temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add connectivity measurements
#con_data_df = add_measurement_column(con_data_df, "Connectivity") # already exists
temp_df = con_data_df.pivot_table(index="Subject_ID",columns=["Con_measurement",
"Eye_status", "chx", "chy", "Freq_band"], dropna=True,
values="Value").reset_index(drop=True)
# Drop coh and plv, which are more susceptible to volume conduction
temp_df = temp_df.drop("coh",axis=1)
temp_df = temp_df.drop("plv",axis=1)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add orthogonalized power enveloped correlations
pec_data_df = add_measurement_column(pec_data_df, "Power Envelope Correlation")
temp_df = pec_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "chx", "chy", "Freq_band"], dropna=True,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add microstate transition probabilities
microstate_transition_data_df = add_measurement_column(microstate_transition_data_df, "Microstate Transition")
temp_df = microstate_transition_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Transition"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add microstate time covered
microstate_time_df = add_measurement_column(microstate_time_df, "Microstate Ratio Time")
# Convert microstate to str before using it as column
microstate_time_df = microstate_time_df.astype({"Microstate": str})
temp_df = microstate_time_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Microstate"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add microstate entropy
#microstate_entropy_df = add_measurement_column(microstate_entropy_df, "Microstate_Entropy") # already exists
microstate_entropy_df["Measurement"] = ["Microstate Entropy"]*microstate_entropy_df.shape[0]
temp_df = microstate_entropy_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add Granger Causality # fixed so it is always chx --> chy
GC_data_df = add_measurement_column(GC_data_df, "Granger Causality")
temp_df = GC_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "chx", "chy", "Freq_band"], dropna=True,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
# Check if any GC was not calculated properly
assert temp_df.shape[1] == (31*31-31)*5*2 # expected number of features with 31 sources
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add DFA exponent
H_data_df = add_measurement_column(H_data_df, "DFA Exponent")
temp_df = H_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Channel", "Freq_band"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add Global Hurst exponent
H_data_df_global = H_data_df_global.drop("Measurement",axis=1)
H_data_df_global = add_measurement_column(H_data_df_global, "Global DFA Exponent") # already exists
temp_df = H_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Freq_band"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
return EEG_features_df, EEG_features_name_list
# %% ML predction of PTSD status using all features
# mRMR -> mRMR -> SVM/RF/LogReg with L2
# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
EEG_features_df, EEG_features_name_list = load_features_df()
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
# Get study index
study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
# Assign label
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]
# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)
# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
model_par = []
final_models = []
final_features = []
final_y_preds = []
np.random.seed(42)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV = []
Outer_CV_test = []
for train_index, test_index in skf.split(EEG_features_df,study_group_status):
Outer_CV.append([train_index,test_index])
Outer_CV_test.append(test_index)
Rep_Outer_CV.append(Outer_CV)
Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
def allUnique(x):
seen = set()
return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
for rep in range(n_repetitions):
# The outer fold CV has already been saved as lists
Outer_CV = Rep_Outer_CV[rep]
# Pre-allocate memory
model_par0 = []
final_models0 = []
final_features0 = []
final_y_preds0 = []
Outer_counter = 0
for train_index, test_index in Outer_CV:
test_df = EEG_features_df.iloc[test_index]
train_df = EEG_features_df.iloc[train_index]
# Training data will be standardized
standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
train_df_standard = train_df.copy()
train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
# Test data will also be standardized but using mean and std from training data
test_df_standard = test_df.copy()
test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
# Get the training data
X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
y_train = train_df_standard[Target]
# Get test data
X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
y_test = test_df_standard[Target]
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1 = [20,30,40,50]
n_feat_mRMR2 = [30,40,50,60]
# Max features from mRMR for each eye status
max_mRMR_features = n_feat_mRMR1[-1]
max_mRMR_features2 = n_feat_mRMR2[-1]
eye_status = ["Eyes Closed", "Eyes Open"]
n_eye_status = len(eye_status)
mRMR_features = []
feat_counter = []
# Perform mRMR for each feature type
for fset in range(len(EEG_features_name_list)):
temp_feat = EEG_features_name_list[fset]
other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
for e in range(n_eye_status):
ee = eye_status[e]
temp_ee = "{}_{}".format(temp_feat,ee)
# Retrieve the dataset for each feature
col_idx = np.zeros(len(X_train.columns), dtype=bool)
for fsub in range(len(temp_feat)):
temp_feat0 = temp_feat[fsub]
col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
col_idx = np.logical_or(col_idx,col_idx0) # append all trues
temp_X_train = X_train.loc[:,col_idx]
# Check if any of the other features were wrongly chosen
for fcheck in range(len(other_feats)):
if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]+"_"))]
if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
# Index for eye status
temp_X_train = temp_X_train.loc[:,temp_X_train.columns.str.contains(ee)]
# Save number of original features fed into mRMR
feat_counter.append([temp_feat,temp_X_train.shape[1]])
# Do not use mRMR if there are fewer than max_mRMR_features
if temp_X_train.shape[1] <= max_mRMR_features:
filter_features = temp_X_train.columns
else:
# mRMR
filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features,
K_MAX=1000,n_jobs=-1,verbose=False)
# Get selected features
filter_features = temp_X_train.columns[filter_feat_selector]
# Save features selected
mRMR_features.append(list(filter_features))
# Check that no features were excluded when checking for wrongly chosen
total_features = 0
for feat_c in range(len(feat_counter)):
total_features += feat_counter[feat_c][1]
assert total_features == X_train.shape[1]
# Part 2 with second mRMR and classifiers in loop
k_fold = 10
skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV = []
for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
Inner_CV.append([train_index2,test_index2])
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
kernels = ["linear"]
# rbf overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
# Random forest classifier
# Prepare hyper parameters
trees = np.array([10, 100, 500, 1000])
depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels)))
val_feat_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels),max_mRMR_features2))
val_feat_svm.fill(np.nan)
val_err_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg)))
val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg),max_mRMR_features2))
val_feat_logreg.fill(np.nan)
val_err_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth)))
val_feat_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2))
val_feat_rf.fill(np.nan)
val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2))
val_feat_import_rf.fill(np.nan)
for n1 in range(len(n_feat_mRMR1)):
# Get the features from first mRMR
mRMR_features1 = [item for sublist in mRMR_features for item in sublist[0:n_feat_mRMR1[n1]]]
# print("{} features are left after first mRMR filtering".format(len(mRMR_features1)))
X_train_mRMR = X_train[mRMR_features1].copy()
# Second round of mRMR on the reduced list
filter_feat_selector = mRMR_feature_select(X_train_mRMR.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features2,
K_MAX=1000,n_jobs=-1,verbose=False)
for n2 in range(len(n_feat_mRMR2)):
# Get selected features from mRMR2
filter_features = X_train_mRMR.columns[filter_feat_selector[0:n_feat_mRMR2[n2]]]
X_train_mRMR2 = X_train_mRMR[filter_features].copy()
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
min_features_to_select = 1
# Using normal for loop as RFECV has inbuilt multiprocessing
for C in range(len(C_parameter_SVM)):
for K in range(len(kernels)):
# Define the model
svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000)
# Perform recurive feature elimination with in-built CV
rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy",
cv=Inner_CV,
min_features_to_select=min_features_to_select)
rfecv.fit(X_train_mRMR2,y_train.to_numpy().ravel())
# Get CV score
err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
# Save results for hyperparameter optimization
val_err_svm[n1,n2,C,K] = err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0]
val_feat_svm[n1,n2,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
# Logistic regression with sequential forward selection
# In-bult CV
k_features = np.arange(1,n_feat_mRMR2[n2]+1,1) # try up to all features
inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
for C in range(len(C_parameter_LogReg)):
LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
max_iter = 50000)
sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
forward = True, scoring = "balanced_accuracy",
verbose = 0, floating = False,
cv = Inner_CV, n_jobs = -1)
sfs = sfs.fit(X_train_mRMR2, y_train.to_numpy().ravel())
# Save CV scores for each SFS step
for feat in range(len(k_features)):
inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
if len(K) > 1:
K = K[np.where(K == np.min(K))[0]]
err = inner_cv_scores[C,K]
# Save validation error and features
val_err_logreg[n1,n2,C] = err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
val_feat_logreg[n1,n2,C,0:len(sfs_feat_idx)] = sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err = np.zeros((k_fold,len(trees),len(depth)))
RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR2.shape[1]))
counter = 0
for cv in range(k_fold):
# Retrieve CV indices
train_index_rf = Inner_CV[cv][0]
test_index_rf = Inner_CV[cv][1]
# Retrieve datasets
X_train2 = X_train_mRMR2.iloc[train_index_rf]; X_test2 = X_train_mRMR2.iloc[test_index_rf]
y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
for t in range(len(trees)):
for d in range(len(depth)):
RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
n_jobs=-1, random_state=None)
RF.fit(X_train2.to_numpy(),y_train2)
# Validation error
RF_y = RF.predict(X_test2.to_numpy())
err = balanced_accuracy_score(y_test2, RF_y)
# Save to array
model_test_err[cv,t,d] = err
# Save feature importance array
RF_feat_import[cv,t,d] = RF.feature_importances_
counter += 1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean = np.mean(model_test_err, axis=0)
val_err_rf[n1,n2,:,:] = model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import = np.mean(RF_feat_import, axis=0)
val_feat_import_rf[n1,n2,:,:,0:n_feat_mRMR2[n2]] = RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0]
val_feat_rf[n1,n2,:,:,0:len(rf_feat_idx)] = rf_feat_idx
print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100))
# Choose the optimal parameters
### SVM
n1, n2, C, K = np.where(val_err_svm==np.max(val_err_svm))
if len(C) > 1:
print("There are multiple SVM runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
rfe_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
n2_temp = int(n2[i2])
C_temp = int(C[i2])
K_temp = int(K[i2])
temp_feats = val_feat_svm[n1_temp,n2_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,n2_temp,C_temp,K_temp])].astype(int)
rfe_feat_len.append(len(temp_feats))
rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
if len(rfe_feat_min) > 1:
print("Multiple SVM runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[rfe_feat_min])
n1, n2, C, K = [int(n1[rfe_feat_min][C_min]), int(n2[rfe_feat_min][C_min]),
int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])]
else:
n1, n2, C, K = [int(n1[int(rfe_feat_min)]), int(n2[int(rfe_feat_min)]),
int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])]
else:
n1, n2, C, K = [int(n1), int(n2), int(C), int(K)]
mRMR_chosen1 = n_feat_mRMR1[n1]
mRMR_chosen2 = n_feat_mRMR2[n2]
C_chosen = C_parameter_SVM[C]
K_chosen = kernels[K]
# Save the best validation erro
val_error = val_err_svm[n1, n2, C, K]
# Get the subsets chosen
chosen_feats = val_feat_svm[n1,n2,C,K][~np.isnan(val_feat_svm[n1,n2,C,K])].astype(int)
rfe_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rfe_features)
x_train_mRMR2_rfe = X_train[rfe_features]
# Fit on all training data
model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000)
model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rfe)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rfe_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error]
SVM_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat,K_chosen]
SVM_model = model
SVM_y_pred = [[model_train_y],[y_pred]]
### LogReg with SFS
n1, n2, C = np.where(val_err_logreg==np.max(val_err_logreg))
if len(C) > 1:
print("There are multiple LogReg runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
sfs_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
n2_temp = int(n2[i2])
C_temp = int(C[i2])
temp_feats = val_feat_logreg[n1_temp,n2_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,n2_temp,C_temp])].astype(int)
sfs_feat_len.append(len(temp_feats))
sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
if len(sfs_feat_min) > 1:
print("Multiple LogReg runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[sfs_feat_min])
n1, n2, C = [int(n1[sfs_feat_min][C_min]), int(n2[sfs_feat_min][C_min]),
int(C[sfs_feat_min][C_min])]
else:
n1, n2, C = [int(n1[int(sfs_feat_min)]), int(n2[int(sfs_feat_min)]),
int(C[int(sfs_feat_min)])]
else:
n1, n2, C = [int(n1), int(n2), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
mRMR_chosen2 = n_feat_mRMR2[n2]
C_chosen = C_parameter_LogReg[C]
# Save the best validation erro
val_error = val_err_logreg[n1, n2, C]
# Get the subsets chosen
chosen_feats = val_feat_logreg[n1,n2,C][~np.isnan(val_feat_logreg[n1,n2,C])].astype(int)
sfs_features = list(X_train.columns[chosen_feats])
n_final_feat = len(sfs_features)
x_train_mRMR2_sfs = X_train[sfs_features]
# Fit on all training data
model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_sfs)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[sfs_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error]
LogReg_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat]
LogReg_model = model
LogReg_y_pred = [[model_train_y],[y_pred]]
### RF
n1, n2, t, d = np.where(val_err_rf==np.max(val_err_rf))
if len(d) > 1:
print("There are multiple RF runs with the same validation error")
print("Choosing run with lowest depth to alleviate overfitting")
d_min = np.where(d==np.min(d))[0]
if len(d_min) > 1:
print("Multiple RF runs with same number number of depths")
print("Choosing run with lowest trees to alleviate overfitting")
t_min = np.argmin(t[d_min]) # argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1, n2, t, d = [int(n1[d_min][t_min]), int(n2[d_min][t_min]),
int(t[d_min][t_min]), int(d[d_min][t_min])]
else:
n1, n2, t, d = [int(n1[int(d_min)]), int(n2[int(d_min)]),
int(t[int(d_min)]), int(d[int(d_min)])]
else:
n1, n2, t, d = [int(n1), int(n2), int(t), int(d)]
mRMR_chosen1 = n_feat_mRMR1[n1]
mRMR_chosen2 = n_feat_mRMR2[n2]
t_chosen = trees[t]
d_chosen = depth[d]
# Save the best validation erro
val_error = val_err_rf[n1, n2, t, d]
# Get the chosen features and feature importances
chosen_feats = val_feat_rf[n1,n2,t,d][~np.isnan(val_feat_rf[n1,n2,t,d])].astype(int)
rf_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rf_features)
x_train_mRMR2_rf = X_train[rf_features]
rf_feat_importances = val_feat_import_rf[n1,n2,t,d][~np.isnan(val_feat_import_rf[n1,n2,t,d])]
# Fit on all training data
model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
n_jobs=-1, random_state=None)
model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rf)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rf_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error]
RF_model_par = [mRMR_chosen1,mRMR_chosen2,t_chosen,d_chosen]
RF_model = model
RF_y_pred = [[model_train_y],[y_pred]]
# Save the rest
model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par])
final_models0.append([SVM_model,LogReg_model,RF_model])
final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]])
final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred])
# Move counter
Outer_counter += 1
print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1))
# Save results from all outer folds
model_par.append(model_par0)
final_models.append(final_models0)
final_features.append(final_features0)
final_y_preds.append(final_y_preds0)
# Save results to file
Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds]
# Run with variable feat in first and second mRMR
with open(Model_savepath+"Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Group_Status_results_010222.pkl", "wb") as filehandle:
pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle)
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
# Print total progress
print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
# %% ML predction of PTSD status using each feature separately
# mRMR -> mRMR -> SVM/RF/LogReg with L2
# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
# With concurrent processes for faster computation
EEG_features_df, EEG_features_name_list = load_features_df()
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
# Get study index
study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
# Assign label
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]
# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)
# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
# accuracy_arr = np.zeros((n_repetitions,k_out,len(EEG_features_name_list),n_models,3))
Ind_feat_clf = [0]*n_repetitions
np.random.seed(42)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV = []
Outer_CV_test = []
for train_index, test_index in skf.split(EEG_features_df,study_group_status):
Outer_CV.append([train_index,test_index])
Outer_CV_test.append(test_index)
Rep_Outer_CV.append(Outer_CV)
Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
def allUnique(x):
seen = set()
return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
def Each_feat_10rep_10x10CV(rep):
# The outer fold CV has already been saved as lists
Outer_CV = Rep_Outer_CV[rep]
# Pre-allocate memory
Ind_feat_clf0 = []
accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3))
Outer_counter = 0
for train_index, test_index in Outer_CV:
test_df = EEG_features_df.iloc[test_index]
train_df = EEG_features_df.iloc[train_index]
# Training data will be standardized
standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
train_df_standard = train_df.copy()
train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
# Test data will also be standardized but using mean and std from training data
test_df_standard = test_df.copy()
test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
# Get the training data
X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
y_train = train_df_standard[Target]
# Get test data
X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
y_test = test_df_standard[Target]
# Part 2 with second mRMR and classifiers in loop
k_fold = 10
skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV = []
for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
Inner_CV.append([train_index2,test_index2])
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1 = [20,30,40,50]
# Max features from mRMR for each eye status
max_mRMR_features = n_feat_mRMR1[-1]
eye_status = ["Eyes Closed", "Eyes Open"]
n_eye_status = len(eye_status)
feat_counter = []
# Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
Ind_feat_clf00 = []
for fset in range(len(EEG_features_name_list)):
temp_feat = EEG_features_name_list[fset]
other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
# Retrieve the dataset for each feature
col_idx = np.zeros(len(X_train.columns), dtype=bool)
for fsub in range(len(temp_feat)):
temp_feat0 = temp_feat[fsub]
col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
col_idx = np.logical_or(col_idx,col_idx0) # append all trues
temp_X_train = X_train.loc[:,col_idx]
# Check if any of the other features were wrongly chosen
for fcheck in range(len(other_feats)):
if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))]
if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
# Save number of original features fed into mRMR
feat_counter.append([temp_feat,temp_X_train.shape[1]])
# If there are no features selected, then skip the rest of the loop
if temp_X_train.shape[1] == 0:
continue
# Do not use mRMR if there are fewer than n_features
if temp_X_train.shape[1] <= max_mRMR_features:
filter_features = temp_X_train.columns
else:
# mRMR
filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features,
K_MAX=1000,n_jobs=1,verbose=False)
# Get selected features
filter_features = temp_X_train.columns[filter_feat_selector]
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
# rbf kernel overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
# Random forest classifier
# Prepare hyper parameters
trees = np.array([10, 100, 500, 1000])
depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM)))
val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features))
val_feat_svm.fill(np.nan)
val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
val_feat_logreg.fill(np.nan)
val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_rf.fill(np.nan)
val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_import_rf.fill(np.nan)
min_features_to_select = 1
for n1 in range(len(n_feat_mRMR1)):
mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]]
# Use the selected filter features
temp_X_train_mRMR = temp_X_train[mRMR_features1]
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
for C in range(len(C_parameter_SVM)):
# Define the model
svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000)
# Perform recurive feature elimination with in-built CV
rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy",
cv=Inner_CV,
min_features_to_select=min_features_to_select)
rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel())
# Get CV score
err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
# Save results for hyperparameter optimization
val_err_svm[n1,C] = err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0]
val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
# Logistic regression with sequential forward selection
# In-bult CV
k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]])
k_features = np.arange(1,k_max+1,1) # try up to all features
inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
for C in range(len(C_parameter_LogReg)):
LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
max_iter = 50000)
sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
forward = True, scoring = "balanced_accuracy",
verbose = 0, floating = False,
cv = Inner_CV, n_jobs = 1)
sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel())
# Save CV scores for each SFS step
for feat in range(len(k_features)):
inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
if len(K) > 1:
K = K[np.where(K == np.min(K))[0]]
err = inner_cv_scores[C,K]
# Save validation error and features
val_err_logreg[n1,C] = err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err = np.zeros((k_fold,len(trees),len(depth)))
RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1]))
counter = 0
for cv in range(k_fold):
# Retrieve CV indices
train_index_rf = Inner_CV[cv][0]
test_index_rf = Inner_CV[cv][1]
# Retrieve datasets
X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf]
y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
for t in range(len(trees)):
for d in range(len(depth)):
RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
n_jobs=1, random_state=None)
RF.fit(X_train2.to_numpy(),y_train2)
# Validation error
RF_y = RF.predict(X_test2.to_numpy())
err = balanced_accuracy_score(y_test2,RF_y)
# Save to array
model_test_err[cv,t,d] = err
# Save feature importance array
RF_feat_import[cv,t,d] = RF.feature_importances_
counter += 1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean = np.mean(model_test_err, axis=0)
val_err_rf[n1,:,:] = model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import = np.mean(RF_feat_import, axis=0)
val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0]
val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
# Print progress
current_progress = (n1+1)/(len(n_feat_mRMR1))*100
print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0]))
# Choose the optimal parameters
### SVM
n1, C = np.where(val_err_svm==np.max(val_err_svm))
if len(C) > 1:
print("There are multiple SVM runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
rfe_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
C_temp = int(C[i2])
temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int)
rfe_feat_len.append(len(temp_feats))
rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
if len(rfe_feat_min) > 1:
print("Multiple SVM runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[rfe_feat_min])
n1, C = [int(n1[rfe_feat_min][C_min]),
int(C[rfe_feat_min][C_min])]
else:
n1, C = [int(n1[int(rfe_feat_min)]),
int(C[int(rfe_feat_min)])]
else:
n1, C = [int(n1), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
C_chosen = C_parameter_SVM[C]
# Save the best validation error
val_error = val_err_svm[n1, C]
# Get the subsets chosen
chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int)
rfe_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rfe_features)
x_train_mRMR2_rfe = X_train[rfe_features]
# Fit on all training data
model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000)
model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rfe)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rfe_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error]
SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
SVM_model = model
SVM_y_pred = [[model_train_y],[y_pred]]
### LogReg with SFS
n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
if len(C) > 1:
print("There are multiple LogReg runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
sfs_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
C_temp = int(C[i2])
temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
sfs_feat_len.append(len(temp_feats))
sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
if len(sfs_feat_min) > 1:
print("Multiple LogReg runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[sfs_feat_min])
n1, C = [int(n1[sfs_feat_min][C_min]),
int(C[sfs_feat_min][C_min])]
else:
n1, C = [int(n1[int(sfs_feat_min)]),
int(C[int(sfs_feat_min)])]
else:
n1, C = [int(n1), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
C_chosen = C_parameter_LogReg[C]
# Save the best validation erro
val_error = val_err_logreg[n1, C]
# Get the subsets chosen
chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
sfs_features = list(X_train.columns[chosen_feats])
n_final_feat = len(sfs_features)
x_train_mRMR2_sfs = X_train[sfs_features]
# Fit on all training data
model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_sfs)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[sfs_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error]
LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
LogReg_model = model
LogReg_y_pred = [[model_train_y],[y_pred]]
### RF
n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
if len(d) > 1:
print("There are multiple RF runs with the same validation error")
print("Choosing run with lowest depth to alleviate overfitting")
d_min = np.where(d==np.min(d))[0]
if len(d_min) > 1:
print("Multiple RF runs with same number number of depths")
print("Choosing run with lowest trees to alleviate overfitting")
t_min = np.argmin(t[d_min]) # argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1, t, d = [int(n1[d_min][t_min]),
int(t[d_min][t_min]), int(d[d_min][t_min])]
else:
n1, t, d = [int(n1[int(d_min)]),
int(t[int(d_min)]), int(d[int(d_min)])]
else:
n1, t, d = [int(n1), int(t), int(d)]
mRMR_chosen1 = n_feat_mRMR1[n1]
t_chosen = trees[t]
d_chosen = depth[d]
# Save the best validation error
val_error = val_err_rf[n1, t, d]
# Get the chosen features and feature importances
chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
rf_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rf_features)
x_train_mRMR2_rf = X_train[rf_features]
rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
# Fit on all training data
model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
n_jobs=1, random_state=None)
model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rf)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rf_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error]
RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
RF_model = model
RF_y_pred = [[model_train_y],[y_pred]]
# Save the rest
model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par]
final_models00 = [SVM_model,LogReg_model,RF_model]
final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]]
final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred]
data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_]
# Save the results for the specific feature
res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00]
Ind_feat_clf00.append(res)
print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat))
# Check that no features were excluded when checking for wrongly chosen
total_features = 0
for feat_c in range(len(feat_counter)):
total_features += feat_counter[feat_c][1]
assert total_features == X_train.shape[1]
# Save results over outer folds
Ind_feat_clf0.append(Ind_feat_clf00)
print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1))
# Move counter
Outer_counter += 1
out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable]
# Print total progress
print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
return rep, out
Ind_feat_clf = [0]*n_repetitions
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments
Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
# Save results to file and overwrite when new results arrive
with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR_SVM_LogReg_RF_results_070222.pkl", "wb") as filehandle:
pickle.dump(Ind_feat_clf, filehandle)
# %% Sparse clustering of all EEG features to look for subtypes
EEG_features_df, EEG_features_name_list = load_features_df()
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Only use PTSD patient group
EEG_features_df2 = EEG_features_df.loc[EEG_features_df["Group_status"]==1,:]
Subject_info_cols = ["Subject_ID","Group_status"]
# Standardize the values
X = np.array(EEG_features_df2.copy().drop(Subject_info_cols, axis=1))
standardizer = preprocessing.StandardScaler().fit(X)
X = standardizer.transform(X)
# Use gridsearch and permutations to estimate gap statistic and use it to
# determine number of clusters and sparsity s
# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020
max_clusters = 6
n_sparsity_feat = 20
perm_res = []
Timestamps = []
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
Timestamps.append(c_time1)
for k in range(1,max_clusters):
# Cannot permute with 1 cluster
n_clusters = k+1
perm = pysparcl.cluster.permute_modified(X, k=n_clusters, verbose=True,
nvals=n_sparsity_feat, nperms=100)
perm_res.append(perm)
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
Timestamps.append(c_time2)
# Save the results
with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "wb") as file:
pickle.dump(perm_res, file)
# # Load
# with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "rb") as file:
# perm_res = pickle.load(file)
# Convert results to array
perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4))
for i in range(len(perm_res)):
_, gaps, sdgaps, wbounds, _ = perm_res[i].values()
for i2 in range(n_sparsity_feat):
perm_res_arr[n_sparsity_feat*i+i2,0] = i+2 # cluster size
perm_res_arr[n_sparsity_feat*i+i2,1] = gaps[i2] # gap statistic
perm_res_arr[n_sparsity_feat*i+i2,2] = sdgaps[i2] # gap statistic std
perm_res_arr[n_sparsity_feat*i+i2,3] = wbounds[i2] # sparsity feature s
# For each sparsity s, determine best k using one-standard-error criterion
# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s
# that fulfill Gap(k) >= Gap(k+1)-std(k+1)
def one_standard_deviation_search(gaps, std):
best_gaps = np.argmax(gaps)
current_gaps = gaps[best_gaps]
current_std = std[best_gaps]
current_gaps_idx = best_gaps
while (gaps[current_gaps_idx-1] >= current_gaps - current_std):
if current_gaps_idx == 0:
break
else:
current_gaps_idx -= 1
current_gaps = gaps[current_gaps_idx]
current_std = std[current_gaps_idx]
out = current_gaps, current_std, current_gaps_idx
return out
best_ks = np.zeros((n_sparsity_feat, 2))
all_s = np.unique(perm_res_arr[:,3])
plt.figure(figsize=(12,12))
for i2 in range(n_sparsity_feat):
current_s = all_s[i2]
gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1]
std = perm_res_arr[perm_res_arr[:,3] == current_s,2]
_, _, idx = one_standard_deviation_search(gaps, std)
# Save to array
best_ks[i2,0] = current_s
best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx]
# Plot gap
plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"),
gaps, yerr=std, capsize=5, label = np.round(current_s,3))
plt.title("Gap statistic for different fixed s")
plt.legend(loc=1)
plt.xlabel("Number of clusters")
plt.ylabel("Gap statistic")
best_k = int(scipy.stats.mode(best_ks[:,1])[0])
# Determine s using fixed k as lowest s within 1 std of max gap statistic
# According to Witten & Tibshirani, 2010
best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1])
best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx]
best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx]
one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std
best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]])
# Perform clustering with k clusters
sparcl = pysparcl.cluster.kmeans(X, k=best_k, wbounds=best_s)[0]
# # Save the results
# with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "wb") as file:
# pickle.dump(sparcl, file)
with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
sparcl = pickle.load(file)
# %% Sparse kmeans -> mRMR -> SVM/RF/LogReg with L2
# Using sparse kmeans selected features, only eyes closed as they were primarily chosen
# Prediction of subtype 1
EEG_features_df, EEG_features_name_list = load_features_df()
with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
sparcl = pickle.load(file)
# Use concatenated features
EEG_features_name_list = [['Power'],
['Frontal Theta Beta Ratio',
'Asymmetry'],
['Peak Alpha Frequency',
'Global Peak Alpha Frequency'],
["1/f exponent"],
['imcoh'],
['wpli'],
['Power Envelope Correlation'],
['Microstate Transition',
'Microstate Ratio Time',
'Microstate Entropy'],
["Granger Causality"],
['DFA Exponent',
'Global DFA Exponent']]
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Only keep PTSD subtype 1 by dropping subtype 2
subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
EEG_features_df = EEG_features_df.set_index("Subject_ID")
EEG_features_df = EEG_features_df.drop(subtype2)
EEG_features_df = EEG_features_df.reset_index()
# Check it was dropped correctly
assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])
# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2
# Get features from sparse kmeans
nonzero_idx = sparcl["ws"].nonzero()
sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO
# Only use eyes closed (2483 features)
sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")]
EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1)
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
# Get study index
study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
# Assign label
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]
# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)
# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
model_par = []
final_models = []
final_features = []
final_y_preds = []
np.random.seed(42)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV = []
Outer_CV_test = []
for train_index, test_index in skf.split(EEG_features_df,study_group_status):
Outer_CV.append([train_index,test_index])
Outer_CV_test.append(test_index)
Rep_Outer_CV.append(Outer_CV)
Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
def allUnique(x):
seen = set()
return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
for rep in range(n_repetitions):
# The outer fold CV has already been saved as lists
Outer_CV = Rep_Outer_CV[rep]
# Pre-allocate memory
model_par0 = []
final_models0 = []
final_features0 = []
final_y_preds0 = []
Outer_counter = 0
for train_index, test_index in Outer_CV:
test_df = EEG_features_df.iloc[test_index]
train_df = EEG_features_df.iloc[train_index]
# Training data will be standardized
standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
train_df_standard = train_df.copy()
train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
# Test data will also be standardized but using mean and std from training data
test_df_standard = test_df.copy()
test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
# Get the training data
X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
y_train = train_df_standard[Target]
# Get test data
X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
y_test = test_df_standard[Target]
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1 = [30,40,50,60]
# Max features from mRMR for each eye status
max_mRMR_features = n_feat_mRMR1[-1]
# mRMR on the all sparse kmeans selected features
filter_feat_selector = mRMR_feature_select(X_train.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features,
K_MAX=1000,n_jobs=-1,verbose=False)
# Part 2 with second mRMR and classifiers in loop
k_fold = 10
skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV = []
for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
Inner_CV.append([train_index2,test_index2])
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
kernels = ["linear"] # error when using rbf and RFECV
# rbf also overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
# Random forest classifier
# Prepare hyper parameters
trees = np.array([10, 100, 500, 1000])
depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels)))
val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),len(kernels),max_mRMR_features))
val_feat_svm.fill(np.nan)
val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
val_feat_logreg.fill(np.nan)
val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_rf.fill(np.nan)
val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_import_rf.fill(np.nan)
for n1 in range(len(n_feat_mRMR1)):
# Get selected features from mRMR
filter_features = X_train.columns[filter_feat_selector[0:n_feat_mRMR1[n1]]]
X_train_mRMR = X_train[filter_features].copy()
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
min_features_to_select = 1
# Using normal for loop as RFECV has inbuilt multiprocessing
for C in range(len(C_parameter_SVM)):
for K in range(len(kernels)):
# Define the model
svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000)
# Perform recurive feature elimination with in-built CV
rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy",
cv=Inner_CV,
min_features_to_select=min_features_to_select)
rfecv.fit(X_train_mRMR,y_train.to_numpy().ravel())
# Get CV score
err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
# Save results for hyperparameter optimization
val_err_svm[n1,C,K] = err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0]
val_feat_svm[n1,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
# Logistic regression with sequential forward selection
# In-bult CV
k_features = np.arange(1,n_feat_mRMR1[n1]+1,1) # try up to all features
inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
for C in range(len(C_parameter_LogReg)):
LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
max_iter = 50000)
sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
forward = True, scoring = "balanced_accuracy",
verbose = 0, floating = False,
cv = Inner_CV, n_jobs = -1)
sfs = sfs.fit(X_train_mRMR, y_train.to_numpy().ravel())
# Save CV scores for each SFS step
for feat in range(len(k_features)):
inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
if len(K) > 1:
K = K[np.where(K == np.min(K))[0]]
err = inner_cv_scores[C,K]
# Save validation error and features
val_err_logreg[n1,C] = err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err = np.zeros((k_fold,len(trees),len(depth)))
RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR.shape[1]))
counter = 0
for cv in range(k_fold):
# Retrieve CV indices
train_index_rf = Inner_CV[cv][0]
test_index_rf = Inner_CV[cv][1]
# Retrieve datasets
X_train2 = X_train_mRMR.iloc[train_index_rf]; X_test2 = X_train_mRMR.iloc[test_index_rf]
y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
for t in range(len(trees)):
for d in range(len(depth)):
RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
n_jobs=-1, random_state=None)
RF.fit(X_train2.to_numpy(),y_train2)
# Validation error
RF_y = RF.predict(X_test2.to_numpy())
err = balanced_accuracy_score(y_test2, RF_y)
# Save to array
model_test_err[cv,t,d] = err
# Save feature importance array
RF_feat_import[cv,t,d] = RF.feature_importances_
counter += 1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean = np.mean(model_test_err, axis=0)
val_err_rf[n1,:,:] = model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import = np.mean(RF_feat_import, axis=0)
val_feat_import_rf[n1,:,:,0:n_feat_mRMR1[n1]] = RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0]
val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100))
# Choose the optimal parameters
### SVM
n1, C, K = np.where(val_err_svm==np.max(val_err_svm))
if len(C) > 1:
print("There are multiple SVM runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
rfe_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
C_temp = int(C[i2])
K_temp = int(K[i2])
temp_feats = val_feat_svm[n1_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,C_temp,K_temp])].astype(int)
rfe_feat_len.append(len(temp_feats))
rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
if len(rfe_feat_min) > 1:
print("Multiple SVM runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[rfe_feat_min])
n1, C, K = [int(n1[rfe_feat_min][C_min]),
int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])]
else:
n1, C, K = [int(n1[int(rfe_feat_min)]),
int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])]
else:
n1, C, K = [int(n1), int(C), int(K)]
mRMR_chosen1 = n_feat_mRMR1[n1]
C_chosen = C_parameter_SVM[C]
K_chosen = kernels[K]
# Save the best validation erro
val_error = val_err_svm[n1, C, K]
# Get the subsets chosen
chosen_feats = val_feat_svm[n1,C,K][~np.isnan(val_feat_svm[n1,C,K])].astype(int)
rfe_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rfe_features)
x_train_mRMR_rfe = X_train[rfe_features]
# Fit on all training data
model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000)
model = model.fit(x_train_mRMR_rfe,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR_rfe)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rfe_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error]
SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat,K_chosen]
SVM_model = model
SVM_y_pred = [[model_train_y],[y_pred]]
### LogReg with SFS
n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
if len(C) > 1:
print("There are multiple LogReg runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
sfs_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
C_temp = int(C[i2])
temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
sfs_feat_len.append(len(temp_feats))
sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
if len(sfs_feat_min) > 1:
print("Multiple LogReg runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[sfs_feat_min])
n1, C = [int(n1[sfs_feat_min][C_min]),
int(C[sfs_feat_min][C_min])]
else:
n1, C = [int(n1[int(sfs_feat_min)]),
int(C[int(sfs_feat_min)])]
else:
n1, C = [int(n1), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
C_chosen = C_parameter_LogReg[C]
# Save the best validation erro
val_error = val_err_logreg[n1, C]
# Get the subsets chosen
chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
sfs_features = list(X_train.columns[chosen_feats])
n_final_feat = len(sfs_features)
x_train_mRMR_sfs = X_train[sfs_features]
# Fit on all training data
model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
model = model.fit(x_train_mRMR_sfs,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR_sfs)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[sfs_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error]
LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
LogReg_model = model
LogReg_y_pred = [[model_train_y],[y_pred]]
### RF
n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
if len(d) > 1:
print("There are multiple RF runs with the same validation error")
print("Choosing run with lowest depth to alleviate overfitting")
d_min = np.where(d==np.min(d))[0]
if len(d_min) > 1:
print("Multiple RF runs with same number number of depths")
print("Choosing run with lowest trees to alleviate overfitting")
t_min = np.argmin(t[d_min]) # argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1, t, d = [int(n1[d_min][t_min]),
int(t[d_min][t_min]), int(d[d_min][t_min])]
else:
n1, t, d = [int(n1[int(d_min)]),
int(t[int(d_min)]), int(d[int(d_min)])]
else:
n1, t, d = [int(n1), int(t), int(d)]
mRMR_chosen1 = n_feat_mRMR1[n1]
t_chosen = trees[t]
d_chosen = depth[d]
# Save the best validation error
val_error = val_err_rf[n1, t, d]
# Get the chosen features and feature importances
chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
rf_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rf_features)
x_train_mRMR_rf = X_train[rf_features]
rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
# Fit on all training data
model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
n_jobs=-1, random_state=None)
model = model.fit(x_train_mRMR_rf,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR_rf)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rf_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error]
RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
RF_model = model
RF_y_pred = [[model_train_y],[y_pred]]
# Save the rest
model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par])
final_models0.append([SVM_model,LogReg_model,RF_model])
final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]])
final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred])
# Move counter
Outer_counter += 1
print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1))
# Save results from all outer folds
model_par.append(model_par0)
final_models.append(final_models0)
final_features.append(final_features0)
final_y_preds.append(final_y_preds0)
# Save results to file
Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds]
# Run with variable feat in first and second mRMR
with open(Model_savepath+"Rep10_10x10CV_SparsKmean_mRMR_SVM_LogReg_RF_Subtype1_results_210122.pkl", "wb") as filehandle:
pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle)
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
# Print total progress
print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
# %% Each feat mRMR -> mRMR -> SVM/RF/LogReg with L2
# Using each feature (of the sparse Kmeans EC chosen ones)
# For Subtype 1
# 10 repetitions of 10 fold outer and 20 fold inner two-layer CV
# Prediction of subtype 1
EEG_features_df, EEG_features_name_list = load_features_df() # removed coh and plv and renamed Hurst in v4
with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "rb") as file:
sparcl = pickle.load(file)
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Only keep PTSD subtype 1 by dropping subtype 2
PTSD_idx = np.array([i in cases for i in Subject_id])
CTRL_idx = np.array([not i in cases for i in Subject_id])
subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
EEG_features_df = EEG_features_df.set_index("Subject_ID")
EEG_features_df = EEG_features_df.drop(subtype2)
EEG_features_df = EEG_features_df.reset_index()
# Check it was dropped correctly
assert all(subtype1 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])
# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2
# Get features from sparse kmeans
nonzero_idx = sparcl["ws"].nonzero()
sparcl_features = EEG_features_df.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
sum(sparcl_features.str.contains("Eyes Open"))/len(sparcl_features) # less than 3% are EO
# Only use eyes closed (2483 features)
sparcl_features = sparcl_features[sparcl_features.str.contains("Eyes Closed")]
EEG_features_df = pd.concat([EEG_features_df[Subject_info_cols],EEG_features_df[sparcl_features]],axis=1)
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
# Get study index
study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
# Assign label
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]
# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)
# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
model_par = []
final_models = []
final_features = []
final_y_preds = []
np.random.seed(42)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV = []
Outer_CV_test = []
for train_index, test_index in skf.split(EEG_features_df,study_group_status):
Outer_CV.append([train_index,test_index])
Outer_CV_test.append(test_index)
Rep_Outer_CV.append(Outer_CV)
Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
def allUnique(x):
seen = set()
return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
def Each_feat_10rep_10x10CV(rep):
# The outer fold CV has already been saved as lists
Outer_CV = Rep_Outer_CV[rep]
# Pre-allocate memory
Ind_feat_clf0 = []
accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3))
Outer_counter = 0
for train_index, test_index in Outer_CV:
test_df = EEG_features_df.iloc[test_index]
train_df = EEG_features_df.iloc[train_index]
# Training data will be standardized
standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
train_df_standard = train_df.copy()
train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
# Test data will also be standardized but using mean and std from training data
test_df_standard = test_df.copy()
test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
# Get the training data
X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
y_train = train_df_standard[Target]
# Get test data
X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
y_test = test_df_standard[Target]
# Part 2 with second mRMR and classifiers in loop
k_fold = 10
skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV = []
for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
Inner_CV.append([train_index2,test_index2])
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1 = [20,30,40,50]
# Max features from mRMR for each eye status
max_mRMR_features = n_feat_mRMR1[-1]
eye_status = ["Eyes Closed", "Eyes Open"]
n_eye_status = len(eye_status)
feat_counter = []
# Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
Ind_feat_clf00 = []
for fset in range(len(EEG_features_name_list)):
temp_feat = EEG_features_name_list[fset]
other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
# Retrieve the dataset for each feature
col_idx = np.zeros(len(X_train.columns), dtype=bool)
for fsub in range(len(temp_feat)):
temp_feat0 = temp_feat[fsub]
col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
col_idx = np.logical_or(col_idx,col_idx0) # append all trues
temp_X_train = X_train.loc[:,col_idx]
# Check if any of the other features were wrongly chosen
for fcheck in range(len(other_feats)):
if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))]
if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
# Save number of original features fed into mRMR
feat_counter.append([temp_feat,temp_X_train.shape[1]])
# If there are no features selected, then skip the rest of the loop
if temp_X_train.shape[1] == 0:
continue
# Do not use mRMR if there are fewer than n_features
if temp_X_train.shape[1] <= max_mRMR_features:
filter_features = temp_X_train.columns
else:
# mRMR
filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features,
K_MAX=1000,n_jobs=1,verbose=False)
# Get selected features
filter_features = temp_X_train.columns[filter_feat_selector]
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
# rbf kernel overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
# Random forest classifier
# Prepare hyper parameters
trees = np.array([10, 100, 500, 1000])
depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM)))
val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features))
val_feat_svm.fill(np.nan)
val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
val_feat_logreg.fill(np.nan)
val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_rf.fill(np.nan)
val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_import_rf.fill(np.nan)
min_features_to_select = 1
for n1 in range(len(n_feat_mRMR1)):
mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]]
# Use the selected filter features
temp_X_train_mRMR = temp_X_train[mRMR_features1]
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
for C in range(len(C_parameter_SVM)):
# Define the model
svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000)
# Perform recurive feature elimination with in-built CV
rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy",
cv=Inner_CV,
min_features_to_select=min_features_to_select)
rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel())
# Get CV score
err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
# Save results for hyperparameter optimization
val_err_svm[n1,C] = err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0]
val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
# Logistic regression with sequential forward selection
# In-bult CV
k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]])
k_features = np.arange(1,k_max+1,1) # try up to all features
inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
for C in range(len(C_parameter_LogReg)):
LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
max_iter = 50000)
sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
forward = True, scoring = "balanced_accuracy",
verbose = 0, floating = False,
cv = Inner_CV, n_jobs = 1)
sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel())
# Save CV scores for each SFS step
for feat in range(len(k_features)):
inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
if len(K) > 1:
K = K[np.where(K == np.min(K))[0]]
err = inner_cv_scores[C,K]
# Save validation error and features
val_err_logreg[n1,C] = err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
val_feat_logreg[n1,C,0:len(sfs_feat_idx)] = sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err = np.zeros((k_fold,len(trees),len(depth)))
RF_feat_import = np.zeros((k_fold,len(trees),len(depth),temp_X_train_mRMR.shape[1]))
counter = 0
for cv in range(k_fold):
# Retrieve CV indices
train_index_rf = Inner_CV[cv][0]
test_index_rf = Inner_CV[cv][1]
# Retrieve datasets
X_train2 = temp_X_train_mRMR.iloc[train_index_rf]; X_test2 = temp_X_train_mRMR.iloc[test_index_rf]
y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
for t in range(len(trees)):
for d in range(len(depth)):
RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
n_jobs=1, random_state=None)
RF.fit(X_train2.to_numpy(),y_train2)
# Validation error
RF_y = RF.predict(X_test2.to_numpy())
err = balanced_accuracy_score(y_test2,RF_y)
# Save to array
model_test_err[cv,t,d] = err
# Save feature importance array
RF_feat_import[cv,t,d] = RF.feature_importances_
counter += 1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean = np.mean(model_test_err, axis=0)
val_err_rf[n1,:,:] = model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import = np.mean(RF_feat_import, axis=0)
val_feat_import_rf[n1,:,:,0:RF_feat_import.shape[2]] = RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx = np.where(X_train.columns.isin(mRMR_features1))[0]
val_feat_rf[n1,:,:,0:len(rf_feat_idx)] = rf_feat_idx
# Print progress
current_progress = (n1+1)/(len(n_feat_mRMR1))*100
print("Finished {}% of inner fold optimization for feat: {}".format(current_progress,temp_feat[0]))
# Choose the optimal parameters
### SVM
n1, C = np.where(val_err_svm==np.max(val_err_svm))
if len(C) > 1:
print("There are multiple SVM runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
rfe_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
C_temp = int(C[i2])
temp_feats = val_feat_svm[n1_temp,C_temp][~np.isnan(val_feat_svm[n1_temp,C_temp])].astype(int)
rfe_feat_len.append(len(temp_feats))
rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
if len(rfe_feat_min) > 1:
print("Multiple SVM runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[rfe_feat_min])
n1, C = [int(n1[rfe_feat_min][C_min]),
int(C[rfe_feat_min][C_min])]
else:
n1, C = [int(n1[int(rfe_feat_min)]),
int(C[int(rfe_feat_min)])]
else:
n1, C = [int(n1), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
C_chosen = C_parameter_SVM[C]
# Save the best validation error
val_error = val_err_svm[n1, C]
# Get the subsets chosen
chosen_feats = val_feat_svm[n1,C][~np.isnan(val_feat_svm[n1,C])].astype(int)
rfe_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rfe_features)
x_train_mRMR2_rfe = X_train[rfe_features]
# Fit on all training data
model = SVC(C=C_chosen, kernel="linear", tol=1e-3, cache_size=4000)
model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rfe)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rfe_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr0[Outer_counter,fset,0,:] = [train_error,val_error,test_error]
SVM_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
SVM_model = model
SVM_y_pred = [[model_train_y],[y_pred]]
### LogReg with SFS
n1, C = np.where(val_err_logreg==np.max(val_err_logreg))
if len(C) > 1:
print("There are multiple LogReg runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
sfs_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
C_temp = int(C[i2])
temp_feats = val_feat_logreg[n1_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,C_temp])].astype(int)
sfs_feat_len.append(len(temp_feats))
sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
if len(sfs_feat_min) > 1:
print("Multiple LogReg runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[sfs_feat_min])
n1, C = [int(n1[sfs_feat_min][C_min]),
int(C[sfs_feat_min][C_min])]
else:
n1, C = [int(n1[int(sfs_feat_min)]),
int(C[int(sfs_feat_min)])]
else:
n1, C = [int(n1), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
C_chosen = C_parameter_LogReg[C]
# Save the best validation erro
val_error = val_err_logreg[n1, C]
# Get the subsets chosen
chosen_feats = val_feat_logreg[n1,C][~np.isnan(val_feat_logreg[n1,C])].astype(int)
sfs_features = list(X_train.columns[chosen_feats])
n_final_feat = len(sfs_features)
x_train_mRMR2_sfs = X_train[sfs_features]
# Fit on all training data
model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_sfs)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[sfs_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr0[Outer_counter,fset,1,:] = [train_error,val_error,test_error]
LogReg_model_par = [mRMR_chosen1,C_chosen,n_final_feat]
LogReg_model = model
LogReg_y_pred = [[model_train_y],[y_pred]]
### RF
n1, t, d = np.where(val_err_rf==np.max(val_err_rf))
if len(d) > 1:
print("There are multiple RF runs with the same validation error")
print("Choosing run with lowest depth to alleviate overfitting")
d_min = np.where(d==np.min(d))[0]
if len(d_min) > 1:
print("Multiple RF runs with same number number of depths")
print("Choosing run with lowest trees to alleviate overfitting")
t_min = np.argmin(t[d_min]) # argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1, t, d = [int(n1[d_min][t_min]),
int(t[d_min][t_min]), int(d[d_min][t_min])]
else:
n1, t, d = [int(n1[int(d_min)]),
int(t[int(d_min)]), int(d[int(d_min)])]
else:
n1, t, d = [int(n1), int(t), int(d)]
mRMR_chosen1 = n_feat_mRMR1[n1]
t_chosen = trees[t]
d_chosen = depth[d]
# Save the best validation error
val_error = val_err_rf[n1, t, d]
# Get the chosen features and feature importances
chosen_feats = val_feat_rf[n1,t,d][~np.isnan(val_feat_rf[n1,t,d])].astype(int)
rf_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rf_features)
x_train_mRMR2_rf = X_train[rf_features]
rf_feat_importances = val_feat_import_rf[n1,t,d][~np.isnan(val_feat_import_rf[n1,t,d])]
# Fit on all training data
model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
n_jobs=1, random_state=None)
model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rf)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rf_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr0[Outer_counter,fset,2,:] = [train_error,val_error,test_error]
RF_model_par = [mRMR_chosen1,t_chosen,d_chosen]
RF_model = model
RF_y_pred = [[model_train_y],[y_pred]]
# Save the rest
model_par00 = [SVM_model_par,LogReg_model_par,RF_model_par]
final_models00 = [SVM_model,LogReg_model,RF_model]
final_features00 = [rfe_features, sfs_features, [rf_features,rf_feat_importances]]
final_y_preds00 = [SVM_y_pred,LogReg_y_pred,RF_y_pred]
data_splits00 = [X_train,y_train,X_test,y_test,standardizer.mean_,standardizer.scale_]
# Save the results for the specific feature
res = [temp_feat, model_par00, final_models00, final_features00, final_y_preds00, data_splits00]
Ind_feat_clf00.append(res)
print("Finished outer fold {} out of {} for rep: {} for feat {}".format(Outer_counter+1,k_out,rep+1,temp_feat))
# Check that no features were excluded when checking for wrongly chosen
total_features = 0
for feat_c in range(len(feat_counter)):
total_features += feat_counter[feat_c][1]
assert total_features == X_train.shape[1]
# Save results over outer folds
Ind_feat_clf0.append(Ind_feat_clf00)
print("Finished outer fold {} out of {} for rep: {} for all features".format(Outer_counter+1,k_out,rep+1))
# Move counter
Outer_counter += 1
out = [accuracy_arr0, Ind_feat_clf0] # [Rep][Outer CV][Feature][Variable]
# Print total progress
print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
return rep, out
Ind_feat_clf = [0]*n_repetitions
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
for rep, result in executor.map(Each_feat_10rep_10x10CV, range(n_repetitions)): # Function and arguments
Ind_feat_clf[rep] = result # [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
# Save results to file and overwrite when new results arrive
with open(Model_savepath+"Each_feat_Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Subtype1_results_250122.pkl", "wb") as filehandle:
pickle.dump(Ind_feat_clf, filehandle)
# %% For subtype 2
# The two previous sections are rerun but for subtype 2
# The only modification was indexing the subjects
# Only keep PTSD subtype 2 by dropping subtype 1
subtype1 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==0]
subtype2 = np.array(Subject_id)[PTSD_idx][sparcl["cs"]==1]
EEG_features_df = EEG_features_df.set_index("Subject_ID")
EEG_features_df = EEG_features_df.drop(subtype1)
EEG_features_df = EEG_features_df.reset_index()
# Check it was dropped correctly
assert all(subtype2 == EEG_features_df.loc[EEG_features_df["Group_status"] == 1,"Subject_ID"])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment