Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
Resting-state EEG analysis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Terraform modules
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
s200431
Resting-state EEG analysis
Commits
f8f5f366
Commit
f8f5f366
authored
Oct 24, 2022
by
glia
Browse files
Options
Downloads
Patches
Plain Diff
Machine Learning
parent
c66f2b2b
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
MachineLearning.py
+2423
-0
2423 additions, 0 deletions
MachineLearning.py
with
2423 additions
and
0 deletions
MachineLearning.py
0 → 100644
+
2423
−
0
View file @
f8f5f366
# -*- coding: utf-8 -*-
"""
Updated Oct 18 2022
@author: Qianliang Li (glia@dtu.dk)
This script contains the code for the machine learning analysis
It should be run after Preprocessing.py and FeatureEstimation.py
"""
# Set working directory
import
os
wkdir
=
"
/home/glia/EEG/Final_scripts/
"
os
.
chdir
(
wkdir
)
# Load all libraries from the Preamble
from
Preamble
import
*
# Load the questionnaire data
final_qdf
=
pd
.
read_csv
(
"
final_qdf.csv
"
,
sep
=
"
,
"
,
na_values
=
'
'
)
# Define cases as >= 44 total PCL
# Type: numpy array with subject id
cases
=
np
.
array
(
final_qdf
[
"
Subject_ID
"
][
final_qdf
[
"
PCL_total
"
]
>=
44
])
n_groups
=
2
Groups
=
[
"
CTRL
"
,
"
PTSD
"
]
# Make function to load EEG features
def
load_features_df
():
# Load all features
power_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
Power_df.pkl
"
)
fTBR_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
fTBR_df.pkl
"
)
asymmetry_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
asymmetry_df.pkl
"
)
PAF_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
PAF_data_FOOOF_df.pkl
"
)
PAF_data_df_global
=
pd
.
read_pickle
(
Feature_savepath
+
"
PAF_data_FOOOF_global_df.pkl
"
)
OOF_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
OOF_data_FOOOF_df.pkl
"
)
con_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
con_data_source_drop_interpol_df.pkl
"
)
pec_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
pec_data_drop_interpol_ch_df.pkl
"
)
microstate_transition_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
microstate_transition_data_df.pkl
"
)
microstate_time_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
microstate_time_df.pkl
"
)
microstate_entropy_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
microstate_entropy_df.pkl
"
)
GC_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
GC_data_source_drop_interpol_df.pkl
"
)
H_data_df
=
pd
.
read_pickle
(
Feature_savepath
+
"
H_data_df.pkl
"
)
H_data_df_global
=
pd
.
read_pickle
(
Feature_savepath
+
"
H_data_global_df.pkl
"
)
# List of features
EEG_features_name_list
=
[[
'
Power
'
],
[
'
Frontal Theta Beta Ratio
'
,
'
Asymmetry
'
],
[
'
Peak Alpha Frequency
'
,
'
Global Peak Alpha Frequency
'
],
[
"
1/f exponent
"
],
[
'
imcoh
'
],
[
'
wpli
'
],
[
'
Power Envelope Correlation
'
],
[
'
Microstate Transition
'
,
'
Microstate Ratio Time
'
,
'
Microstate Entropy
'
],
[
"
Granger Causality
"
],
[
'
DFA Exponent
'
,
'
Global DFA Exponent
'
]]
# Arrange them to fit one 2D dataframe
# Make function to add measurement column for indexing
def
add_measurement_column
(
df
,
measurement
=
"
Text
"
):
dummy_variable
=
[
measurement
]
*
df
.
shape
[
0
]
df
.
insert
(
1
,
"
Measurement
"
,
dummy_variable
)
return
df
# Make function to convert column tuple to string
def
convertTupleHeader
(
header
):
header
=
list
(
header
)
str
=
"
_
"
.
join
(
header
)
return
str
# Prepare overall dataframe
EEG_features_df
=
pd
.
DataFrame
(
Subject_id
,
columns
=
[
"
Subject_ID
"
])
# Add power spectral densities
power_df
=
add_measurement_column
(
power_df
,
"
Power
"
)
temp_df
=
power_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Quant_status
"
,
"
Eye_status
"
,
"
Freq_band
"
,
"
Channel
"
],
dropna
=
False
,
values
=
"
PSD
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add frontal theta/beta ratio
# fTBR_data_df = add_measurement_column(fTBR_data_df, "Frontal Theta Beta Ratio")
temp_df
=
fTBR_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
],
dropna
=
False
,
values
=
"
TBR
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add power asymmetry
asymmetry_df
=
add_measurement_column
(
asymmetry_df
,
"
Asymmetry
"
)
temp_df
=
asymmetry_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Freq_band
"
,
"
ROI
"
],
dropna
=
False
,
values
=
"
Asymmetry_score
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add peak alpha frequency
PAF_data_df
=
add_measurement_column
(
PAF_data_df
,
"
Peak Alpha Frequency
"
)
temp_df
=
PAF_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Channel
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
# NaN values are interpolated with means across channels for each condition
eye_status
=
list
(
final_epochs
[
0
].
event_id
.
keys
())
for
ee
in
eye_status
:
temp
=
temp_df
.
loc
[:,(
"
Peak Alpha Frequency
"
,
ee
)]
# get data
temp
=
temp
.
T
.
fillna
(
temp
.
mean
(
axis
=
1
)).
T
# fill (transpose used because fillna is axis=0)
temp_df
.
loc
[:,(
"
Peak Alpha Frequency
"
,
ee
)]
=
temp
.
to_numpy
()
# If there are still NaN the values are interpolated across channels and condition
temp_df
=
temp_df
.
T
.
fillna
(
temp_df
.
mean
(
axis
=
1
)).
T
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add 1/f exponent
OOF_data_df
=
add_measurement_column
(
OOF_data_df
,
"
1/f exponent
"
)
temp_df
=
OOF_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Channel
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
# NaN values are interpolated with means across channels for each condition
eye_status
=
list
(
final_epochs
[
0
].
event_id
.
keys
())
for
ee
in
eye_status
:
temp
=
temp_df
.
loc
[:,(
"
1/f exponent
"
,
ee
)]
# get data
temp
=
temp
.
T
.
fillna
(
temp
.
mean
(
axis
=
1
)).
T
# fill (transpose used because fillna is axis=0)
temp_df
.
loc
[:,(
"
1/f exponent
"
,
ee
)]
=
temp
.
to_numpy
()
# If there are still NaN the values are interpolated across channels and condition
temp_df
=
temp_df
.
T
.
fillna
(
temp_df
.
mean
(
axis
=
1
)).
T
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add global peak alpha frequency
#PAF_data_df_global = add_measurement_column(PAF_data_df_global, "Global_Peak_Alpha_Frequency") # already exists
temp_df
=
PAF_data_df_global
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
# NaN values are interpolated across eye condition
temp_df
=
temp_df
.
T
.
fillna
(
temp_df
.
mean
(
axis
=
1
)).
T
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add connectivity measurements
#con_data_df = add_measurement_column(con_data_df, "Connectivity") # already exists
temp_df
=
con_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Con_measurement
"
,
"
Eye_status
"
,
"
chx
"
,
"
chy
"
,
"
Freq_band
"
],
dropna
=
True
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
# Drop coh and plv, which are more susceptible to volume conduction
temp_df
=
temp_df
.
drop
(
"
coh
"
,
axis
=
1
)
temp_df
=
temp_df
.
drop
(
"
plv
"
,
axis
=
1
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add orthogonalized power enveloped correlations
pec_data_df
=
add_measurement_column
(
pec_data_df
,
"
Power Envelope Correlation
"
)
temp_df
=
pec_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
chx
"
,
"
chy
"
,
"
Freq_band
"
],
dropna
=
True
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add microstate transition probabilities
microstate_transition_data_df
=
add_measurement_column
(
microstate_transition_data_df
,
"
Microstate Transition
"
)
temp_df
=
microstate_transition_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Transition
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add microstate time covered
microstate_time_df
=
add_measurement_column
(
microstate_time_df
,
"
Microstate Ratio Time
"
)
# Convert microstate to str before using it as column
microstate_time_df
=
microstate_time_df
.
astype
({
"
Microstate
"
:
str
})
temp_df
=
microstate_time_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Microstate
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add microstate entropy
#microstate_entropy_df = add_measurement_column(microstate_entropy_df, "Microstate_Entropy") # already exists
microstate_entropy_df
[
"
Measurement
"
]
=
[
"
Microstate Entropy
"
]
*
microstate_entropy_df
.
shape
[
0
]
temp_df
=
microstate_entropy_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add Granger Causality # fixed so it is always chx --> chy
GC_data_df
=
add_measurement_column
(
GC_data_df
,
"
Granger Causality
"
)
temp_df
=
GC_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
chx
"
,
"
chy
"
,
"
Freq_band
"
],
dropna
=
True
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
# Check if any GC was not calculated properly
assert
temp_df
.
shape
[
1
]
==
(
31
*
31
-
31
)
*
5
*
2
# expected number of features with 31 sources
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add DFA exponent
H_data_df
=
add_measurement_column
(
H_data_df
,
"
DFA Exponent
"
)
temp_df
=
H_data_df
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Channel
"
,
"
Freq_band
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
# Add Global Hurst exponent
H_data_df_global
=
H_data_df_global
.
drop
(
"
Measurement
"
,
axis
=
1
)
H_data_df_global
=
add_measurement_column
(
H_data_df_global
,
"
Global DFA Exponent
"
)
# already exists
temp_df
=
H_data_df_global
.
pivot_table
(
index
=
"
Subject_ID
"
,
columns
=
[
"
Measurement
"
,
"
Eye_status
"
,
"
Freq_band
"
],
dropna
=
False
,
values
=
"
Value
"
).
reset_index
(
drop
=
True
)
temp_df
.
columns
=
[
convertTupleHeader
(
temp_df
.
columns
[
i
])
for
i
in
range
(
len
(
temp_df
.
columns
))]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
,
temp_df
],
axis
=
1
)
return
EEG_features_df
,
EEG_features_name_list
# %% ML predction of PTSD status using all features
# mRMR -> mRMR -> SVM/RF/LogReg with L2
# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
EEG_features_df
,
EEG_features_name_list
=
load_features_df
()
# Add group status
Group_status
=
np
.
array
([
0
]
*
EEG_features_df
.
shape
[
0
])
# CTRL = 0
Group_status
[
np
.
array
([
i
in
cases
for
i
in
EEG_features_df
[
"
Subject_ID
"
]])]
=
1
# PTSD = 1
EEG_features_df
.
insert
(
1
,
"
Group_status
"
,
Group_status
)
# Subject info columns
Subject_info_cols
=
list
(
EEG_features_df
.
columns
[
0
:
2
])
n_subject_info_cols
=
len
(
Subject_info_cols
)
n_discrete_cols
=
2
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies
=
3
study_group_status
=
EEG_features_df
[
"
Group_status
"
].
copy
()
for
i
in
range
(
n_studies
):
# Get study index
study_idx
=
(
EEG_features_df
[
"
Subject_ID
"
]
>=
(
i
+
1
)
*
100000
)
&
(
EEG_features_df
[
"
Subject_ID
"
]
<
(
i
+
2
)
*
100000
)
# Assign label
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
0
)]
=
2
*
i
# CTRL
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
1
)]
=
2
*
i
+
1
# PTSD
# Target variable
Target
=
[
"
Group_status
"
]
Target_col
=
EEG_features_df
.
iloc
[:,
1
:].
columns
.
isin
(
Target
)
Target_col_idx
=
np
.
where
(
Target_col
==
True
)[
0
]
# Make 3 models and save them to use enseemble in the end
CLF_models
=
[
"
SVM
"
,
"
LogReg
"
,
"
RF
"
]
n_models
=
len
(
CLF_models
)
# Repeat the classification to see if I just got a lucky seed
n_repetitions
=
10
k_out
=
10
accuracy_arr
=
np
.
zeros
((
n_repetitions
,
k_out
,
n_models
,
3
))
model_par
=
[]
final_models
=
[]
final_features
=
[]
final_y_preds
=
[]
np
.
random
.
seed
(
42
)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV
=
[]
Rep_Outer_CV_test
=
[]
# a list with only the test indices
for
rep
in
range
(
n_repetitions
):
skf
=
StratifiedKFold
(
n_splits
=
k_out
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV
=
[]
Outer_CV_test
=
[]
for
train_index
,
test_index
in
skf
.
split
(
EEG_features_df
,
study_group_status
):
Outer_CV
.
append
([
train_index
,
test_index
])
Outer_CV_test
.
append
(
test_index
)
Rep_Outer_CV
.
append
(
Outer_CV
)
Rep_Outer_CV_test
.
append
(
Outer_CV_test
)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat
=
[
item
for
sublist
in
Rep_Outer_CV_test
for
item
in
sublist
]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str
=
[
""
.
join
([
str
(
x
)
for
x
in
ele
])
for
ele
in
Rep_Outer_CV_test_flat
]
def
allUnique
(
x
):
seen
=
set
()
return
not
any
(
i
in
seen
or
seen
.
add
(
i
)
for
i
in
x
)
assert
allUnique
(
Rep_Outer_CV_test_flat_str
)
# Get current time
c_time1
=
time
.
localtime
()
c_time1
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time1
)
print
(
c_time1
)
for
rep
in
range
(
n_repetitions
):
# The outer fold CV has already been saved as lists
Outer_CV
=
Rep_Outer_CV
[
rep
]
# Pre-allocate memory
model_par0
=
[]
final_models0
=
[]
final_features0
=
[]
final_y_preds0
=
[]
Outer_counter
=
0
for
train_index
,
test_index
in
Outer_CV
:
test_df
=
EEG_features_df
.
iloc
[
test_index
]
train_df
=
EEG_features_df
.
iloc
[
train_index
]
# Training data will be standardized
standardizer
=
preprocessing
.
StandardScaler
().
fit
(
train_df
.
iloc
[:,
n_discrete_cols
:])
train_df_standard
=
train_df
.
copy
()
train_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
train_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Test data will also be standardized but using mean and std from training data
test_df_standard
=
test_df
.
copy
()
test_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
test_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Get the training data
X_train
=
train_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_train
=
train_df_standard
[
Target
]
# Get test data
X_test
=
test_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_test
=
test_df_standard
[
Target
]
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1
=
[
20
,
30
,
40
,
50
]
n_feat_mRMR2
=
[
30
,
40
,
50
,
60
]
# Max features from mRMR for each eye status
max_mRMR_features
=
n_feat_mRMR1
[
-
1
]
max_mRMR_features2
=
n_feat_mRMR2
[
-
1
]
eye_status
=
[
"
Eyes Closed
"
,
"
Eyes Open
"
]
n_eye_status
=
len
(
eye_status
)
mRMR_features
=
[]
feat_counter
=
[]
# Perform mRMR for each feature type
for
fset
in
range
(
len
(
EEG_features_name_list
)):
temp_feat
=
EEG_features_name_list
[
fset
]
other_feats
=
EEG_features_name_list
[:
fset
]
+
EEG_features_name_list
[
fset
+
1
:]
other_feats
=
[
item
for
sublist
in
other_feats
for
item
in
sublist
]
# make the list flatten
for
e
in
range
(
n_eye_status
):
ee
=
eye_status
[
e
]
temp_ee
=
"
{}_{}
"
.
format
(
temp_feat
,
ee
)
# Retrieve the dataset for each feature
col_idx
=
np
.
zeros
(
len
(
X_train
.
columns
),
dtype
=
bool
)
for
fsub
in
range
(
len
(
temp_feat
)):
temp_feat0
=
temp_feat
[
fsub
]
col_idx0
=
X_train
.
columns
.
str
.
contains
(
temp_feat0
+
"
_
"
)
col_idx
=
np
.
logical_or
(
col_idx
,
col_idx0
)
# append all trues
temp_X_train
=
X_train
.
loc
[:,
col_idx
]
# Check if any of the other features were wrongly chosen
for
fcheck
in
range
(
len
(
other_feats
)):
if
any
(
temp_X_train
.
columns
.
str
.
contains
(
other_feats
[
fcheck
]
+
"
_
"
)
==
True
):
temp_X_train
=
temp_X_train
.
loc
[:,
np
.
invert
(
temp_X_train
.
columns
.
str
.
contains
(
other_feats
[
fcheck
]
+
"
_
"
))]
if
temp_X_train
.
size
==
0
:
# if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train
=
X_train
.
loc
[:,
X_train
.
columns
.
str
.
contains
(
temp_feat0
+
"
_
"
)]
# Index for eye status
temp_X_train
=
temp_X_train
.
loc
[:,
temp_X_train
.
columns
.
str
.
contains
(
ee
)]
# Save number of original features fed into mRMR
feat_counter
.
append
([
temp_feat
,
temp_X_train
.
shape
[
1
]])
# Do not use mRMR if there are fewer than max_mRMR_features
if
temp_X_train
.
shape
[
1
]
<=
max_mRMR_features
:
filter_features
=
temp_X_train
.
columns
else
:
# mRMR
filter_feat_selector
=
mRMR_feature_select
(
temp_X_train
.
to_numpy
(),
y_train
.
to_numpy
(),
num_features_to_select
=
max_mRMR_features
,
K_MAX
=
1000
,
n_jobs
=-
1
,
verbose
=
False
)
# Get selected features
filter_features
=
temp_X_train
.
columns
[
filter_feat_selector
]
# Save features selected
mRMR_features
.
append
(
list
(
filter_features
))
# Check that no features were excluded when checking for wrongly chosen
total_features
=
0
for
feat_c
in
range
(
len
(
feat_counter
)):
total_features
+=
feat_counter
[
feat_c
][
1
]
assert
total_features
==
X_train
.
shape
[
1
]
# Part 2 with second mRMR and classifiers in loop
k_fold
=
10
skf2
=
StratifiedKFold
(
n_splits
=
k_fold
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV
=
[]
for
train_index2
,
test_index2
in
skf2
.
split
(
train_df
,
study_group_status
.
iloc
[
train_index
]):
Inner_CV
.
append
([
train_index2
,
test_index2
])
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_SVM
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
kernels
=
[
"
linear
"
]
# rbf overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_LogReg
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
# Random forest classifier
# Prepare hyper parameters
trees
=
np
.
array
([
10
,
100
,
500
,
1000
])
depth
=
np
.
linspace
(
1
,
2
,
2
)
# using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
C_parameter_SVM
),
len
(
kernels
)))
val_feat_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
C_parameter_SVM
),
len
(
kernels
),
max_mRMR_features2
))
val_feat_svm
.
fill
(
np
.
nan
)
val_err_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
C_parameter_LogReg
)))
val_feat_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
C_parameter_LogReg
),
max_mRMR_features2
))
val_feat_logreg
.
fill
(
np
.
nan
)
val_err_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
trees
),
len
(
depth
)))
val_feat_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
trees
),
len
(
depth
),
max_mRMR_features2
))
val_feat_rf
.
fill
(
np
.
nan
)
val_feat_import_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
n_feat_mRMR2
),
len
(
trees
),
len
(
depth
),
max_mRMR_features2
))
val_feat_import_rf
.
fill
(
np
.
nan
)
for
n1
in
range
(
len
(
n_feat_mRMR1
)):
# Get the features from first mRMR
mRMR_features1
=
[
item
for
sublist
in
mRMR_features
for
item
in
sublist
[
0
:
n_feat_mRMR1
[
n1
]]]
# print("{} features are left after first mRMR filtering".format(len(mRMR_features1)))
X_train_mRMR
=
X_train
[
mRMR_features1
].
copy
()
# Second round of mRMR on the reduced list
filter_feat_selector
=
mRMR_feature_select
(
X_train_mRMR
.
to_numpy
(),
y_train
.
to_numpy
(),
num_features_to_select
=
max_mRMR_features2
,
K_MAX
=
1000
,
n_jobs
=-
1
,
verbose
=
False
)
for
n2
in
range
(
len
(
n_feat_mRMR2
)):
# Get selected features from mRMR2
filter_features
=
X_train_mRMR
.
columns
[
filter_feat_selector
[
0
:
n_feat_mRMR2
[
n2
]]]
X_train_mRMR2
=
X_train_mRMR
[
filter_features
].
copy
()
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
min_features_to_select
=
1
# Using normal for loop as RFECV has inbuilt multiprocessing
for
C
in
range
(
len
(
C_parameter_SVM
)):
for
K
in
range
(
len
(
kernels
)):
# Define the model
svc
=
SVC
(
C
=
C_parameter_SVM
[
C
],
kernel
=
kernels
[
K
],
tol
=
1e-3
,
cache_size
=
4000
)
# Perform recurive feature elimination with in-built CV
rfecv
=
RFECV
(
estimator
=
svc
,
n_jobs
=-
1
,
scoring
=
"
balanced_accuracy
"
,
cv
=
Inner_CV
,
min_features_to_select
=
min_features_to_select
)
rfecv
.
fit
(
X_train_mRMR2
,
y_train
.
to_numpy
().
ravel
())
# Get CV score
err
=
rfecv
.
grid_scores_
[
rfecv
.
n_features_
-
min_features_to_select
]
# Save results for hyperparameter optimization
val_err_svm
[
n1
,
n2
,
C
,
K
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
filter_features
[
rfecv
.
support_
]))[
0
]
val_feat_svm
[
n1
,
n2
,
C
,
K
,
0
:
len
(
rfe_feat_idx
)]
=
rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
# Logistic regression with sequential forward selection
# In-bult CV
k_features
=
np
.
arange
(
1
,
n_feat_mRMR2
[
n2
]
+
1
,
1
)
# try up to all features
inner_cv_scores
=
np
.
zeros
((
len
(
C_parameter_LogReg
),
len
(
k_features
)))
for
C
in
range
(
len
(
C_parameter_LogReg
)):
LogReg
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_parameter_LogReg
[
C
],
max_iter
=
50000
)
sfs
=
SFS
(
LogReg
,
k_features
=
(
k_features
[
0
],
k_features
[
-
1
]),
forward
=
True
,
scoring
=
"
balanced_accuracy
"
,
verbose
=
0
,
floating
=
False
,
cv
=
Inner_CV
,
n_jobs
=
-
1
)
sfs
=
sfs
.
fit
(
X_train_mRMR2
,
y_train
.
to_numpy
().
ravel
())
# Save CV scores for each SFS step
for
feat
in
range
(
len
(
k_features
)):
inner_cv_scores
[
C
,
feat
]
=
sfs
.
get_metric_dict
()[
k_features
[
feat
]][
"
avg_score
"
]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K
=
np
.
where
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)
==
np
.
max
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)))[
0
]
if
len
(
K
)
>
1
:
K
=
K
[
np
.
where
(
K
==
np
.
min
(
K
))[
0
]]
err
=
inner_cv_scores
[
C
,
K
]
# Save validation error and features
val_err_logreg
[
n1
,
n2
,
C
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
sfs
.
subsets_
[
int
(
K
)
+
1
][
"
feature_names
"
]))[
0
]
# K+1 since it starts with 1 feature
val_feat_logreg
[
n1
,
n2
,
C
,
0
:
len
(
sfs_feat_idx
)]
=
sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
)))
RF_feat_import
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
),
X_train_mRMR2
.
shape
[
1
]))
counter
=
0
for
cv
in
range
(
k_fold
):
# Retrieve CV indices
train_index_rf
=
Inner_CV
[
cv
][
0
]
test_index_rf
=
Inner_CV
[
cv
][
1
]
# Retrieve datasets
X_train2
=
X_train_mRMR2
.
iloc
[
train_index_rf
];
X_test2
=
X_train_mRMR2
.
iloc
[
test_index_rf
]
y_train2
=
y_train
.
to_numpy
().
ravel
()[
train_index_rf
];
y_test2
=
y_train
.
to_numpy
().
ravel
()[
test_index_rf
]
for
t
in
range
(
len
(
trees
)):
for
d
in
range
(
len
(
depth
)):
RF
=
RandomForestClassifier
(
n_estimators
=
trees
[
t
],
max_features
=
"
sqrt
"
,
max_depth
=
depth
[
d
],
n_jobs
=-
1
,
random_state
=
None
)
RF
.
fit
(
X_train2
.
to_numpy
(),
y_train2
)
# Validation error
RF_y
=
RF
.
predict
(
X_test2
.
to_numpy
())
err
=
balanced_accuracy_score
(
y_test2
,
RF_y
)
# Save to array
model_test_err
[
cv
,
t
,
d
]
=
err
# Save feature importance array
RF_feat_import
[
cv
,
t
,
d
]
=
RF
.
feature_importances_
counter
+=
1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean
=
np
.
mean
(
model_test_err
,
axis
=
0
)
val_err_rf
[
n1
,
n2
,:,:]
=
model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import
=
np
.
mean
(
RF_feat_import
,
axis
=
0
)
val_feat_import_rf
[
n1
,
n2
,:,:,
0
:
n_feat_mRMR2
[
n2
]]
=
RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
filter_features
))[
0
]
val_feat_rf
[
n1
,
n2
,:,:,
0
:
len
(
rf_feat_idx
)]
=
rf_feat_idx
print
(
"
Finished {}% of total run
"
.
format
((
n1
+
1
)
*
1
/
len
(
n_feat_mRMR1
)
*
100
))
# Choose the optimal parameters
### SVM
n1
,
n2
,
C
,
K
=
np
.
where
(
val_err_svm
==
np
.
max
(
val_err_svm
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple SVM runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
rfe_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
n2_temp
=
int
(
n2
[
i2
])
C_temp
=
int
(
C
[
i2
])
K_temp
=
int
(
K
[
i2
])
temp_feats
=
val_feat_svm
[
n1_temp
,
n2_temp
,
C_temp
,
K_temp
][
~
np
.
isnan
(
val_feat_svm
[
n1_temp
,
n2_temp
,
C_temp
,
K_temp
])].
astype
(
int
)
rfe_feat_len
.
append
(
len
(
temp_feats
))
rfe_feat_min
=
np
.
where
(
rfe_feat_len
==
np
.
min
(
rfe_feat_len
))[
0
]
if
len
(
rfe_feat_min
)
>
1
:
print
(
"
Multiple SVM runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
rfe_feat_min
])
n1
,
n2
,
C
,
K
=
[
int
(
n1
[
rfe_feat_min
][
C_min
]),
int
(
n2
[
rfe_feat_min
][
C_min
]),
int
(
C
[
rfe_feat_min
][
C_min
]),
int
(
K
[
rfe_feat_min
][
C_min
])]
else
:
n1
,
n2
,
C
,
K
=
[
int
(
n1
[
int
(
rfe_feat_min
)]),
int
(
n2
[
int
(
rfe_feat_min
)]),
int
(
C
[
int
(
rfe_feat_min
)]),
int
(
K
[
int
(
rfe_feat_min
)])]
else
:
n1
,
n2
,
C
,
K
=
[
int
(
n1
),
int
(
n2
),
int
(
C
),
int
(
K
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
mRMR_chosen2
=
n_feat_mRMR2
[
n2
]
C_chosen
=
C_parameter_SVM
[
C
]
K_chosen
=
kernels
[
K
]
# Save the best validation erro
val_error
=
val_err_svm
[
n1
,
n2
,
C
,
K
]
# Get the subsets chosen
chosen_feats
=
val_feat_svm
[
n1
,
n2
,
C
,
K
][
~
np
.
isnan
(
val_feat_svm
[
n1
,
n2
,
C
,
K
])].
astype
(
int
)
rfe_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rfe_features
)
x_train_mRMR2_rfe
=
X_train
[
rfe_features
]
# Fit on all training data
model
=
SVC
(
C
=
C_chosen
,
kernel
=
K_chosen
,
tol
=
1e-3
,
cache_size
=
4000
)
model
=
model
.
fit
(
x_train_mRMR2_rfe
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_rfe
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rfe_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr
[
rep
,
Outer_counter
,
0
,:]
=
[
train_error
,
val_error
,
test_error
]
SVM_model_par
=
[
mRMR_chosen1
,
mRMR_chosen2
,
C_chosen
,
n_final_feat
,
K_chosen
]
SVM_model
=
model
SVM_y_pred
=
[[
model_train_y
],[
y_pred
]]
### LogReg with SFS
n1
,
n2
,
C
=
np
.
where
(
val_err_logreg
==
np
.
max
(
val_err_logreg
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple LogReg runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
sfs_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
n2_temp
=
int
(
n2
[
i2
])
C_temp
=
int
(
C
[
i2
])
temp_feats
=
val_feat_logreg
[
n1_temp
,
n2_temp
,
C_temp
][
~
np
.
isnan
(
val_feat_logreg
[
n1_temp
,
n2_temp
,
C_temp
])].
astype
(
int
)
sfs_feat_len
.
append
(
len
(
temp_feats
))
sfs_feat_min
=
np
.
where
(
sfs_feat_len
==
np
.
min
(
sfs_feat_len
))[
0
]
if
len
(
sfs_feat_min
)
>
1
:
print
(
"
Multiple LogReg runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
sfs_feat_min
])
n1
,
n2
,
C
=
[
int
(
n1
[
sfs_feat_min
][
C_min
]),
int
(
n2
[
sfs_feat_min
][
C_min
]),
int
(
C
[
sfs_feat_min
][
C_min
])]
else
:
n1
,
n2
,
C
=
[
int
(
n1
[
int
(
sfs_feat_min
)]),
int
(
n2
[
int
(
sfs_feat_min
)]),
int
(
C
[
int
(
sfs_feat_min
)])]
else
:
n1
,
n2
,
C
=
[
int
(
n1
),
int
(
n2
),
int
(
C
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
mRMR_chosen2
=
n_feat_mRMR2
[
n2
]
C_chosen
=
C_parameter_LogReg
[
C
]
# Save the best validation erro
val_error
=
val_err_logreg
[
n1
,
n2
,
C
]
# Get the subsets chosen
chosen_feats
=
val_feat_logreg
[
n1
,
n2
,
C
][
~
np
.
isnan
(
val_feat_logreg
[
n1
,
n2
,
C
])].
astype
(
int
)
sfs_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
sfs_features
)
x_train_mRMR2_sfs
=
X_train
[
sfs_features
]
# Fit on all training data
model
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_chosen
,
max_iter
=
50000
)
model
=
model
.
fit
(
x_train_mRMR2_sfs
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_sfs
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
sfs_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr
[
rep
,
Outer_counter
,
1
,:]
=
[
train_error
,
val_error
,
test_error
]
LogReg_model_par
=
[
mRMR_chosen1
,
mRMR_chosen2
,
C_chosen
,
n_final_feat
]
LogReg_model
=
model
LogReg_y_pred
=
[[
model_train_y
],[
y_pred
]]
### RF
n1
,
n2
,
t
,
d
=
np
.
where
(
val_err_rf
==
np
.
max
(
val_err_rf
))
if
len
(
d
)
>
1
:
print
(
"
There are multiple RF runs with the same validation error
"
)
print
(
"
Choosing run with lowest depth to alleviate overfitting
"
)
d_min
=
np
.
where
(
d
==
np
.
min
(
d
))[
0
]
if
len
(
d_min
)
>
1
:
print
(
"
Multiple RF runs with same number number of depths
"
)
print
(
"
Choosing run with lowest trees to alleviate overfitting
"
)
t_min
=
np
.
argmin
(
t
[
d_min
])
# argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1
,
n2
,
t
,
d
=
[
int
(
n1
[
d_min
][
t_min
]),
int
(
n2
[
d_min
][
t_min
]),
int
(
t
[
d_min
][
t_min
]),
int
(
d
[
d_min
][
t_min
])]
else
:
n1
,
n2
,
t
,
d
=
[
int
(
n1
[
int
(
d_min
)]),
int
(
n2
[
int
(
d_min
)]),
int
(
t
[
int
(
d_min
)]),
int
(
d
[
int
(
d_min
)])]
else
:
n1
,
n2
,
t
,
d
=
[
int
(
n1
),
int
(
n2
),
int
(
t
),
int
(
d
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
mRMR_chosen2
=
n_feat_mRMR2
[
n2
]
t_chosen
=
trees
[
t
]
d_chosen
=
depth
[
d
]
# Save the best validation erro
val_error
=
val_err_rf
[
n1
,
n2
,
t
,
d
]
# Get the chosen features and feature importances
chosen_feats
=
val_feat_rf
[
n1
,
n2
,
t
,
d
][
~
np
.
isnan
(
val_feat_rf
[
n1
,
n2
,
t
,
d
])].
astype
(
int
)
rf_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rf_features
)
x_train_mRMR2_rf
=
X_train
[
rf_features
]
rf_feat_importances
=
val_feat_import_rf
[
n1
,
n2
,
t
,
d
][
~
np
.
isnan
(
val_feat_import_rf
[
n1
,
n2
,
t
,
d
])]
# Fit on all training data
model
=
RandomForestClassifier
(
n_estimators
=
t_chosen
,
max_features
=
"
sqrt
"
,
max_depth
=
d_chosen
,
n_jobs
=-
1
,
random_state
=
None
)
model
=
model
.
fit
(
x_train_mRMR2_rf
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_rf
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rf_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr
[
rep
,
Outer_counter
,
2
,:]
=
[
train_error
,
val_error
,
test_error
]
RF_model_par
=
[
mRMR_chosen1
,
mRMR_chosen2
,
t_chosen
,
d_chosen
]
RF_model
=
model
RF_y_pred
=
[[
model_train_y
],[
y_pred
]]
# Save the rest
model_par0
.
append
([
SVM_model_par
,
LogReg_model_par
,
RF_model_par
])
final_models0
.
append
([
SVM_model
,
LogReg_model
,
RF_model
])
final_features0
.
append
([
rfe_features
,
sfs_features
,
[
rf_features
,
rf_feat_importances
]])
final_y_preds0
.
append
([
SVM_y_pred
,
LogReg_y_pred
,
RF_y_pred
])
# Move counter
Outer_counter
+=
1
print
(
"
Finished outer fold {} out of {} for rep: {}
"
.
format
(
Outer_counter
,
k_out
,
rep
+
1
))
# Save results from all outer folds
model_par
.
append
(
model_par0
)
final_models
.
append
(
final_models0
)
final_features
.
append
(
final_features0
)
final_y_preds
.
append
(
final_y_preds0
)
# Save results to file
Rep_mRMR2_SVM_LogReg_RF
=
[
accuracy_arr
,
model_par
,
final_models
,
final_features
,
final_y_preds
]
# Run with variable feat in first and second mRMR
with
open
(
Model_savepath
+
"
Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Group_Status_results_010222.pkl
"
,
"
wb
"
)
as
filehandle
:
pickle
.
dump
(
Rep_mRMR2_SVM_LogReg_RF
,
filehandle
)
# Get current time
c_time2
=
time
.
localtime
()
c_time2
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time2
)
print
(
"
Started
"
,
c_time1
,
"
\n
Current Time
"
,
c_time2
)
# Print total progress
print
(
"
Finished outer fold repetition {} out of {}
"
.
format
(
rep
+
1
,
n_repetitions
))
# %% ML predction of PTSD status using each feature separately
# mRMR -> mRMR -> SVM/RF/LogReg with L2
# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
# With concurrent processes for faster computation
EEG_features_df
,
EEG_features_name_list
=
load_features_df
()
# Add group status
Group_status
=
np
.
array
([
0
]
*
EEG_features_df
.
shape
[
0
])
# CTRL = 0
Group_status
[
np
.
array
([
i
in
cases
for
i
in
EEG_features_df
[
"
Subject_ID
"
]])]
=
1
# PTSD = 1
EEG_features_df
.
insert
(
1
,
"
Group_status
"
,
Group_status
)
# Subject info columns
Subject_info_cols
=
list
(
EEG_features_df
.
columns
[
0
:
2
])
n_subject_info_cols
=
len
(
Subject_info_cols
)
n_discrete_cols
=
2
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies
=
3
study_group_status
=
EEG_features_df
[
"
Group_status
"
].
copy
()
for
i
in
range
(
n_studies
):
# Get study index
study_idx
=
(
EEG_features_df
[
"
Subject_ID
"
]
>=
(
i
+
1
)
*
100000
)
&
(
EEG_features_df
[
"
Subject_ID
"
]
<
(
i
+
2
)
*
100000
)
# Assign label
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
0
)]
=
2
*
i
# CTRL
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
1
)]
=
2
*
i
+
1
# PTSD
# Target variable
Target
=
[
"
Group_status
"
]
Target_col
=
EEG_features_df
.
iloc
[:,
1
:].
columns
.
isin
(
Target
)
Target_col_idx
=
np
.
where
(
Target_col
==
True
)[
0
]
# Make 3 models and save them to use enseemble in the end
CLF_models
=
[
"
SVM
"
,
"
LogReg
"
,
"
RF
"
]
n_models
=
len
(
CLF_models
)
# Repeat the classification to see if I just got a lucky seed
n_repetitions
=
10
k_out
=
10
# accuracy_arr = np.zeros((n_repetitions,k_out,len(EEG_features_name_list),n_models,3))
Ind_feat_clf
=
[
0
]
*
n_repetitions
np
.
random
.
seed
(
42
)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV
=
[]
Rep_Outer_CV_test
=
[]
# a list with only the test indices
for
rep
in
range
(
n_repetitions
):
skf
=
StratifiedKFold
(
n_splits
=
k_out
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV
=
[]
Outer_CV_test
=
[]
for
train_index
,
test_index
in
skf
.
split
(
EEG_features_df
,
study_group_status
):
Outer_CV
.
append
([
train_index
,
test_index
])
Outer_CV_test
.
append
(
test_index
)
Rep_Outer_CV
.
append
(
Outer_CV
)
Rep_Outer_CV_test
.
append
(
Outer_CV_test
)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat
=
[
item
for
sublist
in
Rep_Outer_CV_test
for
item
in
sublist
]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str
=
[
""
.
join
([
str
(
x
)
for
x
in
ele
])
for
ele
in
Rep_Outer_CV_test_flat
]
def
allUnique
(
x
):
seen
=
set
()
return
not
any
(
i
in
seen
or
seen
.
add
(
i
)
for
i
in
x
)
assert
allUnique
(
Rep_Outer_CV_test_flat_str
)
# Get current time
c_time1
=
time
.
localtime
()
c_time1
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time1
)
print
(
c_time1
)
def
Each_feat_10rep_10x10CV
(
rep
):
# The outer fold CV has already been saved as lists
Outer_CV
=
Rep_Outer_CV
[
rep
]
# Pre-allocate memory
Ind_feat_clf0
=
[]
accuracy_arr0
=
np
.
zeros
((
k_out
,
len
(
EEG_features_name_list
),
n_models
,
3
))
Outer_counter
=
0
for
train_index
,
test_index
in
Outer_CV
:
test_df
=
EEG_features_df
.
iloc
[
test_index
]
train_df
=
EEG_features_df
.
iloc
[
train_index
]
# Training data will be standardized
standardizer
=
preprocessing
.
StandardScaler
().
fit
(
train_df
.
iloc
[:,
n_discrete_cols
:])
train_df_standard
=
train_df
.
copy
()
train_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
train_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Test data will also be standardized but using mean and std from training data
test_df_standard
=
test_df
.
copy
()
test_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
test_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Get the training data
X_train
=
train_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_train
=
train_df_standard
[
Target
]
# Get test data
X_test
=
test_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_test
=
test_df_standard
[
Target
]
# Part 2 with second mRMR and classifiers in loop
k_fold
=
10
skf2
=
StratifiedKFold
(
n_splits
=
k_fold
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV
=
[]
for
train_index2
,
test_index2
in
skf2
.
split
(
train_df
,
study_group_status
.
iloc
[
train_index
]):
Inner_CV
.
append
([
train_index2
,
test_index2
])
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1
=
[
20
,
30
,
40
,
50
]
# Max features from mRMR for each eye status
max_mRMR_features
=
n_feat_mRMR1
[
-
1
]
eye_status
=
[
"
Eyes Closed
"
,
"
Eyes Open
"
]
n_eye_status
=
len
(
eye_status
)
feat_counter
=
[]
# Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
Ind_feat_clf00
=
[]
for
fset
in
range
(
len
(
EEG_features_name_list
)):
temp_feat
=
EEG_features_name_list
[
fset
]
other_feats
=
EEG_features_name_list
[:
fset
]
+
EEG_features_name_list
[
fset
+
1
:]
other_feats
=
[
item
for
sublist
in
other_feats
for
item
in
sublist
]
# make the list flatten
# Retrieve the dataset for each feature
col_idx
=
np
.
zeros
(
len
(
X_train
.
columns
),
dtype
=
bool
)
for
fsub
in
range
(
len
(
temp_feat
)):
temp_feat0
=
temp_feat
[
fsub
]
col_idx0
=
X_train
.
columns
.
str
.
contains
(
temp_feat0
+
"
_
"
)
col_idx
=
np
.
logical_or
(
col_idx
,
col_idx0
)
# append all trues
temp_X_train
=
X_train
.
loc
[:,
col_idx
]
# Check if any of the other features were wrongly chosen
for
fcheck
in
range
(
len
(
other_feats
)):
if
any
(
temp_X_train
.
columns
.
str
.
contains
(
other_feats
[
fcheck
]
+
"
_
"
)
==
True
):
temp_X_train
=
temp_X_train
.
loc
[:,
np
.
invert
(
temp_X_train
.
columns
.
str
.
contains
(
other_feats
[
fcheck
]))]
if
temp_X_train
.
size
==
0
:
# if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train
=
X_train
.
loc
[:,
X_train
.
columns
.
str
.
contains
(
temp_feat0
+
"
_
"
)]
# Save number of original features fed into mRMR
feat_counter
.
append
([
temp_feat
,
temp_X_train
.
shape
[
1
]])
# If there are no features selected, then skip the rest of the loop
if
temp_X_train
.
shape
[
1
]
==
0
:
continue
# Do not use mRMR if there are fewer than n_features
if
temp_X_train
.
shape
[
1
]
<=
max_mRMR_features
:
filter_features
=
temp_X_train
.
columns
else
:
# mRMR
filter_feat_selector
=
mRMR_feature_select
(
temp_X_train
.
to_numpy
(),
y_train
.
to_numpy
(),
num_features_to_select
=
max_mRMR_features
,
K_MAX
=
1000
,
n_jobs
=
1
,
verbose
=
False
)
# Get selected features
filter_features
=
temp_X_train
.
columns
[
filter_feat_selector
]
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_SVM
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
# rbf kernel overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_LogReg
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
# Random forest classifier
# Prepare hyper parameters
trees
=
np
.
array
([
10
,
100
,
500
,
1000
])
depth
=
np
.
linspace
(
1
,
2
,
2
)
# using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_SVM
)))
val_feat_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_SVM
),
max_mRMR_features
))
val_feat_svm
.
fill
(
np
.
nan
)
val_err_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_LogReg
)))
val_feat_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_LogReg
),
max_mRMR_features
))
val_feat_logreg
.
fill
(
np
.
nan
)
val_err_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
)))
val_feat_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
),
max_mRMR_features
))
val_feat_rf
.
fill
(
np
.
nan
)
val_feat_import_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
),
max_mRMR_features
))
val_feat_import_rf
.
fill
(
np
.
nan
)
min_features_to_select
=
1
for
n1
in
range
(
len
(
n_feat_mRMR1
)):
mRMR_features1
=
filter_features
[
0
:
n_feat_mRMR1
[
n1
]]
# Use the selected filter features
temp_X_train_mRMR
=
temp_X_train
[
mRMR_features1
]
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
for
C
in
range
(
len
(
C_parameter_SVM
)):
# Define the model
svc
=
SVC
(
C
=
C_parameter_SVM
[
C
],
kernel
=
"
linear
"
,
tol
=
1e-3
,
cache_size
=
4000
)
# Perform recurive feature elimination with in-built CV
rfecv
=
RFECV
(
estimator
=
svc
,
n_jobs
=
1
,
scoring
=
"
balanced_accuracy
"
,
cv
=
Inner_CV
,
min_features_to_select
=
min_features_to_select
)
rfecv
.
fit
(
temp_X_train_mRMR
,
y_train
.
to_numpy
().
ravel
())
# Get CV score
err
=
rfecv
.
grid_scores_
[
rfecv
.
n_features_
-
min_features_to_select
]
# Save results for hyperparameter optimization
val_err_svm
[
n1
,
C
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
mRMR_features1
[
rfecv
.
support_
]))[
0
]
val_feat_svm
[
n1
,
C
,
0
:
len
(
rfe_feat_idx
)]
=
rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
# Logistic regression with sequential forward selection
# In-bult CV
k_max
=
np
.
min
([
temp_X_train_mRMR
.
shape
[
1
],
n_feat_mRMR1
[
n1
]])
k_features
=
np
.
arange
(
1
,
k_max
+
1
,
1
)
# try up to all features
inner_cv_scores
=
np
.
zeros
((
len
(
C_parameter_LogReg
),
len
(
k_features
)))
for
C
in
range
(
len
(
C_parameter_LogReg
)):
LogReg
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_parameter_LogReg
[
C
],
max_iter
=
50000
)
sfs
=
SFS
(
LogReg
,
k_features
=
(
k_features
[
0
],
k_features
[
-
1
]),
forward
=
True
,
scoring
=
"
balanced_accuracy
"
,
verbose
=
0
,
floating
=
False
,
cv
=
Inner_CV
,
n_jobs
=
1
)
sfs
=
sfs
.
fit
(
temp_X_train_mRMR
,
y_train
.
to_numpy
().
ravel
())
# Save CV scores for each SFS step
for
feat
in
range
(
len
(
k_features
)):
inner_cv_scores
[
C
,
feat
]
=
sfs
.
get_metric_dict
()[
k_features
[
feat
]][
"
avg_score
"
]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K
=
np
.
where
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)
==
np
.
max
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)))[
0
]
if
len
(
K
)
>
1
:
K
=
K
[
np
.
where
(
K
==
np
.
min
(
K
))[
0
]]
err
=
inner_cv_scores
[
C
,
K
]
# Save validation error and features
val_err_logreg
[
n1
,
C
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
sfs
.
subsets_
[
int
(
K
)
+
1
][
"
feature_names
"
]))[
0
]
# K+1 since it starts with 1 feature
val_feat_logreg
[
n1
,
C
,
0
:
len
(
sfs_feat_idx
)]
=
sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
)))
RF_feat_import
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
),
temp_X_train_mRMR
.
shape
[
1
]))
counter
=
0
for
cv
in
range
(
k_fold
):
# Retrieve CV indices
train_index_rf
=
Inner_CV
[
cv
][
0
]
test_index_rf
=
Inner_CV
[
cv
][
1
]
# Retrieve datasets
X_train2
=
temp_X_train_mRMR
.
iloc
[
train_index_rf
];
X_test2
=
temp_X_train_mRMR
.
iloc
[
test_index_rf
]
y_train2
=
y_train
.
to_numpy
().
ravel
()[
train_index_rf
];
y_test2
=
y_train
.
to_numpy
().
ravel
()[
test_index_rf
]
for
t
in
range
(
len
(
trees
)):
for
d
in
range
(
len
(
depth
)):
RF
=
RandomForestClassifier
(
n_estimators
=
trees
[
t
],
max_features
=
"
sqrt
"
,
max_depth
=
depth
[
d
],
n_jobs
=
1
,
random_state
=
None
)
RF
.
fit
(
X_train2
.
to_numpy
(),
y_train2
)
# Validation error
RF_y
=
RF
.
predict
(
X_test2
.
to_numpy
())
err
=
balanced_accuracy_score
(
y_test2
,
RF_y
)
# Save to array
model_test_err
[
cv
,
t
,
d
]
=
err
# Save feature importance array
RF_feat_import
[
cv
,
t
,
d
]
=
RF
.
feature_importances_
counter
+=
1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean
=
np
.
mean
(
model_test_err
,
axis
=
0
)
val_err_rf
[
n1
,:,:]
=
model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import
=
np
.
mean
(
RF_feat_import
,
axis
=
0
)
val_feat_import_rf
[
n1
,:,:,
0
:
RF_feat_import
.
shape
[
2
]]
=
RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
mRMR_features1
))[
0
]
val_feat_rf
[
n1
,:,:,
0
:
len
(
rf_feat_idx
)]
=
rf_feat_idx
# Print progress
current_progress
=
(
n1
+
1
)
/
(
len
(
n_feat_mRMR1
))
*
100
print
(
"
Finished {}% of inner fold optimization for feat: {}
"
.
format
(
current_progress
,
temp_feat
[
0
]))
# Choose the optimal parameters
### SVM
n1
,
C
=
np
.
where
(
val_err_svm
==
np
.
max
(
val_err_svm
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple SVM runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
rfe_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
C_temp
=
int
(
C
[
i2
])
temp_feats
=
val_feat_svm
[
n1_temp
,
C_temp
][
~
np
.
isnan
(
val_feat_svm
[
n1_temp
,
C_temp
])].
astype
(
int
)
rfe_feat_len
.
append
(
len
(
temp_feats
))
rfe_feat_min
=
np
.
where
(
rfe_feat_len
==
np
.
min
(
rfe_feat_len
))[
0
]
if
len
(
rfe_feat_min
)
>
1
:
print
(
"
Multiple SVM runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
rfe_feat_min
])
n1
,
C
=
[
int
(
n1
[
rfe_feat_min
][
C_min
]),
int
(
C
[
rfe_feat_min
][
C_min
])]
else
:
n1
,
C
=
[
int
(
n1
[
int
(
rfe_feat_min
)]),
int
(
C
[
int
(
rfe_feat_min
)])]
else
:
n1
,
C
=
[
int
(
n1
),
int
(
C
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
C_chosen
=
C_parameter_SVM
[
C
]
# Save the best validation error
val_error
=
val_err_svm
[
n1
,
C
]
# Get the subsets chosen
chosen_feats
=
val_feat_svm
[
n1
,
C
][
~
np
.
isnan
(
val_feat_svm
[
n1
,
C
])].
astype
(
int
)
rfe_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rfe_features
)
x_train_mRMR2_rfe
=
X_train
[
rfe_features
]
# Fit on all training data
model
=
SVC
(
C
=
C_chosen
,
kernel
=
"
linear
"
,
tol
=
1e-3
,
cache_size
=
4000
)
model
=
model
.
fit
(
x_train_mRMR2_rfe
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_rfe
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rfe_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr0
[
Outer_counter
,
fset
,
0
,:]
=
[
train_error
,
val_error
,
test_error
]
SVM_model_par
=
[
mRMR_chosen1
,
C_chosen
,
n_final_feat
]
SVM_model
=
model
SVM_y_pred
=
[[
model_train_y
],[
y_pred
]]
### LogReg with SFS
n1
,
C
=
np
.
where
(
val_err_logreg
==
np
.
max
(
val_err_logreg
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple LogReg runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
sfs_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
C_temp
=
int
(
C
[
i2
])
temp_feats
=
val_feat_logreg
[
n1_temp
,
C_temp
][
~
np
.
isnan
(
val_feat_logreg
[
n1_temp
,
C_temp
])].
astype
(
int
)
sfs_feat_len
.
append
(
len
(
temp_feats
))
sfs_feat_min
=
np
.
where
(
sfs_feat_len
==
np
.
min
(
sfs_feat_len
))[
0
]
if
len
(
sfs_feat_min
)
>
1
:
print
(
"
Multiple LogReg runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
sfs_feat_min
])
n1
,
C
=
[
int
(
n1
[
sfs_feat_min
][
C_min
]),
int
(
C
[
sfs_feat_min
][
C_min
])]
else
:
n1
,
C
=
[
int
(
n1
[
int
(
sfs_feat_min
)]),
int
(
C
[
int
(
sfs_feat_min
)])]
else
:
n1
,
C
=
[
int
(
n1
),
int
(
C
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
C_chosen
=
C_parameter_LogReg
[
C
]
# Save the best validation erro
val_error
=
val_err_logreg
[
n1
,
C
]
# Get the subsets chosen
chosen_feats
=
val_feat_logreg
[
n1
,
C
][
~
np
.
isnan
(
val_feat_logreg
[
n1
,
C
])].
astype
(
int
)
sfs_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
sfs_features
)
x_train_mRMR2_sfs
=
X_train
[
sfs_features
]
# Fit on all training data
model
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_chosen
,
max_iter
=
50000
)
model
=
model
.
fit
(
x_train_mRMR2_sfs
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_sfs
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
sfs_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr0
[
Outer_counter
,
fset
,
1
,:]
=
[
train_error
,
val_error
,
test_error
]
LogReg_model_par
=
[
mRMR_chosen1
,
C_chosen
,
n_final_feat
]
LogReg_model
=
model
LogReg_y_pred
=
[[
model_train_y
],[
y_pred
]]
### RF
n1
,
t
,
d
=
np
.
where
(
val_err_rf
==
np
.
max
(
val_err_rf
))
if
len
(
d
)
>
1
:
print
(
"
There are multiple RF runs with the same validation error
"
)
print
(
"
Choosing run with lowest depth to alleviate overfitting
"
)
d_min
=
np
.
where
(
d
==
np
.
min
(
d
))[
0
]
if
len
(
d_min
)
>
1
:
print
(
"
Multiple RF runs with same number number of depths
"
)
print
(
"
Choosing run with lowest trees to alleviate overfitting
"
)
t_min
=
np
.
argmin
(
t
[
d_min
])
# argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1
,
t
,
d
=
[
int
(
n1
[
d_min
][
t_min
]),
int
(
t
[
d_min
][
t_min
]),
int
(
d
[
d_min
][
t_min
])]
else
:
n1
,
t
,
d
=
[
int
(
n1
[
int
(
d_min
)]),
int
(
t
[
int
(
d_min
)]),
int
(
d
[
int
(
d_min
)])]
else
:
n1
,
t
,
d
=
[
int
(
n1
),
int
(
t
),
int
(
d
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
t_chosen
=
trees
[
t
]
d_chosen
=
depth
[
d
]
# Save the best validation error
val_error
=
val_err_rf
[
n1
,
t
,
d
]
# Get the chosen features and feature importances
chosen_feats
=
val_feat_rf
[
n1
,
t
,
d
][
~
np
.
isnan
(
val_feat_rf
[
n1
,
t
,
d
])].
astype
(
int
)
rf_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rf_features
)
x_train_mRMR2_rf
=
X_train
[
rf_features
]
rf_feat_importances
=
val_feat_import_rf
[
n1
,
t
,
d
][
~
np
.
isnan
(
val_feat_import_rf
[
n1
,
t
,
d
])]
# Fit on all training data
model
=
RandomForestClassifier
(
n_estimators
=
t_chosen
,
max_features
=
"
sqrt
"
,
max_depth
=
d_chosen
,
n_jobs
=
1
,
random_state
=
None
)
model
=
model
.
fit
(
x_train_mRMR2_rf
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_rf
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rf_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr0
[
Outer_counter
,
fset
,
2
,:]
=
[
train_error
,
val_error
,
test_error
]
RF_model_par
=
[
mRMR_chosen1
,
t_chosen
,
d_chosen
]
RF_model
=
model
RF_y_pred
=
[[
model_train_y
],[
y_pred
]]
# Save the rest
model_par00
=
[
SVM_model_par
,
LogReg_model_par
,
RF_model_par
]
final_models00
=
[
SVM_model
,
LogReg_model
,
RF_model
]
final_features00
=
[
rfe_features
,
sfs_features
,
[
rf_features
,
rf_feat_importances
]]
final_y_preds00
=
[
SVM_y_pred
,
LogReg_y_pred
,
RF_y_pred
]
data_splits00
=
[
X_train
,
y_train
,
X_test
,
y_test
,
standardizer
.
mean_
,
standardizer
.
scale_
]
# Save the results for the specific feature
res
=
[
temp_feat
,
model_par00
,
final_models00
,
final_features00
,
final_y_preds00
,
data_splits00
]
Ind_feat_clf00
.
append
(
res
)
print
(
"
Finished outer fold {} out of {} for rep: {} for feat {}
"
.
format
(
Outer_counter
+
1
,
k_out
,
rep
+
1
,
temp_feat
))
# Check that no features were excluded when checking for wrongly chosen
total_features
=
0
for
feat_c
in
range
(
len
(
feat_counter
)):
total_features
+=
feat_counter
[
feat_c
][
1
]
assert
total_features
==
X_train
.
shape
[
1
]
# Save results over outer folds
Ind_feat_clf0
.
append
(
Ind_feat_clf00
)
print
(
"
Finished outer fold {} out of {} for rep: {} for all features
"
.
format
(
Outer_counter
+
1
,
k_out
,
rep
+
1
))
# Move counter
Outer_counter
+=
1
out
=
[
accuracy_arr0
,
Ind_feat_clf0
]
# [Rep][Outer CV][Feature][Variable]
# Print total progress
print
(
"
Finished outer fold repetition {} out of {}
"
.
format
(
rep
+
1
,
n_repetitions
))
# Get current time
c_time2
=
time
.
localtime
()
c_time2
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time2
)
print
(
"
Started
"
,
c_time1
,
"
\n
Current Time
"
,
c_time2
)
return
rep
,
out
Ind_feat_clf
=
[
0
]
*
n_repetitions
with
concurrent
.
futures
.
ProcessPoolExecutor
(
max_workers
=
10
)
as
executor
:
for
rep
,
result
in
executor
.
map
(
Each_feat_10rep_10x10CV
,
range
(
n_repetitions
)):
# Function and arguments
Ind_feat_clf
[
rep
]
=
result
# [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
# Save results to file and overwrite when new results arrive
with
open
(
Model_savepath
+
"
Each_feat_Rep10_10x10CV_mRMR_SVM_LogReg_RF_results_070222.pkl
"
,
"
wb
"
)
as
filehandle
:
pickle
.
dump
(
Ind_feat_clf
,
filehandle
)
# %% Sparse clustering of all EEG features to look for subtypes
EEG_features_df
,
EEG_features_name_list
=
load_features_df
()
# Add group status
Group_status
=
np
.
array
([
0
]
*
EEG_features_df
.
shape
[
0
])
# CTRL = 0
Group_status
[
np
.
array
([
i
in
cases
for
i
in
EEG_features_df
[
"
Subject_ID
"
]])]
=
1
# PTSD = 1
EEG_features_df
.
insert
(
1
,
"
Group_status
"
,
Group_status
)
# Only use PTSD patient group
EEG_features_df2
=
EEG_features_df
.
loc
[
EEG_features_df
[
"
Group_status
"
]
==
1
,:]
Subject_info_cols
=
[
"
Subject_ID
"
,
"
Group_status
"
]
# Standardize the values
X
=
np
.
array
(
EEG_features_df2
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
))
standardizer
=
preprocessing
.
StandardScaler
().
fit
(
X
)
X
=
standardizer
.
transform
(
X
)
# Use gridsearch and permutations to estimate gap statistic and use it to
# determine number of clusters and sparsity s
# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020
max_clusters
=
6
n_sparsity_feat
=
20
perm_res
=
[]
Timestamps
=
[]
# Get current time
c_time1
=
time
.
localtime
()
c_time1
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time1
)
print
(
c_time1
)
Timestamps
.
append
(
c_time1
)
for
k
in
range
(
1
,
max_clusters
):
# Cannot permute with 1 cluster
n_clusters
=
k
+
1
perm
=
pysparcl
.
cluster
.
permute_modified
(
X
,
k
=
n_clusters
,
verbose
=
True
,
nvals
=
n_sparsity_feat
,
nperms
=
100
)
perm_res
.
append
(
perm
)
# Get current time
c_time2
=
time
.
localtime
()
c_time2
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time2
)
print
(
"
Started
"
,
c_time1
,
"
\n
Current Time
"
,
c_time2
)
Timestamps
.
append
(
c_time2
)
# Save the results
with
open
(
Feature_savepath
+
"
All_feats_no_coh_plv_kmeans_perm.pkl
"
,
"
wb
"
)
as
file
:
pickle
.
dump
(
perm_res
,
file
)
# # Load
# with open(Feature_savepath+"All_feats_no_coh_plv_kmeans_perm.pkl", "rb") as file:
# perm_res = pickle.load(file)
# Convert results to array
perm_res_arr
=
np
.
zeros
((
len
(
perm_res
)
*
n_sparsity_feat
,
4
))
for
i
in
range
(
len
(
perm_res
)):
_
,
gaps
,
sdgaps
,
wbounds
,
_
=
perm_res
[
i
].
values
()
for
i2
in
range
(
n_sparsity_feat
):
perm_res_arr
[
n_sparsity_feat
*
i
+
i2
,
0
]
=
i
+
2
# cluster size
perm_res_arr
[
n_sparsity_feat
*
i
+
i2
,
1
]
=
gaps
[
i2
]
# gap statistic
perm_res_arr
[
n_sparsity_feat
*
i
+
i2
,
2
]
=
sdgaps
[
i2
]
# gap statistic std
perm_res_arr
[
n_sparsity_feat
*
i
+
i2
,
3
]
=
wbounds
[
i2
]
# sparsity feature s
# For each sparsity s, determine best k using one-standard-error criterion
# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s
# that fulfill Gap(k) >= Gap(k+1)-std(k+1)
def
one_standard_deviation_search
(
gaps
,
std
):
best_gaps
=
np
.
argmax
(
gaps
)
current_gaps
=
gaps
[
best_gaps
]
current_std
=
std
[
best_gaps
]
current_gaps_idx
=
best_gaps
while
(
gaps
[
current_gaps_idx
-
1
]
>=
current_gaps
-
current_std
):
if
current_gaps_idx
==
0
:
break
else
:
current_gaps_idx
-=
1
current_gaps
=
gaps
[
current_gaps_idx
]
current_std
=
std
[
current_gaps_idx
]
out
=
current_gaps
,
current_std
,
current_gaps_idx
return
out
best_ks
=
np
.
zeros
((
n_sparsity_feat
,
2
))
all_s
=
np
.
unique
(
perm_res_arr
[:,
3
])
plt
.
figure
(
figsize
=
(
12
,
12
))
for
i2
in
range
(
n_sparsity_feat
):
current_s
=
all_s
[
i2
]
gaps
=
perm_res_arr
[
perm_res_arr
[:,
3
]
==
current_s
,
1
]
std
=
perm_res_arr
[
perm_res_arr
[:,
3
]
==
current_s
,
2
]
_
,
_
,
idx
=
one_standard_deviation_search
(
gaps
,
std
)
# Save to array
best_ks
[
i2
,
0
]
=
current_s
best_ks
[
i2
,
1
]
=
perm_res_arr
[
perm_res_arr
[:,
3
]
==
current_s
,
0
][
idx
]
# Plot gap
plt
.
errorbar
(
perm_res_arr
[
perm_res_arr
[:,
3
]
==
current_s
,
0
].
astype
(
"
int
"
),
gaps
,
yerr
=
std
,
capsize
=
5
,
label
=
np
.
round
(
current_s
,
3
))
plt
.
title
(
"
Gap statistic for different fixed s
"
)
plt
.
legend
(
loc
=
1
)
plt
.
xlabel
(
"
Number of clusters
"
)
plt
.
ylabel
(
"
Gap statistic
"
)
best_k
=
int
(
scipy
.
stats
.
mode
(
best_ks
[:,
1
])[
0
])
# Determine s using fixed k as lowest s within 1 std of max gap statistic
# According to Witten & Tibshirani, 2010
best_gaps_idx
=
np
.
argmax
(
perm_res_arr
[
perm_res_arr
[:,
0
]
==
best_k
,
1
])
best_gaps
=
perm_res_arr
[
perm_res_arr
[:,
0
]
==
best_k
,
1
][
best_gaps_idx
]
best_gaps_std
=
perm_res_arr
[
perm_res_arr
[:,
0
]
==
best_k
,
2
][
best_gaps_idx
]
one_std_crit
=
perm_res_arr
[
perm_res_arr
[:,
0
]
==
best_k
,
1
]
>=
best_gaps
-
best_gaps_std
best_s
=
np
.
array
([
perm_res_arr
[
perm_res_arr
[:,
0
]
==
best_k
,
3
][
one_std_crit
][
0
]])
# Perform clustering with k clusters
sparcl
=
pysparcl
.
cluster
.
kmeans
(
X
,
k
=
best_k
,
wbounds
=
best_s
)[
0
]
# # Save the results
# with open(Feature_savepath+"All_feats_sparse_kmeans.pkl", "wb") as file:
# pickle.dump(sparcl, file)
with
open
(
Feature_savepath
+
"
All_feats_sparse_kmeans.pkl
"
,
"
rb
"
)
as
file
:
sparcl
=
pickle
.
load
(
file
)
# %% Sparse kmeans -> mRMR -> SVM/RF/LogReg with L2
# Using sparse kmeans selected features, only eyes closed as they were primarily chosen
# Prediction of subtype 1
EEG_features_df
,
EEG_features_name_list
=
load_features_df
()
with
open
(
Feature_savepath
+
"
All_feats_sparse_kmeans.pkl
"
,
"
rb
"
)
as
file
:
sparcl
=
pickle
.
load
(
file
)
# Use concatenated features
EEG_features_name_list
=
[[
'
Power
'
],
[
'
Frontal Theta Beta Ratio
'
,
'
Asymmetry
'
],
[
'
Peak Alpha Frequency
'
,
'
Global Peak Alpha Frequency
'
],
[
"
1/f exponent
"
],
[
'
imcoh
'
],
[
'
wpli
'
],
[
'
Power Envelope Correlation
'
],
[
'
Microstate Transition
'
,
'
Microstate Ratio Time
'
,
'
Microstate Entropy
'
],
[
"
Granger Causality
"
],
[
'
DFA Exponent
'
,
'
Global DFA Exponent
'
]]
# Add group status
Group_status
=
np
.
array
([
0
]
*
EEG_features_df
.
shape
[
0
])
# CTRL = 0
Group_status
[
np
.
array
([
i
in
cases
for
i
in
EEG_features_df
[
"
Subject_ID
"
]])]
=
1
# PTSD = 1
EEG_features_df
.
insert
(
1
,
"
Group_status
"
,
Group_status
)
# Only keep PTSD subtype 1 by dropping subtype 2
subtype1
=
np
.
array
(
Subject_id
)[
PTSD_idx
][
sparcl
[
"
cs
"
]
==
0
]
subtype2
=
np
.
array
(
Subject_id
)[
PTSD_idx
][
sparcl
[
"
cs
"
]
==
1
]
EEG_features_df
=
EEG_features_df
.
set_index
(
"
Subject_ID
"
)
EEG_features_df
=
EEG_features_df
.
drop
(
subtype2
)
EEG_features_df
=
EEG_features_df
.
reset_index
()
# Check it was dropped correctly
assert
all
(
subtype1
==
EEG_features_df
.
loc
[
EEG_features_df
[
"
Group_status
"
]
==
1
,
"
Subject_ID
"
])
# Subject info columns
Subject_info_cols
=
list
(
EEG_features_df
.
columns
[
0
:
2
])
n_subject_info_cols
=
len
(
Subject_info_cols
)
n_discrete_cols
=
2
# Get features from sparse kmeans
nonzero_idx
=
sparcl
[
"
ws
"
].
nonzero
()
sparcl_features
=
EEG_features_df
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
).
columns
[
nonzero_idx
]
sum
(
sparcl_features
.
str
.
contains
(
"
Eyes Open
"
))
/
len
(
sparcl_features
)
# less than 3% are EO
# Only use eyes closed (2483 features)
sparcl_features
=
sparcl_features
[
sparcl_features
.
str
.
contains
(
"
Eyes Closed
"
)]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
[
Subject_info_cols
],
EEG_features_df
[
sparcl_features
]],
axis
=
1
)
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies
=
3
study_group_status
=
EEG_features_df
[
"
Group_status
"
].
copy
()
for
i
in
range
(
n_studies
):
# Get study index
study_idx
=
(
EEG_features_df
[
"
Subject_ID
"
]
>=
(
i
+
1
)
*
100000
)
&
(
EEG_features_df
[
"
Subject_ID
"
]
<
(
i
+
2
)
*
100000
)
# Assign label
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
0
)]
=
2
*
i
# CTRL
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
1
)]
=
2
*
i
+
1
# PTSD
# Target variable
Target
=
[
"
Group_status
"
]
Target_col
=
EEG_features_df
.
iloc
[:,
1
:].
columns
.
isin
(
Target
)
Target_col_idx
=
np
.
where
(
Target_col
==
True
)[
0
]
# Make 3 models and save them to use enseemble in the end
CLF_models
=
[
"
SVM
"
,
"
LogReg
"
,
"
RF
"
]
n_models
=
len
(
CLF_models
)
# Repeat the classification to see if I just got a lucky seed
n_repetitions
=
10
k_out
=
10
accuracy_arr
=
np
.
zeros
((
n_repetitions
,
k_out
,
n_models
,
3
))
model_par
=
[]
final_models
=
[]
final_features
=
[]
final_y_preds
=
[]
np
.
random
.
seed
(
42
)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV
=
[]
Rep_Outer_CV_test
=
[]
# a list with only the test indices
for
rep
in
range
(
n_repetitions
):
skf
=
StratifiedKFold
(
n_splits
=
k_out
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV
=
[]
Outer_CV_test
=
[]
for
train_index
,
test_index
in
skf
.
split
(
EEG_features_df
,
study_group_status
):
Outer_CV
.
append
([
train_index
,
test_index
])
Outer_CV_test
.
append
(
test_index
)
Rep_Outer_CV
.
append
(
Outer_CV
)
Rep_Outer_CV_test
.
append
(
Outer_CV_test
)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat
=
[
item
for
sublist
in
Rep_Outer_CV_test
for
item
in
sublist
]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str
=
[
""
.
join
([
str
(
x
)
for
x
in
ele
])
for
ele
in
Rep_Outer_CV_test_flat
]
def
allUnique
(
x
):
seen
=
set
()
return
not
any
(
i
in
seen
or
seen
.
add
(
i
)
for
i
in
x
)
assert
allUnique
(
Rep_Outer_CV_test_flat_str
)
# Get current time
c_time1
=
time
.
localtime
()
c_time1
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time1
)
print
(
c_time1
)
for
rep
in
range
(
n_repetitions
):
# The outer fold CV has already been saved as lists
Outer_CV
=
Rep_Outer_CV
[
rep
]
# Pre-allocate memory
model_par0
=
[]
final_models0
=
[]
final_features0
=
[]
final_y_preds0
=
[]
Outer_counter
=
0
for
train_index
,
test_index
in
Outer_CV
:
test_df
=
EEG_features_df
.
iloc
[
test_index
]
train_df
=
EEG_features_df
.
iloc
[
train_index
]
# Training data will be standardized
standardizer
=
preprocessing
.
StandardScaler
().
fit
(
train_df
.
iloc
[:,
n_discrete_cols
:])
train_df_standard
=
train_df
.
copy
()
train_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
train_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Test data will also be standardized but using mean and std from training data
test_df_standard
=
test_df
.
copy
()
test_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
test_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Get the training data
X_train
=
train_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_train
=
train_df_standard
[
Target
]
# Get test data
X_test
=
test_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_test
=
test_df_standard
[
Target
]
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1
=
[
30
,
40
,
50
,
60
]
# Max features from mRMR for each eye status
max_mRMR_features
=
n_feat_mRMR1
[
-
1
]
# mRMR on the all sparse kmeans selected features
filter_feat_selector
=
mRMR_feature_select
(
X_train
.
to_numpy
(),
y_train
.
to_numpy
(),
num_features_to_select
=
max_mRMR_features
,
K_MAX
=
1000
,
n_jobs
=-
1
,
verbose
=
False
)
# Part 2 with second mRMR and classifiers in loop
k_fold
=
10
skf2
=
StratifiedKFold
(
n_splits
=
k_fold
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV
=
[]
for
train_index2
,
test_index2
in
skf2
.
split
(
train_df
,
study_group_status
.
iloc
[
train_index
]):
Inner_CV
.
append
([
train_index2
,
test_index2
])
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_SVM
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
kernels
=
[
"
linear
"
]
# error when using rbf and RFECV
# rbf also overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_LogReg
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
# Random forest classifier
# Prepare hyper parameters
trees
=
np
.
array
([
10
,
100
,
500
,
1000
])
depth
=
np
.
linspace
(
1
,
2
,
2
)
# using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_SVM
),
len
(
kernels
)))
val_feat_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_SVM
),
len
(
kernels
),
max_mRMR_features
))
val_feat_svm
.
fill
(
np
.
nan
)
val_err_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_LogReg
)))
val_feat_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_LogReg
),
max_mRMR_features
))
val_feat_logreg
.
fill
(
np
.
nan
)
val_err_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
)))
val_feat_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
),
max_mRMR_features
))
val_feat_rf
.
fill
(
np
.
nan
)
val_feat_import_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
),
max_mRMR_features
))
val_feat_import_rf
.
fill
(
np
.
nan
)
for
n1
in
range
(
len
(
n_feat_mRMR1
)):
# Get selected features from mRMR
filter_features
=
X_train
.
columns
[
filter_feat_selector
[
0
:
n_feat_mRMR1
[
n1
]]]
X_train_mRMR
=
X_train
[
filter_features
].
copy
()
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
min_features_to_select
=
1
# Using normal for loop as RFECV has inbuilt multiprocessing
for
C
in
range
(
len
(
C_parameter_SVM
)):
for
K
in
range
(
len
(
kernels
)):
# Define the model
svc
=
SVC
(
C
=
C_parameter_SVM
[
C
],
kernel
=
kernels
[
K
],
tol
=
1e-3
,
cache_size
=
4000
)
# Perform recurive feature elimination with in-built CV
rfecv
=
RFECV
(
estimator
=
svc
,
n_jobs
=-
1
,
scoring
=
"
balanced_accuracy
"
,
cv
=
Inner_CV
,
min_features_to_select
=
min_features_to_select
)
rfecv
.
fit
(
X_train_mRMR
,
y_train
.
to_numpy
().
ravel
())
# Get CV score
err
=
rfecv
.
grid_scores_
[
rfecv
.
n_features_
-
min_features_to_select
]
# Save results for hyperparameter optimization
val_err_svm
[
n1
,
C
,
K
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
filter_features
[
rfecv
.
support_
]))[
0
]
val_feat_svm
[
n1
,
C
,
K
,
0
:
len
(
rfe_feat_idx
)]
=
rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
# Logistic regression with sequential forward selection
# In-bult CV
k_features
=
np
.
arange
(
1
,
n_feat_mRMR1
[
n1
]
+
1
,
1
)
# try up to all features
inner_cv_scores
=
np
.
zeros
((
len
(
C_parameter_LogReg
),
len
(
k_features
)))
for
C
in
range
(
len
(
C_parameter_LogReg
)):
LogReg
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_parameter_LogReg
[
C
],
max_iter
=
50000
)
sfs
=
SFS
(
LogReg
,
k_features
=
(
k_features
[
0
],
k_features
[
-
1
]),
forward
=
True
,
scoring
=
"
balanced_accuracy
"
,
verbose
=
0
,
floating
=
False
,
cv
=
Inner_CV
,
n_jobs
=
-
1
)
sfs
=
sfs
.
fit
(
X_train_mRMR
,
y_train
.
to_numpy
().
ravel
())
# Save CV scores for each SFS step
for
feat
in
range
(
len
(
k_features
)):
inner_cv_scores
[
C
,
feat
]
=
sfs
.
get_metric_dict
()[
k_features
[
feat
]][
"
avg_score
"
]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K
=
np
.
where
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)
==
np
.
max
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)))[
0
]
if
len
(
K
)
>
1
:
K
=
K
[
np
.
where
(
K
==
np
.
min
(
K
))[
0
]]
err
=
inner_cv_scores
[
C
,
K
]
# Save validation error and features
val_err_logreg
[
n1
,
C
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
sfs
.
subsets_
[
int
(
K
)
+
1
][
"
feature_names
"
]))[
0
]
# K+1 since it starts with 1 feature
val_feat_logreg
[
n1
,
C
,
0
:
len
(
sfs_feat_idx
)]
=
sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
)))
RF_feat_import
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
),
X_train_mRMR
.
shape
[
1
]))
counter
=
0
for
cv
in
range
(
k_fold
):
# Retrieve CV indices
train_index_rf
=
Inner_CV
[
cv
][
0
]
test_index_rf
=
Inner_CV
[
cv
][
1
]
# Retrieve datasets
X_train2
=
X_train_mRMR
.
iloc
[
train_index_rf
];
X_test2
=
X_train_mRMR
.
iloc
[
test_index_rf
]
y_train2
=
y_train
.
to_numpy
().
ravel
()[
train_index_rf
];
y_test2
=
y_train
.
to_numpy
().
ravel
()[
test_index_rf
]
for
t
in
range
(
len
(
trees
)):
for
d
in
range
(
len
(
depth
)):
RF
=
RandomForestClassifier
(
n_estimators
=
trees
[
t
],
max_features
=
"
sqrt
"
,
max_depth
=
depth
[
d
],
n_jobs
=-
1
,
random_state
=
None
)
RF
.
fit
(
X_train2
.
to_numpy
(),
y_train2
)
# Validation error
RF_y
=
RF
.
predict
(
X_test2
.
to_numpy
())
err
=
balanced_accuracy_score
(
y_test2
,
RF_y
)
# Save to array
model_test_err
[
cv
,
t
,
d
]
=
err
# Save feature importance array
RF_feat_import
[
cv
,
t
,
d
]
=
RF
.
feature_importances_
counter
+=
1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean
=
np
.
mean
(
model_test_err
,
axis
=
0
)
val_err_rf
[
n1
,:,:]
=
model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import
=
np
.
mean
(
RF_feat_import
,
axis
=
0
)
val_feat_import_rf
[
n1
,:,:,
0
:
n_feat_mRMR1
[
n1
]]
=
RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
filter_features
))[
0
]
val_feat_rf
[
n1
,:,:,
0
:
len
(
rf_feat_idx
)]
=
rf_feat_idx
print
(
"
Finished {}% of total run
"
.
format
((
n1
+
1
)
*
1
/
len
(
n_feat_mRMR1
)
*
100
))
# Choose the optimal parameters
### SVM
n1
,
C
,
K
=
np
.
where
(
val_err_svm
==
np
.
max
(
val_err_svm
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple SVM runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
rfe_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
C_temp
=
int
(
C
[
i2
])
K_temp
=
int
(
K
[
i2
])
temp_feats
=
val_feat_svm
[
n1_temp
,
C_temp
,
K_temp
][
~
np
.
isnan
(
val_feat_svm
[
n1_temp
,
C_temp
,
K_temp
])].
astype
(
int
)
rfe_feat_len
.
append
(
len
(
temp_feats
))
rfe_feat_min
=
np
.
where
(
rfe_feat_len
==
np
.
min
(
rfe_feat_len
))[
0
]
if
len
(
rfe_feat_min
)
>
1
:
print
(
"
Multiple SVM runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
rfe_feat_min
])
n1
,
C
,
K
=
[
int
(
n1
[
rfe_feat_min
][
C_min
]),
int
(
C
[
rfe_feat_min
][
C_min
]),
int
(
K
[
rfe_feat_min
][
C_min
])]
else
:
n1
,
C
,
K
=
[
int
(
n1
[
int
(
rfe_feat_min
)]),
int
(
C
[
int
(
rfe_feat_min
)]),
int
(
K
[
int
(
rfe_feat_min
)])]
else
:
n1
,
C
,
K
=
[
int
(
n1
),
int
(
C
),
int
(
K
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
C_chosen
=
C_parameter_SVM
[
C
]
K_chosen
=
kernels
[
K
]
# Save the best validation erro
val_error
=
val_err_svm
[
n1
,
C
,
K
]
# Get the subsets chosen
chosen_feats
=
val_feat_svm
[
n1
,
C
,
K
][
~
np
.
isnan
(
val_feat_svm
[
n1
,
C
,
K
])].
astype
(
int
)
rfe_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rfe_features
)
x_train_mRMR_rfe
=
X_train
[
rfe_features
]
# Fit on all training data
model
=
SVC
(
C
=
C_chosen
,
kernel
=
K_chosen
,
tol
=
1e-3
,
cache_size
=
4000
)
model
=
model
.
fit
(
x_train_mRMR_rfe
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR_rfe
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rfe_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr
[
rep
,
Outer_counter
,
0
,:]
=
[
train_error
,
val_error
,
test_error
]
SVM_model_par
=
[
mRMR_chosen1
,
C_chosen
,
n_final_feat
,
K_chosen
]
SVM_model
=
model
SVM_y_pred
=
[[
model_train_y
],[
y_pred
]]
### LogReg with SFS
n1
,
C
=
np
.
where
(
val_err_logreg
==
np
.
max
(
val_err_logreg
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple LogReg runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
sfs_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
C_temp
=
int
(
C
[
i2
])
temp_feats
=
val_feat_logreg
[
n1_temp
,
C_temp
][
~
np
.
isnan
(
val_feat_logreg
[
n1_temp
,
C_temp
])].
astype
(
int
)
sfs_feat_len
.
append
(
len
(
temp_feats
))
sfs_feat_min
=
np
.
where
(
sfs_feat_len
==
np
.
min
(
sfs_feat_len
))[
0
]
if
len
(
sfs_feat_min
)
>
1
:
print
(
"
Multiple LogReg runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
sfs_feat_min
])
n1
,
C
=
[
int
(
n1
[
sfs_feat_min
][
C_min
]),
int
(
C
[
sfs_feat_min
][
C_min
])]
else
:
n1
,
C
=
[
int
(
n1
[
int
(
sfs_feat_min
)]),
int
(
C
[
int
(
sfs_feat_min
)])]
else
:
n1
,
C
=
[
int
(
n1
),
int
(
C
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
C_chosen
=
C_parameter_LogReg
[
C
]
# Save the best validation erro
val_error
=
val_err_logreg
[
n1
,
C
]
# Get the subsets chosen
chosen_feats
=
val_feat_logreg
[
n1
,
C
][
~
np
.
isnan
(
val_feat_logreg
[
n1
,
C
])].
astype
(
int
)
sfs_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
sfs_features
)
x_train_mRMR_sfs
=
X_train
[
sfs_features
]
# Fit on all training data
model
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_chosen
,
max_iter
=
50000
)
model
=
model
.
fit
(
x_train_mRMR_sfs
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR_sfs
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
sfs_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr
[
rep
,
Outer_counter
,
1
,:]
=
[
train_error
,
val_error
,
test_error
]
LogReg_model_par
=
[
mRMR_chosen1
,
C_chosen
,
n_final_feat
]
LogReg_model
=
model
LogReg_y_pred
=
[[
model_train_y
],[
y_pred
]]
### RF
n1
,
t
,
d
=
np
.
where
(
val_err_rf
==
np
.
max
(
val_err_rf
))
if
len
(
d
)
>
1
:
print
(
"
There are multiple RF runs with the same validation error
"
)
print
(
"
Choosing run with lowest depth to alleviate overfitting
"
)
d_min
=
np
.
where
(
d
==
np
.
min
(
d
))[
0
]
if
len
(
d_min
)
>
1
:
print
(
"
Multiple RF runs with same number number of depths
"
)
print
(
"
Choosing run with lowest trees to alleviate overfitting
"
)
t_min
=
np
.
argmin
(
t
[
d_min
])
# argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1
,
t
,
d
=
[
int
(
n1
[
d_min
][
t_min
]),
int
(
t
[
d_min
][
t_min
]),
int
(
d
[
d_min
][
t_min
])]
else
:
n1
,
t
,
d
=
[
int
(
n1
[
int
(
d_min
)]),
int
(
t
[
int
(
d_min
)]),
int
(
d
[
int
(
d_min
)])]
else
:
n1
,
t
,
d
=
[
int
(
n1
),
int
(
t
),
int
(
d
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
t_chosen
=
trees
[
t
]
d_chosen
=
depth
[
d
]
# Save the best validation error
val_error
=
val_err_rf
[
n1
,
t
,
d
]
# Get the chosen features and feature importances
chosen_feats
=
val_feat_rf
[
n1
,
t
,
d
][
~
np
.
isnan
(
val_feat_rf
[
n1
,
t
,
d
])].
astype
(
int
)
rf_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rf_features
)
x_train_mRMR_rf
=
X_train
[
rf_features
]
rf_feat_importances
=
val_feat_import_rf
[
n1
,
t
,
d
][
~
np
.
isnan
(
val_feat_import_rf
[
n1
,
t
,
d
])]
# Fit on all training data
model
=
RandomForestClassifier
(
n_estimators
=
t_chosen
,
max_features
=
"
sqrt
"
,
max_depth
=
d_chosen
,
n_jobs
=-
1
,
random_state
=
None
)
model
=
model
.
fit
(
x_train_mRMR_rf
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR_rf
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rf_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr
[
rep
,
Outer_counter
,
2
,:]
=
[
train_error
,
val_error
,
test_error
]
RF_model_par
=
[
mRMR_chosen1
,
t_chosen
,
d_chosen
]
RF_model
=
model
RF_y_pred
=
[[
model_train_y
],[
y_pred
]]
# Save the rest
model_par0
.
append
([
SVM_model_par
,
LogReg_model_par
,
RF_model_par
])
final_models0
.
append
([
SVM_model
,
LogReg_model
,
RF_model
])
final_features0
.
append
([
rfe_features
,
sfs_features
,
[
rf_features
,
rf_feat_importances
]])
final_y_preds0
.
append
([
SVM_y_pred
,
LogReg_y_pred
,
RF_y_pred
])
# Move counter
Outer_counter
+=
1
print
(
"
Finished outer fold {} out of {} for rep: {}
"
.
format
(
Outer_counter
,
k_out
,
rep
+
1
))
# Save results from all outer folds
model_par
.
append
(
model_par0
)
final_models
.
append
(
final_models0
)
final_features
.
append
(
final_features0
)
final_y_preds
.
append
(
final_y_preds0
)
# Save results to file
Rep_mRMR2_SVM_LogReg_RF
=
[
accuracy_arr
,
model_par
,
final_models
,
final_features
,
final_y_preds
]
# Run with variable feat in first and second mRMR
with
open
(
Model_savepath
+
"
Rep10_10x10CV_SparsKmean_mRMR_SVM_LogReg_RF_Subtype1_results_210122.pkl
"
,
"
wb
"
)
as
filehandle
:
pickle
.
dump
(
Rep_mRMR2_SVM_LogReg_RF
,
filehandle
)
# Get current time
c_time2
=
time
.
localtime
()
c_time2
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time2
)
print
(
"
Started
"
,
c_time1
,
"
\n
Current Time
"
,
c_time2
)
# Print total progress
print
(
"
Finished outer fold repetition {} out of {}
"
.
format
(
rep
+
1
,
n_repetitions
))
# %% Each feat mRMR -> mRMR -> SVM/RF/LogReg with L2
# Using each feature (of the sparse Kmeans EC chosen ones)
# For Subtype 1
# 10 repetitions of 10 fold outer and 20 fold inner two-layer CV
# Prediction of subtype 1
EEG_features_df
,
EEG_features_name_list
=
load_features_df
()
# removed coh and plv and renamed Hurst in v4
with
open
(
Feature_savepath
+
"
All_feats_sparse_kmeans.pkl
"
,
"
rb
"
)
as
file
:
sparcl
=
pickle
.
load
(
file
)
# Add group status
Group_status
=
np
.
array
([
0
]
*
EEG_features_df
.
shape
[
0
])
# CTRL = 0
Group_status
[
np
.
array
([
i
in
cases
for
i
in
EEG_features_df
[
"
Subject_ID
"
]])]
=
1
# PTSD = 1
EEG_features_df
.
insert
(
1
,
"
Group_status
"
,
Group_status
)
# Only keep PTSD subtype 1 by dropping subtype 2
PTSD_idx
=
np
.
array
([
i
in
cases
for
i
in
Subject_id
])
CTRL_idx
=
np
.
array
([
not
i
in
cases
for
i
in
Subject_id
])
subtype1
=
np
.
array
(
Subject_id
)[
PTSD_idx
][
sparcl
[
"
cs
"
]
==
0
]
subtype2
=
np
.
array
(
Subject_id
)[
PTSD_idx
][
sparcl
[
"
cs
"
]
==
1
]
EEG_features_df
=
EEG_features_df
.
set_index
(
"
Subject_ID
"
)
EEG_features_df
=
EEG_features_df
.
drop
(
subtype2
)
EEG_features_df
=
EEG_features_df
.
reset_index
()
# Check it was dropped correctly
assert
all
(
subtype1
==
EEG_features_df
.
loc
[
EEG_features_df
[
"
Group_status
"
]
==
1
,
"
Subject_ID
"
])
# Subject info columns
Subject_info_cols
=
list
(
EEG_features_df
.
columns
[
0
:
2
])
n_subject_info_cols
=
len
(
Subject_info_cols
)
n_discrete_cols
=
2
# Get features from sparse kmeans
nonzero_idx
=
sparcl
[
"
ws
"
].
nonzero
()
sparcl_features
=
EEG_features_df
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
).
columns
[
nonzero_idx
]
sum
(
sparcl_features
.
str
.
contains
(
"
Eyes Open
"
))
/
len
(
sparcl_features
)
# less than 3% are EO
# Only use eyes closed (2483 features)
sparcl_features
=
sparcl_features
[
sparcl_features
.
str
.
contains
(
"
Eyes Closed
"
)]
EEG_features_df
=
pd
.
concat
([
EEG_features_df
[
Subject_info_cols
],
EEG_features_df
[
sparcl_features
]],
axis
=
1
)
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies
=
3
study_group_status
=
EEG_features_df
[
"
Group_status
"
].
copy
()
for
i
in
range
(
n_studies
):
# Get study index
study_idx
=
(
EEG_features_df
[
"
Subject_ID
"
]
>=
(
i
+
1
)
*
100000
)
&
(
EEG_features_df
[
"
Subject_ID
"
]
<
(
i
+
2
)
*
100000
)
# Assign label
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
0
)]
=
2
*
i
# CTRL
study_group_status
[(
study_idx
)
&
(
EEG_features_df
[
"
Group_status
"
]
==
1
)]
=
2
*
i
+
1
# PTSD
# Target variable
Target
=
[
"
Group_status
"
]
Target_col
=
EEG_features_df
.
iloc
[:,
1
:].
columns
.
isin
(
Target
)
Target_col_idx
=
np
.
where
(
Target_col
==
True
)[
0
]
# Make 3 models and save them to use enseemble in the end
CLF_models
=
[
"
SVM
"
,
"
LogReg
"
,
"
RF
"
]
n_models
=
len
(
CLF_models
)
# Repeat the classification to see if I just got a lucky seed
n_repetitions
=
10
k_out
=
10
accuracy_arr
=
np
.
zeros
((
n_repetitions
,
k_out
,
n_models
,
3
))
model_par
=
[]
final_models
=
[]
final_features
=
[]
final_y_preds
=
[]
np
.
random
.
seed
(
42
)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV
=
[]
Rep_Outer_CV_test
=
[]
# a list with only the test indices
for
rep
in
range
(
n_repetitions
):
skf
=
StratifiedKFold
(
n_splits
=
k_out
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV
=
[]
Outer_CV_test
=
[]
for
train_index
,
test_index
in
skf
.
split
(
EEG_features_df
,
study_group_status
):
Outer_CV
.
append
([
train_index
,
test_index
])
Outer_CV_test
.
append
(
test_index
)
Rep_Outer_CV
.
append
(
Outer_CV
)
Rep_Outer_CV_test
.
append
(
Outer_CV_test
)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat
=
[
item
for
sublist
in
Rep_Outer_CV_test
for
item
in
sublist
]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str
=
[
""
.
join
([
str
(
x
)
for
x
in
ele
])
for
ele
in
Rep_Outer_CV_test_flat
]
def
allUnique
(
x
):
seen
=
set
()
return
not
any
(
i
in
seen
or
seen
.
add
(
i
)
for
i
in
x
)
assert
allUnique
(
Rep_Outer_CV_test_flat_str
)
# Get current time
c_time1
=
time
.
localtime
()
c_time1
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time1
)
print
(
c_time1
)
def
Each_feat_10rep_10x10CV
(
rep
):
# The outer fold CV has already been saved as lists
Outer_CV
=
Rep_Outer_CV
[
rep
]
# Pre-allocate memory
Ind_feat_clf0
=
[]
accuracy_arr0
=
np
.
zeros
((
k_out
,
len
(
EEG_features_name_list
),
n_models
,
3
))
Outer_counter
=
0
for
train_index
,
test_index
in
Outer_CV
:
test_df
=
EEG_features_df
.
iloc
[
test_index
]
train_df
=
EEG_features_df
.
iloc
[
train_index
]
# Training data will be standardized
standardizer
=
preprocessing
.
StandardScaler
().
fit
(
train_df
.
iloc
[:,
n_discrete_cols
:])
train_df_standard
=
train_df
.
copy
()
train_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
train_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Test data will also be standardized but using mean and std from training data
test_df_standard
=
test_df
.
copy
()
test_df_standard
.
iloc
[:,
n_discrete_cols
:]
=
standardizer
.
transform
(
test_df_standard
.
iloc
[:,
n_discrete_cols
:])
# Get the training data
X_train
=
train_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_train
=
train_df_standard
[
Target
]
# Get test data
X_test
=
test_df_standard
.
copy
().
drop
(
Subject_info_cols
,
axis
=
1
)
y_test
=
test_df_standard
[
Target
]
# Part 2 with second mRMR and classifiers in loop
k_fold
=
10
skf2
=
StratifiedKFold
(
n_splits
=
k_fold
,
shuffle
=
True
,
random_state
=
(
rep
+
1
)
*
123
)
# using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV
=
[]
for
train_index2
,
test_index2
in
skf2
.
split
(
train_df
,
study_group_status
.
iloc
[
train_index
]):
Inner_CV
.
append
([
train_index2
,
test_index2
])
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1
=
[
20
,
30
,
40
,
50
]
# Max features from mRMR for each eye status
max_mRMR_features
=
n_feat_mRMR1
[
-
1
]
eye_status
=
[
"
Eyes Closed
"
,
"
Eyes Open
"
]
n_eye_status
=
len
(
eye_status
)
feat_counter
=
[]
# Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
Ind_feat_clf00
=
[]
for
fset
in
range
(
len
(
EEG_features_name_list
)):
temp_feat
=
EEG_features_name_list
[
fset
]
other_feats
=
EEG_features_name_list
[:
fset
]
+
EEG_features_name_list
[
fset
+
1
:]
other_feats
=
[
item
for
sublist
in
other_feats
for
item
in
sublist
]
# make the list flatten
# Retrieve the dataset for each feature
col_idx
=
np
.
zeros
(
len
(
X_train
.
columns
),
dtype
=
bool
)
for
fsub
in
range
(
len
(
temp_feat
)):
temp_feat0
=
temp_feat
[
fsub
]
col_idx0
=
X_train
.
columns
.
str
.
contains
(
temp_feat0
+
"
_
"
)
col_idx
=
np
.
logical_or
(
col_idx
,
col_idx0
)
# append all trues
temp_X_train
=
X_train
.
loc
[:,
col_idx
]
# Check if any of the other features were wrongly chosen
for
fcheck
in
range
(
len
(
other_feats
)):
if
any
(
temp_X_train
.
columns
.
str
.
contains
(
other_feats
[
fcheck
]
+
"
_
"
)
==
True
):
temp_X_train
=
temp_X_train
.
loc
[:,
np
.
invert
(
temp_X_train
.
columns
.
str
.
contains
(
other_feats
[
fcheck
]))]
if
temp_X_train
.
size
==
0
:
# if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train
=
X_train
.
loc
[:,
X_train
.
columns
.
str
.
contains
(
temp_feat0
+
"
_
"
)]
# Save number of original features fed into mRMR
feat_counter
.
append
([
temp_feat
,
temp_X_train
.
shape
[
1
]])
# If there are no features selected, then skip the rest of the loop
if
temp_X_train
.
shape
[
1
]
==
0
:
continue
# Do not use mRMR if there are fewer than n_features
if
temp_X_train
.
shape
[
1
]
<=
max_mRMR_features
:
filter_features
=
temp_X_train
.
columns
else
:
# mRMR
filter_feat_selector
=
mRMR_feature_select
(
temp_X_train
.
to_numpy
(),
y_train
.
to_numpy
(),
num_features_to_select
=
max_mRMR_features
,
K_MAX
=
1000
,
n_jobs
=
1
,
verbose
=
False
)
# Get selected features
filter_features
=
temp_X_train
.
columns
[
filter_feat_selector
]
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_SVM
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
# rbf kernel overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent
=
np
.
linspace
(
-
3
,
1
,
9
)
exponent
=
np
.
round
(
exponent
,
5
)
# sometimes linspace are not so exact
C_parameter_LogReg
=
np
.
power
(
np
.
array
([
10
]
*
len
(
exponent
)),
exponent
)
# Random forest classifier
# Prepare hyper parameters
trees
=
np
.
array
([
10
,
100
,
500
,
1000
])
depth
=
np
.
linspace
(
1
,
2
,
2
)
# using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_SVM
)))
val_feat_svm
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_SVM
),
max_mRMR_features
))
val_feat_svm
.
fill
(
np
.
nan
)
val_err_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_LogReg
)))
val_feat_logreg
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
C_parameter_LogReg
),
max_mRMR_features
))
val_feat_logreg
.
fill
(
np
.
nan
)
val_err_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
)))
val_feat_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
),
max_mRMR_features
))
val_feat_rf
.
fill
(
np
.
nan
)
val_feat_import_rf
=
np
.
zeros
((
len
(
n_feat_mRMR1
),
len
(
trees
),
len
(
depth
),
max_mRMR_features
))
val_feat_import_rf
.
fill
(
np
.
nan
)
min_features_to_select
=
1
for
n1
in
range
(
len
(
n_feat_mRMR1
)):
mRMR_features1
=
filter_features
[
0
:
n_feat_mRMR1
[
n1
]]
# Use the selected filter features
temp_X_train_mRMR
=
temp_X_train
[
mRMR_features1
]
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
for
C
in
range
(
len
(
C_parameter_SVM
)):
# Define the model
svc
=
SVC
(
C
=
C_parameter_SVM
[
C
],
kernel
=
"
linear
"
,
tol
=
1e-3
,
cache_size
=
4000
)
# Perform recurive feature elimination with in-built CV
rfecv
=
RFECV
(
estimator
=
svc
,
n_jobs
=
1
,
scoring
=
"
balanced_accuracy
"
,
cv
=
Inner_CV
,
min_features_to_select
=
min_features_to_select
)
rfecv
.
fit
(
temp_X_train_mRMR
,
y_train
.
to_numpy
().
ravel
())
# Get CV score
err
=
rfecv
.
grid_scores_
[
rfecv
.
n_features_
-
min_features_to_select
]
# Save results for hyperparameter optimization
val_err_svm
[
n1
,
C
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
mRMR_features1
[
rfecv
.
support_
]))[
0
]
val_feat_svm
[
n1
,
C
,
0
:
len
(
rfe_feat_idx
)]
=
rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
# Logistic regression with sequential forward selection
# In-bult CV
k_max
=
np
.
min
([
temp_X_train_mRMR
.
shape
[
1
],
n_feat_mRMR1
[
n1
]])
k_features
=
np
.
arange
(
1
,
k_max
+
1
,
1
)
# try up to all features
inner_cv_scores
=
np
.
zeros
((
len
(
C_parameter_LogReg
),
len
(
k_features
)))
for
C
in
range
(
len
(
C_parameter_LogReg
)):
LogReg
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_parameter_LogReg
[
C
],
max_iter
=
50000
)
sfs
=
SFS
(
LogReg
,
k_features
=
(
k_features
[
0
],
k_features
[
-
1
]),
forward
=
True
,
scoring
=
"
balanced_accuracy
"
,
verbose
=
0
,
floating
=
False
,
cv
=
Inner_CV
,
n_jobs
=
1
)
sfs
=
sfs
.
fit
(
temp_X_train_mRMR
,
y_train
.
to_numpy
().
ravel
())
# Save CV scores for each SFS step
for
feat
in
range
(
len
(
k_features
)):
inner_cv_scores
[
C
,
feat
]
=
sfs
.
get_metric_dict
()[
k_features
[
feat
]][
"
avg_score
"
]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K
=
np
.
where
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)
==
np
.
max
(
np
.
round
(
inner_cv_scores
[
C
,:],
2
)))[
0
]
if
len
(
K
)
>
1
:
K
=
K
[
np
.
where
(
K
==
np
.
min
(
K
))[
0
]]
err
=
inner_cv_scores
[
C
,
K
]
# Save validation error and features
val_err_logreg
[
n1
,
C
]
=
err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
sfs
.
subsets_
[
int
(
K
)
+
1
][
"
feature_names
"
]))[
0
]
# K+1 since it starts with 1 feature
val_feat_logreg
[
n1
,
C
,
0
:
len
(
sfs_feat_idx
)]
=
sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
)))
RF_feat_import
=
np
.
zeros
((
k_fold
,
len
(
trees
),
len
(
depth
),
temp_X_train_mRMR
.
shape
[
1
]))
counter
=
0
for
cv
in
range
(
k_fold
):
# Retrieve CV indices
train_index_rf
=
Inner_CV
[
cv
][
0
]
test_index_rf
=
Inner_CV
[
cv
][
1
]
# Retrieve datasets
X_train2
=
temp_X_train_mRMR
.
iloc
[
train_index_rf
];
X_test2
=
temp_X_train_mRMR
.
iloc
[
test_index_rf
]
y_train2
=
y_train
.
to_numpy
().
ravel
()[
train_index_rf
];
y_test2
=
y_train
.
to_numpy
().
ravel
()[
test_index_rf
]
for
t
in
range
(
len
(
trees
)):
for
d
in
range
(
len
(
depth
)):
RF
=
RandomForestClassifier
(
n_estimators
=
trees
[
t
],
max_features
=
"
sqrt
"
,
max_depth
=
depth
[
d
],
n_jobs
=
1
,
random_state
=
None
)
RF
.
fit
(
X_train2
.
to_numpy
(),
y_train2
)
# Validation error
RF_y
=
RF
.
predict
(
X_test2
.
to_numpy
())
err
=
balanced_accuracy_score
(
y_test2
,
RF_y
)
# Save to array
model_test_err
[
cv
,
t
,
d
]
=
err
# Save feature importance array
RF_feat_import
[
cv
,
t
,
d
]
=
RF
.
feature_importances_
counter
+=
1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean
=
np
.
mean
(
model_test_err
,
axis
=
0
)
val_err_rf
[
n1
,:,:]
=
model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import
=
np
.
mean
(
RF_feat_import
,
axis
=
0
)
val_feat_import_rf
[
n1
,:,:,
0
:
RF_feat_import
.
shape
[
2
]]
=
RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx
=
np
.
where
(
X_train
.
columns
.
isin
(
mRMR_features1
))[
0
]
val_feat_rf
[
n1
,:,:,
0
:
len
(
rf_feat_idx
)]
=
rf_feat_idx
# Print progress
current_progress
=
(
n1
+
1
)
/
(
len
(
n_feat_mRMR1
))
*
100
print
(
"
Finished {}% of inner fold optimization for feat: {}
"
.
format
(
current_progress
,
temp_feat
[
0
]))
# Choose the optimal parameters
### SVM
n1
,
C
=
np
.
where
(
val_err_svm
==
np
.
max
(
val_err_svm
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple SVM runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
rfe_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
C_temp
=
int
(
C
[
i2
])
temp_feats
=
val_feat_svm
[
n1_temp
,
C_temp
][
~
np
.
isnan
(
val_feat_svm
[
n1_temp
,
C_temp
])].
astype
(
int
)
rfe_feat_len
.
append
(
len
(
temp_feats
))
rfe_feat_min
=
np
.
where
(
rfe_feat_len
==
np
.
min
(
rfe_feat_len
))[
0
]
if
len
(
rfe_feat_min
)
>
1
:
print
(
"
Multiple SVM runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
rfe_feat_min
])
n1
,
C
=
[
int
(
n1
[
rfe_feat_min
][
C_min
]),
int
(
C
[
rfe_feat_min
][
C_min
])]
else
:
n1
,
C
=
[
int
(
n1
[
int
(
rfe_feat_min
)]),
int
(
C
[
int
(
rfe_feat_min
)])]
else
:
n1
,
C
=
[
int
(
n1
),
int
(
C
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
C_chosen
=
C_parameter_SVM
[
C
]
# Save the best validation error
val_error
=
val_err_svm
[
n1
,
C
]
# Get the subsets chosen
chosen_feats
=
val_feat_svm
[
n1
,
C
][
~
np
.
isnan
(
val_feat_svm
[
n1
,
C
])].
astype
(
int
)
rfe_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rfe_features
)
x_train_mRMR2_rfe
=
X_train
[
rfe_features
]
# Fit on all training data
model
=
SVC
(
C
=
C_chosen
,
kernel
=
"
linear
"
,
tol
=
1e-3
,
cache_size
=
4000
)
model
=
model
.
fit
(
x_train_mRMR2_rfe
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_rfe
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rfe_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr0
[
Outer_counter
,
fset
,
0
,:]
=
[
train_error
,
val_error
,
test_error
]
SVM_model_par
=
[
mRMR_chosen1
,
C_chosen
,
n_final_feat
]
SVM_model
=
model
SVM_y_pred
=
[[
model_train_y
],[
y_pred
]]
### LogReg with SFS
n1
,
C
=
np
.
where
(
val_err_logreg
==
np
.
max
(
val_err_logreg
))
if
len
(
C
)
>
1
:
print
(
"
There are multiple LogReg runs with the same validation error
"
)
print
(
"
Choosing run with fewest features to alleviate overfitting
"
)
sfs_feat_len
=
[]
for
i2
in
range
(
len
(
C
)):
n1_temp
=
int
(
n1
[
i2
])
C_temp
=
int
(
C
[
i2
])
temp_feats
=
val_feat_logreg
[
n1_temp
,
C_temp
][
~
np
.
isnan
(
val_feat_logreg
[
n1_temp
,
C_temp
])].
astype
(
int
)
sfs_feat_len
.
append
(
len
(
temp_feats
))
sfs_feat_min
=
np
.
where
(
sfs_feat_len
==
np
.
min
(
sfs_feat_len
))[
0
]
if
len
(
sfs_feat_min
)
>
1
:
print
(
"
Multiple LogReg runs with same number of fewest features
"
)
print
(
"
Choosing run with lowest C (highest regularization) to alleviate overfitting
"
)
C_min
=
np
.
argmin
(
C
[
sfs_feat_min
])
n1
,
C
=
[
int
(
n1
[
sfs_feat_min
][
C_min
]),
int
(
C
[
sfs_feat_min
][
C_min
])]
else
:
n1
,
C
=
[
int
(
n1
[
int
(
sfs_feat_min
)]),
int
(
C
[
int
(
sfs_feat_min
)])]
else
:
n1
,
C
=
[
int
(
n1
),
int
(
C
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
C_chosen
=
C_parameter_LogReg
[
C
]
# Save the best validation erro
val_error
=
val_err_logreg
[
n1
,
C
]
# Get the subsets chosen
chosen_feats
=
val_feat_logreg
[
n1
,
C
][
~
np
.
isnan
(
val_feat_logreg
[
n1
,
C
])].
astype
(
int
)
sfs_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
sfs_features
)
x_train_mRMR2_sfs
=
X_train
[
sfs_features
]
# Fit on all training data
model
=
LogisticRegression
(
penalty
=
"
l2
"
,
C
=
C_chosen
,
max_iter
=
50000
)
model
=
model
.
fit
(
x_train_mRMR2_sfs
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_sfs
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
sfs_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr0
[
Outer_counter
,
fset
,
1
,:]
=
[
train_error
,
val_error
,
test_error
]
LogReg_model_par
=
[
mRMR_chosen1
,
C_chosen
,
n_final_feat
]
LogReg_model
=
model
LogReg_y_pred
=
[[
model_train_y
],[
y_pred
]]
### RF
n1
,
t
,
d
=
np
.
where
(
val_err_rf
==
np
.
max
(
val_err_rf
))
if
len
(
d
)
>
1
:
print
(
"
There are multiple RF runs with the same validation error
"
)
print
(
"
Choosing run with lowest depth to alleviate overfitting
"
)
d_min
=
np
.
where
(
d
==
np
.
min
(
d
))[
0
]
if
len
(
d_min
)
>
1
:
print
(
"
Multiple RF runs with same number number of depths
"
)
print
(
"
Choosing run with lowest trees to alleviate overfitting
"
)
t_min
=
np
.
argmin
(
t
[
d_min
])
# argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1
,
t
,
d
=
[
int
(
n1
[
d_min
][
t_min
]),
int
(
t
[
d_min
][
t_min
]),
int
(
d
[
d_min
][
t_min
])]
else
:
n1
,
t
,
d
=
[
int
(
n1
[
int
(
d_min
)]),
int
(
t
[
int
(
d_min
)]),
int
(
d
[
int
(
d_min
)])]
else
:
n1
,
t
,
d
=
[
int
(
n1
),
int
(
t
),
int
(
d
)]
mRMR_chosen1
=
n_feat_mRMR1
[
n1
]
t_chosen
=
trees
[
t
]
d_chosen
=
depth
[
d
]
# Save the best validation error
val_error
=
val_err_rf
[
n1
,
t
,
d
]
# Get the chosen features and feature importances
chosen_feats
=
val_feat_rf
[
n1
,
t
,
d
][
~
np
.
isnan
(
val_feat_rf
[
n1
,
t
,
d
])].
astype
(
int
)
rf_features
=
list
(
X_train
.
columns
[
chosen_feats
])
n_final_feat
=
len
(
rf_features
)
x_train_mRMR2_rf
=
X_train
[
rf_features
]
rf_feat_importances
=
val_feat_import_rf
[
n1
,
t
,
d
][
~
np
.
isnan
(
val_feat_import_rf
[
n1
,
t
,
d
])]
# Fit on all training data
model
=
RandomForestClassifier
(
n_estimators
=
t_chosen
,
max_features
=
"
sqrt
"
,
max_depth
=
d_chosen
,
n_jobs
=
1
,
random_state
=
None
)
model
=
model
.
fit
(
x_train_mRMR2_rf
,
y_train
.
to_numpy
().
ravel
())
# Get training output
model_train_y
=
model
.
predict
(
x_train_mRMR2_rf
)
# Get training error
train_error
=
balanced_accuracy_score
(
y_train
,
model_train_y
)
# Get prediction of test data
y_pred
=
model
.
predict
(
X_test
[
rf_features
])
# Use model to predict class on test data
test_error
=
balanced_accuracy_score
(
y_test
,
y_pred
)
# Save or prepare to save
accuracy_arr0
[
Outer_counter
,
fset
,
2
,:]
=
[
train_error
,
val_error
,
test_error
]
RF_model_par
=
[
mRMR_chosen1
,
t_chosen
,
d_chosen
]
RF_model
=
model
RF_y_pred
=
[[
model_train_y
],[
y_pred
]]
# Save the rest
model_par00
=
[
SVM_model_par
,
LogReg_model_par
,
RF_model_par
]
final_models00
=
[
SVM_model
,
LogReg_model
,
RF_model
]
final_features00
=
[
rfe_features
,
sfs_features
,
[
rf_features
,
rf_feat_importances
]]
final_y_preds00
=
[
SVM_y_pred
,
LogReg_y_pred
,
RF_y_pred
]
data_splits00
=
[
X_train
,
y_train
,
X_test
,
y_test
,
standardizer
.
mean_
,
standardizer
.
scale_
]
# Save the results for the specific feature
res
=
[
temp_feat
,
model_par00
,
final_models00
,
final_features00
,
final_y_preds00
,
data_splits00
]
Ind_feat_clf00
.
append
(
res
)
print
(
"
Finished outer fold {} out of {} for rep: {} for feat {}
"
.
format
(
Outer_counter
+
1
,
k_out
,
rep
+
1
,
temp_feat
))
# Check that no features were excluded when checking for wrongly chosen
total_features
=
0
for
feat_c
in
range
(
len
(
feat_counter
)):
total_features
+=
feat_counter
[
feat_c
][
1
]
assert
total_features
==
X_train
.
shape
[
1
]
# Save results over outer folds
Ind_feat_clf0
.
append
(
Ind_feat_clf00
)
print
(
"
Finished outer fold {} out of {} for rep: {} for all features
"
.
format
(
Outer_counter
+
1
,
k_out
,
rep
+
1
))
# Move counter
Outer_counter
+=
1
out
=
[
accuracy_arr0
,
Ind_feat_clf0
]
# [Rep][Outer CV][Feature][Variable]
# Print total progress
print
(
"
Finished outer fold repetition {} out of {}
"
.
format
(
rep
+
1
,
n_repetitions
))
# Get current time
c_time2
=
time
.
localtime
()
c_time2
=
time
.
strftime
(
"
%a %d %b %Y %H:%M:%S
"
,
c_time2
)
print
(
"
Started
"
,
c_time1
,
"
\n
Current Time
"
,
c_time2
)
return
rep
,
out
Ind_feat_clf
=
[
0
]
*
n_repetitions
with
concurrent
.
futures
.
ProcessPoolExecutor
(
max_workers
=
10
)
as
executor
:
for
rep
,
result
in
executor
.
map
(
Each_feat_10rep_10x10CV
,
range
(
n_repetitions
)):
# Function and arguments
Ind_feat_clf
[
rep
]
=
result
# [acc or parameters], for parameters: [Rep][Outer CV][Feature][Variable]
# Save results to file and overwrite when new results arrive
with
open
(
Model_savepath
+
"
Each_feat_Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Subtype1_results_250122.pkl
"
,
"
wb
"
)
as
filehandle
:
pickle
.
dump
(
Ind_feat_clf
,
filehandle
)
# %% For subtype 2
# The two previous sections are rerun but for subtype 2
# The only modification was indexing the subjects
# Only keep PTSD subtype 2 by dropping subtype 1
subtype1
=
np
.
array
(
Subject_id
)[
PTSD_idx
][
sparcl
[
"
cs
"
]
==
0
]
subtype2
=
np
.
array
(
Subject_id
)[
PTSD_idx
][
sparcl
[
"
cs
"
]
==
1
]
EEG_features_df
=
EEG_features_df
.
set_index
(
"
Subject_ID
"
)
EEG_features_df
=
EEG_features_df
.
drop
(
subtype1
)
EEG_features_df
=
EEG_features_df
.
reset_index
()
# Check it was dropped correctly
assert
all
(
subtype2
==
EEG_features_df
.
loc
[
EEG_features_df
[
"
Group_status
"
]
==
1
,
"
Subject_ID
"
])
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment