Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
Updated Oct 18 2022
@author: Qianliang Li (glia@dtu.dk)
This script contains the code for the machine learning analysis
It should be run after Preprocessing.py and FeatureEstimation.py
"""
# Set working directory
import os
wkdir = "/home/glia/EEG/Final_scripts/"
os.chdir(wkdir)
# Load all libraries from the Preamble
from Preamble import *
# Load the questionnaire data
final_qdf = pd.read_csv("final_qdf.csv", sep=",", na_values=' ')
# Define cases as >= 44 total PCL
# Type: numpy array with subject id
cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44])
n_groups = 2
Groups = ["CTRL", "PTSD"]
# Make function to load EEG features
def load_features_df():
# Load all features
power_df = pd.read_pickle(Feature_savepath+"Power_df.pkl")
fTBR_data_df = pd.read_pickle(Feature_savepath+"fTBR_df.pkl")
asymmetry_df = pd.read_pickle(Feature_savepath+"asymmetry_df.pkl")
PAF_data_df = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_df.pkl")
PAF_data_df_global = pd.read_pickle(Feature_savepath+"PAF_data_FOOOF_global_df.pkl")
OOF_data_df = pd.read_pickle(Feature_savepath+"OOF_data_FOOOF_df.pkl")
con_data_df = pd.read_pickle(Feature_savepath+"con_data_source_drop_interpol_df.pkl")
pec_data_df = pd.read_pickle(Feature_savepath+"pec_data_drop_interpol_ch_df.pkl")
microstate_transition_data_df = pd.read_pickle(Feature_savepath+"microstate_transition_data_df.pkl")
microstate_time_df = pd.read_pickle(Feature_savepath+"microstate_time_df.pkl")
microstate_entropy_df = pd.read_pickle(Feature_savepath+"microstate_entropy_df.pkl")
GC_data_df = pd.read_pickle(Feature_savepath+"GC_data_source_drop_interpol_df.pkl")
H_data_df = pd.read_pickle(Feature_savepath+"H_data_df.pkl")
H_data_df_global = pd.read_pickle(Feature_savepath+"H_data_global_df.pkl")
# List of features
EEG_features_name_list = [['Power'],
['Frontal Theta Beta Ratio',
'Asymmetry'],
['Peak Alpha Frequency',
'Global Peak Alpha Frequency'],
["1/f exponent"],
['imcoh'],
['wpli'],
['Power Envelope Correlation'],
['Microstate Transition',
'Microstate Ratio Time',
'Microstate Entropy'],
["Granger Causality"],
['DFA Exponent',
'Global DFA Exponent']]
# Arrange them to fit one 2D dataframe
# Make function to add measurement column for indexing
def add_measurement_column(df, measurement = "Text"):
dummy_variable = [measurement]*df.shape[0]
df.insert(1, "Measurement", dummy_variable)
return df
# Make function to convert column tuple to string
def convertTupleHeader(header):
header = list(header)
str = "_".join(header)
return str
# Prepare overall dataframe
EEG_features_df = pd.DataFrame(Subject_id, columns = ["Subject_ID"])
# Add power spectral densities
power_df = add_measurement_column(power_df, "Power")
temp_df = power_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Quant_status", "Eye_status",
"Freq_band", "Channel"], dropna=False,
values="PSD").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add frontal theta/beta ratio
# fTBR_data_df = add_measurement_column(fTBR_data_df, "Frontal Theta Beta Ratio")
temp_df = fTBR_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status"], dropna=False,
values="TBR").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add power asymmetry
asymmetry_df = add_measurement_column(asymmetry_df, "Asymmetry")
temp_df = asymmetry_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Freq_band", "ROI"], dropna=False,
values="Asymmetry_score").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add peak alpha frequency
PAF_data_df = add_measurement_column(PAF_data_df, "Peak Alpha Frequency")
temp_df = PAF_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Channel"], dropna=False,
values="Value").reset_index(drop=True)
# NaN values are interpolated with means across channels for each condition
eye_status = list(final_epochs[0].event_id.keys())
for ee in eye_status:
temp = temp_df.loc[:,("Peak Alpha Frequency",ee)] # get data
temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0)
temp_df.loc[:,("Peak Alpha Frequency",ee)] = temp.to_numpy()
# If there are still NaN the values are interpolated across channels and condition
temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add 1/f exponent
OOF_data_df = add_measurement_column(OOF_data_df, "1/f exponent")
temp_df = OOF_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Channel"], dropna=False,
values="Value").reset_index(drop=True)
# NaN values are interpolated with means across channels for each condition
eye_status = list(final_epochs[0].event_id.keys())
for ee in eye_status:
temp = temp_df.loc[:,("1/f exponent",ee)] # get data
temp = temp.T.fillna(temp.mean(axis=1)).T # fill (transpose used because fillna is axis=0)
temp_df.loc[:,("1/f exponent",ee)] = temp.to_numpy()
# If there are still NaN the values are interpolated across channels and condition
temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add global peak alpha frequency
#PAF_data_df_global = add_measurement_column(PAF_data_df_global, "Global_Peak_Alpha_Frequency") # already exists
temp_df = PAF_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status"], dropna=False,
values="Value").reset_index(drop=True)
# NaN values are interpolated across eye condition
temp_df = temp_df.T.fillna(temp_df.mean(axis=1)).T
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add connectivity measurements
#con_data_df = add_measurement_column(con_data_df, "Connectivity") # already exists
temp_df = con_data_df.pivot_table(index="Subject_ID",columns=["Con_measurement",
"Eye_status", "chx", "chy", "Freq_band"], dropna=True,
values="Value").reset_index(drop=True)
# Drop coh and plv, which are more susceptible to volume conduction
temp_df = temp_df.drop("coh",axis=1)
temp_df = temp_df.drop("plv",axis=1)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add orthogonalized power enveloped correlations
pec_data_df = add_measurement_column(pec_data_df, "Power Envelope Correlation")
temp_df = pec_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "chx", "chy", "Freq_band"], dropna=True,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add microstate transition probabilities
microstate_transition_data_df = add_measurement_column(microstate_transition_data_df, "Microstate Transition")
temp_df = microstate_transition_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Transition"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add microstate time covered
microstate_time_df = add_measurement_column(microstate_time_df, "Microstate Ratio Time")
# Convert microstate to str before using it as column
microstate_time_df = microstate_time_df.astype({"Microstate": str})
temp_df = microstate_time_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Microstate"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add microstate entropy
#microstate_entropy_df = add_measurement_column(microstate_entropy_df, "Microstate_Entropy") # already exists
microstate_entropy_df["Measurement"] = ["Microstate Entropy"]*microstate_entropy_df.shape[0]
temp_df = microstate_entropy_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add Granger Causality # fixed so it is always chx --> chy
GC_data_df = add_measurement_column(GC_data_df, "Granger Causality")
temp_df = GC_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "chx", "chy", "Freq_band"], dropna=True,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
# Check if any GC was not calculated properly
assert temp_df.shape[1] == (31*31-31)*5*2 # expected number of features with 31 sources
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add DFA exponent
H_data_df = add_measurement_column(H_data_df, "DFA Exponent")
temp_df = H_data_df.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Channel", "Freq_band"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
# Add Global Hurst exponent
H_data_df_global = H_data_df_global.drop("Measurement",axis=1)
H_data_df_global = add_measurement_column(H_data_df_global, "Global DFA Exponent") # already exists
temp_df = H_data_df_global.pivot_table(index="Subject_ID",columns=["Measurement",
"Eye_status", "Freq_band"], dropna=False,
values="Value").reset_index(drop=True)
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
EEG_features_df = pd.concat([EEG_features_df,temp_df], axis=1)
return EEG_features_df, EEG_features_name_list
# %% ML predction of PTSD status using all features
# mRMR -> mRMR -> SVM/RF/LogReg with L2
# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
EEG_features_df, EEG_features_name_list = load_features_df()
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
# Get study index
study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
# Assign label
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]
# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)
# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
accuracy_arr = np.zeros((n_repetitions,k_out,n_models,3))
model_par = []
final_models = []
final_features = []
final_y_preds = []
np.random.seed(42)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV = []
Outer_CV_test = []
for train_index, test_index in skf.split(EEG_features_df,study_group_status):
Outer_CV.append([train_index,test_index])
Outer_CV_test.append(test_index)
Rep_Outer_CV.append(Outer_CV)
Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
def allUnique(x):
seen = set()
return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
for rep in range(n_repetitions):
# The outer fold CV has already been saved as lists
Outer_CV = Rep_Outer_CV[rep]
# Pre-allocate memory
model_par0 = []
final_models0 = []
final_features0 = []
final_y_preds0 = []
Outer_counter = 0
for train_index, test_index in Outer_CV:
test_df = EEG_features_df.iloc[test_index]
train_df = EEG_features_df.iloc[train_index]
# Training data will be standardized
standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
train_df_standard = train_df.copy()
train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
# Test data will also be standardized but using mean and std from training data
test_df_standard = test_df.copy()
test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
# Get the training data
X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
y_train = train_df_standard[Target]
# Get test data
X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
y_test = test_df_standard[Target]
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1 = [20,30,40,50]
n_feat_mRMR2 = [30,40,50,60]
# Max features from mRMR for each eye status
max_mRMR_features = n_feat_mRMR1[-1]
max_mRMR_features2 = n_feat_mRMR2[-1]
eye_status = ["Eyes Closed", "Eyes Open"]
n_eye_status = len(eye_status)
mRMR_features = []
feat_counter = []
# Perform mRMR for each feature type
for fset in range(len(EEG_features_name_list)):
temp_feat = EEG_features_name_list[fset]
other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
for e in range(n_eye_status):
ee = eye_status[e]
temp_ee = "{}_{}".format(temp_feat,ee)
# Retrieve the dataset for each feature
col_idx = np.zeros(len(X_train.columns), dtype=bool)
for fsub in range(len(temp_feat)):
temp_feat0 = temp_feat[fsub]
col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
col_idx = np.logical_or(col_idx,col_idx0) # append all trues
temp_X_train = X_train.loc[:,col_idx]
# Check if any of the other features were wrongly chosen
for fcheck in range(len(other_feats)):
if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]+"_"))]
if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
# Index for eye status
temp_X_train = temp_X_train.loc[:,temp_X_train.columns.str.contains(ee)]
# Save number of original features fed into mRMR
feat_counter.append([temp_feat,temp_X_train.shape[1]])
# Do not use mRMR if there are fewer than max_mRMR_features
if temp_X_train.shape[1] <= max_mRMR_features:
filter_features = temp_X_train.columns
else:
# mRMR
filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features,
K_MAX=1000,n_jobs=-1,verbose=False)
# Get selected features
filter_features = temp_X_train.columns[filter_feat_selector]
# Save features selected
mRMR_features.append(list(filter_features))
# Check that no features were excluded when checking for wrongly chosen
total_features = 0
for feat_c in range(len(feat_counter)):
total_features += feat_counter[feat_c][1]
assert total_features == X_train.shape[1]
# Part 2 with second mRMR and classifiers in loop
k_fold = 10
skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV = []
for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
Inner_CV.append([train_index2,test_index2])
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
kernels = ["linear"]
# rbf overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
# Random forest classifier
# Prepare hyper parameters
trees = np.array([10, 100, 500, 1000])
depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels)))
val_feat_svm = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_SVM),len(kernels),max_mRMR_features2))
val_feat_svm.fill(np.nan)
val_err_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg)))
val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(C_parameter_LogReg),max_mRMR_features2))
val_feat_logreg.fill(np.nan)
val_err_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth)))
val_feat_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2))
val_feat_rf.fill(np.nan)
val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(n_feat_mRMR2),len(trees),len(depth),max_mRMR_features2))
val_feat_import_rf.fill(np.nan)
for n1 in range(len(n_feat_mRMR1)):
# Get the features from first mRMR
mRMR_features1 = [item for sublist in mRMR_features for item in sublist[0:n_feat_mRMR1[n1]]]
# print("{} features are left after first mRMR filtering".format(len(mRMR_features1)))
X_train_mRMR = X_train[mRMR_features1].copy()
# Second round of mRMR on the reduced list
filter_feat_selector = mRMR_feature_select(X_train_mRMR.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features2,
K_MAX=1000,n_jobs=-1,verbose=False)
for n2 in range(len(n_feat_mRMR2)):
# Get selected features from mRMR2
filter_features = X_train_mRMR.columns[filter_feat_selector[0:n_feat_mRMR2[n2]]]
X_train_mRMR2 = X_train_mRMR[filter_features].copy()
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
min_features_to_select = 1
# Using normal for loop as RFECV has inbuilt multiprocessing
for C in range(len(C_parameter_SVM)):
for K in range(len(kernels)):
# Define the model
svc = SVC(C=C_parameter_SVM[C], kernel=kernels[K], tol=1e-3, cache_size=4000)
# Perform recurive feature elimination with in-built CV
rfecv = RFECV(estimator=svc, n_jobs=-1, scoring="balanced_accuracy",
cv=Inner_CV,
min_features_to_select=min_features_to_select)
rfecv.fit(X_train_mRMR2,y_train.to_numpy().ravel())
# Get CV score
err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
# Save results for hyperparameter optimization
val_err_svm[n1,n2,C,K] = err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx = np.where(X_train.columns.isin(filter_features[rfecv.support_]))[0]
val_feat_svm[n1,n2,C,K,0:len(rfe_feat_idx)] = rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,np.prod([len(C_parameter_SVM),len(kernels)])))
# Logistic regression with sequential forward selection
# In-bult CV
k_features = np.arange(1,n_feat_mRMR2[n2]+1,1) # try up to all features
inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
for C in range(len(C_parameter_LogReg)):
LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
max_iter = 50000)
sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
forward = True, scoring = "balanced_accuracy",
verbose = 0, floating = False,
cv = Inner_CV, n_jobs = -1)
sfs = sfs.fit(X_train_mRMR2, y_train.to_numpy().ravel())
# Save CV scores for each SFS step
for feat in range(len(k_features)):
inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]
if len(K) > 1:
K = K[np.where(K == np.min(K))[0]]
err = inner_cv_scores[C,K]
# Save validation error and features
val_err_logreg[n1,n2,C] = err
# Save the features chosen by RFE based on index from pre mRMR
sfs_feat_idx = np.where(X_train.columns.isin(sfs.subsets_[int(K)+1]["feature_names"]))[0] # K+1 since it starts with 1 feature
val_feat_logreg[n1,n2,C,0:len(sfs_feat_idx)] = sfs_feat_idx
# print("Finished LogReg run {} out of {}".format(C+1,len(C_parameter_LogReg)))
# Random forest
model_test_err = np.zeros((k_fold,len(trees),len(depth)))
RF_feat_import = np.zeros((k_fold,len(trees),len(depth),X_train_mRMR2.shape[1]))
counter = 0
for cv in range(k_fold):
# Retrieve CV indices
train_index_rf = Inner_CV[cv][0]
test_index_rf = Inner_CV[cv][1]
# Retrieve datasets
X_train2 = X_train_mRMR2.iloc[train_index_rf]; X_test2 = X_train_mRMR2.iloc[test_index_rf]
y_train2 = y_train.to_numpy().ravel()[train_index_rf]; y_test2 = y_train.to_numpy().ravel()[test_index_rf]
for t in range(len(trees)):
for d in range(len(depth)):
RF = RandomForestClassifier(n_estimators=trees[t], max_features="sqrt", max_depth=depth[d],
n_jobs=-1, random_state=None)
RF.fit(X_train2.to_numpy(),y_train2)
# Validation error
RF_y = RF.predict(X_test2.to_numpy())
err = balanced_accuracy_score(y_test2, RF_y)
# Save to array
model_test_err[cv,t,d] = err
# Save feature importance array
RF_feat_import[cv,t,d] = RF.feature_importances_
counter += 1
# print("Finished RF run {} out of {}".format(counter, k_fold))
# Average the errors over the CV folds
model_test_err_mean = np.mean(model_test_err, axis=0)
val_err_rf[n1,n2,:,:] = model_test_err_mean
# Average the feature importances over the CV folds
RF_feat_import = np.mean(RF_feat_import, axis=0)
val_feat_import_rf[n1,n2,:,:,0:n_feat_mRMR2[n2]] = RF_feat_import
# Save the features used by the RF based on index from pre mRMR
rf_feat_idx = np.where(X_train.columns.isin(filter_features))[0]
val_feat_rf[n1,n2,:,:,0:len(rf_feat_idx)] = rf_feat_idx
print("Finished {}% of total run".format((n1+1)*1/len(n_feat_mRMR1)*100))
# Choose the optimal parameters
### SVM
n1, n2, C, K = np.where(val_err_svm==np.max(val_err_svm))
if len(C) > 1:
print("There are multiple SVM runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
rfe_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
n2_temp = int(n2[i2])
C_temp = int(C[i2])
K_temp = int(K[i2])
temp_feats = val_feat_svm[n1_temp,n2_temp,C_temp,K_temp][~np.isnan(val_feat_svm[n1_temp,n2_temp,C_temp,K_temp])].astype(int)
rfe_feat_len.append(len(temp_feats))
rfe_feat_min = np.where(rfe_feat_len==np.min(rfe_feat_len))[0]
if len(rfe_feat_min) > 1:
print("Multiple SVM runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[rfe_feat_min])
n1, n2, C, K = [int(n1[rfe_feat_min][C_min]), int(n2[rfe_feat_min][C_min]),
int(C[rfe_feat_min][C_min]), int(K[rfe_feat_min][C_min])]
else:
n1, n2, C, K = [int(n1[int(rfe_feat_min)]), int(n2[int(rfe_feat_min)]),
int(C[int(rfe_feat_min)]), int(K[int(rfe_feat_min)])]
else:
n1, n2, C, K = [int(n1), int(n2), int(C), int(K)]
mRMR_chosen1 = n_feat_mRMR1[n1]
mRMR_chosen2 = n_feat_mRMR2[n2]
C_chosen = C_parameter_SVM[C]
K_chosen = kernels[K]
# Save the best validation erro
val_error = val_err_svm[n1, n2, C, K]
# Get the subsets chosen
chosen_feats = val_feat_svm[n1,n2,C,K][~np.isnan(val_feat_svm[n1,n2,C,K])].astype(int)
rfe_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rfe_features)
x_train_mRMR2_rfe = X_train[rfe_features]
# Fit on all training data
model = SVC(C=C_chosen, kernel=K_chosen, tol=1e-3, cache_size=4000)
model = model.fit(x_train_mRMR2_rfe,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rfe)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rfe_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,0,:] = [train_error,val_error,test_error]
SVM_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat,K_chosen]
SVM_model = model
SVM_y_pred = [[model_train_y],[y_pred]]
### LogReg with SFS
n1, n2, C = np.where(val_err_logreg==np.max(val_err_logreg))
if len(C) > 1:
print("There are multiple LogReg runs with the same validation error")
print("Choosing run with fewest features to alleviate overfitting")
sfs_feat_len = []
for i2 in range(len(C)):
n1_temp = int(n1[i2])
n2_temp = int(n2[i2])
C_temp = int(C[i2])
temp_feats = val_feat_logreg[n1_temp,n2_temp,C_temp][~np.isnan(val_feat_logreg[n1_temp,n2_temp,C_temp])].astype(int)
sfs_feat_len.append(len(temp_feats))
sfs_feat_min = np.where(sfs_feat_len==np.min(sfs_feat_len))[0]
if len(sfs_feat_min) > 1:
print("Multiple LogReg runs with same number of fewest features")
print("Choosing run with lowest C (highest regularization) to alleviate overfitting")
C_min = np.argmin(C[sfs_feat_min])
n1, n2, C = [int(n1[sfs_feat_min][C_min]), int(n2[sfs_feat_min][C_min]),
int(C[sfs_feat_min][C_min])]
else:
n1, n2, C = [int(n1[int(sfs_feat_min)]), int(n2[int(sfs_feat_min)]),
int(C[int(sfs_feat_min)])]
else:
n1, n2, C = [int(n1), int(n2), int(C)]
mRMR_chosen1 = n_feat_mRMR1[n1]
mRMR_chosen2 = n_feat_mRMR2[n2]
C_chosen = C_parameter_LogReg[C]
# Save the best validation erro
val_error = val_err_logreg[n1, n2, C]
# Get the subsets chosen
chosen_feats = val_feat_logreg[n1,n2,C][~np.isnan(val_feat_logreg[n1,n2,C])].astype(int)
sfs_features = list(X_train.columns[chosen_feats])
n_final_feat = len(sfs_features)
x_train_mRMR2_sfs = X_train[sfs_features]
# Fit on all training data
model = LogisticRegression(penalty="l2", C=C_chosen, max_iter = 50000)
model = model.fit(x_train_mRMR2_sfs,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_sfs)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[sfs_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,1,:] = [train_error,val_error,test_error]
LogReg_model_par = [mRMR_chosen1,mRMR_chosen2,C_chosen,n_final_feat]
LogReg_model = model
LogReg_y_pred = [[model_train_y],[y_pred]]
### RF
n1, n2, t, d = np.where(val_err_rf==np.max(val_err_rf))
if len(d) > 1:
print("There are multiple RF runs with the same validation error")
print("Choosing run with lowest depth to alleviate overfitting")
d_min = np.where(d==np.min(d))[0]
if len(d_min) > 1:
print("Multiple RF runs with same number number of depths")
print("Choosing run with lowest trees to alleviate overfitting")
t_min = np.argmin(t[d_min]) # argmin just takes the first
# If there are multiple with same parameters and validation error it is most likely the same
n1, n2, t, d = [int(n1[d_min][t_min]), int(n2[d_min][t_min]),
int(t[d_min][t_min]), int(d[d_min][t_min])]
else:
n1, n2, t, d = [int(n1[int(d_min)]), int(n2[int(d_min)]),
int(t[int(d_min)]), int(d[int(d_min)])]
else:
n1, n2, t, d = [int(n1), int(n2), int(t), int(d)]
mRMR_chosen1 = n_feat_mRMR1[n1]
mRMR_chosen2 = n_feat_mRMR2[n2]
t_chosen = trees[t]
d_chosen = depth[d]
# Save the best validation erro
val_error = val_err_rf[n1, n2, t, d]
# Get the chosen features and feature importances
chosen_feats = val_feat_rf[n1,n2,t,d][~np.isnan(val_feat_rf[n1,n2,t,d])].astype(int)
rf_features = list(X_train.columns[chosen_feats])
n_final_feat = len(rf_features)
x_train_mRMR2_rf = X_train[rf_features]
rf_feat_importances = val_feat_import_rf[n1,n2,t,d][~np.isnan(val_feat_import_rf[n1,n2,t,d])]
# Fit on all training data
model = RandomForestClassifier(n_estimators=t_chosen, max_features="sqrt", max_depth=d_chosen,
n_jobs=-1, random_state=None)
model = model.fit(x_train_mRMR2_rf,y_train.to_numpy().ravel())
# Get training output
model_train_y = model.predict(x_train_mRMR2_rf)
# Get training error
train_error = balanced_accuracy_score(y_train, model_train_y)
# Get prediction of test data
y_pred = model.predict(X_test[rf_features])
# Use model to predict class on test data
test_error = balanced_accuracy_score(y_test, y_pred)
# Save or prepare to save
accuracy_arr[rep,Outer_counter,2,:] = [train_error,val_error,test_error]
RF_model_par = [mRMR_chosen1,mRMR_chosen2,t_chosen,d_chosen]
RF_model = model
RF_y_pred = [[model_train_y],[y_pred]]
# Save the rest
model_par0.append([SVM_model_par,LogReg_model_par,RF_model_par])
final_models0.append([SVM_model,LogReg_model,RF_model])
final_features0.append([rfe_features, sfs_features, [rf_features,rf_feat_importances]])
final_y_preds0.append([SVM_y_pred,LogReg_y_pred,RF_y_pred])
# Move counter
Outer_counter += 1
print("Finished outer fold {} out of {} for rep: {}".format(Outer_counter,k_out,rep+1))
# Save results from all outer folds
model_par.append(model_par0)
final_models.append(final_models0)
final_features.append(final_features0)
final_y_preds.append(final_y_preds0)
# Save results to file
Rep_mRMR2_SVM_LogReg_RF = [accuracy_arr, model_par, final_models, final_features, final_y_preds]
# Run with variable feat in first and second mRMR
with open(Model_savepath+"Rep10_10x10CV_mRMR2_SVM_LogReg_RF_Group_Status_results_010222.pkl", "wb") as filehandle:
pickle.dump(Rep_mRMR2_SVM_LogReg_RF, filehandle)
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
# Print total progress
print("Finished outer fold repetition {} out of {}".format(rep+1,n_repetitions))
# %% ML predction of PTSD status using each feature separately
# mRMR -> mRMR -> SVM/RF/LogReg with L2
# 10 repetitions of 10 fold outer and 10 fold inner two-layer CV
# With concurrent processes for faster computation
EEG_features_df, EEG_features_name_list = load_features_df()
# Add group status
Group_status = np.array([0]*EEG_features_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in EEG_features_df["Subject_ID"]])] = 1 # PTSD = 1
EEG_features_df.insert(1, "Group_status", Group_status)
# Subject info columns
Subject_info_cols = list(EEG_features_df.columns[0:2])
n_subject_info_cols = len(Subject_info_cols)
n_discrete_cols = 2
# To ensure proper stratification into train/test set I will stratify using group status and study status
# A variable that encodes for this is created
n_studies = 3
study_group_status = EEG_features_df["Group_status"].copy()
for i in range(n_studies):
# Get study index
study_idx = (EEG_features_df["Subject_ID"]>=(i+1)*100000)&(EEG_features_df["Subject_ID"]<(i+2)*100000)
# Assign label
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==0)] = 2*i # CTRL
study_group_status[(study_idx)&(EEG_features_df["Group_status"]==1)] = 2*i+1 # PTSD
# Target variable
Target = ["Group_status"]
Target_col = EEG_features_df.iloc[:,1:].columns.isin(Target)
Target_col_idx = np.where(Target_col == True)[0]
# Make 3 models and save them to use enseemble in the end
CLF_models = ["SVM", "LogReg", "RF"]
n_models = len(CLF_models)
# Repeat the classification to see if I just got a lucky seed
n_repetitions = 10
k_out = 10
# accuracy_arr = np.zeros((n_repetitions,k_out,len(EEG_features_name_list),n_models,3))
Ind_feat_clf = [0]*n_repetitions
np.random.seed(42)
# Prepare the splits beforehand to make sure the repetitions are not the same
Rep_Outer_CV = []
Rep_Outer_CV_test = [] # a list with only the test indices
for rep in range(n_repetitions):
skf = StratifiedKFold(n_splits=k_out, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Outer_CV = []
Outer_CV_test = []
for train_index, test_index in skf.split(EEG_features_df,study_group_status):
Outer_CV.append([train_index,test_index])
Outer_CV_test.append(test_index)
Rep_Outer_CV.append(Outer_CV)
Rep_Outer_CV_test.append(Outer_CV_test)
# Check none of the repetitions are the same by using only the test sets
# The list is first flattened
Rep_Outer_CV_test_flat = [item for sublist in Rep_Outer_CV_test for item in sublist]
# All elements are converted to strings
# This makes it easier to look for uniques, and the indices are already in numerical order
Rep_Outer_CV_test_flat_str = ["".join([str(x) for x in ele])for ele in Rep_Outer_CV_test_flat]
def allUnique(x):
seen = set()
return not any(i in seen or seen.add(i) for i in x)
assert allUnique(Rep_Outer_CV_test_flat_str)
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
def Each_feat_10rep_10x10CV(rep):
# The outer fold CV has already been saved as lists
Outer_CV = Rep_Outer_CV[rep]
# Pre-allocate memory
Ind_feat_clf0 = []
accuracy_arr0 = np.zeros((k_out,len(EEG_features_name_list),n_models,3))
Outer_counter = 0
for train_index, test_index in Outer_CV:
test_df = EEG_features_df.iloc[test_index]
train_df = EEG_features_df.iloc[train_index]
# Training data will be standardized
standardizer = preprocessing.StandardScaler().fit(train_df.iloc[:,n_discrete_cols:])
train_df_standard = train_df.copy()
train_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(train_df_standard.iloc[:,n_discrete_cols:])
# Test data will also be standardized but using mean and std from training data
test_df_standard = test_df.copy()
test_df_standard.iloc[:,n_discrete_cols:] = standardizer.transform(test_df_standard.iloc[:,n_discrete_cols:])
# Get the training data
X_train = train_df_standard.copy().drop(Subject_info_cols, axis=1)
y_train = train_df_standard[Target]
# Get test data
X_test = test_df_standard.copy().drop(Subject_info_cols, axis=1)
y_test = test_df_standard[Target]
# Part 2 with second mRMR and classifiers in loop
k_fold = 10
skf2 = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=(rep+1)*123) # using 10% equals around 21 test subjects
# I am also using converting it to an iterable list instad of the generator to promote reuse
Inner_CV = []
for train_index2, test_index2 in skf2.split(train_df,study_group_status.iloc[train_index]):
Inner_CV.append([train_index2,test_index2])
# Prepare initial filtering of feature types to alleviate imbalance in number of features
# Use a variable that is optimized using inner CV
n_feat_mRMR1 = [20,30,40,50]
# Max features from mRMR for each eye status
max_mRMR_features = n_feat_mRMR1[-1]
eye_status = ["Eyes Closed", "Eyes Open"]
n_eye_status = len(eye_status)
feat_counter = []
# Perform mRMR for each feature type followed by RFE SVM/SFS LogReg/RF
Ind_feat_clf00 = []
for fset in range(len(EEG_features_name_list)):
temp_feat = EEG_features_name_list[fset]
other_feats = EEG_features_name_list[:fset]+EEG_features_name_list[fset+1:]
other_feats = [item for sublist in other_feats for item in sublist] # make the list flatten
# Retrieve the dataset for each feature
col_idx = np.zeros(len(X_train.columns), dtype=bool)
for fsub in range(len(temp_feat)):
temp_feat0 = temp_feat[fsub]
col_idx0 = X_train.columns.str.contains(temp_feat0+"_")
col_idx = np.logical_or(col_idx,col_idx0) # append all trues
temp_X_train = X_train.loc[:,col_idx]
# Check if any of the other features were wrongly chosen
for fcheck in range(len(other_feats)):
if any(temp_X_train.columns.str.contains(other_feats[fcheck]+"_")==True):
temp_X_train = temp_X_train.loc[:,np.invert(temp_X_train.columns.str.contains(other_feats[fcheck]))]
if temp_X_train.size == 0: # if no columns are left, e.g. imcoh when removing coh, then add features again
temp_X_train = X_train.loc[:,X_train.columns.str.contains(temp_feat0+"_")]
# Save number of original features fed into mRMR
feat_counter.append([temp_feat,temp_X_train.shape[1]])
# If there are no features selected, then skip the rest of the loop
if temp_X_train.shape[1] == 0:
continue
# Do not use mRMR if there are fewer than n_features
if temp_X_train.shape[1] <= max_mRMR_features:
filter_features = temp_X_train.columns
else:
# mRMR
filter_feat_selector = mRMR_feature_select(temp_X_train.to_numpy(),y_train.to_numpy(),
num_features_to_select=max_mRMR_features,
K_MAX=1000,n_jobs=1,verbose=False)
# Get selected features
filter_features = temp_X_train.columns[filter_feat_selector]
# SVM with L2-norm (it is by default squared)
# Prepare hyper parameters
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_SVM = np.power(np.array([10]*len(exponent)),exponent)
# rbf kernel overfit easier, whereas linear empirically works better in high D data
# Sequential Feature Selection for Logistic Regression
# In-built model selection CV
# The L2 is the inverse of C for LogReg
exponent = np.linspace(-3,1,9)
exponent = np.round(exponent,5) # sometimes linspace are not so exact
C_parameter_LogReg = np.power(np.array([10]*len(exponent)),exponent)
# Random forest classifier
# Prepare hyper parameters
trees = np.array([10, 100, 500, 1000])
depth = np.linspace(1,2,2) # using more depth leads to a lot of overfitting
# Prepare arrays for validation errors
val_err_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM)))
val_feat_svm = np.zeros((len(n_feat_mRMR1),len(C_parameter_SVM),max_mRMR_features))
val_feat_svm.fill(np.nan)
val_err_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg)))
val_feat_logreg = np.zeros((len(n_feat_mRMR1),len(C_parameter_LogReg),max_mRMR_features))
val_feat_logreg.fill(np.nan)
val_err_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth)))
val_feat_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_rf.fill(np.nan)
val_feat_import_rf = np.zeros((len(n_feat_mRMR1),len(trees),len(depth),max_mRMR_features))
val_feat_import_rf.fill(np.nan)
min_features_to_select = 1
for n1 in range(len(n_feat_mRMR1)):
mRMR_features1 = filter_features[0:n_feat_mRMR1[n1]]
# Use the selected filter features
temp_X_train_mRMR = temp_X_train[mRMR_features1]
# SVM with recursive feature elemination
# Stratified CV to find best regularization strength and number of features
# CV is built-in RFECV
for C in range(len(C_parameter_SVM)):
# Define the model
svc = SVC(C=C_parameter_SVM[C], kernel="linear", tol=1e-3, cache_size=4000)
# Perform recurive feature elimination with in-built CV
rfecv = RFECV(estimator=svc, n_jobs=1, scoring="balanced_accuracy",
cv=Inner_CV,
min_features_to_select=min_features_to_select)
rfecv.fit(temp_X_train_mRMR,y_train.to_numpy().ravel())
# Get CV score
err = rfecv.grid_scores_[rfecv.n_features_-min_features_to_select]
# Save results for hyperparameter optimization
val_err_svm[n1,C] = err
# Save the features chosen by RFE based on index from pre mRMR
rfe_feat_idx = np.where(X_train.columns.isin(mRMR_features1[rfecv.support_]))[0]
val_feat_svm[n1,C,0:len(rfe_feat_idx)] = rfe_feat_idx
# print("Finished SVM run {} out of {}".format(C+1,len(C_parameter_SVM)))
# Logistic regression with sequential forward selection
# In-bult CV
k_max = np.min([temp_X_train_mRMR.shape[1],n_feat_mRMR1[n1]])
k_features = np.arange(1,k_max+1,1) # try up to all features
inner_cv_scores = np.zeros((len(C_parameter_LogReg),len(k_features)))
for C in range(len(C_parameter_LogReg)):
LogReg = LogisticRegression(penalty="l2", C=C_parameter_LogReg[C],
max_iter = 50000)
sfs = SFS(LogReg, k_features = (k_features[0],k_features[-1]),
forward = True, scoring = "balanced_accuracy",
verbose = 0, floating = False,
cv = Inner_CV, n_jobs = 1)
sfs = sfs.fit(temp_X_train_mRMR, y_train.to_numpy().ravel())
# Save CV scores for each SFS step
for feat in range(len(k_features)):
inner_cv_scores[C,feat] = sfs.get_metric_dict()[k_features[feat]]["avg_score"]
# Find the best number of features
# I am rounding to make it more smooth and disregard small improvements
K = np.where(np.round(inner_cv_scores[C,:],2)==np.max(np.round(inner_cv_scores[C,:],2)))[0]