diff --git a/FeatureEstimation.py b/FeatureEstimation.py index af84e9f1c540aae9cdb93c92c207d1ff2b2d3040..4c599d466a1c402e30f172a778f9e4f5e7c04a73 100644 --- a/FeatureEstimation.py +++ b/FeatureEstimation.py @@ -53,51 +53,362 @@ for r, d, f in os.walk(load_path): files.append(os.path.join(r, file)) files.sort() + # Subject IDs Subject_id = [0] * len(files) for i in range(len(files)): - temp = files[i].split("\\") - temp = temp[-1].split("_") - Subject_id[i] = int(temp[0][-1]) # Subject_id[i] = int(temp[0]) + temp = files[i].split("/") + temp = temp[-1].split(".") + temp = temp[0].split("_") + Subject_id[i] = int(temp[0]) + +# Should exclude subject 100326 due to 7 bad channels +# Exclude 200013 and 200015 due to too many dropped epochs +# (200001, 200004, 200053 and 200072 were already excluded prior to preprocessing) +# Exclude 302215, 302224, 302227, 302233, 302264, 302268, 302275 due to too many dropped epochs +# 13 subjects excluded in total + 4 I did not receive because they were marked as bad from Helse +bad_subjects = [100326, 200013, 200015, 302224, 302227, 302233, 302264, 302268, 302275] +good_subject_idx = [not i in bad_subjects for i in Subject_id] + +Subject_id = list(np.array(Subject_id)[good_subject_idx]) +files = list(np.array(files)[good_subject_idx]) n_subjects = len(Subject_id) -# Load Final epochs -final_epochs = [0]*n_subjects -for n in range(n_subjects): - final_epochs[n] = mne.read_epochs(fname = os.path.join(files[n]), +# Load ISAF +n_ISAF = 51-1 +ISAF7_final_epochs = [0]*n_ISAF +for n in range(len(ISAF7_final_epochs)): + ISAF7_final_epochs[n] = mne.read_epochs(fname = os.path.join(files[n]), + verbose=0) + +# Load HELSE +n_HELSE = 70-2 +HELSE_final_epochs = [0]*n_HELSE +for n in range(len(HELSE_final_epochs)): + HELSE_final_epochs[n] = mne.read_epochs(fname = os.path.join(files[n+n_ISAF]), verbose=0) + # Rename channels to match ISAF (Using 10-20 MCN) + mne.rename_channels(HELSE_final_epochs[n].info, {"T3":"T7", + "T4":"T8", + "T5":"P7", + "T6":"P8"}) +# Warning about chronological order is due to interleaved EO->EC->EO->EC being concatenated as 5xEC->5xEO + +# Load Baseline +n_Base = 91-6 +Base_final_epochs = [0]*n_Base +for n in range(len(Base_final_epochs)): + Base_final_epochs [n] = mne.read_epochs(fname = os.path.join(files[n+n_ISAF+n_HELSE]), + verbose=0) + +# I will use the union of the channels in both dataset (except mastoids) +# This means I add empty channels and interpolate when it is missing +# Helsefond montage has wrong calibration, head size are too big. +# Thus I will need to re-calibrate by comparing with ISAF7 +# Notice I already used the final dig montage for Baseline data + +# Get channel names +ISAF7_chs = ISAF7_final_epochs[0].copy().info["ch_names"] +Helse_chs = HELSE_final_epochs[0].copy().info["ch_names"] +Base_chs = Base_final_epochs[0].copy().info["ch_names"] + +# Get intersection of channels +intersect_ch = list(set(Helse_chs) & set(ISAF7_chs)) +intersect_ch_ratio = [0]*len(intersect_ch) +for i in range(len(intersect_ch)): + # Get channel name + ch_name0 = intersect_ch[i] + # Get index and electrode location + ISAF7_ch_idx = np.where(np.array(ISAF7_chs) == ch_name0)[0] + ISAF7_ch_loc = ISAF7_final_epochs[0].info["chs"][int(ISAF7_ch_idx)]["loc"] + Helse_ch_idx = np.where(np.array(Helse_chs) == ch_name0)[0] + Helse_ch_loc = HELSE_final_epochs[0].info["chs"][int(Helse_ch_idx)]["loc"] + # Calculate ratio + intersect_ch_ratio[i] = [ch_name0,ISAF7_ch_loc[0:3]/Helse_ch_loc[0:3]] +# Most of the ratios are either around 0.095 or 0/Inf when division with 0 +# We also see that Cz for ISAF is defined as (0,0,0.095m). For Helse Cz is (0,0,1m) +Helse_to_ISAF7_cal = 0.095 + +# Now we are ready to make a combined montage (info["dig"]) +# Get list of channels to add +ISAF7_add_ch_list = list(set(Helse_chs)-set(ISAF7_chs)) +Helse_add_ch_list = list(set(ISAF7_chs)-set(Helse_chs)) +combined_ch_list = Helse_chs + ISAF7_chs +# Remove duplicates while maintaining A-P order +duplicate = set() +final_ch_list = [x for x in combined_ch_list if not (x in duplicate or duplicate.add(x))] +# Move AFz and POz to correct position +final_ch_list.insert(3,final_ch_list.pop(-2)) # from second last to 3 +final_ch_list.insert(-5,final_ch_list.pop(-1)) # from last to seventh from last + +# The order of DigPoints in info are based on sorted ch names +Helse_dig = HELSE_final_epochs[0].copy().info["dig"] +Helse_chs_sorted = Helse_chs.copy() +Helse_chs_sorted.sort() +ISAF7_dig = ISAF7_final_epochs[0].copy().info["dig"] +ISAF7_chs_sorted = ISAF7_chs.copy() +ISAF7_chs_sorted.sort() + +# Make one list with DigPoints from Helse + unique channels from ISAF7 +ch_idx = [i for i, item in enumerate(ISAF7_chs_sorted) if item in set(Helse_add_ch_list)] # indices of unique channels +final_ch_list_sorted = final_ch_list.copy() +final_ch_list_sorted.sort() +dig_insert_idx = [i for i, item in enumerate(final_ch_list_sorted) if item in set(Helse_add_ch_list)] # find where ISAF7 channels should be inserted + +# Prepare combined dig montage +final_dig = Helse_dig.copy() +# Calibrate Helse digpoints +for i in range(len(final_dig)): + final_dig[i]["r"] = final_dig[i]["r"]*Helse_to_ISAF7_cal +# Insert ISAF7 digpoints +for i in range(len(ch_idx)): + final_dig.insert(dig_insert_idx[i],ISAF7_dig[ch_idx[i]]) + +# Remove mastoids +Mastoid_ch = ["M1", "M2"] +M_idx = [i for i, item in enumerate(final_ch_list_sorted) if item in set(Mastoid_ch)] # find mastoids ch +M_idx2 = [i for i, item in enumerate(final_ch_list) if item in set(Mastoid_ch)] # find mastoids ch +M_idx3 = [i for i, item in enumerate(Helse_add_ch_list) if item in set(Mastoid_ch)] # find mastoids ch +for i in reversed(range(len(M_idx))): + del final_ch_list_sorted[M_idx[i]] + del final_dig[M_idx[i]] + # Mastoids are placed in the back of final_ch_list and Helse_add_ch_list and are also removed + del final_ch_list[M_idx2[i]] + del Helse_add_ch_list[M_idx3[i]] + +# T7, T8, Pz, P8 and P7 are placed wrongly (probably due to renaming) +# This is fixed manually +final_dig.insert(-7,final_dig.pop(-4)) # swap between P8 and T7 +final_dig.insert(-6,final_dig.pop(-3)) # swap between T7 and T8 +final_dig.insert(-4,final_dig.pop(-4)) # swap between Pz and T7 +final_dig.insert(-5,final_dig.pop(-5)) # swap between Pz and POz + +# Update EEG identity number +for i in range(len(final_dig)): + final_dig[i]["ident"] = i+1 + +# Make final digital montage +final_digmon = mne.channels.DigMontage(dig=final_dig, ch_names=final_ch_list_sorted) +# final_digmon.plot() # visually inspect topographical positions +# final_digmon.plot(kind="3d") # visually inspect 3D positions +# final_digmon.save("final_digmon.fif") # Save digital montage +with open("final_digmon_ch_names.pkl", "wb") as filehandle: + # The data is stored as binary data stream + pickle.dump(final_digmon.ch_names, filehandle) + +# Remove mastoids from ISAF, add channels from Helse and interpolate +for n in range(len(ISAF7_final_epochs)): + # Remove mastoid channels + ISAF7_final_epochs[n].drop_channels(Mastoid_ch) + # Add empty channels to interpolate - notice that the locations are set to 0 + mne.add_reference_channels(ISAF7_final_epochs[n],ISAF7_add_ch_list,copy=False) + # Fix channel info (both after removal of mastoids and newly added chs) + # Ch info loc are linked for all reference channels and this link is removed + for c in range(ISAF7_final_epochs[n].info["nchan"]): + ISAF7_final_epochs[n].info["chs"][c]["loc"] = ISAF7_final_epochs[n].info["chs"][c]["loc"].copy() + ISAF7_final_epochs[n].info["chs"][c]["scanno"] = c+1 + ISAF7_final_epochs[n].info["chs"][c]["logno"] = c+1 + # Set new combined montage + ISAF7_final_epochs[n].set_montage(final_digmon) + # Set newly added channels as "bad" and interpolate + ISAF7_final_epochs[n].info["bads"] = ISAF7_add_ch_list + ISAF7_final_epochs[n].interpolate_bads(reset_bads=True) + # Fix "picks" in order to reorder channels + ISAF7_final_epochs[n].picks = np.array(range(ISAF7_final_epochs[n].info["nchan"])) + # Reorder channel + ISAF7_final_epochs[n].reorder_channels(final_ch_list) + +# Add channels from ISAF to Helse and interpolate +for n in range(len(HELSE_final_epochs)): + # Add empty channels to interpolate + mne.add_reference_channels(HELSE_final_epochs[n],Helse_add_ch_list,copy=False) + # Fix channel info (both after removal of mastoids and newly added chs) + # Ch info loc are linked for all reference channels and this link is removed + for c in range(HELSE_final_epochs[n].info["nchan"]): + HELSE_final_epochs[n].info["chs"][c]["loc"] = HELSE_final_epochs[n].info["chs"][c]["loc"].copy() + HELSE_final_epochs[n].info["chs"][c]["scanno"] = c+1 + HELSE_final_epochs[n].info["chs"][c]["logno"] = c+1 + # Set new combined montage + HELSE_final_epochs[n].set_montage(final_digmon) + # Set newly added channels as "bad" and interpolate + HELSE_final_epochs[n].info["bads"] = Helse_add_ch_list + HELSE_final_epochs[n].interpolate_bads(reset_bads=True) + # Fix "picks" in order to reorder channels + HELSE_final_epochs[n].picks = np.array(range(HELSE_final_epochs[n].info["nchan"])) + # Reorder channels + HELSE_final_epochs[n].reorder_channels(final_ch_list) + +# Add missing channels to Baseline and interpolate +Base_add_ch_list = list(set(final_ch_list)-set(Base_chs)) +for n in range(len(Base_final_epochs)): + # Add empty channels to interpolate + mne.add_reference_channels(Base_final_epochs[n],Base_add_ch_list,copy=False) + # Fix channel info (both after removal of mastoids and newly added chs) + # Ch info loc are linked for all reference channels and this link is removed + for c in range(Base_final_epochs[n].info["nchan"]): + Base_final_epochs[n].info["chs"][c]["loc"] = Base_final_epochs[n].info["chs"][c]["loc"].copy() + Base_final_epochs[n].info["chs"][c]["scanno"] = c+1 + Base_final_epochs[n].info["chs"][c]["logno"] = c+1 + # Set new combined montage + Base_final_epochs[n].set_montage(final_digmon) + # Set newly added channels as "bad" and interpolate + Base_final_epochs[n].info["bads"] = Base_add_ch_list + Base_final_epochs[n].interpolate_bads(reset_bads=True) + # Fix "picks" in order to reorder channels + Base_final_epochs[n].picks = np.array(range(Base_final_epochs[n].info["nchan"])) + # Reorder channels + Base_final_epochs[n].reorder_channels(final_ch_list) + +# Combine both dataset in one list +final_epochs = ISAF7_final_epochs+HELSE_final_epochs+Base_final_epochs +# Check number of epochs +file_lengths = [0]*len(final_epochs) +for i in range(len(final_epochs)): + file_lengths[i] = len(final_epochs[i]) +# sns.distplot(file_lengths) # visualize +np.min(file_lengths)/150*100 # Max 20% epochs dropped. Above and the subjects were excluded +n_subjects = len(final_epochs) + +# Re-sample files to 200Hz (Data was already lowpass filtered at 100, so above 200 is oversampling) +for i in range(len(final_epochs)): + final_epochs[i].resample(sfreq=200, verbose=2) + +# # Save final epochs data +# save_path = "/home/glia/Analysis/Final_epochs_data" +# for n in range(len(final_epochs)): +# # Make file writeable +# final_epochs[n]._times_readonly.flags["WRITEABLE"] = False +# # Save file +# final_epochs[n].save(fname = os.path.join(save_path,str("{}_final_epoch".format(Subject_id[n]) + "-epo.fif")), +# overwrite=True, verbose=0) # Load dropped epochs - used for gap idx in microstates -bad_subjects = [12345] # list with subjects that were excluded due to too many dropped epochs/chs -Drop_epochs_df = pd.read_pickle("./Preprocessing/dropped_epochs.pkl") +ISAF7_dropped_epochs_df = pd.read_pickle("ISAF7_dropped_epochs.pkl") +Helse_dropped_epochs_df = pd.read_pickle("HELSE_dropped_epochs.pkl") +Base_dropped_epochs_df = pd.read_pickle("Base_dropped_epochs.pkl") +Drop_epochs_df = pd.concat([ISAF7_dropped_epochs_df,Helse_dropped_epochs_df, + Base_dropped_epochs_df]).reset_index(drop=True) good_subject_df_idx = [not i in bad_subjects for i in Drop_epochs_df["Subject_ID"]] Drop_epochs_df = Drop_epochs_df.loc[good_subject_df_idx,:] Drop_epochs_df = Drop_epochs_df.sort_values(by=["Subject_ID"]).reset_index(drop=True) ### Load questionnaire data -# For the purposes of this demonstration I will make a dummy dataframe -# The original code imported csv files with questionnaire data and group status -final_qdf = pd.read_csv(r'/data/raw/FOR_DTU/Questionnaires_for_DTU.csv') - -""" -final_qdf = pd.DataFrame({"Subject_ID":Subject_id, - "Age":[23,26], - "Gender":[0,0], - "Group_status":[0,1], - "PCL_total":[33,56], - "Q1":[1.2, 2.3], - "Q2":[1.7, 1.5], - "Qn":[2.1,1.0]}) -""" +# ISAF +qdf_ISAF7 = pd.read_csv("/data/raw/FOR_DTU/Questionnaires_for_DTU.csv", + sep=";", na_values=' ') +# Rename Subject_ID column +qdf_ISAF7.rename({"ID_number": "Subject_ID"}, axis=1, inplace=True) +# Sort Subject_id to match Subject_id for files +qdf_ISAF7 = qdf_ISAF7.sort_values(by=["Subject_ID"], ignore_index=True) +# Get column idx for PCL_t7 columns +PCL_idx = qdf_ISAF7.columns.str.contains("PCL") & np.invert(qdf_ISAF7.columns.str.contains("PCL_")) +# Keep subject id +PCL_idx[qdf_ISAF7.columns=="Subject_ID"] = True + +# Make a final df and exclude dropped subjects +final_qdf0 = qdf_ISAF7.loc[qdf_ISAF7["Subject_ID"].isin(Subject_id),PCL_idx].reset_index(drop=True) +# Make column that is sum of all PCL +final_qdf0.insert(len(final_qdf0.columns),"PCL_total",np.sum(final_qdf0.iloc[:,1:],axis=1)) + +# Helse +qdf_helse = pd.read_csv("/data/may2020/Questionnaires/HelsfondenQuestData_nytLbn.csv", + sep=",", na_values=' ') +# Rename subject ID column +qdf_helse.rename(columns={"Nyt_lbn":"Subject_ID"}, inplace=True) +Helse_ID_modifier = 200000 +# Add 200000 to id +qdf_helse["Subject_ID"] += Helse_ID_modifier +# Sort Subject_id to match Subject_id for files +qdf_helse = qdf_helse.sort_values(by=["Subject_ID"], ignore_index=True) +# Get column idx for PCL columns (don't use summarized columns with _) +PCL_idx = qdf_helse.columns.str.contains("PCL") & np.invert(qdf_helse.columns.str.contains("PCL_")) +# Keep subject id +PCL_idx[qdf_helse.columns=="Subject_ID"] = True + +# Make a final df and exclude dropped subjects +final_qdf1 = qdf_helse.loc[qdf_helse["Subject_ID"].isin(Subject_id),PCL_idx].reset_index(drop=True) +# Make column that is sum of all PCL +final_qdf1.insert(len(final_qdf1.columns),"PCL_total",np.sum(final_qdf1.iloc[:,1:],axis=1)) + +# Baseline +# antal_børm renamed to antal_boern +qdf_base = pd.read_csv("/data/sep2020/BaselineForLi.csv", sep=",", na_values=' ') + +# Rename subject ID column +qdf_base.rename(columns={"LbnRand":"Subject_ID"}, inplace=True) +Base_ID_modifier = 300000 +# Add 300000 to id +qdf_base["Subject_ID"] += Base_ID_modifier +# Sort Subject_id to match Subject_id for files +qdf_base = qdf_base.sort_values(by=["Subject_ID"], ignore_index=True) +# Get column idx for PCL columns (don't use summarized columns with _) +PCL_idx = qdf_base.columns.str.contains("PCL") & np.invert(qdf_base.columns.str.contains("PCL_")) +# Keep subject id +PCL_idx[qdf_base.columns=="Subject_ID"] = True + +# Make a final df and exclude dropped subjects +final_qdf2 = qdf_base.loc[qdf_base["Subject_ID"].isin(Subject_id),PCL_idx].reset_index(drop=True) +# Make column that is sum of all PCL +final_qdf2.insert(len(final_qdf2.columns),"PCL_total",np.sum(final_qdf2.iloc[:,1:],axis=1)) + +# Find NaN +nan_idx = np.where(final_qdf2.isna()==True) +final_qdf2.iloc[nan_idx[0],np.concatenate([np.array([0]),nan_idx[1]])] # 2252 has NaN for PCL3 and 12 +# Interpolate with mean of column +final_qdf2 = final_qdf2.fillna(final_qdf2.mean()) + +# Combine the 3 datasets +final_qdf0.columns = final_qdf1.columns # fix colnames with t7 +final_qdf = pd.concat([final_qdf0,final_qdf1,final_qdf2], ignore_index=True) + +# Ensure all columns are integers +final_qdf = final_qdf.astype("int") # Define cases as >= 44 total PCL # Type: numpy array with subject id -cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_t7"]>=44]) +cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44]) n_groups = 2 Groups = ["CTRL", "PTSD"] +# Check percentage of cases in both datasets +len(np.where((cases>100000)&(cases<200000))[0])/n_ISAF # around 32% +len(np.where((cases>200000)&(cases<300000))[0])/n_HELSE # around 51% +len(np.where((cases>300000)&(cases<400000))[0])/n_Base # around 66% +# There is clearly class imbalance between studies! + +### Get depression scores as binary above threshold +# BDI >= 20 is moderate depression +# DASS-42 >= 14 is moderate depression for depression subscale +dep_cases = np.concatenate([np.array(qdf_ISAF7["Subject_ID"][qdf_ISAF7["BDI_t7"] >= 20]), + np.array(qdf_helse["Subject_ID"][qdf_helse["DASS_D_t0"] >= 14]), + np.array(qdf_base["Subject_ID"][qdf_base["DASS_D_t0"] >= 14])]) +dep_cases.sort() +dep_cases = dep_cases[np.isin(dep_cases,Subject_id)] # only keep those that we received and not excluded + +# Check percentage of dep cases in both datasets +len(np.where((dep_cases>100000)&(dep_cases<200000))[0])/n_ISAF # around 34% +len(np.where((dep_cases>200000)&(dep_cases<300000))[0])/n_HELSE # around 53% +len(np.where((dep_cases>300000)&(dep_cases<400000))[0])/n_Base # around 72% + +# Make normalized to max depression score to combine from both scales +# Relative score seem to be consistent between BDI-II and DASS-42 and clinical label +max_BDI = 63 +max_DASS = 42 +# Get normalized depression scores for each dataset +ISAF7_dep_score = qdf_ISAF7["BDI_t7"][qdf_ISAF7["Subject_ID"].isin(Subject_id)]/max_BDI +Helse_dep_score = qdf_helse["DASS_D_t0"][qdf_helse["Subject_ID"].isin(Subject_id)]/max_DASS +Base_dep_score = qdf_base["DASS_D_t0"][qdf_base["Subject_ID"].isin(Subject_id)]/max_DASS +Norm_dep_score = np.concatenate([ISAF7_dep_score.to_numpy(),Helse_dep_score.to_numpy(),Base_dep_score.to_numpy()]) + +# Check if subject id match when using concat +test_d1 = qdf_ISAF7["Subject_ID"][qdf_ISAF7["Subject_ID"].isin(Subject_id)] +test_d2 = qdf_helse["Subject_ID"][qdf_helse["Subject_ID"].isin(Subject_id)] +test_d3 = qdf_base["Subject_ID"][qdf_base["Subject_ID"].isin(Subject_id)] +test_d4 = np.concatenate([test_d1.to_numpy(),test_d2.to_numpy(),test_d3.to_numpy()]) +assert all(np.equal(Subject_id,test_d4)) + # Define folder for saving features Feature_savepath = "./Features/" Stat_savepath = "./Statistics/"