diff --git a/FeatureEstimation.py b/FeatureEstimation.py new file mode 100644 index 0000000000000000000000000000000000000000..7ea34d005991cd0361cf91f9b51c4c9865d8a165 --- /dev/null +++ b/FeatureEstimation.py @@ -0,0 +1,2733 @@ +# -*- coding: utf-8 -*- +""" +Updated Oct 18 2022 + +@author: Qianliang Li (glia@dtu.dk) + +This script contains the code to estimate the following EEG features: + 1. Power Spectral Density + 2. Frontal Theta/Beta Ratio + 3. Asymmetry + 4. Peak Alpha Frequency + 5. 1/f Exponents + 6. Microstates + 7. Long-Range Temporal Correlation (DFA Exponent) +Source localization and functional connectivity + 8. Imaginary part of Coherence + 9. Weighted Phase Lag Index + 10. (Orthogonalized) Power Envelope Correlations + 11. Granger Causality + +It should be run after Preprocessing.py + +All features are saved in pandas DataFrame format for the machine learning +pipeline + +Note that the code has not been changed to fit the demonstration dataset, +thus just running it might introduce some errors. +The code is provided to show how we performed the feature estimation +and if you are adapting the code, you should check if it fits your dataset +e.g. the questionnaire data, sensors and source parcellation etc + +The script was written in Spyder. The outline panel can be used to navigate +the different parts easier (Default shortcut: Ctrl + Shift + O) +""" + +# Set working directory +import os +wkdir = "/home/glia/EEG" +os.chdir(wkdir) + +# Load all libraries from the Preamble +from Preamble import * + +# %% Load preprocessed epochs and questionnaire data +load_path = "./PreprocessedData" + +# Get filenames +files = [] +for r, d, f in os.walk(load_path): + for file in f: + if ".fif" in file: + files.append(os.path.join(r, file)) +files.sort() + +# Subject IDs +Subject_id = [0] * len(files) +for i in range(len(files)): + temp = files[i].split("\\") + temp = temp[-1].split("_") + Subject_id[i] = int(temp[0]) + +n_subjects = len(Subject_id) + +# Load Final epochs +final_epochs = [0]*n_subjects +for n in range(n_subjects): + final_epochs[n] = mne.read_epochs(fname = os.path.join(files[n]), + verbose=0) + +# Load dropped epochs - used for gap idx in microstates +bad_subjects = [12345] # list with subjects that were excluded due to too many dropped epochs/chs +Drop_epochs_df = pd.read_pickle("./Preprocessing/dropped_epochs.pkl") + +good_subject_df_idx = [not i in bad_subjects for i in Drop_epochs_df["Subject_ID"]] +Drop_epochs_df = Drop_epochs_df.loc[good_subject_df_idx,:] +Drop_epochs_df = Drop_epochs_df.sort_values(by=["Subject_ID"]).reset_index(drop=True) + +### Load questionnaire data +# For the purposes of this demonstration I will make a dummy dataframe +# The original code imported csv files with questionnaire data and group status +final_qdf = pd.DataFrame({"Subject_ID":Subject_id, + "Age":[23,26], + "Gender":[0,0], + "Group_status":[0,1], + "PCL_total":[33,56], + "Q1":[1.2, 2.3], + "Q2":[1.7, 1.5], + "Qn":[2.1,1.0]}) + +# Define cases as >= 44 total PCL +# Type: numpy array with subject id +cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44]) +n_groups = 2 +Groups = ["CTRL", "PTSD"] + +# Define folder for saving features +Feature_savepath = "./Features/" +Stat_savepath = "./Statistics/" +Model_savepath = "./Model/" + +# %% Power spectrum features +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +ch_names = final_epochs[0].info["ch_names"] +n_channels = final_epochs[0].info["nchan"] + +# Pre-allocate memory +power_bands = [0]*len(final_epochs) + +def power_band_estimation(n): + # Get index for eyes open and eyes closed + EC_index = final_epochs[n].events[:,2] == 1 + EO_index = final_epochs[n].events[:,2] == 2 + + # Calculate the power spectral density + psds, freqs = psd_multitaper(final_epochs[n], fmin = 1, fmax = 50) # output (epochs, channels, freqs) + + temp_power_band = [] + + for fmin, fmax in Freq_Bands.values(): + # Calculate the power each frequency band + psds_band = psds[:, :, (freqs >= fmin) & (freqs < fmax)].sum(axis=-1) + # Calculate the mean for each eye status + psds_band_eye = np.array([psds_band[EC_index,:].mean(axis=0), + psds_band[EO_index,:].mean(axis=0)]) + # Append for each freq band + temp_power_band.append(psds_band_eye) + # Output: List with the 5 bands, and each element is a 2D array with eye status as 1st dimension and channels as 2nd dimension + + # The list is reshaped and absolute and relative power are calculated + abs_power_band = np.reshape(temp_power_band, (5, 2, n_channels)) + abs_power_band = 10.*np.log10(abs_power_band) # Convert to decibel scale + + rel_power_band = np.reshape(temp_power_band, (5, 2, n_channels)) + rel_power_band = rel_power_band/np.sum(rel_power_band, axis=0, keepdims=True) + # each eye condition and channel have been normalized to power in all 5 frequencies for that given eye condition and channel + + # Make one list in 1 dimension + abs_power_values = np.concatenate(np.concatenate(abs_power_band, axis=0), axis=0) + rel_power_values = np.concatenate(np.concatenate(rel_power_band, axis=0), axis=0) + ## Output: First the channels, then the eye status and finally the frequency bands are concatenated + ## E.g. element 26 is 3rd channel, eyes open, first frequency band + #assert abs_power_values[26] == abs_power_band[0,1,2] + #assert abs_power_values[47] == abs_power_band[0,1,23] # +21 channels to last + #assert abs_power_values[50] == abs_power_band[1,0,2] # once all channels have been changed then the freq is changed and eye status + + # Get result + res = np.concatenate([abs_power_values,rel_power_values],axis=0) + return n, res + +with concurrent.futures.ProcessPoolExecutor() as executor: + for n, result in executor.map(power_band_estimation, range(len(final_epochs))): # Function and arguments + power_bands[n] = result + +# Combine all data into one dataframe +# First the columns are prepared +n_subjects = len(Subject_id) + +# The group status (PTSD/CTRL) is made using the information about the cases +Group_status = np.array(["CTRL"]*n_subjects) +Group_status[np.array([i in cases for i in Subject_id])] = "PTSD" + +# A variable that codes the channels based on A/P localization is also made +Frontal_chs = ["Fp1", "Fpz", "Fp2", "AFz", "Fz", "F3", "F4", "F7", "F8"] +Central_chs = ["Cz", "C3", "C4", "T7", "T8", "FT7", "FC3", "FCz", "FC4", "FT8", "TP7", "CP3", "CPz", "CP4", "TP8"] +Posterior_chs = ["Pz", "P3", "P4", "P7", "P8", "POz", "O1", "O2", "Oz"] + +Brain_region_labels = ["Frontal","Central","Posterior"] +Brain_region = np.array(ch_names, dtype = "<U9") +Brain_region[np.array([i in Frontal_chs for i in ch_names])] = Brain_region_labels[0] +Brain_region[np.array([i in Central_chs for i in ch_names])] = Brain_region_labels[1] +Brain_region[np.array([i in Posterior_chs for i in ch_names])] = Brain_region_labels[2] + +# A variable that codes the channels based on M/L localization +Left_chs = ["Fp1", "F3", "F7", "FC3", "FT7", "C3", "T7", "CP3", "TP7", "P3", "P7", "O1"] +Right_chs = ["Fp2", "F4", "F8", "FC4", "FT8", "C4", "T8", "CP4", "TP8", "P4", "P8", "O2"] +Mid_chs = ["Fpz", "AFz", "Fz", "FCz", "Cz", "CPz", "Pz", "POz", "Oz"] + +Brain_side = np.array(ch_names, dtype = "<U5") +Brain_side[np.array([i in Left_chs for i in ch_names])] = "Left" +Brain_side[np.array([i in Right_chs for i in ch_names])] = "Right" +Brain_side[np.array([i in Mid_chs for i in ch_names])] = "Mid" + +# Eye status is added +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) + +# Frequency bands +freq_bands_name = list(Freq_Bands.keys()) +n_freq_bands = len(freq_bands_name) + +# Quantification (Abs/Rel) +quant_status = ["Absolute", "Relative"] +n_quant_status = len(quant_status) + +# The dataframe is made by combining all the unlisted pds values +# Each row correspond to a different channel. It is reset after all channel names have been used +# Each eye status element is repeated n_channel times, before it is reset +# Each freq_band element is repeated n_channel * n_eye_status times, before it is reset +# Each quantification status element is repeated n_channel * n_eye_status * n_freq_bands times, before it is reset +power_df = pd.DataFrame(data = {"Subject_ID": [ele for ele in Subject_id for i in range(n_eye_status*n_quant_status*n_freq_bands*n_channels)], + "Group_status": [ele for ele in Group_status for i in range(n_eye_status*n_quant_status*n_freq_bands*n_channels)], + "Channel": ch_names*(n_eye_status*n_quant_status*n_freq_bands*n_subjects), + "Brain_region": list(Brain_region)*(n_eye_status*n_quant_status*n_freq_bands*n_subjects), + "Brain_side": list(Brain_side)*(n_eye_status*n_quant_status*n_freq_bands*n_subjects), + "Eye_status": [ele for ele in eye_status for i in range(n_channels)]*n_quant_status*n_freq_bands*n_subjects, + "Freq_band": [ele for ele in freq_bands_name for i in range(n_channels*n_eye_status)]*n_quant_status*n_subjects, + "Quant_status": [ele for ele in quant_status for i in range(n_channels*n_eye_status*n_freq_bands)]*n_subjects, + "PSD": list(np.concatenate(power_bands, axis=0)) + }) +# Absolute power is in decibels (10*log10(power)) + +# Fix Freq_band categorical order +power_df["Freq_band"] = power_df["Freq_band"].astype("category").\ + cat.reorder_categories(list(Freq_Bands.keys()), ordered=True) +# Fix Brain_region categorical order +power_df["Brain_region"] = power_df["Brain_region"].astype("category").\ + cat.reorder_categories(Brain_region_labels, ordered=True) + +# Save the dataframe +power_df.to_pickle(os.path.join(Feature_savepath,"Power_df.pkl")) + + +# %% Theta-beta ratio +# Frontal theta/beta ratio has been implicated in cognitive control of attention +power_df = pd.read_pickle(os.path.join(Feature_savepath,"Power_df.pkl")) + +eye_status = list(final_epochs[0].event_id) +n_eye_status = len(eye_status) + +# Subset frontal absolute power +power_df_sub1 = power_df[(power_df["Quant_status"] == "Absolute")& + (power_df["Brain_region"] == "Frontal")] + +# Calculate average frontal power +frontal_theta_mean_subject = power_df_sub1[power_df_sub1["Freq_band"] == "theta"].\ + groupby(["Subject_ID","Group_status","Eye_status"]).mean().reset_index() +frontal_beta_mean_subject = power_df_sub1[power_df_sub1["Freq_band"] == "beta"].\ + groupby(["Subject_ID","Group_status","Eye_status"]).mean().reset_index() + +# Convert from dB to raw power +frontal_theta_mean_subject["PSD"] = 10**(frontal_theta_mean_subject["PSD"]/10) +frontal_beta_mean_subject["PSD"] = 10**(frontal_beta_mean_subject["PSD"]/10) + +# Calculate mean for each group and take ratio for whole group +# To confirm trend observed in PSD plots +mean_group_f_theta = frontal_theta_mean_subject.iloc[:,1:].groupby(["Group_status","Eye_status"]).mean() +mean_group_f_beta = frontal_beta_mean_subject.iloc[:,1:].groupby(["Group_status","Eye_status"]).mean() +mean_group_f_theta_beta_ratio = mean_group_f_theta/mean_group_f_beta + +# Calculate ratio for each subject +frontal_theta_beta_ratio = frontal_theta_mean_subject.copy() +frontal_theta_beta_ratio["PSD"] = frontal_theta_mean_subject["PSD"]/frontal_beta_mean_subject["PSD"] + +# Take the natural log of ratio +frontal_theta_beta_ratio["PSD"] = np.log(frontal_theta_beta_ratio["PSD"]) + +# Rename and save feature +frontal_theta_beta_ratio.rename(columns={"PSD":"TBR"},inplace=True) +# Add dummy variable for re-using plot code +dummy_variable = ["Frontal Theta Beta Ratio"]*frontal_theta_beta_ratio.shape[0] +frontal_theta_beta_ratio.insert(3, "Measurement", dummy_variable ) + +frontal_theta_beta_ratio.to_pickle(os.path.join(Feature_savepath,"fTBR_df.pkl")) + +# %% Frequency bands asymmetry +# Defined as ln(right) - ln(left) +# Thus we should only work with the absolute values and undo the dB transformation +# Here I avg over all areas. I.e. mean((ln(F4)-ln(F3),(ln(F8)-ln(F7),(ln(Fp2)-ln(Fp1))) for frontal +ROI = ["Frontal", "Central", "Posterior"] +qq = "Absolute" # only calculate asymmetry for absolute +# Pre-allocate memory +asymmetry = np.zeros(shape=(len(np.unique(power_df["Subject_ID"])), + len(np.unique(power_df["Eye_status"])), + len(list(Freq_Bands.keys())), + len(ROI))) + +def calculate_asymmetry(i): + ii = np.unique(power_df["Subject_ID"])[i] + temp_asymmetry = np.zeros(shape=(len(np.unique(power_df["Eye_status"])), + len(list(Freq_Bands.keys())), + len(ROI))) + for e in range(len(np.unique(power_df["Eye_status"]))): + ee = np.unique(power_df["Eye_status"])[e] + for f in range(len(list(Freq_Bands.keys()))): + ff = list(Freq_Bands.keys())[f] + + # Get the specific part of the df + temp_power_df = power_df[(power_df["Quant_status"] == qq) & + (power_df["Eye_status"] == ee) & + (power_df["Subject_ID"] == ii) & + (power_df["Freq_band"] == ff)].copy() + + # Convert from dB to raw power + temp_power_df.loc[:,"PSD"] = np.array(10**(temp_power_df["PSD"]/10)) + + # Calculate the power asymmetry + for r in range(len(ROI)): + rr = ROI[r] + temp_power_roi_df = temp_power_df[(temp_power_df["Brain_region"] == rr)& + ~(temp_power_df["Brain_side"] == "Mid")] + # Sort using channel names to make sure F8-F7 and not F4-F7 etc. + temp_power_roi_df = temp_power_roi_df.sort_values("Channel").reset_index(drop=True) + # Get the log power + R_power = temp_power_roi_df[(temp_power_roi_df["Brain_side"] == "Right")]["PSD"] + ln_R_power = np.log(R_power) # get log power + L_power = temp_power_roi_df[(temp_power_roi_df["Brain_side"] == "Left")]["PSD"] + ln_L_power = np.log(L_power) # get log power + # Pairwise subtraction followed by averaging + asymmetry_value = np.mean(np.array(ln_R_power) - np.array(ln_L_power)) + # Save it to the array + temp_asymmetry[e,f,r] = asymmetry_value + # Print progress + print("{} out of {} finished testing".format(i+1,n_subjects)) + return i, temp_asymmetry + +with concurrent.futures.ProcessPoolExecutor() as executor: + for i, res in executor.map(calculate_asymmetry, range(len(np.unique(power_df["Subject_ID"])))): # Function and arguments + asymmetry[i,:,:,:] = res + +# Prepare conversion of array to df using flatten +n_subjects = len(Subject_id) + +# The group status (PTSD/CTRL) is made using the information about the cases +Group_status = np.array(["CTRL"]*n_subjects) +Group_status[np.array([i in cases for i in Subject_id])] = "PTSD" + +# Eye status is added +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) + +# Frequency bands +freq_bands_name = list(Freq_Bands.keys()) +n_freq_bands = len(freq_bands_name) + +# ROIs +n_ROI = len(ROI) + +# Make the dataframe +asymmetry_df = pd.DataFrame(data = {"Subject_ID": [ele for ele in Subject_id for i in range(n_eye_status*n_freq_bands*n_ROI)], + "Group_status": [ele for ele in Group_status for i in range(n_eye_status*n_freq_bands*n_ROI)], + "Eye_status": [ele for ele in eye_status for i in range(n_freq_bands*n_ROI)]*(n_subjects), + "Freq_band": [ele for ele in freq_bands_name for i in range(n_ROI)]*(n_subjects*n_eye_status), + "ROI": list(ROI)*(n_subjects*n_eye_status*n_freq_bands), + "Asymmetry_score": asymmetry.flatten(order="C") + }) +# Flatten with order=C means that it first goes through last axis, +# then repeat along 2nd last axis, and then repeat along 3rd last etc + +# Asymmetry numpy to pandas conversion check +random_point=321 +asymmetry_df.iloc[random_point] + +i = np.where(np.unique(power_df["Subject_ID"]) == asymmetry_df.iloc[random_point]["Subject_ID"])[0] +e = np.where(np.unique(power_df["Eye_status"]) == asymmetry_df.iloc[random_point]["Eye_status"])[0] +f = np.where(np.array(list(Freq_Bands.keys())) == asymmetry_df.iloc[random_point]["Freq_band"])[0] +r = np.where(np.array(ROI) == asymmetry_df.iloc[random_point]["ROI"])[0] + +assert asymmetry[i,e,f,r] == asymmetry_df.iloc[random_point]["Asymmetry_score"] + +# Save the dataframe +asymmetry_df.to_pickle(os.path.join(Feature_savepath,"asymmetry_df.pkl")) + +# %% Using FOOOF +# Peak alpha frequency (PAF) and 1/f exponent (OOF) +# Using the FOOOF algorithm (Fitting oscillations and one over f) +# Published by Donoghue et al, 2020 in Nature Neuroscience +# To start, FOOOF takes the freqs and power spectra as input +n_channels = final_epochs[0].info["nchan"] +ch_names = final_epochs[0].info["ch_names"] +sfreq = final_epochs[0].info["sfreq"] +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +n_freq_bands = len(Freq_Bands) + +# From visual inspection there seems to be problem if PSD is too steep at the start +# To overcome this problem, we try multiple start freq +OOF_r2_thres = 0.95 # a high threshold as we allow for overfitting +PAF_r2_thres = 0.90 # a more lenient threshold for PAF, as it is usually still captured even if fit for 1/f is not perfect +freq_start_it_range = [2,3,4,5,6] +freq_end = 40 # Stop freq at 40Hz to not be influenced by the Notch Filter + +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) + +PAF_data = np.zeros((n_subjects,n_eye_status,n_channels,3)) # CF, power, band_width +OOF_data = np.zeros((n_subjects,n_eye_status,n_channels,2)) # offset and exponent + +def FOOOF_estimation(i): + PAF_data0 = np.zeros((n_eye_status,n_channels,3)) # CF, power, band_width + OOF_data0 = np.zeros((n_eye_status,n_channels,2)) # offset and exponent + # Get Eye status + eye_idx = [final_epochs[i].events[:,2] == 1, final_epochs[i].events[:,2] == 2] # EC and EO + # Calculate the power spectral density + psd, freqs = psd_multitaper(final_epochs[i], fmin = 1, fmax = 50) # output (epochs, channels, freqs) + # Retrieve psds for the 2 conditions and calculate mean across epochs + psds = [] + for e in range(n_eye_status): + # Get the epochs for specific eye condition + temp_psd = psd[eye_idx[e],:,:] + # Calculate the mean across epochs + temp_psd = np.mean(temp_psd, axis=0) + # Save + psds.append(temp_psd) + # Try multiple start freq + PAF_data00 = np.zeros((n_eye_status,n_channels,len(freq_start_it_range),3)) # CF, power, band_width + OOF_data00 = np.zeros((n_eye_status,n_channels,len(freq_start_it_range),2)) # offset and exponent + r2s00 = np.zeros((n_eye_status,n_channels,len(freq_start_it_range))) + for e in range(n_eye_status): + psds_avg = psds[e] + for f in range(len(freq_start_it_range)): + # Initiate FOOOF group for analysis of multiple PSD + fg = fooof.FOOOFGroup() + # Set the frequency range to fit the model + freq_range = [freq_start_it_range[f], freq_end] # variable freq start to 49Hz + # Fit to each source PSD separately, but in parallel + fg.fit(freqs,psds_avg,freq_range,n_jobs=1) + # Extract aperiodic parameters + aps = fg.get_params('aperiodic_params') + # Extract peak parameters + peaks = fg.get_params('peak_params') + # Extract goodness-of-fit metrics + r2s = fg.get_params('r_squared') + # Save OOF and r2s + OOF_data00[e,:,f] = aps + r2s00[e,:,f] = r2s + # Find the alpha peak with greatest power + for c in range(n_channels): + peaks0 = peaks[peaks[:,3] == c] + # Subset the peaks within the alpha band + in_alpha_band = (peaks0[:,0] >= Freq_Bands["alpha"][0]) & (peaks0[:,0] <= Freq_Bands["alpha"][1]) + if sum(in_alpha_band) > 0: # Any alpha peaks? + # Choose the peak with the highest power + max_alpha_idx = np.argmax(peaks0[in_alpha_band,1]) + # Save results + PAF_data00[e,c,f] = peaks0[in_alpha_band][max_alpha_idx,:-1] + else: + # No alpha peaks + PAF_data00[e,c,f] = [np.nan]*3 + # Check criterias + good_fits_OOF = (r2s00 > OOF_r2_thres) & (OOF_data00[:,:,:,1] > 0) # r^2 > 0.95 and exponent > 0 + good_fits_PAF = (r2s00 > PAF_r2_thres) & (np.isfinite(PAF_data00[:,:,:,0])) # r^2 > 0.90 and detected peak in alpha band + # Save the data or NaN if criterias were not fulfilled + for e in range(n_eye_status): + for c in range(n_channels): + if sum(good_fits_OOF[e,c]) == 0: # no good OOF estimation + OOF_data0[e,c] = [np.nan]*2 + else: # Save OOF associated with greatest r^2 that fulfilled criterias + OOF_data0[e,c] = OOF_data00[e,c,np.argmax(r2s00[e,c,good_fits_OOF[e,c]])] + if sum(good_fits_PAF[e,c]) == 0: # no good PAF estimation + PAF_data0[e,c] = [np.nan]*3 + else: # Save PAF associated with greatest r^2 that fulfilled criterias + PAF_data0[e,c] = PAF_data00[e,c,np.argmax(r2s00[e,c,good_fits_PAF[e,c]])] + print("Finished {} out of {} subjects".format(i+1,n_subjects)) + return i, PAF_data0, OOF_data0 + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +with concurrent.futures.ProcessPoolExecutor() as executor: + for i, PAF_result, OOF_result in executor.map(FOOOF_estimation, range(n_subjects)): # Function and arguments + PAF_data[i] = PAF_result + OOF_data[i] = OOF_result + +# Save data +with open(Feature_savepath+"PAF_data_arr.pkl", "wb") as file: + pickle.dump(PAF_data, file) +with open(Feature_savepath+"OOF_data_arr.pkl", "wb") as file: + pickle.dump(OOF_data, file) + +# Get current time +c_time2 = time.localtime() +c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) +print("Started", c_time1, "\nFinished",c_time2) + +# Convert to Pandas dataframe (only keep mean parameter for PAF) +# The dimensions will each be a column with numbers and the last column will be the actual values +ori = PAF_data[:,:,:,0] +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, ori.shape), indexing="ij"))) + [ori.ravel()]) +PAF_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Channel", "Value"]) +# Change from numerical coding to actual values + +index_values = [Subject_id,eye_status,ch_names] +temp_df = PAF_data_df.copy() # make temp df to not sequential overwrite what is changed +for col in range(len(index_values)): + col_name = PAF_data_df.columns[col] + for shape in range(ori.shape[col]): + temp_df.loc[PAF_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] +PAF_data_df = temp_df # replace original df + +# Add group status +Group_status = np.array(["CTRL"]*len(PAF_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in PAF_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +PAF_data_df.insert(3, "Group_status", Group_status) + +# Global peak alpha +PAF_data_df_global = PAF_data_df.groupby(["Subject_ID", "Group_status", "Eye_status"]).mean().reset_index() # by default pandas mean skip nan +# Add dummy variable for re-using plot code +dummy_variable = ["Global Peak Alpha Frequency"]*PAF_data_df_global.shape[0] +PAF_data_df_global.insert(3, "Measurement", dummy_variable ) + +# Regional peak alpha +# A variable that codes the channels based on A/P localization is also made +Frontal_chs = ["Fp1", "Fpz", "Fp2", "AFz", "Fz", "F3", "F4", "F7", "F8"] +Central_chs = ["Cz", "C3", "C4", "T7", "T8", "FT7", "FC3", "FCz", "FC4", "FT8", "TP7", "CP3", "CPz", "CP4", "TP8"] +Posterior_chs = ["Pz", "P3", "P4", "P7", "P8", "POz", "O1", "O2", "Oz"] + +Brain_region = np.array(ch_names, dtype = "<U9") +Brain_region[np.array([i in Frontal_chs for i in ch_names])] = "Frontal" +Brain_region[np.array([i in Central_chs for i in ch_names])] = "Central" +Brain_region[np.array([i in Posterior_chs for i in ch_names])] = "Posterior" + +PAF_data_df.insert(4, "Brain_region", list(Brain_region)*int(PAF_data_df.shape[0]/len(Brain_region))) + +# Save the dataframes +PAF_data_df.to_pickle(os.path.join(Feature_savepath,"PAF_data_FOOOF_df.pkl")) +PAF_data_df_global.to_pickle(os.path.join(Feature_savepath,"PAF_data_FOOOF_global_df.pkl")) + +# Convert to Pandas dataframe (only keep exponent parameter for OOF) +# The dimensions will each be a column with numbers and the last column will be the actual values +ori = OOF_data[:,:,:,1] +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, ori.shape), indexing="ij"))) + [ori.ravel()]) +PAF_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Channel", "Value"]) +# Change from numerical coding to actual values + +index_values = [Subject_id,eye_status,ch_names] +temp_df = PAF_data_df.copy() # make temp df to not sequential overwrite what is changed +for col in range(len(index_values)): + col_name = PAF_data_df.columns[col] + for shape in range(ori.shape[col]): + temp_df.loc[PAF_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] +OOF_data_df = temp_df # replace original df + +# Add group status +Group_status = np.array(["CTRL"]*len(OOF_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in OOF_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +OOF_data_df.insert(3, "Group_status", Group_status) + +# Regional OOF +OOF_data_df.insert(4, "Brain_region", list(Brain_region)*int(PAF_data_df.shape[0]/len(Brain_region))) + +# Save the dataframes +OOF_data_df.to_pickle(os.path.join(Feature_savepath,"OOF_data_FOOOF_df.pkl")) + +# %% Microstate analysis +# The function takes the data as a numpy array (n_t, n_ch) +# The data is already re-referenced to common average +# Variables for the clustering function are extracted +sfreq = final_epochs[0].info["sfreq"] +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) +ch_names = final_epochs[0].info["ch_names"] +n_channels = len(ch_names) +locs = np.zeros((n_channels,2)) # xy coordinates of the electrodes +for c in range(n_channels): + locs[c] = final_epochs[0].info["chs"][c]["loc"][0:2] + +# The epochs are transformed to numpy arrays +micro_data = [] +EC_micro_data = [] +EO_micro_data = [] +for i in range(n_subjects): + # Transform data to correct shape + micro_data.append(final_epochs[i].get_data()) # get data + arr_shape = micro_data[i].shape # get shape + micro_data[i] = micro_data[i].swapaxes(1,2) # swap ch and time axis + micro_data[i] = micro_data[i].reshape(arr_shape[0]*arr_shape[2],arr_shape[1]) # reshape by combining epochs and times + # Get indices for eyes open and closed + EC_index = final_epochs[i].events[:,2] == 1 + EO_index = final_epochs[i].events[:,2] == 2 + # Repeat with 4s * sample frequency to correct for concatenation of times and epochs + EC_index = np.repeat(EC_index,4*sfreq) + EO_index = np.repeat(EO_index,4*sfreq) + # Save data where it is divided into eye status + EC_micro_data.append(micro_data[i][EC_index]) + EO_micro_data.append(micro_data[i][EO_index]) + +# Global explained variance and Cross-validation criterion is used to determine number of microstates +# First all data is concatenated to find the optimal number of maps for all data +micro_data_all = np.vstack(micro_data) + +# Determine the number of clusters +# I use a slightly modified kmeans function which returns the cv_min +global_gev = [] +cv_criterion = [] +for n_maps in range(2,7): + maps, L, gfp_peaks, gev, cv_min = kmeans_return_all(micro_data_all, n_maps) + global_gev.append(np.sum(gev)) + cv_criterion.append(cv_min) +# Save run results +cluster_results = np.array([global_gev,cv_criterion]) +np.save("Microstate_n_cluster_test_results.npy", cluster_results) # (gev/cv_crit, n_maps from 2 to 6) + +#cluster_results = np.load("Microstate_n_cluster_test_results.npy") +#global_gev = cluster_results[0,:] +#cv_criterion = cluster_results[1,:] + +# Evaluate best n_maps +plt.figure() +plt.plot(np.linspace(2,6,len(cv_criterion)),(cv_criterion/np.sum(cv_criterion)), label="CV Criterion") +plt.plot(np.linspace(2,6,len(cv_criterion)),(global_gev/np.sum(global_gev)), label="GEV") +plt.legend() +plt.ylabel("Normalized to total") +# The lower CV the better. +# But the higher GEV the better. +# Based on the plots and the recommendation by vong Wegner & Laufs 2018 +# we used 4 microstates + +# In order to compare between groups, I fix the microstates by clustering on data from both groups +# Due to instability of maps when running multiple times, I increased n_maps from 4 to 6 +n_maps = 4 +mode = ["aahc", "kmeans", "kmedoids", "pca", "ica"][1] + +# K-means is stochastic, thus I run it multiple times in order to find the maps with highest GEV +# Each K-means is run 5 times and best map is chosen. But I do this 10 times more, so in total 50 times! +n_run = 10 +# Pre-allocate memory +microstate_cluster_results = [] + +# Parallel processing can only be implemented by ensuring different seeds +# Otherwise the iteration would be the same. +# However the k-means already use parallel processing so making outer loop with +# concurrent processes make it use too many processors +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +for r in range(n_run): + maps = [0]*2 + m_labels = [0]*2 + gfp_peaks = [0]*2 + gev = [0]*2 + # Eyes closed + counter = 0 + maps_, x_, gfp_peaks_, gev_ = clustering( + np.vstack(np.array(EC_micro_data)), sfreq, ch_names, locs, mode, n_maps, doplot=False) # doplot=True is bugged + maps[counter] = maps_ + m_labels[counter] = x_ + gfp_peaks[counter] = gfp_peaks_ + gev[counter] = gev_ + counter += 1 + # Eyes open + maps_, x_, gfp_peaks_, gev_ = clustering( + np.vstack(np.array(EO_micro_data)), sfreq, ch_names, locs, mode, n_maps, doplot=False) # doplot=True is bugged + maps[counter] = maps_ + m_labels[counter] = x_ + gfp_peaks[counter] = gfp_peaks_ + gev[counter] = gev_ + counter += 1 + + microstate_cluster_results.append([maps, m_labels, gfp_peaks, gev]) + print("Finished {} out of {}".format(r+1, n_run)) + +# Get current time +c_time2 = time.localtime() +c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) +print("Started", c_time1, "\nFinished",c_time2) + +# Save the results +with open(Feature_savepath+"Microstate_4_maps_10x5_k_means_results.pkl", "wb") as file: + pickle.dump(microstate_cluster_results, file) + +# # Load +# with open(Feature_savepath+"Microstate_4_maps_10x5_k_means_results.pkl", "rb") as file: +# microstate_cluster_results = pickle.load(file) + +# Find the best maps (Highest GEV across all the K-means clusters) +EC_total_gevs = np.sum(np.vstack(np.array(microstate_cluster_results)[:,3,0]), axis=1) # (runs, maps/labels/gfp/gev, ec/eo) +EO_total_gevs = np.sum(np.vstack(np.array(microstate_cluster_results)[:,3,1]), axis=1) +Best_EC_idx = np.argmax(EC_total_gevs) +Best_EO_idx = np.argmax(EO_total_gevs) +# Update the variables for the best maps +maps = [microstate_cluster_results[Best_EC_idx][0][0],microstate_cluster_results[Best_EO_idx][0][1]] +m_labels = [microstate_cluster_results[Best_EC_idx][1][0],microstate_cluster_results[Best_EO_idx][1][1]] +gfp_peaks = [microstate_cluster_results[Best_EC_idx][2][0],microstate_cluster_results[Best_EO_idx][2][1]] +gev = [microstate_cluster_results[Best_EC_idx][3][0],microstate_cluster_results[Best_EO_idx][3][1]] + +# Plot the maps +plt.style.use('default') +labels = ["EC", "EO"] +for i in range(len(labels)): + fig, axarr = plt.subplots(1, n_maps, figsize=(20,5)) + fig.patch.set_facecolor('white') + for imap in range(n_maps): + mne.viz.plot_topomap(maps[i][imap,:], pos = final_epochs[0].info, axes = axarr[imap]) # plot + axarr[imap].set_title("GEV: {:.2f}".format(gev[i][imap]), fontsize=16, fontweight="bold") # title + fig.suptitle("Microstates: {}".format(labels[i]), fontsize=20, fontweight="bold") + +# Manual re-order the maps +# Due the random initiation of K-means this have to be modified every time clusters are made! +# Assign map labels (e.g. 0, 2, 1, 3) +order = [0]*2 +order[0] = [3,0,1,2] # EC +order[1] = [3,1,0,2] # EO +for i in range(len(order)): + maps[i] = maps[i][order[i],:] # re-order maps + gev[i] = gev[i][order[i]] # re-order GEV + # Make directory to find and replace map labels + dic0 = {value:key for key, value in enumerate(order[i])} + m_labels[i][:] = [dic0.get(n, n) for n in m_labels[i]] # re-order labels + +# The maps seems to be correlated both negatively and positively (see spatial correlation plots) +# Thus the sign of the map does not really reflect which areas are positive or negative (absolute) +# But more which areas are different during each state (relatively) +# I can therefore change the sign of the map for the visualizaiton +sign_swap = [[1,-1,1,1],[1,1,1,-1]] +for i in range(len(order)): + for m in range(n_maps): + maps[i][m] *= sign_swap[i][m] + +# Plot the maps and save +save_path = "/home/glia/Analysis/Figures/Microstates/" +labels = ["EC", "EO"] +for i in range(len(labels)): + fig, axarr = plt.subplots(1, n_maps, figsize=(20,5)) + fig.patch.set_facecolor('white') + for imap in range(n_maps): + mne.viz.plot_topomap(maps[i][imap,:], pos = final_epochs[0].info, axes = axarr[imap]) # plot + axarr[imap].set_title("GEV: {:.2f}".format(gev[i][imap]), fontsize=16, fontweight="bold") # title + fig.suptitle("Microstates: {} - Total GEV: {:.2f}".format(labels[i],sum(gev[i])), fontsize=20, fontweight="bold") + # Save the figure + fig.savefig(os.path.join(save_path,str("Microstates_{}".format(labels[i]) + ".png"))) + +# Calculate spatial correlation between maps and actual data points (topography) +# The sign of the map is changed so the correlation is positive +# By default the code looks for highest spatial correlation (regardless of sign) +# Thus depending on random initiation point the map might be opposite +plt.style.use('ggplot') +def spatial_correlation(data, maps): + n_t = data.shape[0] + n_ch = data.shape[1] + data = data - data.mean(axis=1, keepdims=True) + + # GFP peaks + gfp = np.std(data, axis=1) + gfp_peaks = locmax(gfp) + gfp_values = gfp[gfp_peaks] + gfp2 = np.sum(gfp_values**2) # normalizing constant in GEV + n_gfp = gfp_peaks.shape[0] + + # Spatial correlation + C = np.dot(data, maps.T) + C /= (n_ch*np.outer(gfp, np.std(maps, axis=1))) + L = np.argmax(C**2, axis=1) # C is squared here which means the maps do no retain information about the sign of the correlation + + return C + +C_EC = spatial_correlation(np.vstack(np.array(EC_micro_data)), maps[0]) +C_EO = spatial_correlation(np.vstack(np.array(EO_micro_data)), maps[1]) +C = [C_EC, C_EO] + +# Plot the distribution of spatial correlation for each label and each map +labels = ["EC", "EO"] +for i in range(len(labels)): + fig, axarr = plt.subplots(n_maps, n_maps, figsize=(16,16)) + for Lmap in range(n_maps): + for Mmap in range(n_maps): + sns.distplot(C[i][m_labels[i] == Lmap,Mmap], ax = axarr[Lmap,Mmap]) + axarr[Lmap,Mmap].set_xlabel("Spatial correlation") + plt.suptitle("Distribution of spatial correlation_{}".format(labels[i]), fontsize=20, fontweight="bold") + # Add common x and y axis labels by making one big axis + fig.add_subplot(111, frameon=False) + plt.tick_params(labelcolor="none", top="off", bottom="off", left="off", right="off") # hide tick labels and ticks + plt.grid(False) # remove global grid + plt.xlabel("Microstate number", labelpad=20) + plt.ylabel("Label number", labelpad=10) + fig.savefig(os.path.join(save_path,str("Microstates_Spatial_Correlation_Label_State_{}".format(labels[i]) + ".png"))) + +# Plot the distribution of spatial correlation for all data and each map +labels = ["EC", "EO"] +for i in range(len(labels)): + fig, axarr = plt.subplots(1,n_maps, figsize=(20,5)) + for imap in range(n_maps): + sns.distplot(C[i][:,imap], ax = axarr[imap]) + plt.xlabel("Spatial correlation") + plt.suptitle("Distribution of spatial correlation", fontsize=20, fontweight="bold") + # Add common x and y axis labels by making one big axis + fig.add_subplot(111, frameon=False) + plt.tick_params(labelcolor="none", top="off", bottom="off", left="off", right="off") # hide tick labels and ticks + plt.grid(False) # remove global grid + plt.xlabel("Microstate number", labelpad=20) + plt.ylabel("Label number") + +# Prepare for calculation of transition matrix +# I modified the function, so it takes the list argument gap_index +# gap_index should have the indices right before gaps in data + +# Gaps: Between dropped epochs, trials (eo/ec) and subjects +# The between subjects gaps is removed by dividing the data into subjects +n_trials = 5 +n_epoch_length = final_epochs[0].get_data().shape[2] + +micro_labels = [] +micro_subject_EC_idx = [0] +micro_subject_EO_idx = [0] +gaps_idx = [] +gaps_trials_idx = [] +for i in range(n_subjects): + # Get indices for subject + micro_subject_EC_idx.append(micro_subject_EC_idx[i]+EC_micro_data[i].shape[0]) + temp_EC = m_labels[0][micro_subject_EC_idx[i]:micro_subject_EC_idx[i+1]] + # Get labels for subject i EO + micro_subject_EO_idx.append(micro_subject_EO_idx[i]+EO_micro_data[i].shape[0]) + temp_EO = m_labels[1][micro_subject_EO_idx[i]:micro_subject_EO_idx[i+1]] + # Save + micro_labels.append([temp_EC,temp_EO]) # (subject, eye) + + # Get indices with gaps + # Dropped epochs are first considered + # Each epoch last 4s, which correspond to 2000 samples and a trial is 15 epochs - dropped epochs + # Get epochs for each condition + EC_drop_epochs = Drop_epochs_df.iloc[i,1:][Drop_epochs_df.iloc[i,1:] <= 75].to_numpy() + EO_drop_epochs = Drop_epochs_df.iloc[i,1:][(Drop_epochs_df.iloc[i,1:] >= 75)& + (Drop_epochs_df.iloc[i,1:] <= 150)].to_numpy() + # Get indices for the epochs for EC that were dropped and correct for changing index due to drop + EC_drop_epochs_gaps_idx = [] + counter = 0 + for d in range(len(EC_drop_epochs)): + drop_epoch_number = EC_drop_epochs[d] + Drop_epoch_idx = (drop_epoch_number-counter)*n_epoch_length # counter subtracted as the drop index is before dropped + EC_drop_epochs_gaps_idx.append(Drop_epoch_idx-1) # -1 for point just before gap + counter += 1 + # Negative index might occur if the first epochs were removed. This index is not needed for transition matrix + if len(EC_drop_epochs_gaps_idx) > 0: + for d in range(len(EC_drop_epochs_gaps_idx)): # check all, e.g. if epoch 0,1,2,3 are dropped then all should be caught + if EC_drop_epochs_gaps_idx[0] == -1: + EC_drop_epochs_gaps_idx = EC_drop_epochs_gaps_idx[1:len(EC_drop_epochs)] + + # Get indices for the epochs for EO that were dropped and correct for changing index due to drop + EO_drop_epochs_gaps_idx = [] + counter = 0 + for d in range(len(EO_drop_epochs)): + drop_epoch_number = EO_drop_epochs[d]-75 + Drop_epoch_idx = (drop_epoch_number-counter)*n_epoch_length # counter subtracted as the drop index is before dropped + EO_drop_epochs_gaps_idx.append(Drop_epoch_idx-1) # -1 for point just before gap + counter += 1 + # Negative index might occur if the first epoch was removed. This index is not needed for transition matrix + if len(EO_drop_epochs_gaps_idx) > 0: + for d in range(len(EO_drop_epochs_gaps_idx)): # check all, e.g. if epoch 0,1,2,3 are dropped then all should be caught + if EO_drop_epochs_gaps_idx[0] == -1: + EO_drop_epochs_gaps_idx = EO_drop_epochs_gaps_idx[1:len(EO_drop_epochs)] + + # Gaps between trials + Trial_indices = [0, 15, 30, 45, 60, 75] # all the indices for start and end of the 5 trials + EC_trial_gaps_idx = [] + EO_trial_gaps_idx = [] + counter_EC = 0 + counter_EO = 0 + for t in range(len(Trial_indices)-2): # -2 as start and end is not used in transition matrix + temp_drop = EC_drop_epochs[(EC_drop_epochs >= Trial_indices[t])& + (EC_drop_epochs < Trial_indices[t+1])] + # Correct the trial id for any potential drops within that trial + counter_EC += len(temp_drop) + trial_idx_corrected_for_drops = 15*(t+1)-counter_EC + EC_trial_gaps_idx.append((trial_idx_corrected_for_drops*n_epoch_length)-1) # multiply id with length of epoch and subtract 1 + + temp_drop = EO_drop_epochs[(EO_drop_epochs >= Trial_indices[t]+75)& + (EO_drop_epochs < Trial_indices[t+1]+75)] + # Correct the trial id for any potential drops within that trial + counter_EO += len(temp_drop) + trial_idx_corrected_for_drops = 15*(t+1)-counter_EO + EO_trial_gaps_idx.append((trial_idx_corrected_for_drops*n_epoch_length)-1) # multiply id with length of epoch and subtract 1 + + # Concatenate all drop indices + gaps_idx.append([np.unique(np.sort(EC_drop_epochs_gaps_idx+EC_trial_gaps_idx)), + np.unique(np.sort(EO_drop_epochs_gaps_idx+EO_trial_gaps_idx))]) + # Make on with trial gaps only for use in LRTC analysis + gaps_trials_idx.append([EC_trial_gaps_idx,EO_trial_gaps_idx]) + +# Save the gap idx files +np.save("Gaps_idx.npy",np.array(gaps_idx)) +np.save("Gaps_trials_idx.npy",np.array(gaps_trials_idx)) + +# %% Calculate microstate features +# Symbol distribution (also called ratio of time covered RTT) +# Transition matrix +# Shannon entropy +EC_p_hat = p_empirical(m_labels[0], n_maps) +EO_p_hat = p_empirical(m_labels[1], n_maps) +# Sanity check: Overall between EC and EO + +microstate_time_data = np.zeros((n_subjects,n_eye_status,n_maps)) +microstate_transition_data = np.zeros((n_subjects,n_eye_status,n_maps,n_maps)) +microstate_entropy_data = np.zeros((n_subjects,n_eye_status)) +for i in range(n_subjects): + # Calculate ratio of time covered + temp_EC_p_hat = p_empirical(micro_labels[i][0], n_maps) + temp_EO_p_hat = p_empirical(micro_labels[i][1], n_maps) + # Calculate transition matrix + temp_EC_T_hat = T_empirical(micro_labels[i][0], n_maps, gaps_idx[i][0]) + temp_EO_T_hat = T_empirical(micro_labels[i][1], n_maps, gaps_idx[i][1]) + # Calculate Shannon entropy + temp_EC_h_hat = H_1(micro_labels[i][0], n_maps) + temp_EO_h_hat = H_1(micro_labels[i][1], n_maps) + + # Save the data + microstate_time_data[i,0,:] = temp_EC_p_hat + microstate_time_data[i,1,:] = temp_EO_p_hat + microstate_transition_data[i,0,:,:] = temp_EC_T_hat + microstate_transition_data[i,1,:,:] = temp_EO_T_hat + microstate_entropy_data[i,0] = temp_EC_h_hat/max_entropy(n_maps) # ratio of max entropy + microstate_entropy_data[i,1] = temp_EO_h_hat/max_entropy(n_maps) # ratio of max entropy + +# Save transition data +np.save(Feature_savepath+"microstate_transition_data.npy", microstate_transition_data) +# Convert transition data to dataframe for further processing with other features +# Transition matrix should be read as probability of row to column +microstate_transition_data_arr =\ + microstate_transition_data.reshape((n_subjects,n_eye_status,n_maps*n_maps)) # flatten 4 x 4 matrix to 1D +transition_info = ["M1->M1", "M1->M2", "M1->M3", "M1->M4", + "M2->M1", "M2->M2", "M2->M3", "M2->M4", + "M3->M1", "M3->M2", "M3->M3", "M3->M4", + "M4->M1", "M4->M2", "M4->M3", "M4->M4"] + +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, microstate_transition_data_arr.shape), indexing="ij"))) + [microstate_transition_data_arr.ravel()]) +microstate_transition_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Transition", "Value"]) +# Change from numerical coding to actual values +eye_status = list(final_epochs[0].event_id.keys()) + +index_values = [Subject_id,eye_status,transition_info] +for col in range(len(index_values)): + col_name = microstate_transition_data_df.columns[col] + for shape in reversed(range(microstate_transition_data_arr.shape[col])): # notice this is the shape of original numpy array. Not shape of DF + microstate_transition_data_df.loc[microstate_transition_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] + +# Add group status +Group_status = np.array(["CTRL"]*len(microstate_transition_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in microstate_transition_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +microstate_transition_data_df.insert(2, "Group_status", Group_status) + +# Save df +microstate_transition_data_df.to_pickle(os.path.join(Feature_savepath,"microstate_transition_data_df.pkl")) + +# Convert time covered data to Pandas dataframe +# The dimensions will each be a column with numbers and the last column will be the actual values +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, microstate_time_data.shape), indexing="ij"))) + [microstate_time_data.ravel()]) +microstate_time_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Microstate", "Value"]) +# Change from numerical coding to actual values +eye_status = list(final_epochs[0].event_id.keys()) +microstates = [1,2,3,4] + +index_values = [Subject_id,eye_status,microstates] +for col in range(len(index_values)): + col_name = microstate_time_df.columns[col] + for shape in reversed(range(microstate_time_data.shape[col])): # notice this is the shape of original numpy array. Not shape of DF + microstate_time_df.loc[microstate_time_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] +# Reversed in inner loop is used to avoid sequencial data being overwritten. +# E.g. if 0 is renamed to 1, then the next loop all 1's will be renamed to 2 + +# Add group status +Group_status = np.array(["CTRL"]*len(microstate_time_df["Subject_ID"])) +Group_status[np.array([i in cases for i in microstate_time_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +microstate_time_df.insert(2, "Group_status", Group_status) + +# Save df +microstate_time_df.to_pickle(os.path.join(Feature_savepath,"microstate_time_df.pkl")) + +# Transition data - mean +# Get index for groups +PTSD_idx = np.array([i in cases for i in Subject_id]) +CTRL_idx = np.array([not i in cases for i in Subject_id]) +n_groups = 2 + +microstate_transition_data_mean = np.zeros((n_groups,n_eye_status,n_maps,n_maps)) +microstate_transition_data_mean[0,:,:,:] = np.mean(microstate_transition_data[PTSD_idx,:,:,:], axis=0) +microstate_transition_data_mean[1,:,:,:] = np.mean(microstate_transition_data[CTRL_idx,:,:,:], axis=0) + +# Convert entropy data to Pandas dataframe +# The dimensions will each be a column with numbers and the last column will be the actual values +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, microstate_entropy_data.shape), indexing="ij"))) + [microstate_entropy_data.ravel()]) +microstate_entropy_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Value"]) +# Change from numerical coding to actual values +eye_status = list(final_epochs[0].event_id.keys()) + +index_values = [Subject_id,eye_status] +for col in range(len(index_values)): + col_name = microstate_entropy_df.columns[col] + for shape in reversed(range(microstate_entropy_data.shape[col])): # notice this is the shape of original numpy array. Not shape of DF + microstate_entropy_df.loc[microstate_entropy_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] +# Reversed in inner loop is used to avoid sequencial data being overwritten. +# E.g. if 0 is renamed to 1, then the next loop all 1's will be renamed to 2 + +# Add group status +Group_status = np.array(["CTRL"]*len(microstate_entropy_df["Subject_ID"])) +Group_status[np.array([i in cases for i in microstate_entropy_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +microstate_entropy_df.insert(2, "Group_status", Group_status) +# Add dummy variable for re-using plot code +dummy_variable = ["Entropy"]*len(Group_status) +microstate_entropy_df.insert(3, "Measurement", dummy_variable) + +# Save df +microstate_entropy_df.to_pickle(os.path.join(Feature_savepath,"microstate_entropy_df.pkl")) + +# %% Long-range temporal correlations (LRTC) +""" +See Hardstone et al, 2012 +Hurst exponent estimation steps: + 1. Preprocess + 2. Band-pass filter for frequency band of interest + 3. Hilbert transform to obtain amplitude envelope + 4. Perform DFA + 4.1 Compute cumulative sum of time series to create signal profile + 4.2 Define set of window sizes (see below) + 4.3 Remove the linear trend using least-squares for each window + 4.4 Calculate standard deviation for each window and take the mean + 4.5 Plot fluctuation function (Standard deviation) as function + for all window sizes, on double logarithmic scale + 4.6 The DFA exponent alpha correspond to Hurst exponent + f(L) = sd = L^alpha (with alpha as linear coefficient in log plot) + +If 0 < alpha < 0.5: The process exhibits anti-correlations +If 0.5 < alpha < 1: The process exhibits positive correlations +If alpha = 0.5: The process is indistinguishable from a random process +If 1.0 < alpha < 2.0: The process is non-stationary. H = alpha - 1 + +Window sizes should be equally spaced on a logarithmic scale +Sizes should be at least 4 samples and up to 10% of total signal length +Filters can influence neighboring samples, thus filters should be tested +on white noise to estimate window sizes that are unaffected by filters + +filter_length=str(2*1/fmin)+"s" # cannot be used with default transition bandwidth + +""" +# From simulations with white noise I determined window size thresholds for the 5 frequency bands: +thresholds = [7,7,7,6.5,6.5] +# And their corresponding log step sizes +with open("LRTC_log_win_sizes.pkl", "rb") as filehandle: + log_win_sizes = pickle.load(filehandle) + +# Variables for the the different conditions +# Sampling frequency +sfreq = final_epochs[0].info["sfreq"] +# Channels +ch_names = final_epochs[0].info["ch_names"] +n_channels = len(ch_names) +# Frequency +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +n_freq_bands = len(Freq_Bands) +# Eye status +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) + +### Estimating Hurst exponent for the data +# The data should be re-referenced to common average (Already done) + +# Data are transformed to numpy arrays +# Then divided into EO and EC and further into each of the 5 trials +# So DFA is estimated for each trial separately, which was concluded from simulations +gaps_trials_idx = np.load("Gaps_trials_idx.npy") # re-used from microstate analysis +n_trials = 5 + +H_data = [] +for i in range(n_subjects): + # Transform data to correct shape + temp_arr = final_epochs[i].get_data() # get data + arr_shape = temp_arr.shape # get shape + temp_arr = temp_arr.swapaxes(1,2) # swap ch and time axis + temp_arr = temp_arr.reshape(arr_shape[0]*arr_shape[2],arr_shape[1]) # reshape by combining epochs and times + # Get indices for eyes open and closed + EC_index = final_epochs[i].events[:,2] == 1 + EO_index = final_epochs[i].events[:,2] == 2 + # Repeat with 4s * sample frequency to correct for concatenation of times and epochs + EC_index = np.repeat(EC_index,4*sfreq) + EO_index = np.repeat(EO_index,4*sfreq) + # Divide into eye status + EC_data = temp_arr[EC_index] + EO_data = temp_arr[EO_index] + # Divide into trials + EC_gap_idx = np.array([0]+list(gaps_trials_idx[i,0])+[len(EC_data)]) + EO_gap_idx = np.array([0]+list(gaps_trials_idx[i,1])+[len(EO_data)]) + + EC_trial_data = [] + EO_trial_data = [] + for t in range(n_trials): + EC_trial_data.append(EC_data[EC_gap_idx[t]:EC_gap_idx[t+1]]) + EO_trial_data.append(EO_data[EO_gap_idx[t]:EO_gap_idx[t+1]]) + + # Save data + H_data.append([EC_trial_data,EO_trial_data]) # output [subject][eye][trial][time,ch] + +# Calculate H for each subject, eye status, trial, freq and channel +H_arr = np.zeros((n_subjects,n_eye_status,n_trials,n_channels,n_freq_bands)) +w_len = [len(ele) for ele in log_win_sizes] +DFA_arr = np.empty((n_subjects,n_eye_status,n_trials,n_channels,n_freq_bands,2,np.max(w_len))) +DFA_arr[:] = np.nan + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print("Started",c_time1) + +# Nolds are already using all cores so multiprocessing with make it slower +# Warning occurs when R2 is estimated during detrending - but R2 is not used +warnings.simplefilter("ignore") +for i in range(n_subjects): + # Pre-allocate memory + DFA_temp = np.empty((n_eye_status,n_trials,n_channels,n_freq_bands,2,np.max(w_len))) + DFA_temp[:] = np.nan + H_temp = np.empty((n_eye_status,n_trials,n_channels,n_freq_bands)) + for e in range(n_eye_status): + for trial in range(n_trials): + for c in range(n_channels): + # Get the data + signal = H_data[i][e][trial][:,c] + + counter = 0 # prepare counter + for fmin, fmax in Freq_Bands.values(): + # Filter for each freq band + signal_filtered = mne.filter.filter_data(signal, sfreq=sfreq, verbose=0, + l_freq=fmin, h_freq=fmax) + # Hilbert transform + analytic_signal = scipy.signal.hilbert(signal_filtered) + # Get Amplitude envelope + # np.abs is the same as np.linalg.norm, i.e. the length for complex input which is the amplitude + ampltude_envelope = np.abs(analytic_signal) + # Perform DFA using predefined window sizes from simulation + a, dfa_data = nolds.dfa(ampltude_envelope, + nvals=np.exp(log_win_sizes[counter]).astype("int"), + debug_data=True) + # Save DFA results + DFA_temp[e,trial,c,counter,:,0:w_len[counter]] = dfa_data[0:2] + H_temp[e,trial,c,counter] = a + # Update counter + counter += 1 + + # Print run status + print("Finished {} out of {}".format(i+1,n_subjects)) + # Save the results + H_arr[i] = H_temp + DFA_arr[i] = DFA_temp + +warnings.simplefilter("default") + +# Get current time +c_time2 = time.localtime() +c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) +print("Started", c_time1, "\nCurrent Time",c_time2) + +# Save the DFA analysis data +np.save(Feature_savepath+"DFA_arr.npy", DFA_arr) +np.save(Feature_savepath+"H_arr.npy", H_arr) + +# Load +DFA_arr = np.load(Feature_savepath+"DFA_arr.npy") +H_arr = np.load(Feature_savepath+"H_arr.npy") + +# Average the Hurst Exponent across trials +H_arr = np.mean(H_arr, axis=2) + +# Convert to Pandas dataframe (Hurst exponent) +# The dimensions will each be a column with numbers and the last column will be the actual values +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, H_arr.shape), indexing="ij"))) + [H_arr.ravel()]) +H_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Channel", "Freq_band", "Value"]) +# Change from numerical coding to actual values +eye_status = list(final_epochs[0].event_id.keys()) +ch_name = final_epochs[0].info["ch_names"] + +index_values = [Subject_id,eye_status,ch_name,list(Freq_Bands.keys())] +for col in range(len(index_values)): + col_name = H_data_df.columns[col] + for shape in range(H_arr.shape[col]): # notice this is the shape of original numpy array. Not shape of DF + H_data_df.loc[H_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] + +# Add group status +Group_status = np.array(["CTRL"]*len(H_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in H_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +H_data_df.insert(2, "Group_status", Group_status) + +# Fix Freq_band categorical order +H_data_df["Freq_band"] = H_data_df["Freq_band"].astype("category").\ + cat.reorder_categories(list(Freq_Bands.keys()), ordered=True) + +# Global Hurst exponent +H_data_df_global = H_data_df.groupby(["Subject_ID", "Eye_status", "Freq_band"]).mean().reset_index() # by default pandas mean skip nan +# Add group status (cannot use group_by as each subject only have 1 group, not both) +Group_status = np.array(["CTRL"]*len(H_data_df_global["Subject_ID"])) +Group_status[np.array([i in cases for i in H_data_df_global["Subject_ID"]])] = "PTSD" +# Add to dataframe +H_data_df_global.insert(2, "Group_status", Group_status) +# Add dummy variable for re-using plot code +dummy_variable = ["Global Hurst Exponent"]*H_data_df_global.shape[0] +H_data_df_global.insert(3, "Measurement", dummy_variable ) + +# Save the data +H_data_df.to_pickle(os.path.join(Feature_savepath,"H_data_df.pkl")) +H_data_df_global.to_pickle(os.path.join(Feature_savepath,"H_data_global_df.pkl")) + +# %% Source localization of sensor data +# Using non-interpolated channels +# Even interpolated channels during preprocessing and visual inspection +# are dropped + +# Prepare epochs for estimation of source connectivity +source_epochs = [0]*n_subjects +for i in range(n_subjects): + source_epochs[i] = final_epochs[i].copy() + +### Make forward solutions +# A forward solution is first made for all individuals with no dropped channels +# Afterwards individual forward solutions are made for subjects with bad +# channels that were interpolated in preprocessing and these are dropped +# First forward operator is computed using a template MRI for each dataset +fs_dir = "/home/glia/MNE-fsaverage-data/fsaverage" +subjects_dir = os.path.dirname(fs_dir) +trans = "fsaverage" +src = os.path.join(fs_dir, "bem", "fsaverage-ico-5-src.fif") +bem = os.path.join(fs_dir, "bem", "fsaverage-5120-5120-5120-bem-sol.fif") + +# Read the template sourcespace +sourcespace = mne.read_source_spaces(src) + +temp_idx = 0 # Index with subject that had no bad channels +subject_eeg = source_epochs[temp_idx].copy() +subject_eeg.set_eeg_reference(projection=True) # needed for inverse modelling +# Make forward solution +fwd = mne.make_forward_solution(subject_eeg.info, trans=trans, src=src, + bem=bem, eeg=True, mindist=5.0, n_jobs=1) +# Save forward operator +fname_fwd = "./Source_fwd/fsaverage-fwd.fif" +mne.write_forward_solution(fname_fwd, fwd, overwrite=True) + +# A specific forward solution is also made for each subject with bad channels +with open("./Preprocessing/bad_ch.pkl", "rb") as file: + bad_ch = pickle.load(file) + +All_bad_ch = bad_ch +All_drop_epochs = dropped_epochs_df +All_dropped_ch = [] + +Bad_ch_idx = [idx for idx, item in enumerate(All_bad_ch) if item != 0] +Bad_ch_subjects = All_drop_epochs["Subject_ID"][Bad_ch_idx] +# For each subject with bad channels, drop the channels and make forward operator +for n in range(len(Bad_ch_subjects)): + Subject = Bad_ch_subjects.iloc[n] + try: + Subject_idx = Subject_id.index(Subject) + # Get unique bad channels + Bad_ch0 = All_bad_ch[Bad_ch_idx[n]] + Bad_ch1 = [] + for i2 in range(len(Bad_ch0)): + if type(Bad_ch0[i2]) == list: + for i3 in range(len(Bad_ch0[i2])): + Bad_ch1.append(Bad_ch0[i2][i3]) + elif type(Bad_ch0[i2]) == str: + Bad_ch1.append(Bad_ch0[i2]) + Bad_ch1 = np.unique(Bad_ch1) + # Drop the bad channels + source_epochs[Subject_idx].drop_channels(Bad_ch1) + # Save the overview of dropped channels + All_dropped_ch.append([Subject,Subject_idx,Bad_ch1]) + # Make forward operator + subject_eeg = source_epochs[Subject_idx].copy() + subject_eeg.set_eeg_reference(projection=True) # needed for inverse modelling + # Make forward solution + fwd = mne.make_forward_solution(subject_eeg.info, trans=trans, src=src, + bem=bem, eeg=True, mindist=5.0, n_jobs=1) + # Save forward operator + fname_fwd = "./Source_fwd/fsaverage_{}-fwd.fif".format(Subject) + mne.write_forward_solution(fname_fwd, fwd, overwrite=True) + except: + print(Subject,"was already dropped") + +with open("./Preprocessing/All_datasets_bad_ch.pkl", "wb") as filehandle: + pickle.dump(All_dropped_ch, filehandle) + + +# %% Load forward operators +# Re-use for all subjects without dropped channels +fname_fwd = "./Source_fwd/fsaverage-fwd.fif" +fwd = mne.read_forward_solution(fname_fwd) + +fwd_list = [fwd]*n_subjects + +# Use specific forward solutions for subjects with dropped channels +with open("./Preprocessing/All_datasets_bad_ch.pkl", "rb") as file: + All_dropped_ch = pickle.load(file) + +for i in range(len(All_dropped_ch)): + Subject = All_dropped_ch[i][0] + Subject_idx = All_dropped_ch[i][1] + fname_fwd = "./Source_fwd/fsaverage_{}-fwd.fif".format(Subject) + fwd = mne.read_forward_solution(fname_fwd) + fwd_list[Subject_idx] = fwd + +# Check the correct number of channels are present in fwd +random_point = int(np.random.randint(0,len(All_dropped_ch)-1,1)) +assert len(fwds[All_dropped_ch[random_point][1]].ch_names) == source_epochs[All_dropped_ch[random_point][1]].info["nchan"] + +# %% Make parcellation +# After mapping to source space, I end up with 20484 vertices +# but I wanted to map to fewer sources and not many more +# Thus I need to perform parcellation +# Get labels for FreeSurfer "aparc" cortical parcellation (example with 74 labels/hemi - Destriuex) +labels_aparc = mne.read_labels_from_annot("fsaverage", parc="aparc.a2009s", + subjects_dir=subjects_dir) +labels_aparc = labels_aparc[:-2] # remove unknowns + +labels_aparc_names = [label.name for label in labels_aparc] + +# Manually adding the 31 ROIs (14-lh/rh + 3 in midline) from Toll et al, 2020 +# Making fuction to take subset of a label +def label_subset(label, subset, name="ROI_name"): + label_subset = mne.Label(label.vertices[subset], label.pos[subset,:], + label.values[subset], label.hemi, + name = "{}-{}".format(name,label.hemi), + subject = label.subject, color = None) + return label_subset + +### Visual area 1 (V1 and somatosensory cortex BA1-3) +label_filenames = ["lh.V1.label", "rh.V1.label", + "lh.BA1.label", "rh.BA1.label", + "lh.BA2.label", "rh.BA2.label", + "lh.BA3a.label", "rh.BA3a.label", + "lh.BA3b.label", "rh.BA3b.label"] +labels0 = [0]*len(label_filenames) +for i, filename in enumerate(label_filenames): + labels0[i] = mne.read_label(os.path.join(fs_dir, "label", filename), subject="fsaverage") +# Add V1 to final label variable +labels = labels0[:2] +# Rename to remove redundant hemi information +labels[0].name = "V1-{}".format(labels[0].hemi) +labels[1].name = "V1-{}".format(labels[1].hemi) +# Assign a color +labels[0].color = matplotlib.colors.to_rgba("salmon") +labels[1].color = matplotlib.colors.to_rgba("salmon") +# Combine Brodmann Areas for SMC. Only use vertices ones to avoid duplication error +SMC_labels = labels0[2:] +for hem in range(2): + SMC_p1 = SMC_labels[hem] + for i in range(1,len(SMC_labels)//2): + SMC_p2 = SMC_labels[hem+2*i] + p2_idx = np.isin(SMC_p2.vertices, SMC_p1.vertices, invert=True) + SMC_p21 = label_subset(SMC_p2, p2_idx, "SMC") + SMC_p1 = SMC_p1.__add__(SMC_p21) + SMC_p1.name = SMC_p21.name + # Assign a color + SMC_p1.color = matplotlib.colors.to_rgba("orange") + labels.append(SMC_p1) + +### Inferior frontal junction +# Located at junction between inferior frontal and inferior precentral sulcus +label_aparc_names0 = ["S_front_inf","S_precentral-inf-part"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) + +pos1 = temp_labels[0].pos +pos2 = temp_labels[2].pos +distm = scipy.spatial.distance.cdist(pos1,pos2) +# Find the closest points between the 2 ROIs +l1_idx = np.unique(np.where(distm<np.quantile(distm, 0.001))[0]) # q chosen to correspond to around 10% of ROI +l2_idx = np.unique(np.where(distm<np.quantile(distm, 0.0005))[1]) # q chosen to correspond to around 10% of ROI + +IFJ_label_p1 = label_subset(temp_labels[0], l1_idx, "IFJ") +IFJ_label_p2 = label_subset(temp_labels[2], l2_idx, "IFJ") +# Combine the 2 parts +IFJ_label = IFJ_label_p1.__add__(IFJ_label_p2) +IFJ_label.name = IFJ_label_p1.name +# Assign a color +IFJ_label.color = matplotlib.colors.to_rgba("chartreuse") +# Append to final list +labels.append(IFJ_label) + +# Do the same for the right hemisphere +pos1 = temp_labels[1].pos +pos2 = temp_labels[3].pos +distm = scipy.spatial.distance.cdist(pos1,pos2) +# Find the closest points between the 2 ROIs +l1_idx = np.unique(np.where(distm<np.quantile(distm, 0.00075))[0]) # q chosen to correspond to around 10% of ROI +l2_idx = np.unique(np.where(distm<np.quantile(distm, 0.0005))[1]) # q chosen to correspond to around 10% of ROI +IFJ_label_p1 = label_subset(temp_labels[1], l1_idx, "IFJ") +IFJ_label_p2 = label_subset(temp_labels[3], l2_idx, "IFJ") +# Combine the 2 parts +IFJ_label = IFJ_label_p1.__add__(IFJ_label_p2) +IFJ_label.name = IFJ_label_p1.name +# Assign a color +IFJ_label.color = matplotlib.colors.to_rgba("chartreuse") +# Append to final list +labels.append(IFJ_label) + +### Intraparietal sulcus +label_aparc_names0 = ["S_intrapariet_and_P_trans"] +labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[0])] +for i in range(len(labels_aparc_idx)): + labels.append(labels_aparc[labels_aparc_idx[i]].copy()) + labels[-1].name = "IPS-{}".format(labels[-1].hemi) + +### Frontal eye field as intersection between middle frontal gyrus and precentral gyrus +label_aparc_names0 = ["G_front_middle","G_precentral"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) + +# Take 10% of middle frontal gyrus closest to precentral gyrus (most posterior) +temp_label0 = temp_labels[0] +G_fm_y = temp_label0.pos[:,1] +thres_G_fm_y = np.sort(G_fm_y)[len(G_fm_y)//10] +idx_p1 = np.where(G_fm_y<thres_G_fm_y)[0] +FEF_label_p1 = label_subset(temp_label0, idx_p1, "FEF") +# Take 10% closest for precentral gyrus (most anterior) +temp_label0 = temp_labels[2] +# I cannot only use y (anterior/posterior) but also need to restrict z-position +G_pre_cen_z = temp_label0.pos[:,2] +thres_G_pre_cen_z = 0.04 # visually inspected threshold +G_pre_cen_y = temp_label0.pos[:,1] +thres_G_pre_cen_y = np.sort(G_pre_cen_y[G_pre_cen_z>thres_G_pre_cen_z])[-len(G_pre_cen_y)//10] # notice - for anterior +idx_p2 = np.where((G_pre_cen_y>thres_G_pre_cen_y) & (G_pre_cen_z>thres_G_pre_cen_z))[0] +FEF_label_p2 = label_subset(temp_label0, idx_p2, "FEF") +# Combine the 2 parts +FEF_label = FEF_label_p1.__add__(FEF_label_p2) +FEF_label.name = FEF_label_p1.name +# Assign a color +FEF_label.color = matplotlib.colors.to_rgba("aqua") +# Append to final list +labels.append(FEF_label) + +# Do the same for the right hemisphere +temp_label0 = temp_labels[1] +G_fm_y = temp_label0.pos[:,1] +thres_G_fm_y = np.sort(G_fm_y)[len(G_fm_y)//10] +idx_p1 = np.where(G_fm_y<thres_G_fm_y)[0] +FEF_label_p1 = label_subset(temp_label0, idx_p1, "FEF") + +temp_label0 = temp_labels[3] +G_pre_cen_z = temp_label0.pos[:,2] +thres_G_pre_cen_z = 0.04 # visually inspected threshold +G_pre_cen_y = temp_label0.pos[:,1] +thres_G_pre_cen_y = np.sort(G_pre_cen_y[G_pre_cen_z>thres_G_pre_cen_z])[-len(G_pre_cen_y)//10] # notice - for anterior +idx_p2 = np.where((G_pre_cen_y>thres_G_pre_cen_y) & (G_pre_cen_z>thres_G_pre_cen_z))[0] +FEF_label_p2 = label_subset(temp_label0, idx_p2, "FEF") +# Combine the 2 parts +FEF_label = FEF_label_p1.__add__(FEF_label_p2) +FEF_label.name = FEF_label_p1.name +# Assign a color +FEF_label.color = matplotlib.colors.to_rgba("aqua") +# Append to final list +labels.append(FEF_label) + +### Supplementary eye fields +# Located at caudal end of frontal gyrus and upper part of paracentral sulcus +label_aparc_names0 = ["G_and_S_paracentral","G_front_sup"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) + +pos1 = temp_labels[0].pos +pos2 = temp_labels[2].pos +distm = scipy.spatial.distance.cdist(pos1,pos2) +# Find the closest points between the 2 ROIs +l1_idx = np.unique(np.where(distm<np.quantile(distm, 0.0005))[0]) # q chosen to correspond to around 15% of ROI +l2_idx = np.unique(np.where(distm<np.quantile(distm, 0.005))[1]) # q chosen to correspond to around 10% of ROI +# Notice that superior frontal gyrus is around 4 times bigger than paracentral +len(l1_idx)/pos1.shape[0] +len(l2_idx)/pos2.shape[0] +# Only use upper part +z_threshold = 0.06 # visually inspected +l1_idx = l1_idx[pos1[l1_idx,2] > z_threshold] +l2_idx = l2_idx[pos2[l2_idx,2] > z_threshold] + +SEF_label_p1 = label_subset(temp_labels[0], l1_idx, "SEF") +SEF_label_p2 = label_subset(temp_labels[2], l2_idx, "SEF") +# Combine the 2 parts +SEF_label = SEF_label_p1.__add__(SEF_label_p2) +SEF_label.name = SEF_label_p1.name +# Assign a color +SEF_label.color = matplotlib.colors.to_rgba("royalblue") +# Append to final list +labels.append(SEF_label) + +# Do the same for the right hemisphere +pos1 = temp_labels[1].pos +pos2 = temp_labels[3].pos +distm = scipy.spatial.distance.cdist(pos1,pos2) +# Find the closest points between the 2 ROIs +l1_idx = np.unique(np.where(distm<np.quantile(distm, 0.0005))[0]) # q chosen to correspond to around 15% of ROI +l2_idx = np.unique(np.where(distm<np.quantile(distm, 0.005))[1]) # q chosen to correspond to around 10% of ROI +# Notice that superior frontal gyrus is around 4 times bigger than paracentral +len(l1_idx)/pos1.shape[0] +len(l2_idx)/pos2.shape[0] +# Only use upper part +z_threshold = 0.06 # visually inspected +l1_idx = l1_idx[pos1[l1_idx,2] > z_threshold] +l2_idx = l2_idx[pos2[l2_idx,2] > z_threshold] + +SEF_label_p1 = label_subset(temp_labels[1], l1_idx, "SEF") +SEF_label_p2 = label_subset(temp_labels[3], l2_idx, "SEF") +# Combine the 2 parts +SEF_label = SEF_label_p1.__add__(SEF_label_p2) +SEF_label.name = SEF_label_p1.name +# Assign a color +SEF_label.color = matplotlib.colors.to_rgba("royalblue") +# Append to final list +labels.append(SEF_label) + +### Posterior cingulate cortex +label_aparc_names0 = ["G_cingul-Post-dorsal", "G_cingul-Post-ventral"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) +labels0 = [] +for hem in range(2): + PCC_p1 = temp_labels[hem] + for i in range(1,len(temp_labels)//2): + PCC_p2 = temp_labels[hem+2*i] + PCC_p1 = PCC_p1.__add__(PCC_p2) + PCC_p1.name = "PCC-{}".format(PCC_p1.hemi) + labels0.append(PCC_p1) +# Combine the 2 hemisphere in 1 label +labels.append(labels0[0].__add__(labels0[1])) + +### Medial prefrontal cortex +# From their schematic it looks like rostral 1/4 of superior frontal gyrus +label_aparc_names0 = ["G_front_sup"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels0 = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels0 = temp_labels0.split(4, subjects_dir=subjects_dir)[3] + temp_labels0.name = "mPFC-{}".format(temp_labels0.hemi) + temp_labels.append(temp_labels0) +# Combine the 2 hemisphere in 1 label +labels.append(temp_labels[0].__add__(temp_labels[1])) + +### Angular gyrus +label_aparc_names0 = ["G_pariet_inf-Angular"] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels.name = "ANG-{}".format(temp_labels.hemi) + labels.append(temp_labels) + +### Posterior middle frontal gyrus +label_aparc_names0 = ["G_front_middle"] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels = temp_labels.split(2, subjects_dir=subjects_dir)[0] + temp_labels.name = "PMFG-{}".format(temp_labels.hemi) + labels.append(temp_labels) + +### Inferior parietal lobule +# From their parcellation figure seems to be rostral angular gyrus and posterior supramarginal gyrus +label_aparc_names0 = ["G_pariet_inf-Angular","G_pariet_inf-Supramar"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) +# Split angular in 2 and get rostral part +temp_labels[0] = temp_labels[0].split(2, subjects_dir=subjects_dir)[1] +temp_labels[1] = temp_labels[1].split(2, subjects_dir=subjects_dir)[1] +# Split supramarginal in 2 and get posterior part +temp_labels[2] = temp_labels[2].split(2, subjects_dir=subjects_dir)[0] +temp_labels[3] = temp_labels[3].split(2, subjects_dir=subjects_dir)[0] + +for hem in range(2): + PCC_p1 = temp_labels[hem] + for i in range(1,len(temp_labels)//2): + PCC_p2 = temp_labels[hem+2*i] + PCC_p1 = PCC_p1.__add__(PCC_p2) + PCC_p1.name = "IPL-{}".format(PCC_p1.hemi) + labels.append(PCC_p1) + +### Orbital gyrus +# From their figure it seems to correspond to orbital part of inferior frontal gyrus +label_aparc_names0 = ["G_front_inf-Orbital"] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels.name = "ORB-{}".format(temp_labels.hemi) + labels.append(temp_labels) + +### Middle temporal gyrus +# From their figure it seems to only be 1/4 of MTG at the 2nd to last caudal part +label_aparc_names0 = ["G_temporal_middle"] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels = temp_labels.split(4, subjects_dir=subjects_dir)[1] + temp_labels.name = "MTG-{}".format(temp_labels.hemi) + labels.append(temp_labels) + +### Anterior middle frontal gyrus +label_aparc_names0 = ["G_front_middle"] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels = temp_labels.split(2, subjects_dir=subjects_dir)[1] + temp_labels.name = "AMFG-{}".format(temp_labels.hemi) + labels.append(temp_labels) + +### Insula +label_aparc_names0 = ["G_Ins_lg_and_S_cent_ins","G_insular_short"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) +for hem in range(2): + PCC_p1 = temp_labels[hem] + for i in range(1,len(temp_labels)//2): + PCC_p2 = temp_labels[hem+2*i] + PCC_p1 = PCC_p1.__add__(PCC_p2) + PCC_p1.name = "INS-{}".format(PCC_p1.hemi) + labels.append(PCC_p1) + +### (Dorsal) Anterior Cingulate Cortex +label_aparc_names0 = ["G_and_S_cingul-Ant"] +temp_labels = [] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels.append(labels_aparc[labels_aparc_idx[i2]].copy()) + temp_labels[-1].name = "ACC-{}".format(temp_labels[-1].hemi) +# Combine the 2 hemisphere in 1 label +labels.append(temp_labels[0].__add__(temp_labels[1])) + +### Supramarginal Gyrus +label_aparc_names0 = ["G_pariet_inf-Supramar"] +for i in range(len(label_aparc_names0)): + labels_aparc_idx = [labels_aparc_names.index(l) for l in labels_aparc_names if l.startswith(label_aparc_names0[i])] + for i2 in range(len(labels_aparc_idx)): + temp_labels = labels_aparc[labels_aparc_idx[i2]].copy() + temp_labels.name = "SUP-{}".format(temp_labels.hemi) + labels.append(temp_labels) + +print("{} ROIs have been defined".format(len(labels))) + +# # Visualize positions +# fig = plt.figure() +# ax = fig.add_subplot(111, projection="3d") +# for i in range(0,3): +# temp_pos = temp_labels[i].pos +# ax.scatter(temp_pos[:,0],temp_pos[:,1],temp_pos[:,2], marker="o", alpha=0.1) +# # Add to plot +# ax.scatter(labels[-1].pos[:,0],labels[-1].pos[:,1],labels[-1].pos[:,2], marker="o") + +# # Visualize the labels +# # temp_l = labels_aparc[labels_aparc_idx[0]] +# temp_l = labels[-2] +# l_stc = stc[100].in_label(temp_l) +# l_stc.vertices + +# l_stc.plot(**surfer_kwargs) + +# Save the annotation file +with open("custom_aparc2009_Li_et_al_2022.pkl", "wb") as file: + pickle.dump(labels, file) + +# %% Calculate orthogonalized power envelope connectivity in source space +# In non-interpolated channels +# Updated 22/1 - 2021 to use delta = 1/81 and assumption +# about non-correlated and equal variance noise covariance matrix for channels + +# Load +with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file: + labels = pickle.load(file) +label_names = [label.name for label in labels] + +# Define function to estimate PEC +def PEC_estimation(x, freq_bands, sfreq=200): + """ + This function takes a source timeseries signal x and performs: + 1. Bandpass filtering + 2. Hilbert transform to yield analytical signal + 3. Compute all to all connectivity by iteratively computing for each pair + a. Orthogonalization + b. Computing power envelopes by squaring the signals |x|^2 + c. Log-transform to enhance normality + d. Pearson's correlation between each pair + e. Fisher's r-to-z transform to enhance normality + The code has been optimized by inspiration from MNE-Python's function: + mne.connectivity.enelope_correlation. + + In MNE-python version < 0.22 there was a bug, but after the fix in 0.22 + the mne function is equivalent to my implementation, although they don't + use epsilon but gives same result with a RuntimeWarning about log(0) + + IMPORTANT NOTE: + Filtering introduce artifacts for first and last timepoint + The values are very low, more than 1e-12 less than the others + If they are not removed, then they will heavily influence Pearson's + correlation as it is outlier sensitive + + Inputs: + x - The signal in source space as np.array with shape (ROIs,Timepoints) + freq_bands - The frequency bands of interest as a dictionary e.g. + {"alpha": [8.0, 13.0], "beta": [13.0, 30.0]} + sfreq - The sampling frequency in Hertz + + Output: + The pairwise connectivity matrix + """ + n_roi, n_timepoints = x.shape + n_freq_bands = len(freq_bands) + + epsilon = 1e-100 # small value to prevent log(0) errors + + # Filter the signal in the different freq bands + PEC_con0 = np.zeros((n_roi,n_roi,n_freq_bands)) + for fname, frange in freq_bands.items(): + fmin, fmax = [float(interval) for interval in frange] + signal_filtered = mne.filter.filter_data(x, sfreq, fmin, fmax, + fir_design="firwin", verbose=0) + # Filtering on finite signals will yield very low values for first + # and last timepoint, which can create outliers. E.g. 1e-29 compared to 1e-14 + # Outlier sensitive methods, like Pearson's correlation, is therefore + # heavily affected and this systematic error is removed by removing + # the first and last timepoint + signal_filtered = signal_filtered[:,1:-1] + + # Hilbert transform + analytic_signal = scipy.signal.hilbert(signal_filtered) + # I will use x and y to keep track of orthogonalization + x0 = analytic_signal + # Get power envelope + x0_mag = np.abs(x0) + # Get scaled conjugate used for orthogonalization estimation + x0_conj_scaled = x0.conj() + x0_conj_scaled /= x0_mag + # Take square power envelope + PEx = np.square(x0_mag) + # Take log transform + lnPEx = np.log(PEx+epsilon) + # Remove mean for Pearson correlation calculation + lnPEx_nomean = lnPEx - np.mean(lnPEx, axis=-1, keepdims=True) # normalize each roi timeseries + # Get std for Pearson correlation calculation + lnPEx_std = np.std(lnPEx, axis=-1) + lnPEx_std[lnPEx_std == 0] = 1 # Prevent std = 0 problems + # Prepare con matrix + con0 = np.zeros((n_roi,n_roi)) + for roi_r, y0 in enumerate(x0): # for each y0 + # Calculate orthogonalized signal y with respect to x for all x + # Using y_ort = imag(y*x_conj/|x|) + # I checked the formula in temp_v3 and it works as intended + # I want to orthogonalize element wise for each timepoint + y0_ort = (y0*x0_conj_scaled).imag + # Here y0_ort.shape = (n_roi, n_timepoints) + # So y is current roi and the first axis gives each x it is orthogonalized to + # Take the abs to get power envelope + y0_ort = np.abs(y0_ort) + # Prevent log(0) error when calculating y_ort on y + y0_ort[roi_r] = 1. # this will be 0 zero after mean subtraction + # Take square power envelope + PEy = np.square(y0_ort) # squared power envelope + # Take log transform + lnPEy = np.log(PEy+epsilon) + # Remove mean for pearson correlation calculation + lnPEy_nomean = lnPEy - np.mean(lnPEy, axis=-1, keepdims=True) + # Get std for Pearson correlation calculation + lnPEy_std = np.std(lnPEy, axis=-1) + lnPEy_std[lnPEy_std == 0] = 1. + # Pearson correlation is expectation of X_nomean * Y_nomean for each time-series divided with standard deviations + PEC = np.mean(lnPEx_nomean*lnPEy_nomean, axis=-1) + PEC /= lnPEx_std + PEC /= lnPEy_std + con0[roi_r] = PEC + # The con0 connectivity matrix should be read as correlation between + # orthogonalized y (row number) and x (column number) + # It is not symmetrical, as cor(roi2_ort, roi1) is not cor(roi1_ort, roi2) + # To make it symmetrical the average of the absolute correlation + # of the 2 possibilities for each pair are taken + con0 = np.abs(con0) + con0 = (con0.T+con0)/2. + # Fisher's z transform - which is equivalent to arctanh + con0 = np.arctanh(con0) + # The diagonal is not 0 as I wanted to avoid numerical errors with log(0) + # and used a small epsilon value. Thus the diagonal is explicitly set to 0 + + # Save to array + PEC_con0[:,:,list(freq_bands.keys()).index(fname)] = con0 + return PEC_con0 + +# Prepare variables +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +n_freq_bands = len(Freq_Bands) +n_roi = len(labels) + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +# PEC analysis +PEC_data_list = [0]*n_subjects +STCs_list = [0]*n_subjects + +# Using inverse operator as generator interferes with concurrent processes +# If I run it for multiple subjects I run out of ram +# Thus concurrent processes are used inside the for loop +def PEC_analysis(input_args): # iterable epoch number and corresponding ts + i2, ts = input_args + # Estimate PEC + PEC_con0 = PEC_estimation(ts, Freq_Bands, sfreq) + print("Finished {} out of {} epochs".format(i2+1,n_epochs)) + return i2, PEC_con0, ts + +for i in range(n_subjects): + n_epochs, n_ch, n_timepoints = source_epochs[i].get_data().shape + # Use different forward solutions depending on number of channels + cur_subject_id = Subject_id[i] + fwd = fwds[i] + + # Using assumption about equal variance and no correlations I make a diagonal matrix + # Using the default option for 0.2µV std for EEG data + noise_cov = mne.make_ad_hoc_cov(source_epochs[i].info, None) + + # Make inverse operator + # Using default depth parameter = 0.8 and free orientation (loose = 1) + inverse_operator = mne.minimum_norm.make_inverse_operator(source_epochs[i].info, + fwd, noise_cov, + loose = 1, depth = 0.8, + verbose = 0) + src_inv = inverse_operator["src"] + # Compute inverse solution and retrieve time series for each label + # Preallocate memory + label_ts = np.full((n_epochs,len(labels),n_timepoints),np.nan) + # Define regularization + snr = 9 # Zhang et al, 2020 used delta = 1/81, which is inverse SNR and correspond to lambda2 + # A for loop is used for each label due to memory issues when doing all labels at the same time + for l in range(len(labels)): + stc = mne.minimum_norm.apply_inverse_epochs(source_epochs[i],inverse_operator, + lambda2 = 1/(snr**2), + label = labels[l], + pick_ori = "vector", + return_generator=False, + method = "MNE", verbose = 0) + # Use PCA to reduce the 3 orthogonal directions to 1 principal direction with max power + # There can be ambiguity about the orientation, thus the one that + # is pointing most "normal", i.e. closest 90 degrees to the skull is used + stc_pca = [0]*len(stc) + for ep in range(n_epochs): + stc_pca[ep], pca_dir = stc[ep].project(directions="pca", src=src_inv) + # Get mean time series for the whole label + temp_label_ts = mne.extract_label_time_course(stc_pca, labels[l], src_inv, mode="mean_flip", + return_generator=False, verbose=0) + # Save to array + label_ts[:,l,:] = np.squeeze(np.array(temp_label_ts)) + print("Finished estimating STC for {} out of {} ROIs".format(l+1,len(labels))) + + # Free up memory + del stc + + # Prepare variables + sfreq=source_epochs[i].info["sfreq"] + n_epochs = len(source_epochs[i]) + # Estimate the pairwise PEC for each epoch + PEC_con_subject = np.zeros((n_epochs,n_roi,n_roi,n_freq_bands)) + stcs0 = np.zeros((n_epochs,n_roi,int(sfreq)*4)) # 4s epochs + # Make list of arguments to pass into PEC_analysis using the helper func + args = [] + for i2 in range(n_epochs): + args.append((i2,label_ts[i2])) + + with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor: + for i2, PEC_result, stc_result in executor.map(PEC_analysis, args): # Function and arguments + PEC_con_subject[i2] = PEC_result + stcs0[i2] = stc_result + + # Save to list + PEC_data_list[i] = PEC_con_subject # [subject](epoch,ch,ch,freq) + STCs_list[i] = stcs0 # [subject][epoch,roi,timepoint] + + # Print progress + print("Finished {} out of {} subjects".format(i+1,n_subjects)) + +# Get current time +c_time2 = time.localtime() +c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) +print("Started", c_time1, "\nFinished",c_time2) + +with open(Feature_savepath+"PEC_each_epoch_drop_interpol_ch_fix_snr.pkl", "wb") as file: + pickle.dump(PEC_data_list, file) +with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "wb") as file: + pickle.dump(STCs_list, file) + +# # # Load +# with open(Feature_savepath+"PEC_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file: +# PEC_data_list = pickle.load(file) + +# # Load +# with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file: +# STCs_list = pickle.load(file) + +# Average over eye status +eye_status = list(source_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) +pec_data = np.zeros((n_subjects,n_eye_status,n_roi,n_roi,n_freq_bands)) +for i in range(n_subjects): + # Get indices for eyes open and closed + EC_index = source_epochs[i].events[:,2] == 1 + EO_index = source_epochs[i].events[:,2] == 2 + # Average over the indices and save to array + pec_data[i,0] = np.mean(PEC_data_list[i][EC_index], axis=0) + pec_data[i,1] = np.mean(PEC_data_list[i][EO_index], axis=0) + # Only use the lower diagonal as the diagonal should be 0 (or very small due to numerical errors) + # And it is symmetric + for f in range(n_freq_bands): + pec_data[i,0,:,:,f] = np.tril(pec_data[i,0,:,:,f],k=-1) + pec_data[i,1,:,:,f] = np.tril(pec_data[i,1,:,:,f],k=-1) + +# Also save as dataframe format for feature selection +# Convert to Pandas dataframe +# The dimensions will each be a column with numbers and the last column will be the actual values +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, pec_data.shape), indexing="ij"))) + [pec_data.ravel()]) +pec_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"]) +# Change from numerical coding to actual values +eye_status = list(source_epochs[0].event_id.keys()) +freq_bands_name = list(Freq_Bands.keys()) +label_names = [label.name for label in labels] + +index_values = [Subject_id,eye_status,label_names,label_names,freq_bands_name] +for col in range(len(index_values)): + col_name = pec_data_df.columns[col] + for shape in range(pec_data.shape[col]): # notice not dataframe but the array + pec_data_df.loc[pec_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] + +# Add group status +Group_status = np.array(["CTRL"]*len(pec_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in pec_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +pec_data_df.insert(3, "Group_status", Group_status) + +# Remove all diagonal and upper-matrix entries by removing zeros +pec_data_df = pec_data_df.iloc[pec_data_df["Value"].to_numpy().nonzero()] + +# Save df +pec_data_df.to_pickle(os.path.join(Feature_savepath,"pec_data_drop_interpol_ch_df.pkl")) + +# %% Sparse clustering of PEC for subtyping PTSD group +# They did it for both eye status together, so all data in one matrix +# Load PEC df +# pec_data_df = pd.read_pickle(os.path.join(Feature_savepath,"pec_data_df.pkl")) +pec_data_df = pd.read_pickle(os.path.join(Feature_savepath,"pec_data_drop_interpol_ch_df.pkl")) + +# Convert to wide format +# Make function to add measurement column for indexing +def add_measurement_column(df, measurement = "Text"): + dummy_variable = [measurement]*df.shape[0] + df.insert(1, "Measurement", dummy_variable) + return df +# Make function to convert column tuple to string +def convertTupleHeader(header): + header = list(header) + str = "_".join(header) + return str + +# Prepare overall dataframe +PEC_df = pd.DataFrame(Subject_id, columns = ["Subject_ID"]) +# Add PEC +pec_data_df = add_measurement_column(pec_data_df, "PEC") +temp_df = pec_data_df.pivot_table(index="Subject_ID",columns=["Measurement", + "Eye_status", "chx", "chy", + "Freq_band"], dropna=True, + values="Value").reset_index(drop=True) +# check NaN is properly dropped and subject index is correct +assert pec_data_df.shape[0] == np.prod(temp_df.shape) +test1 = pec_data_df.iloc[np.random.randint(n_subjects),:] +assert test1["Value"] ==\ + temp_df[test1[1]][test1[2]][test1[3]][test1[5]][test1[6]][Subject_id.index(test1[0])] +# Fix column names +temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))] + +PEC_df = pd.concat([PEC_df,temp_df], axis=1) + +# Add group status +Groups = ["CTRL", "PTSD"] +Group_status = np.array([0]*PEC_df.shape[0]) # CTRL = 0 +Group_status[np.array([i in cases for i in PEC_df["Subject_ID"]])] = 1 # PTSD = 1 +PEC_df.insert(1, "Group_status", Group_status) + +# Only use PTSD patient group +PEC_df2 = PEC_df.loc[PEC_df["Group_status"]==1,:] + +Subject_info_cols = ["Subject_ID","Group_status"] + +# Use gridsearch and permutations to estimate gap statistic and use it to +# determine number of clusters and sparsity s +# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020 +# Error when trying to determine Gap statistic for 1 cluster? Also in R package +max_clusters = 6 +n_sparsity_feat = 20 +perm_res = [] +for k in range(1,max_clusters): + # Cannot permute with 1 cluster + n_clusters = k+1 + x = np.array(PEC_df2.copy().drop(Subject_info_cols, axis=1)) + perm = pysparcl.cluster.permute_modified(x, k=n_clusters, verbose=True, + nvals=n_sparsity_feat, nperms=100) + perm_res.append(perm) + +# Save the results +with open(Feature_savepath+"PEC_drop_interpol_ch_kmeans_perm.pkl", "wb") as file: + pickle.dump(perm_res, file) + +# # Load +# with open(Feature_savepath+"PEC_drop_interpol_ch_kmeans_perm.pkl", "rb") as file: +# perm_res = pickle.load(file) + +# Convert results to array +perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4)) +for i in range(len(perm_res)): + _, gaps, sdgaps, wbounds, _ = perm_res[i].values() + for i2 in range(n_sparsity_feat): + perm_res_arr[20*i+i2,0] = i+2 # cluster size + perm_res_arr[20*i+i2,1] = gaps[i2] # gap statistic + perm_res_arr[20*i+i2,2] = sdgaps[i2] # gap statistic std + perm_res_arr[20*i+i2,3] = wbounds[i2] # sparsity feature s + +# For each sparsity s, determine best k using one-standard-error criterion +# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s +# that fulfill Gap(k) >= Gap(k+1)-std(k+1) +def one_standard_deviation_search(gaps, std): + best_gaps = np.argmax(gaps) + current_gaps = gaps[best_gaps] + current_std = std[best_gaps] + current_gaps_idx = best_gaps + while (gaps[current_gaps_idx-1] >= current_gaps - current_std): + if current_gaps_idx == 0: + break + else: + current_gaps_idx -= 1 + current_gaps = gaps[current_gaps_idx] + current_std = std[current_gaps_idx] + out = current_gaps, current_std, current_gaps_idx + return out + +best_ks = np.zeros((n_sparsity_feat, 2)) +all_s = np.unique(perm_res_arr[:,3]) +plt.figure(figsize=(12,12)) +for i2 in range(n_sparsity_feat): + current_s = all_s[i2] + gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1] + std = perm_res_arr[perm_res_arr[:,3] == current_s,2] + _, _, idx = one_standard_deviation_search(gaps, std) + # Save to array + best_ks[i2,0] = current_s + best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx] + # Plot gap + plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"), + gaps, yerr=std, capsize=5, label = np.round(current_s,3)) +plt.title("Gap statistic for different fixed s") +plt.legend(loc=1) +plt.xlabel("Number of clusters") +plt.ylabel("Gap statistic") + +best_k = int(scipy.stats.mode(best_ks[:,1])[0]) + +# Determine s using fixed k as lowest s within 1 std of max gap statistic +# According to Witten & Tibshirani, 2010 +best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1]) +best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx] +best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx] +one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std + +best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]]) + +# Perform clustering with k clusters +x = np.array(PEC_df2.copy().drop(Subject_info_cols, axis=1)) +sparcl = pysparcl.cluster.kmeans(x, k=best_k, wbounds=best_s)[0] + +# Save the results +with open(Feature_savepath+"PEC_drop_interpol_ch_sparse_kmeans.pkl", "wb") as file: + pickle.dump(sparcl, file) + +# Get overview of the features chosen and summarize feature type with countplot +nonzero_idx = sparcl["ws"].nonzero() +sparcl_features = PEC_df2.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx] + +# Prepare variables +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +n_freq_bands = len(Freq_Bands) +eye_status = list(source_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) + +sparcl_feat = [] +sparcl_feat_counts = [] +for e in range(n_eye_status): + ee = eye_status[e] + for f in range(n_freq_bands): + ff = list(Freq_Bands.keys())[f] + temp_feat = sparcl_features[sparcl_features.str.contains(("_"+ee))] + temp_feat = temp_feat[temp_feat.str.contains(("_"+ff))] + # Save to list + sparcl_feat.append(temp_feat) + sparcl_feat_counts.append(["{}_{}".format(ee,ff), len(temp_feat)]) + +# Convert the list to dataframe to use in countplot +sparcl_feat_counts_df = pd.DataFrame(columns=["Eye_status", "Freq_band"]) +for i in range(len(sparcl_feat_counts)): + # If this feature type does not exist, then skip it + if sparcl_feat_counts[i][1] == 0: + continue + ee, ff = sparcl_feat_counts[i][0].split("_") + counts = sparcl_feat_counts[i][1] + temp_df = pd.DataFrame({"Eye_status":np.repeat(ee,counts), + "Freq_band":np.repeat(ff,counts)}) + sparcl_feat_counts_df = sparcl_feat_counts_df.append(temp_df, ignore_index=True) + +# Fix Freq_band categorical order +cat_type = pd.CategoricalDtype(categories=list(Freq_Bands.keys()), ordered=True) +sparcl_feat_counts_df["Freq_band"] = sparcl_feat_counts_df["Freq_band"].astype(cat_type) + +plt.figure(figsize=(8,8)) +g = sns.countplot(y="Freq_band", hue="Eye_status", data=sparcl_feat_counts_df) +plt.title("PEC Sparse K-means features") +plt.xlabel("Number of non-zero weights") +plt.ylabel("Frequency Band") + +# %% Functional connectivity in source space +# MNE implementation of PLV and wPLI is phase across trials(epochs), e.g. for ERPs +# I'll use my own manually implemented PLV and wPLI across time and then average across epochs +# Notice that the new MNE-connectivity library now also takes phase across time + +sfreq = final_epochs[0].info["sfreq"] +# error when using less than 5 cycles for spectrum estimation +# 1Hz too low with epoch length of 4, thus I changed the fmin to 1.25 for delta +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +n_freq_bands = len(Freq_Bands) +freq_centers = np.array([2.5,6,10.5,21.5,40]) +# Convert to tuples for the mne function +fmin=tuple([list(Freq_Bands.values())[f][0] for f in range(len(Freq_Bands))]) +fmax=tuple([list(Freq_Bands.values())[f][1] for f in range(len(Freq_Bands))]) + +# Make linspace array for morlet waves +freq_centers = np.arange(fmin[0],fmax[-1]+0.25,0.25) +# Prepare Morlets +morlets = mne.time_frequency.tfr.morlet(sfreq,freq_centers,n_cycles=3) + +# Make freqs array for indexing +freqs0 = [0]*n_freq_bands +for f in range(n_freq_bands): + freqs0[f] = freq_centers[(freq_centers>=fmin[f]) & (freq_centers<=fmax[f])] + +# The in-built connectivity function gives an (n_channel, n_channel, freqs output +# For loop over subject ID and eye status is implemented +n_subjects = len(Subject_id) +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) +ch_names = final_epochs[0].info["ch_names"] + +# Load source labels +with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file: + labels = pickle.load(file) +label_names = [label.name for label in labels] +n_sources = len(label_names) + +# Connectivity methods +connectivity_methods = ["coh","imcoh","plv","wpli"] +n_con_methods=len(connectivity_methods) + +# Number of pairwise ch connections +n_ch_connections = scipy.special.comb(n_sources,2, exact=True, repetition=False) + +# Load source time series +with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file: + STCs_list = pickle.load(file) + +# I made my own slightly-optimized PLV & WPLI function +# Version 2 based on Filter + Hilbert instead of Morlets +def calculate_PLV_WPLI_across_time(data): + n_ch, n_time0 = data.shape + x = data.copy() + # Filter the signal in the different freq bands + con_array0 = np.zeros((2,n_ch,n_ch,n_freq_bands)) + # con_array0[con_array0==0] = np.nan + for fname, frange in Freq_Bands.items(): + fmin, fmax = [float(interval) for interval in frange] + signal_filtered = mne.filter.filter_data(x, sfreq, fmin, fmax, + fir_design="firwin", verbose=0) + # Filtering on finite signals will yield very low values for first + # and last timepoint, which can create outliers. E.g. 1e-29 compared to 1e-14 + # This systematic error is removed by removing the first and last timepoint + signal_filtered = signal_filtered[:,1:-1] + # Hilbert transform to get complex signal + analytic_signal = scipy.signal.hilbert(signal_filtered) + # Calculate for the lower diagnonal only as it is symmetric + for ch_r in range(n_ch): + for ch_c in range(n_ch): + if ch_r>ch_c: + # ========================================================================= + # PLV over time correspond to mean across time of the absolute value of + # the circular length of the relative phases. So PLV will be 1 if + # the phases of 2 signals maintain a constant lag + # In equational form: PLV = 1/N * |sum(e^i(phase1-phase2))| + # In code: abs(mean(exp(1i*phase_diff))) + # ========================================================================= + # The real part correspond to the amplitude and the imaginary part can be used to calculate the phase + phase_diff = np.angle(analytic_signal[ch_r])-np.angle(analytic_signal[ch_c]) + # Convert phase difference to complex part i(phase1-phase2) + phase_diff_im = 0*phase_diff+1j*phase_diff + # Take the exponential, then the mean followed by absolute value + PLV = np.abs(np.mean(np.exp(phase_diff_im))) + # Save to array + con_array0[0,ch_r,ch_c,list(Freq_Bands.keys()).index(fname)] = PLV + # ========================================================================= + # PLI over time correspond to the sign of the sine of relative phase + # differences. So PLI will be 1 if one signal is always leading or + # lagging behind the other signal. But it is insensitive to changes in + # relative phase, as long as it is the same signal that leads. + # If 2 signals are almost in phase, they might shift between lead/lag + # due to small fluctuations from noise. This would lead to unstable + # estimation of "phase" synchronisation. + # The wPLI tries to correct for this by weighting the PLI with the + # magnitude of the lag, to attenuate noise sources giving rise to + # near zero phase lag "synchronization" + # In equational form: WPLI = |E{|phase_diff|*sign(phase_diff)}| / E{|phase_diff|} + # ========================================================================= + # Calculate the magnitude of phase differences + phase_diff_mag = np.abs(np.sin(phase_diff)) + # Calculate the signed phase difference (PLI) + sign_phase_diff = np.sign(np.sin(phase_diff)) + # Calculate the nominator (abs and average across time) + WPLI_nominator = np.abs(np.mean(phase_diff_mag*sign_phase_diff)) + # Calculate denominator for normalization + WPLI_denom = np.mean(phase_diff_mag) + # Calculate WPLI + WPLI = WPLI_nominator/WPLI_denom + # Save to array + con_array0[1,ch_r,ch_c,list(Freq_Bands.keys()).index(fname)] = WPLI + return con_array0 + +# Pre-allocatate memory +con_data = np.zeros((n_con_methods,n_subjects,n_eye_status,n_sources,n_sources,n_freq_bands)) +n_epochs_matrix = np.zeros((n_subjects,n_eye_status)) + +# Get current time +c_time = time.localtime() +c_time = time.strftime("%H:%M:%S", c_time) +print(c_time) + +def connectivity_estimation(i): + con_data0 = np.zeros((n_con_methods,n_eye_status,n_sources,n_sources,n_freq_bands)) + con_data0[con_data0==0] = np.nan + n_epochs_matrix0 = np.zeros((n_eye_status)) + for e in range(n_eye_status): + ee = eye_status[e] + eye_idx = final_epochs[i].events[:,2] == e+1 # EC = 1 and EO = 2 + # Get source time series + temp_STC = STCs_list[i][eye_idx] + # Calculate the coherence and ImgCoh for the given subject and eye status + con, freqs, times, n_epochs, n_tapers = spectral_connectivity( + temp_STC, method=connectivity_methods[0:2], + mode="multitaper", sfreq=sfreq, fmin=fmin, fmax=fmax, + faverage=True, verbose=0) + # Save the results in array + con_data0[0,e,:,:,:] = con[0] # coherence + con_data0[1,e,:,:,:] = np.abs(con[1]) # Absolute value of ImgCoh to reflect magnitude of ImgCoh + + # Calculate PLV and wPLI for each epoch and then average + n_epochs0 = temp_STC.shape[0] + con_data1 = np.zeros((len(connectivity_methods[2:]),n_epochs0,n_sources,n_sources,n_freq_bands)) + for epoch in range(n_epochs0): + # First the data is retrieved and epoch axis dropped + temp_data = temp_STC[epoch,:,:] + # PLV and WPLI value is calculated across timepoints in each freq band + PLV_WPLI_con = calculate_PLV_WPLI_across_time(temp_data) + # Save results + con_data1[0,epoch,:,:,:] = PLV_WPLI_con[0] # phase locking value + con_data1[1,epoch,:,:,:] = PLV_WPLI_con[1] # weighted phase lag index + # Take average across epochs for PLV and wPLI + con_data2 = np.mean(con_data1,axis=1) + # Save to final array + con_data0[2,e,:,:,:] = con_data2[0] # phase locking value + con_data0[3,e,:,:,:] = con_data2[1] # weighted phase lag index + n_epochs_matrix0[e] = n_epochs + + print("{} out of {} finished".format(i+1,n_subjects)) + return i, con_data0, n_epochs_matrix0 + +with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor: + for i, con_result, n_epochs_mat in executor.map(connectivity_estimation, range(n_subjects)): # Function and arguments + con_data[:,i,:,:,:,:] = con_result + n_epochs_matrix[i] = n_epochs_mat + +# Get current time +c_time = time.localtime() +c_time = time.strftime("%H:%M:%S", c_time) +print(c_time) + +# Save the results +np.save(Feature_savepath+"Source_drop_interpol_ch_connectivity_measures_data.npy", con_data) # (con_measure,subject,eye,ch,ch,freq) + +# Also save as dataframe format for feature selection +# Convert to Pandas dataframe +# The dimensions will each be a column with numbers and the last column will be the actual values +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, con_data.shape), indexing="ij"))) + [con_data.ravel()]) +con_data_df = pd.DataFrame(arr, columns = ["Con_measurement", "Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"]) +# Change from numerical coding to actual values +eye_status = list(final_epochs[0].event_id.keys()) +freq_bands_name = list(Freq_Bands.keys()) + +index_values = [connectivity_methods,Subject_id,eye_status,label_names,label_names,freq_bands_name] +for col in range(len(index_values)): + col_name = con_data_df.columns[col] + for shape in range(con_data.shape[col]): # notice not dataframe but the array + con_data_df.loc[con_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] + +# Add group status +Group_status = np.array(["CTRL"]*len(con_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in con_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +con_data_df.insert(3, "Group_status", Group_status) + +# Remove all diagonal and upper-matrix entries +con_data_df = con_data_df.iloc[con_data_df["Value"].to_numpy().nonzero()] + +# Save df +con_data_df.to_pickle(os.path.join(Feature_savepath,"con_data_source_drop_interpol_df.pkl")) + +# %% Estimate Granger's Causality in source space +# Load source labels +with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file: + labels = pickle.load(file) +label_names = [label.name for label in labels] +n_sources = len(label_names) + +# Load source time series +with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file: + STCs_list = pickle.load(file) + +# Granger's causality might be influenced by volume conduction, thus working with CSD might be beneficial +# But since I already used source modelling to alleviate this problem I will not apply CSD +# Barrett et al, 2012 also do not apply CSD on source GC + +# GC assumes stationarity, thus I will test for stationarity using ADF test +# The null hypothesis of ADF is that it has unit root, i.e. is non-stationary +# I will test how many can reject the null hypothesis, i.e. are stationary + +# Due to the low numerical values in STC the ADF test is unstable, thus I multiply it to be around 1e0 + +stationary_test_arr = [0]*n_subjects +n_tests = [0]*n_subjects +for i in range(n_subjects): + # Get data + data_arr = STCs_list[i] + # Get shape + n_epochs, n_channels, n_timepoints = data_arr.shape + # Create array for indices to print out progress + ep_progress_idx = np.arange(n_epochs//5,n_epochs,n_epochs//5) + # Calculate number of tests performed for each subject + n_tests[i] = n_epochs*n_channels + # Prepare empty array (with 2's as 0 and 1 will be used) + stationary_test_arr0 = np.zeros((n_epochs,n_channels))+2 # make array of 2's + for ep in range(n_epochs): + for c in range(n_channels): + ADF = adfuller(data_arr[ep,c,:]*1e14) # multilying with a constant does not change ADF, but helps against numerical instability + p_value = ADF[1] + if p_value < 0.05: + stationary_test_arr0[ep,c] = True # Stationary set to 1 + else: + stationary_test_arr0[ep,c] = False # Non-stationary set to 0 + # Print partial progress + if len(np.where(ep_progress_idx==ep)[0]) > 0: + print("Finished epoch number: {} out of {}".format(ep,n_epochs)) + # Indices that were not tested + no_test_idx = np.where(stationary_test_arr0==2)[0] + if len(no_test_idx) > 0: + print("An unexpected error occurred and {} was not tested".format(no_test_idx)) + # Save to list + stationary_test_arr[i] = stationary_test_arr0 + # Print progress + print("Finished subject {} out of {}".format(i+1,n_subjects)) + +with open(Stat_savepath+"Source_drop_interpol_GC_stationarity_tests.pkl", "wb") as filehandle: + # The data is stored as binary data stream + pickle.dump(stationary_test_arr, filehandle) + +# I used a threshold of 0.05 +# This means that on average I would expect 5% false positives among the tests that showed significance for stationarity +ratio_stationary = [0]*n_subjects +for i in range(n_subjects): + # Ratio of tests that showed stationarity + ratio_stationary[i] = np.sum(stationary_test_arr[i])/n_tests[i] + +print("Ratio of stationary time series: {0:.3f}".format(np.mean(ratio_stationary))) # 88% + +# The pre-processing have already ensured that most of the data fulfills the stationarity assumption. + +# Divide the data into eyes closed and open +ch_names = label_names +n_channels = len(ch_names) + +STC_eye_data = [] +for i in range(n_subjects): + # Get index for eyes open and eyes closed + EC_index = final_epochs[i].events[:,2] == 1 + EO_index = final_epochs[i].events[:,2] == 2 + # Get the data + EC_epoch_data = STCs_list[i][EC_index,:,:] # eye index + EO_epoch_data = STCs_list[i][EO_index,:,:] + # Save to list + STC_eye_data.append([EC_epoch_data, EO_epoch_data]) + +# Make each epoch a TimeSeries object +# Input for TimeSeries is: (ch, time) +eye_status = list(final_epochs[0].event_id.keys()) +n_eye_status = len(eye_status) +sfreq = final_epochs[0].info["sfreq"] + +Timeseries_data = [] +for i in range(n_subjects): + temp_list1 = [] + for e in range(n_eye_status): + temp_list2 = [] + n_epochs = STC_eye_data[i][e].shape[0] + for ep in range(n_epochs): + # Convert to TimeSeries + time_series = nts.TimeSeries(STC_eye_data[i][e][ep,:,:], sampling_rate=sfreq) + # Save the object + temp_list2.append(time_series) + # Save the timeseries across eye status + temp_list1.append(temp_list2) + # Save the timeseries across subjects + Timeseries_data.append(temp_list1) # output [subject][eye][epoch](ch,time) + +# Test multiple specified model orders of AR models, each combination has its own model +m_orders = np.linspace(1,25,25) # the model orders tested +m_orders = np.round(m_orders) +n_timepoints = len(Timeseries_data[0][0][0]) +n_ch_combinations = scipy.special.comb(n_channels,2, exact=True, repetition=False) + +# To reduce computation time I only test representative epochs (1 from each 1 min session) +# There will be 5 epochs from eyes closed and 5 from eyes open +n_rep_epoch = 5 +# The subjects have different number of epochs due to dropped epochs +gaps_trials_idx = np.load("Gaps_trials_idx.npy") # time_points between sessions +# I convert the gap time points to epoch number used as representative epoch +epoch_idx = np.zeros((n_subjects,n_eye_status,n_rep_epoch), dtype=int) # prepare array +epoch_idx[:,:,0:4] = np.round(gaps_trials_idx/n_timepoints,0)-8 # take random epoch from sessions 1 to 4 +epoch_idx[:,:,4] = np.round(gaps_trials_idx[:,:,3]/n_timepoints,0)+5 # take random epoch from session 5 + +# Checking if all epoch idx exists +for i in range(n_subjects): + EC_index = final_epochs[i].events[:,2] == 1 + EO_index = final_epochs[i].events[:,2] == 2 + assert np.sum(EC_index) >= epoch_idx[i,0,4] + assert np.sum(EO_index) >= epoch_idx[i,1,4] + +# Prepare model order estimation +AIC_arr = np.zeros((len(m_orders),n_subjects,n_eye_status,n_rep_epoch,n_ch_combinations)) +BIC_arr = np.zeros((len(m_orders),n_subjects,n_eye_status,n_rep_epoch,n_ch_combinations)) + +def GC_model_order_est(i): + AIC_arr0 = np.zeros((len(m_orders),n_eye_status,n_rep_epoch,n_ch_combinations)) + BIC_arr0 = np.zeros((len(m_orders),n_eye_status,n_rep_epoch,n_ch_combinations)) + for e in range(n_eye_status): + n_epochs = STC_eye_data[i][e].shape[0] + N_total = n_timepoints*n_epochs # total number of datapoints for specific eye condition + for ep in range(n_rep_epoch): + epp = epoch_idx[i,e,ep] + for o in range(len(m_orders)): + order = int(m_orders[o]) + # Make the Granger Causality object + GCA1 = nta.GrangerAnalyzer(Timeseries_data[i][e][epp-1], order=order, + n_freqs=2000) + for c in range(n_ch_combinations): + # Retrieve error covariance matrix for all combinations + ecov = np.array(list(GCA1.error_cov.values())) + # Calculate AIC + AIC = ntsu.akaike_information_criterion(ecov[c,:,:], p = n_channels, + m=order, Ntotal=N_total) + # Calculate BIC + BIC = ntsu.bayesian_information_criterion(ecov[c,:,:], p = n_channels, + m=order, Ntotal=N_total) + # Save the information criterions + AIC_arr0[o,e,ep,c] = AIC + BIC_arr0[o,e,ep,c] = BIC + + print("{} out of {} finished testing".format(i+1,n_subjects)) + return i, AIC_arr0, BIC_arr0 + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +with concurrent.futures.ProcessPoolExecutor() as executor: + for i, AIC_result, BIC_result in executor.map(GC_model_order_est, range(n_subjects)): # Function and arguments + AIC_arr[:,i] = AIC_result + BIC_arr[:,i] = BIC_result + +# Get current time +c_time2 = time.localtime() +c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) +print("Started", c_time1, "\nCurrent Time",c_time2) + +# Save the AIC and BIC results +np.save(Feature_savepath+"AIC_Source_drop_interpol_GC_model_order.npy", AIC_arr) # (m. order, subject, eye, epoch, combination) +np.save(Feature_savepath+"BIC_Source_drop_interpol_GC_model_order.npy", BIC_arr) # (m. order, subject, eye, epoch, combination) + +# Load data +AIC_arr = np.load(Feature_savepath+"AIC_Source_drop_interpol_GC_model_order.npy") +BIC_arr = np.load(Feature_savepath+"BIC_Source_drop_interpol_GC_model_order.npy") + +# Average across all subjects, eye status, epochs and combinations +plt.figure(figsize=(8,6)) +plt.plot(m_orders, np.nanmean(AIC_arr, axis=(1,2,3,4)), label="AIC") +plt.plot(m_orders, np.nanmean(BIC_arr, axis=(1,2,3,4)), label="BIC") +plt.title("Average information criteria value") +plt.xlabel("Model order (Lag)") +plt.legend() + +np.sum(np.isnan(AIC_arr))/AIC_arr.size # around 0.07% NaN due to non-convergence +np.sum(np.isnan(BIC_arr))/BIC_arr.size # around 0.07% NaN due to non-convergence + +# If we look at each subject +mean_subject_AIC = np.nanmean(AIC_arr, axis=(2,3,4)) + +plt.figure(figsize=(8,6)) +for i in range(n_subjects): + plt.plot(m_orders, mean_subject_AIC[:,i]) +plt.title("Average AIC for each subject") +plt.xlabel("Model order (Lag)") + +mean_subject_BIC = np.nanmean(BIC_arr, axis=(2,3,4)) +plt.figure(figsize=(8,6)) +for i in range(n_subjects): + plt.plot(m_orders, mean_subject_BIC[:,i]) +plt.title("Average BIC for each subject") +plt.xlabel("Model order (Lag)") + +# We see that for many cases in BIC, it does not converge. Monotonic increasing! + +# We can look at the distribution of chosen order for each time series analyzed +# I.e. I will find the minima in model order for each model +AIC_min_arr = np.argmin(AIC_arr, axis=0) +BIC_min_arr = np.argmin(BIC_arr, axis=0) + +# Plot the distributions of the model order chosen +plt.figure(figsize=(8,6)) +sns.distplot(AIC_min_arr.reshape(-1)+1, kde=False, norm_hist=True, + bins=np.linspace(0.75,30.25,60), label="AIC") +plt.ylabel("Frequency density") +plt.xlabel("Model order") +plt.title("AIC Model Order Estimation") + +plt.figure(figsize=(8,6)) +sns.distplot(BIC_min_arr.reshape(-1)+1, kde=False, norm_hist=True, color="blue", + bins=np.linspace(0.75,30.25,60), label="BIC") +plt.ylabel("Frequency density") +plt.xlabel("Model order") +plt.title("BIC Model Order Estimation") +# It is clear from the BIC model that most have model order 1 +# which reflect their monotonic increasing nature without convergence +# Thus I will only use AIC + +# There is a bias variance trade-off with model order [Stokes & Purdon, 2017] +# Lower order is associated with higher bias and higher order with variance +# I will choose the model order that is chosen the most (i.e. majority voting) +AR_order = int(np.nanquantile(AIC_min_arr.reshape(-1), q=0.5)) +# Order = 5 + +# Calculate Granger Causality for each subject, eye and epoch +Freq_Bands = {"delta": [1.25, 4.0], + "theta": [4.0, 8.0], + "alpha": [8.0, 13.0], + "beta": [13.0, 30.0], + "gamma": [30.0, 49.0]} +n_freq_bands = len(Freq_Bands) + +# Pre-allocate memory +GC_data = np.zeros((2,n_subjects,n_eye_status,n_channels,n_channels,n_freq_bands)) + +def GC_analysis(i): + GC_data0 = np.zeros((2,n_eye_status,n_channels,n_channels,n_freq_bands)) + for e in range(n_eye_status): + n_epochs = STC_eye_data[i][e].shape[0] + # Make temporary array to save GC for each epoch + temp_GC_data = np.zeros((2,n_epochs,n_channels,n_channels,n_freq_bands)) + for ep in range(n_epochs): + # Fit the AR model + GCA = nta.GrangerAnalyzer(Timeseries_data[i][e][ep], order=AR_order, + n_freqs=int(800)) # n_Freq=800 correspond to step of 0.25Hz, the same as multitaper for power estimation + for f in range(n_freq_bands): + # Define lower and upper band + f_lb = list(Freq_Bands.values())[f][0] + f_ub = list(Freq_Bands.values())[f][1] + # Get index corresponding to the frequency bands of interest + freq_idx_G = np.where((GCA.frequencies >= f_lb) * (GCA.frequencies < f_ub))[0] + # Calculcate Granger causality quantities + g_xy = np.mean(GCA.causality_xy[:, :, freq_idx_G], -1) # avg on last dimension + g_yx = np.mean(GCA.causality_yx[:, :, freq_idx_G], -1) # avg on last dimension + # Transpose to use same format as con_measurement and save + temp_GC_data[0,ep,:,:,f] = np.transpose(g_xy) + temp_GC_data[1,ep,:,:,f] = np.transpose(g_yx) + + # Average over epochs for each person, eye condition, direction and frequency band + temp_GC_epoch_mean = np.nanmean(temp_GC_data, axis=1) # sometimes Log(Sxx/xx_auto_component) is nan + # Save to array + GC_data0[:,e,:,:,:] = temp_GC_epoch_mean + + print("{} out of {} finished analyzing".format(i+1,n_subjects)) + return i, GC_data0 + +# Get current time +c_time1 = time.localtime() +c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1) +print(c_time1) + +with concurrent.futures.ProcessPoolExecutor() as executor: + for i, GC_result in executor.map(GC_analysis, range(n_subjects)): # Function and arguments + GC_data[:,i] = GC_result + +# Get current time +c_time2 = time.localtime() +c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2) +print("Started", c_time1, "\nCurrent Time",c_time2) + +# Output: GC_data (g_xy/g_yx, subject, eye, chx, chy, freq) +# Notice that for g_xy ([0,...]) it means "chy" Granger causes "chx" +# and for g_yx ([1,...]) it means "chx" Granger causes "chy" +# This is due to the transposing which flipped the results on to the lower-part of the diagonal + +# Save the Granger_Causality data +np.save(Feature_savepath+"Source_drop_interpol_GrangerCausality_data.npy", GC_data) + +# Theoretically negative GC values should be impossible, but in practice +# they can still occur due to problems with model fitting (see Stokes & Purdon, 2017) +print("{:.3f}% negative GC values".\ + format(np.sum(GC_data[~np.isnan(GC_data)]<0)/np.sum(~np.isnan(GC_data))*100)) # 0.08% negative values +# These values cannot be interpreted, but seems to occur mostly for true non-causal connections +# Hence I set them to 0 +with np.errstate(invalid="ignore"): # invalid number refers to np.nan, which will be set to False for comparisons + GC_data[(GC_data<0)] = 0 + +# Save as dataframe for further processing with other features +# Convert to Pandas dataframe +# The dimensions will each be a column with numbers and the last column will be the actual values +arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, GC_data.shape), indexing="ij"))) + [GC_data.ravel()]) +GC_data_df = pd.DataFrame(arr, columns = ["GC_direction", "Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"]) +# Change from numerical coding to actual values +eye_status = list(final_epochs[0].event_id.keys()) +freq_bands_name = list(Freq_Bands.keys()) +GC_directions_info = ["chy -> chx", "chx -> chy"] + +index_values = [GC_directions_info,Subject_id,eye_status,ch_names,ch_names,freq_bands_name] +for col in range(len(index_values)): + col_name = GC_data_df.columns[col] + for shape in range(GC_data.shape[col]): # notice not dataframe but the array + GC_data_df.loc[GC_data_df.iloc[:,col] == shape,col_name]\ + = index_values[col][shape] + +# Add group status +Group_status = np.array(["CTRL"]*len(GC_data_df["Subject_ID"])) +Group_status[np.array([i in cases for i in GC_data_df["Subject_ID"]])] = "PTSD" +# Add to dataframe +GC_data_df.insert(3, "Group_status", Group_status) + +# Remove all nan (including diagonal and upper-matrix entries) +GC_data_df = GC_data_df.iloc[np.invert(np.isnan(GC_data_df["Value"].to_numpy()))] + +# Swap ch values for GC_direction chy -> chx (so it is always chx -> chy) +tempchy = GC_data_df[GC_data_df["GC_direction"] == "chy -> chx"]["chy"] # save chy +GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chy"] =\ + GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chx"] # overwrite old chy +GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chx"] = tempchy # overwrite chx + +# Drop the GC_direction column +GC_data_df = GC_data_df.drop("GC_direction", axis=1) + +# Save df +GC_data_df.to_pickle(os.path.join(Feature_savepath,"GC_data_source_drop_interpol_df.pkl")) + +# Testing if df was formatted correctly +expected_GC_values = n_subjects*n_eye_status*n_ch_combinations*n_freq_bands*2 # 2 because it is bidirectional +assert GC_data_df.shape[0] == expected_GC_values +# Testing a random GC value +random_connection = np.random.randint(0,GC_data_df.shape[0]) +test_connection = GC_data_df.iloc[random_connection,:] +i = np.where(Subject_id==test_connection["Subject_ID"])[0] +e = np.where(np.array(eye_status)==test_connection["Eye_status"])[0] +chx = np.where(np.array(ch_names)==test_connection["chx"])[0] +chy = np.where(np.array(ch_names)==test_connection["chy"])[0] +f = np.where(np.array(freq_bands_name)==test_connection["Freq_band"])[0] +value = test_connection["Value"] +if chx < chy: # the GC array is only lower diagonal to save memory + assert GC_data[0,i,e,chy,chx,f] == value +if chx > chy: + assert GC_data[1,i,e,chx,chy,f] == value