Skip to content
Snippets Groups Projects
FeatureEstimation.py 130 KiB
Newer Older
  • Learn to ignore specific revisions
  • glia's avatar
    glia committed
    # -*- coding: utf-8 -*-
    """
    Updated Oct 18 2022
    
    @author: Qianliang Li (glia@dtu.dk)
    
    This script contains the code to estimate the following EEG features:
        1. Power Spectral Density
        2. Frontal Theta/Beta Ratio
        3. Asymmetry
        4. Peak Alpha Frequency
        5. 1/f Exponents
        6. Microstates
        7. Long-Range Temporal Correlation (DFA Exponent)
    Source localization and functional connectivity
        8. Imaginary part of Coherence
        9. Weighted Phase Lag Index
        10. (Orthogonalized) Power Envelope Correlations
        11. Granger Causality
    
    It should be run after Preprocessing.py
    
    All features are saved in pandas DataFrame format for the machine learning
    pipeline
    
    Note that the code has not been changed to fit the demonstration dataset,
    thus just running it might introduce some errors.
    The code is provided to show how we performed the feature estimation
    and if you are adapting the code, you should check if it fits your dataset
    e.g. the questionnaire data, sensors and source parcellation etc
    
    The script was written in Spyder. The outline panel can be used to navigate
    the different parts easier (Default shortcut: Ctrl + Shift + O)
    """
    
    
    glia's avatar
    glia committed
    # Set working directory
    import os
    
    wkdir = "/Users/benj3542/Desktop/Uni/Noter/Semester_6/Bachelor/resting-state-eeg-analysis/"
    
    glia's avatar
    glia committed
    os.chdir(wkdir)
    
    # Load all libraries from the Preamble
    from Preamble import *
    
    # %% Load preprocessed epochs and questionnaire data
    load_path = "./PreprocessedData"
    
    # Get filenames
    files = []
    for r, d, f in os.walk(load_path):
        for file in f:
            if ".fif" in file:
                files.append(os.path.join(r, file))
    files.sort()
    
    # Subject IDs
    Subject_id = [0] * len(files)
    for i in range(len(files)):
        temp = files[i].split("\\")
        temp = temp[-1].split("_")
    
        Subject_id[i] = int(temp[0][-1]) # Subject_id[i] = int(temp[0])
    
    glia's avatar
    glia committed
    
    n_subjects = len(Subject_id)
    
    # Load Final epochs
    final_epochs = [0]*n_subjects
    for n in range(n_subjects):
        final_epochs[n] = mne.read_epochs(fname = os.path.join(files[n]),
                        verbose=0)
    
    # Load dropped epochs - used for gap idx in microstates
    bad_subjects = [12345] # list with subjects that were excluded due to too many dropped epochs/chs
    Drop_epochs_df = pd.read_pickle("./Preprocessing/dropped_epochs.pkl")
    
    good_subject_df_idx = [not i in bad_subjects for i in Drop_epochs_df["Subject_ID"]]
    Drop_epochs_df = Drop_epochs_df.loc[good_subject_df_idx,:]
    Drop_epochs_df = Drop_epochs_df.sort_values(by=["Subject_ID"]).reset_index(drop=True)
    
    ### Load questionnaire data
    # For the purposes of this demonstration I will make a dummy dataframe
    # The original code imported csv files with questionnaire data and group status
    final_qdf = pd.DataFrame({"Subject_ID":Subject_id,
                              "Age":[23,26],
                              "Gender":[0,0],
                              "Group_status":[0,1],
                              "PCL_total":[33,56],
                              "Q1":[1.2, 2.3],
                              "Q2":[1.7, 1.5],
                              "Qn":[2.1,1.0]})
    
    # Define cases as >= 44 total PCL
    # Type: numpy array with subject id
    cases = np.array(final_qdf["Subject_ID"][final_qdf["PCL_total"]>=44])
    n_groups = 2
    Groups = ["CTRL", "PTSD"]
    
    # Define folder for saving features
    Feature_savepath = "./Features/"
    Stat_savepath = "./Statistics/"
    Model_savepath = "./Model/"
    
    # %% Power spectrum features
    Freq_Bands = {"delta": [1.25, 4.0],
                  "theta": [4.0, 8.0],
                  "alpha": [8.0, 13.0],
                  "beta": [13.0, 30.0],
                  "gamma": [30.0, 49.0]}
    ch_names = final_epochs[0].info["ch_names"]
    n_channels = final_epochs[0].info["nchan"]
    
    # Pre-allocate memory
    power_bands = [0]*len(final_epochs)
    
    def power_band_estimation(n):
        # Get index for eyes open and eyes closed
        EC_index = final_epochs[n].events[:,2] == 1
        EO_index = final_epochs[n].events[:,2] == 2
        
        # Calculate the power spectral density
        psds, freqs = psd_multitaper(final_epochs[n], fmin = 1, fmax = 50) # output (epochs, channels, freqs)
        
        temp_power_band = []
        
        for fmin, fmax in Freq_Bands.values():
            # Calculate the power each frequency band
            psds_band = psds[:, :, (freqs >= fmin) & (freqs < fmax)].sum(axis=-1)
            # Calculate the mean for each eye status
            psds_band_eye = np.array([psds_band[EC_index,:].mean(axis=0),
                                          psds_band[EO_index,:].mean(axis=0)])
            # Append for each freq band
            temp_power_band.append(psds_band_eye)
            # Output: List with the 5 bands, and each element is a 2D array with eye status as 1st dimension and channels as 2nd dimension
        
        # The list is reshaped and absolute and relative power are calculated
        abs_power_band = np.reshape(temp_power_band, (5, 2, n_channels))
        abs_power_band = 10.*np.log10(abs_power_band) # Convert to decibel scale
        
        rel_power_band = np.reshape(temp_power_band, (5, 2, n_channels))
        rel_power_band = rel_power_band/np.sum(rel_power_band, axis=0, keepdims=True)
        # each eye condition and channel have been normalized to power in all 5 frequencies for that given eye condition and channel
        
        # Make one list in 1 dimension
        abs_power_values = np.concatenate(np.concatenate(abs_power_band, axis=0), axis=0)
        rel_power_values = np.concatenate(np.concatenate(rel_power_band, axis=0), axis=0)
        ## Output: First the channels, then the eye status and finally the frequency bands are concatenated
        ## E.g. element 26 is 3rd channel, eyes open, first frequency band
        #assert abs_power_values[26] == abs_power_band[0,1,2]
        #assert abs_power_values[47] == abs_power_band[0,1,23] # +21 channels to last
        #assert abs_power_values[50] == abs_power_band[1,0,2] # once all channels have been changed then the freq is changed and eye status
        
        # Get result
        res = np.concatenate([abs_power_values,rel_power_values],axis=0)
        return n, res
    
    
    glia's avatar
    glia committed
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for n, result in executor.map(power_band_estimation, range(len(final_epochs))): # Function and arguments
            power_bands[n] = result
    
    """
    
    for i in range(len(power_bands)):
        n, results = power_band_estimation(i)
        power_bands[i] = results
    
    glia's avatar
    glia committed
    
    # Combine all data into one dataframe
    # First the columns are prepared
    n_subjects = len(Subject_id)
    
    # The group status (PTSD/CTRL) is made using the information about the cases
    Group_status = np.array(["CTRL"]*n_subjects)
    Group_status[np.array([i in cases for i in Subject_id])] = "PTSD"
    
    # A variable that codes the channels based on A/P localization is also made
    Frontal_chs = ["Fp1", "Fpz", "Fp2", "AFz", "Fz", "F3", "F4", "F7", "F8"]
    Central_chs = ["Cz", "C3", "C4", "T7", "T8", "FT7", "FC3", "FCz", "FC4", "FT8", "TP7", "CP3", "CPz", "CP4", "TP8"]
    Posterior_chs = ["Pz", "P3", "P4", "P7", "P8", "POz", "O1", "O2", "Oz"]
    
    Brain_region_labels = ["Frontal","Central","Posterior"]
    Brain_region = np.array(ch_names, dtype = "<U9")
    Brain_region[np.array([i in Frontal_chs for i in ch_names])] = Brain_region_labels[0]
    Brain_region[np.array([i in Central_chs for i in ch_names])] = Brain_region_labels[1]
    Brain_region[np.array([i in Posterior_chs for i in ch_names])] = Brain_region_labels[2]
    
    # A variable that codes the channels based on M/L localization
    Left_chs = ["Fp1", "F3", "F7", "FC3", "FT7", "C3", "T7", "CP3", "TP7", "P3", "P7", "O1"]
    Right_chs = ["Fp2", "F4", "F8", "FC4", "FT8", "C4", "T8", "CP4", "TP8", "P4", "P8", "O2"]
    Mid_chs = ["Fpz", "AFz", "Fz", "FCz", "Cz", "CPz", "Pz", "POz", "Oz"]
    
    Brain_side = np.array(ch_names, dtype = "<U5")
    Brain_side[np.array([i in Left_chs for i in ch_names])] = "Left"
    Brain_side[np.array([i in Right_chs for i in ch_names])] = "Right"
    Brain_side[np.array([i in Mid_chs for i in ch_names])] = "Mid"
    
    # Eye status is added
    eye_status = list(final_epochs[0].event_id.keys())
    n_eye_status = len(eye_status)
    
    # Frequency bands
    freq_bands_name = list(Freq_Bands.keys())
    n_freq_bands = len(freq_bands_name)
    
    # Quantification (Abs/Rel)
    quant_status = ["Absolute", "Relative"]
    n_quant_status = len(quant_status)
    
    # The dataframe is made by combining all the unlisted pds values
    # Each row correspond to a different channel. It is reset after all channel names have been used
    # Each eye status element is repeated n_channel times, before it is reset
    # Each freq_band element is repeated n_channel * n_eye_status times, before it is reset
    # Each quantification status element is repeated n_channel * n_eye_status * n_freq_bands times, before it is reset
    power_df = pd.DataFrame(data = {"Subject_ID": [ele for ele in Subject_id for i in range(n_eye_status*n_quant_status*n_freq_bands*n_channels)],
                                    "Group_status": [ele for ele in Group_status for i in range(n_eye_status*n_quant_status*n_freq_bands*n_channels)],
                                    "Channel": ch_names*(n_eye_status*n_quant_status*n_freq_bands*n_subjects),
                                    "Brain_region": list(Brain_region)*(n_eye_status*n_quant_status*n_freq_bands*n_subjects),
                                    "Brain_side": list(Brain_side)*(n_eye_status*n_quant_status*n_freq_bands*n_subjects),
                                    "Eye_status": [ele for ele in eye_status for i in range(n_channels)]*n_quant_status*n_freq_bands*n_subjects,
                                    "Freq_band": [ele for ele in freq_bands_name for i in range(n_channels*n_eye_status)]*n_quant_status*n_subjects,
                                    "Quant_status": [ele for ele in quant_status for i in range(n_channels*n_eye_status*n_freq_bands)]*n_subjects,
                                    "PSD": list(np.concatenate(power_bands, axis=0))
                                    })
    # Absolute power is in decibels (10*log10(power))
    
    # Fix Freq_band categorical order
    power_df["Freq_band"] = power_df["Freq_band"].astype("category").\
                cat.reorder_categories(list(Freq_Bands.keys()), ordered=True)
    
    glia's avatar
    glia committed
    # Fix Brain_region categorical order
    power_df["Brain_region"] = power_df["Brain_region"].astype("category").\
                cat.reorder_categories(Brain_region_labels, ordered=True)
    
    glia's avatar
    glia committed
    # Save the dataframe
    power_df.to_pickle(os.path.join(Feature_savepath,"Power_df.pkl"))
    
    
    # %% Theta-beta ratio
    # Frontal theta/beta ratio has been implicated in cognitive control of attention
    power_df = pd.read_pickle(os.path.join(Feature_savepath,"Power_df.pkl"))
    
    eye_status = list(final_epochs[0].event_id)
    n_eye_status = len(eye_status)
    
    # Subset frontal absolute power
    power_df_sub1 = power_df[(power_df["Quant_status"] == "Absolute")&
                             (power_df["Brain_region"] == "Frontal")]
    
    # Subset frontal, midline absolute power
    power_df_sub2 = power_df[(power_df["Quant_status"] == "Absolute")&
                                (power_df["Brain_region"] == "Frontal")&
                                (power_df["Brain_side"] == "Mid")]
    
    
    glia's avatar
    glia committed
    
    
    # Calculate average frontal power theta
    
    glia's avatar
    glia committed
    frontal_theta_mean_subject = power_df_sub1[power_df_sub1["Freq_band"] == "theta"].\
        groupby(["Subject_ID","Group_status","Eye_status"]).mean().reset_index()
    
    
    # Calculate average frontal power beta
    
    glia's avatar
    glia committed
    frontal_beta_mean_subject = power_df_sub1[power_df_sub1["Freq_band"] == "beta"].\
        groupby(["Subject_ID","Group_status","Eye_status"]).mean().reset_index()
    
    # Extract all values
    frontal_beta_subject_values = power_df_sub1[power_df_sub1["Freq_band"] == "beta"]
    
    glia's avatar
    glia committed
    
    
    # Calculate average frontal, midline power theta
    frontal_midline_theta_mean_subject = power_df_sub2[power_df_sub2["Freq_band"] == "theta"].\
        groupby(["Subject_ID","Group_status","Eye_status"]).mean().reset_index()
    
    # Extract all values
    frontal_midline_theta_subject_values = power_df_sub2[power_df_sub2["Freq_band"] == "theta"]
    
    glia's avatar
    glia committed
    # Convert from dB to raw power
    frontal_theta_mean_subject["PSD"] = 10**(frontal_theta_mean_subject["PSD"]/10)
    frontal_beta_mean_subject["PSD"] = 10**(frontal_beta_mean_subject["PSD"]/10)
    
    frontal_midline_theta_mean_subject["PSD"] = 10**(frontal_midline_theta_mean_subject["PSD"]/10)
    
    frontal_beta_subject_values["PSD"] = 10**(frontal_beta_subject_values["PSD"]/10)
    frontal_midline_theta_subject_values["PSD"] = 10**(frontal_midline_theta_subject_values["PSD"]/10)
    
    frontal_beta_mean_subject.to_pickle(os.path.join(Feature_savepath,"fBMS_df.pkl"))
    frontal_midline_theta_mean_subject.to_pickle(os.path.join(Feature_savepath,"fMTMS_df.pkl"))
    frontal_beta_subject_values.to_pickle(os.path.join(Feature_savepath,"fBSV_df.pkl"))
    frontal_midline_theta_subject_values.to_pickle(os.path.join(Feature_savepath,"fMTSV_df.pkl"))
    
    glia's avatar
    glia committed
    
    # Calculate mean for each group and take ratio for whole group
    # To confirm trend observed in PSD plots
    mean_group_f_theta = frontal_theta_mean_subject.iloc[:,1:].groupby(["Group_status","Eye_status"]).mean()
    mean_group_f_beta = frontal_beta_mean_subject.iloc[:,1:].groupby(["Group_status","Eye_status"]).mean()
    mean_group_f_theta_beta_ratio = mean_group_f_theta/mean_group_f_beta
    
    # Calculate ratio for each subject
    frontal_theta_beta_ratio = frontal_theta_mean_subject.copy()
    frontal_theta_beta_ratio["PSD"] = frontal_theta_mean_subject["PSD"]/frontal_beta_mean_subject["PSD"]
    
    # Take the natural log of ratio 
    frontal_theta_beta_ratio["PSD"] = np.log(frontal_theta_beta_ratio["PSD"])
    
    # Rename and save feature
    frontal_theta_beta_ratio.rename(columns={"PSD":"TBR"},inplace=True)
    # Add dummy variable for re-using plot code
    dummy_variable = ["Frontal Theta Beta Ratio"]*frontal_theta_beta_ratio.shape[0]
    frontal_theta_beta_ratio.insert(3, "Measurement", dummy_variable )
    
    
    # frontal_theta_beta_ratio.to_pickle(os.path.join(Feature_savepath,"fTBR_df.pkl"))
    
    glia's avatar
    glia committed
    
    
    glia's avatar
    glia committed
    # %% Frequency bands asymmetry
    # Defined as ln(right) - ln(left)
    # Thus we should only work with the absolute values and undo the dB transformation
    # Here I avg over all areas. I.e. mean((ln(F4)-ln(F3),(ln(F8)-ln(F7),(ln(Fp2)-ln(Fp1))) for frontal
    ROI = ["Frontal", "Central", "Posterior"]
    qq = "Absolute" # only calculate asymmetry for absolute
    # Pre-allocate memory
    asymmetry = np.zeros(shape=(len(np.unique(power_df["Subject_ID"])),
                                 len(np.unique(power_df["Eye_status"])),
                                 len(list(Freq_Bands.keys())),
                                 len(ROI)))
    
    def calculate_asymmetry(i):
        ii = np.unique(power_df["Subject_ID"])[i]
        temp_asymmetry = np.zeros(shape=(len(np.unique(power_df["Eye_status"])),
                                 len(list(Freq_Bands.keys())),
                                 len(ROI)))
        for e in range(len(np.unique(power_df["Eye_status"]))):
            ee = np.unique(power_df["Eye_status"])[e]
            for f in range(len(list(Freq_Bands.keys()))):
                ff = list(Freq_Bands.keys())[f]
                
                # Get the specific part of the df
                temp_power_df = power_df[(power_df["Quant_status"] == qq) &
                                         (power_df["Eye_status"] == ee) &
                                         (power_df["Subject_ID"] == ii) &
                                         (power_df["Freq_band"] == ff)].copy()
                
                # Convert from dB to raw power
                temp_power_df.loc[:,"PSD"] = np.array(10**(temp_power_df["PSD"]/10))
                
                # Calculate the power asymmetry
                for r in range(len(ROI)):
                    rr = ROI[r]
                    temp_power_roi_df = temp_power_df[(temp_power_df["Brain_region"] == rr)&
                                                      ~(temp_power_df["Brain_side"] == "Mid")]
                    # Sort using channel names to make sure F8-F7 and not F4-F7 etc.
                    temp_power_roi_df = temp_power_roi_df.sort_values("Channel").reset_index(drop=True)
                    # Get the log power
                    R_power = temp_power_roi_df[(temp_power_roi_df["Brain_side"] == "Right")]["PSD"]
                    ln_R_power = np.log(R_power) # get log power
                    L_power = temp_power_roi_df[(temp_power_roi_df["Brain_side"] == "Left")]["PSD"]
                    ln_L_power = np.log(L_power) # get log power
                    # Pairwise subtraction followed by averaging
                    asymmetry_value = np.mean(np.array(ln_R_power) - np.array(ln_L_power))
                    # Save it to the array
                    temp_asymmetry[e,f,r] = asymmetry_value
        # Print progress
        print("{} out of {} finished testing".format(i+1,n_subjects))
        return i, temp_asymmetry
    
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for i, res in executor.map(calculate_asymmetry, range(len(np.unique(power_df["Subject_ID"])))): # Function and arguments
            asymmetry[i,:,:,:] = res
    
    # Prepare conversion of array to df using flatten
    n_subjects = len(Subject_id)
    
    # The group status (PTSD/CTRL) is made using the information about the cases
    Group_status = np.array(["CTRL"]*n_subjects)
    Group_status[np.array([i in cases for i in Subject_id])] = "PTSD"
    
    # Eye status is added
    eye_status = list(final_epochs[0].event_id.keys())
    n_eye_status = len(eye_status)
    
    # Frequency bands
    freq_bands_name = list(Freq_Bands.keys())
    n_freq_bands = len(freq_bands_name)
    
    # ROIs
    n_ROI = len(ROI)
    
    # Make the dataframe                
    asymmetry_df = pd.DataFrame(data = {"Subject_ID": [ele for ele in Subject_id for i in range(n_eye_status*n_freq_bands*n_ROI)],
                                         "Group_status": [ele for ele in Group_status for i in range(n_eye_status*n_freq_bands*n_ROI)],
                                         "Eye_status": [ele for ele in eye_status for i in range(n_freq_bands*n_ROI)]*(n_subjects),
                                         "Freq_band": [ele for ele in freq_bands_name for i in range(n_ROI)]*(n_subjects*n_eye_status),
                                         "ROI": list(ROI)*(n_subjects*n_eye_status*n_freq_bands),
                                         "Asymmetry_score": asymmetry.flatten(order="C")
                                         })
    # Flatten with order=C means that it first goes through last axis,
    # then repeat along 2nd last axis, and then repeat along 3rd last etc
    
    # Asymmetry numpy to pandas conversion check
    random_point=321
    asymmetry_df.iloc[random_point]
    
    i = np.where(np.unique(power_df["Subject_ID"]) == asymmetry_df.iloc[random_point]["Subject_ID"])[0]
    e = np.where(np.unique(power_df["Eye_status"]) == asymmetry_df.iloc[random_point]["Eye_status"])[0]
    f = np.where(np.array(list(Freq_Bands.keys())) == asymmetry_df.iloc[random_point]["Freq_band"])[0]
    r = np.where(np.array(ROI) == asymmetry_df.iloc[random_point]["ROI"])[0]
    
    assert asymmetry[i,e,f,r] == asymmetry_df.iloc[random_point]["Asymmetry_score"]
    
    # Save the dataframe
    asymmetry_df.to_pickle(os.path.join(Feature_savepath,"asymmetry_df.pkl"))
    
    # %% Using FOOOF
    # Peak alpha frequency (PAF) and 1/f exponent (OOF)
    # Using the FOOOF algorithm (Fitting oscillations and one over f)
    # Published by Donoghue et al, 2020 in Nature Neuroscience
    # To start, FOOOF takes the freqs and power spectra as input
    n_channels = final_epochs[0].info["nchan"]
    ch_names = final_epochs[0].info["ch_names"]
    sfreq = final_epochs[0].info["sfreq"]
    Freq_Bands = {"delta": [1.25, 4.0],
                  "theta": [4.0, 8.0],
                  "alpha": [8.0, 13.0],
                  "beta": [13.0, 30.0],
                  "gamma": [30.0, 49.0]}
    n_freq_bands = len(Freq_Bands)
    
    # From visual inspection there seems to be problem if PSD is too steep at the start
    # To overcome this problem, we try multiple start freq
    OOF_r2_thres = 0.95 # a high threshold as we allow for overfitting
    PAF_r2_thres = 0.90 # a more lenient threshold for PAF, as it is usually still captured even if fit for 1/f is not perfect
    freq_start_it_range = [2,3,4,5,6]
    freq_end = 40 # Stop freq at 40Hz to not be influenced by the Notch Filter
    
    eye_status = list(final_epochs[0].event_id.keys())
    n_eye_status = len(eye_status)
    
    PAF_data = np.zeros((n_subjects,n_eye_status,n_channels,3)) # CF, power, band_width
    OOF_data = np.zeros((n_subjects,n_eye_status,n_channels,2)) # offset and exponent
    
    def FOOOF_estimation(i):
        PAF_data0 = np.zeros((n_eye_status,n_channels,3)) # CF, power, band_width
        OOF_data0 = np.zeros((n_eye_status,n_channels,2)) # offset and exponent
        # Get Eye status
        eye_idx = [final_epochs[i].events[:,2] == 1, final_epochs[i].events[:,2] == 2] # EC and EO
        # Calculate the power spectral density
        psd, freqs = psd_multitaper(final_epochs[i], fmin = 1, fmax = 50) # output (epochs, channels, freqs)
        # Retrieve psds for the 2 conditions and calculate mean across epochs
        psds = []
        for e in range(n_eye_status):
            # Get the epochs for specific eye condition
            temp_psd = psd[eye_idx[e],:,:]
            # Calculate the mean across epochs
            temp_psd = np.mean(temp_psd, axis=0)
            # Save
            psds.append(temp_psd)
        # Try multiple start freq
        PAF_data00 = np.zeros((n_eye_status,n_channels,len(freq_start_it_range),3)) # CF, power, band_width
        OOF_data00 = np.zeros((n_eye_status,n_channels,len(freq_start_it_range),2)) # offset and exponent
        r2s00 = np.zeros((n_eye_status,n_channels,len(freq_start_it_range)))
        for e in range(n_eye_status):
            psds_avg = psds[e]
            for f in range(len(freq_start_it_range)):
                # Initiate FOOOF group for analysis of multiple PSD
                fg = fooof.FOOOFGroup()
                # Set the frequency range to fit the model
                freq_range = [freq_start_it_range[f], freq_end] # variable freq start to 49Hz
                # Fit to each source PSD separately, but in parallel
                fg.fit(freqs,psds_avg,freq_range,n_jobs=1)
                # Extract aperiodic parameters
                aps = fg.get_params('aperiodic_params')
                # Extract peak parameters
                peaks = fg.get_params('peak_params')
                # Extract goodness-of-fit metrics
                r2s = fg.get_params('r_squared')
                # Save OOF and r2s
                OOF_data00[e,:,f] = aps
                r2s00[e,:,f] = r2s
                # Find the alpha peak with greatest power
                for c in range(n_channels):
                    peaks0 = peaks[peaks[:,3] == c]
                    # Subset the peaks within the alpha band
                    in_alpha_band = (peaks0[:,0] >= Freq_Bands["alpha"][0]) & (peaks0[:,0] <= Freq_Bands["alpha"][1])
                    if sum(in_alpha_band) > 0: # Any alpha peaks?
                        # Choose the peak with the highest power
                        max_alpha_idx = np.argmax(peaks0[in_alpha_band,1])
                        # Save results
                        PAF_data00[e,c,f] = peaks0[in_alpha_band][max_alpha_idx,:-1]
                    else:
                        # No alpha peaks
                        PAF_data00[e,c,f] = [np.nan]*3
        # Check criterias
        good_fits_OOF = (r2s00 > OOF_r2_thres) & (OOF_data00[:,:,:,1] > 0) # r^2 > 0.95 and exponent > 0
        good_fits_PAF = (r2s00 > PAF_r2_thres) & (np.isfinite(PAF_data00[:,:,:,0])) # r^2 > 0.90 and detected peak in alpha band
        # Save the data or NaN if criterias were not fulfilled
        for e in range(n_eye_status):
            for c in range(n_channels):
                if sum(good_fits_OOF[e,c]) == 0: # no good OOF estimation
                    OOF_data0[e,c] = [np.nan]*2
                else: # Save OOF associated with greatest r^2 that fulfilled criterias
                    OOF_data0[e,c] = OOF_data00[e,c,np.argmax(r2s00[e,c,good_fits_OOF[e,c]])]
                if sum(good_fits_PAF[e,c]) == 0: # no good PAF estimation
                    PAF_data0[e,c] = [np.nan]*3
                else: # Save PAF associated with greatest r^2 that fulfilled criterias
                    PAF_data0[e,c] = PAF_data00[e,c,np.argmax(r2s00[e,c,good_fits_PAF[e,c]])]
        print("Finished {} out of {} subjects".format(i+1,n_subjects))
        return i, PAF_data0, OOF_data0
    
    # Get current time
    c_time1 = time.localtime()
    c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
    print(c_time1)
    
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for i, PAF_result, OOF_result in executor.map(FOOOF_estimation, range(n_subjects)): # Function and arguments
            PAF_data[i] = PAF_result
            OOF_data[i] = OOF_result
    
    # Save data
    with open(Feature_savepath+"PAF_data_arr.pkl", "wb") as file:
        pickle.dump(PAF_data, file)
    with open(Feature_savepath+"OOF_data_arr.pkl", "wb") as file:
        pickle.dump(OOF_data, file)
    
    # Get current time
    c_time2 = time.localtime()
    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
    print("Started", c_time1, "\nFinished",c_time2)
    
    # Convert to Pandas dataframe (only keep mean parameter for PAF)
    # The dimensions will each be a column with numbers and the last column will be the actual values
    ori = PAF_data[:,:,:,0]
    arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, ori.shape), indexing="ij"))) + [ori.ravel()])
    PAF_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Channel", "Value"])
    # Change from numerical coding to actual values
    
    index_values = [Subject_id,eye_status,ch_names]
    temp_df = PAF_data_df.copy() # make temp df to not sequential overwrite what is changed
    for col in range(len(index_values)):
        col_name = PAF_data_df.columns[col]
        for shape in range(ori.shape[col]):
            temp_df.loc[PAF_data_df.iloc[:,col] == shape,col_name]\
            = index_values[col][shape]
    PAF_data_df = temp_df # replace original df 
    
    # Add group status
    Group_status = np.array(["CTRL"]*len(PAF_data_df["Subject_ID"]))
    Group_status[np.array([i in cases for i in PAF_data_df["Subject_ID"]])] = "PTSD"
    # Add to dataframe
    PAF_data_df.insert(3, "Group_status", Group_status)
    
    # Global peak alpha
    PAF_data_df_global = PAF_data_df.groupby(["Subject_ID", "Group_status", "Eye_status"]).mean().reset_index() # by default pandas mean skip nan
    # Add dummy variable for re-using plot code
    dummy_variable = ["Global Peak Alpha Frequency"]*PAF_data_df_global.shape[0]
    PAF_data_df_global.insert(3, "Measurement", dummy_variable )
    
    # Regional peak alpha
    # A variable that codes the channels based on A/P localization is also made
    Frontal_chs = ["Fp1", "Fpz", "Fp2", "AFz", "Fz", "F3", "F4", "F7", "F8"]
    Central_chs = ["Cz", "C3", "C4", "T7", "T8", "FT7", "FC3", "FCz", "FC4", "FT8", "TP7", "CP3", "CPz", "CP4", "TP8"]
    Posterior_chs = ["Pz", "P3", "P4", "P7", "P8", "POz", "O1", "O2", "Oz"]
    
    Brain_region = np.array(ch_names, dtype = "<U9")
    Brain_region[np.array([i in Frontal_chs for i in ch_names])] = "Frontal"
    Brain_region[np.array([i in Central_chs for i in ch_names])] = "Central"
    Brain_region[np.array([i in Posterior_chs for i in ch_names])] = "Posterior"
    
    PAF_data_df.insert(4, "Brain_region", list(Brain_region)*int(PAF_data_df.shape[0]/len(Brain_region)))
    
    # Save the dataframes
    PAF_data_df.to_pickle(os.path.join(Feature_savepath,"PAF_data_FOOOF_df.pkl"))
    PAF_data_df_global.to_pickle(os.path.join(Feature_savepath,"PAF_data_FOOOF_global_df.pkl"))
    
    # Convert to Pandas dataframe (only keep exponent parameter for OOF)
    # The dimensions will each be a column with numbers and the last column will be the actual values
    ori = OOF_data[:,:,:,1]
    arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, ori.shape), indexing="ij"))) + [ori.ravel()])
    PAF_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Channel", "Value"])
    # Change from numerical coding to actual values
    
    index_values = [Subject_id,eye_status,ch_names]
    temp_df = PAF_data_df.copy() # make temp df to not sequential overwrite what is changed
    for col in range(len(index_values)):
        col_name = PAF_data_df.columns[col]
        for shape in range(ori.shape[col]):
            temp_df.loc[PAF_data_df.iloc[:,col] == shape,col_name]\
            = index_values[col][shape]
    OOF_data_df = temp_df # replace original df 
    
    # Add group status
    Group_status = np.array(["CTRL"]*len(OOF_data_df["Subject_ID"]))
    Group_status[np.array([i in cases for i in OOF_data_df["Subject_ID"]])] = "PTSD"
    # Add to dataframe
    OOF_data_df.insert(3, "Group_status", Group_status)
    
    # Regional OOF
    OOF_data_df.insert(4, "Brain_region", list(Brain_region)*int(PAF_data_df.shape[0]/len(Brain_region)))
    
    # Save the dataframes
    OOF_data_df.to_pickle(os.path.join(Feature_savepath,"OOF_data_FOOOF_df.pkl"))
    
    glia's avatar
    glia committed
    # %% Microstate analysis
    # The function takes the data as a numpy array (n_t, n_ch)
    # The data is already re-referenced to common average
    # Variables for the clustering function are extracted
    sfreq = final_epochs[0].info["sfreq"]
    eye_status = list(final_epochs[0].event_id.keys())
    n_eye_status = len(eye_status)
    ch_names = final_epochs[0].info["ch_names"]
    n_channels = len(ch_names)
    locs = np.zeros((n_channels,2)) # xy coordinates of the electrodes
    for c in range(n_channels):
        locs[c] = final_epochs[0].info["chs"][c]["loc"][0:2]
    
    # The epochs are transformed to numpy arrays
    micro_data = []
    EC_micro_data = []
    EO_micro_data = []
    for i in range(n_subjects):
        # Transform data to correct shape
        micro_data.append(final_epochs[i].get_data()) # get data
        arr_shape = micro_data[i].shape # get shape
        micro_data[i] = micro_data[i].swapaxes(1,2) # swap ch and time axis
        micro_data[i] = micro_data[i].reshape(arr_shape[0]*arr_shape[2],arr_shape[1]) # reshape by combining epochs and times
        # Get indices for eyes open and closed
        EC_index = final_epochs[i].events[:,2] == 1
        EO_index = final_epochs[i].events[:,2] == 2
        # Repeat with 4s * sample frequency to correct for concatenation of times and epochs
        EC_index = np.repeat(EC_index,4*sfreq)
        EO_index = np.repeat(EO_index,4*sfreq)
        # Save data where it is divided into eye status
        EC_micro_data.append(micro_data[i][EC_index])
        EO_micro_data.append(micro_data[i][EO_index])
    
    # Global explained variance and Cross-validation criterion is used to determine number of microstates
    # First all data is concatenated to find the optimal number of maps for all data
    micro_data_all = np.vstack(micro_data)
    
    # Determine the number of clusters
    # I use a slightly modified kmeans function which returns the cv_min
    global_gev = []
    cv_criterion = []
    for n_maps in range(2,7):
        maps, L, gfp_peaks, gev, cv_min = kmeans_return_all(micro_data_all, n_maps)
        global_gev.append(np.sum(gev))
        cv_criterion.append(cv_min)
    # Save run results
    cluster_results = np.array([global_gev,cv_criterion])
    np.save("Microstate_n_cluster_test_results.npy", cluster_results) # (gev/cv_crit, n_maps from 2 to 6)
    
    #cluster_results = np.load("Microstate_n_cluster_test_results.npy")
    #global_gev = cluster_results[0,:]
    #cv_criterion = cluster_results[1,:]
    
    # Evaluate best n_maps
    plt.figure()
    plt.plot(np.linspace(2,6,len(cv_criterion)),(cv_criterion/np.sum(cv_criterion)), label="CV Criterion")
    plt.plot(np.linspace(2,6,len(cv_criterion)),(global_gev/np.sum(global_gev)), label="GEV")
    plt.legend()
    plt.ylabel("Normalized to total")
    # The lower CV the better.
    # But the higher GEV the better.
    # Based on the plots and the recommendation by vong Wegner & Laufs 2018
    
    # we used 5 microstates
    
    glia's avatar
    glia committed
    
    # In order to compare between groups, I fix the microstates by clustering on data from both groups
    # Due to instability of maps when running multiple times, I increased n_maps from 4 to 6
    
    glia's avatar
    glia committed
    mode = ["aahc", "kmeans", "kmedoids", "pca", "ica"][1]
    
    # K-means is stochastic, thus I run it multiple times in order to find the maps with highest GEV
    # Each K-means is run 5 times and best map is chosen. But I do this 10 times more, so in total 50 times!
    n_run = 10
    # Pre-allocate memory
    microstate_cluster_results = []
    
    # Parallel processing can only be implemented by ensuring different seeds
    # Otherwise the iteration would be the same.
    # However the k-means already use parallel processing so making outer loop with
    # concurrent processes make it use too many processors
    # Get current time
    c_time1 = time.localtime()
    c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
    print(c_time1)
    
    for r in range(n_run):
        maps = [0]*2
        m_labels = [0]*2
        gfp_peaks = [0]*2
        gev = [0]*2
        # Eyes closed
        counter = 0
        maps_, x_, gfp_peaks_, gev_ = clustering(
            np.vstack(np.array(EC_micro_data)), sfreq, ch_names, locs, mode, n_maps, doplot=False) # doplot=True is bugged
        maps[counter] = maps_
        m_labels[counter] = x_
        gfp_peaks[counter] = gfp_peaks_
        gev[counter] = gev_
        counter += 1
        # Eyes open
        maps_, x_, gfp_peaks_, gev_ = clustering(
            np.vstack(np.array(EO_micro_data)), sfreq, ch_names, locs, mode, n_maps, doplot=False) # doplot=True is bugged
        maps[counter] = maps_
        m_labels[counter] = x_
        gfp_peaks[counter] = gfp_peaks_
        gev[counter] = gev_
        counter += 1
        
        microstate_cluster_results.append([maps, m_labels, gfp_peaks, gev])
        print("Finished {} out of {}".format(r+1, n_run))
    
    # Get current time
    c_time2 = time.localtime()
    c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
    print("Started", c_time1, "\nFinished",c_time2)
    
    # Save the results
    
    with open(Feature_savepath+"Microstate_5_maps_10x5_k_means_results.pkl", "wb") as file:
    
    glia's avatar
    glia committed
        pickle.dump(microstate_cluster_results, file)
    
    # # Load
    # with open(Feature_savepath+"Microstate_4_maps_10x5_k_means_results.pkl", "rb") as file:
    #     microstate_cluster_results = pickle.load(file)
    
    # Find the best maps (Highest GEV across all the K-means clusters)
    EC_total_gevs = np.sum(np.vstack(np.array(microstate_cluster_results)[:,3,0]), axis=1) # (runs, maps/labels/gfp/gev, ec/eo)
    EO_total_gevs = np.sum(np.vstack(np.array(microstate_cluster_results)[:,3,1]), axis=1)
    Best_EC_idx = np.argmax(EC_total_gevs)
    Best_EO_idx = np.argmax(EO_total_gevs)
    # Update the variables for the best maps
    maps = [microstate_cluster_results[Best_EC_idx][0][0],microstate_cluster_results[Best_EO_idx][0][1]]
    m_labels = [microstate_cluster_results[Best_EC_idx][1][0],microstate_cluster_results[Best_EO_idx][1][1]]
    gfp_peaks = [microstate_cluster_results[Best_EC_idx][2][0],microstate_cluster_results[Best_EO_idx][2][1]]
    gev = [microstate_cluster_results[Best_EC_idx][3][0],microstate_cluster_results[Best_EO_idx][3][1]]
    
    # Plot the maps
    plt.style.use('default')
    
    labels = ["EC", "EO"] #Eyes-closed, Eyes-open
    
    glia's avatar
    glia committed
    for i in range(len(labels)):    
        fig, axarr = plt.subplots(1, n_maps, figsize=(20,5))
        fig.patch.set_facecolor('white')
        for imap in range(n_maps):
            mne.viz.plot_topomap(maps[i][imap,:], pos = final_epochs[0].info, axes = axarr[imap]) # plot
            axarr[imap].set_title("GEV: {:.2f}".format(gev[i][imap]), fontsize=16, fontweight="bold") # title
        fig.suptitle("Microstates: {}".format(labels[i]), fontsize=20, fontweight="bold")
    
    # Manual re-order the maps
    # Due the random initiation of K-means this have to be modified every time clusters are made!
    # Assign map labels (e.g. 0, 2, 1, 3)
    order = [0]*2
    
    order[0] = [3,0,1,2,4] # EC
    order[1] = [3,1,0,2,4] # EO
    
    glia's avatar
    glia committed
    for i in range(len(order)):
        maps[i] = maps[i][order[i],:] # re-order maps
        gev[i] = gev[i][order[i]] # re-order GEV
        # Make directory to find and replace map labels
        dic0 = {value:key for key, value in enumerate(order[i])}
        m_labels[i][:] = [dic0.get(n, n) for n in m_labels[i]] # re-order labels
    
    # The maps seems to be correlated both negatively and positively (see spatial correlation plots)
    # Thus the sign of the map does not really reflect which areas are positive or negative (absolute)
    # But more which areas are different during each state (relatively)
    # I can therefore change the sign of the map for the visualizaiton
    
    sign_swap = [[1,-1,1,1,1],[1,1,1,-1,1]]
    
    glia's avatar
    glia committed
    for i in range(len(order)):
        for m in range(n_maps):
            maps[i][m] *= sign_swap[i][m]
    
    # Plot the maps and save
    
    save_path = "/Users/benj3542/Desktop/Uni/Noter/Semester_6/Bachelor/resting-state-eeg-analysis/Figures/Microstates"
    
    glia's avatar
    glia committed
    labels = ["EC", "EO"]
    for i in range(len(labels)):    
        fig, axarr = plt.subplots(1, n_maps, figsize=(20,5))
        fig.patch.set_facecolor('white')
        for imap in range(n_maps):
            mne.viz.plot_topomap(maps[i][imap,:], pos = final_epochs[0].info, axes = axarr[imap]) # plot
            axarr[imap].set_title("GEV: {:.2f}".format(gev[i][imap]), fontsize=16, fontweight="bold") # title
        fig.suptitle("Microstates: {} - Total GEV: {:.2f}".format(labels[i],sum(gev[i])), fontsize=20, fontweight="bold")
        # Save the figure
        fig.savefig(os.path.join(save_path,str("Microstates_{}".format(labels[i]) + ".png")))
    
    # Calculate spatial correlation between maps and actual data points (topography)
    # The sign of the map is changed so the correlation is positive
    # By default the code looks for highest spatial correlation (regardless of sign)
    # Thus depending on random initiation point the map might be opposite
    plt.style.use('ggplot')
    def spatial_correlation(data, maps):
        n_t = data.shape[0]
        n_ch = data.shape[1]
        data = data - data.mean(axis=1, keepdims=True)
    
        # GFP peaks
        gfp = np.std(data, axis=1)
        gfp_peaks = locmax(gfp)
        gfp_values = gfp[gfp_peaks]
        gfp2 = np.sum(gfp_values**2) # normalizing constant in GEV
        n_gfp = gfp_peaks.shape[0]
    
        # Spatial correlation
        C = np.dot(data, maps.T)
        C /= (n_ch*np.outer(gfp, np.std(maps, axis=1)))
        L = np.argmax(C**2, axis=1) # C is squared here which means the maps do no retain information about the sign of the correlation
        
        return C
    
    C_EC = spatial_correlation(np.vstack(np.array(EC_micro_data)), maps[0])
    C_EO = spatial_correlation(np.vstack(np.array(EO_micro_data)), maps[1])
    C = [C_EC, C_EO]
    
    # Plot the distribution of spatial correlation for each label and each map
    labels = ["EC", "EO"]
    for i in range(len(labels)):
        fig, axarr = plt.subplots(n_maps, n_maps, figsize=(16,16))
        for Lmap in range(n_maps):
            for Mmap in range(n_maps):
                sns.distplot(C[i][m_labels[i] == Lmap,Mmap], ax = axarr[Lmap,Mmap])
                axarr[Lmap,Mmap].set_xlabel("Spatial correlation")
        plt.suptitle("Distribution of spatial correlation_{}".format(labels[i]), fontsize=20, fontweight="bold")
        # Add common x and y axis labels by making one big axis
        fig.add_subplot(111, frameon=False)
        plt.tick_params(labelcolor="none", top="off", bottom="off", left="off", right="off") # hide tick labels and ticks
        plt.grid(False) # remove global grid
        plt.xlabel("Microstate number", labelpad=20)
        plt.ylabel("Label number", labelpad=10)
        fig.savefig(os.path.join(save_path,str("Microstates_Spatial_Correlation_Label_State_{}".format(labels[i]) + ".png")))
    
    # Plot the distribution of spatial correlation for all data and each map
    labels = ["EC", "EO"]
    for i in range(len(labels)):
        fig, axarr = plt.subplots(1,n_maps, figsize=(20,5))
        for imap in range(n_maps):
            sns.distplot(C[i][:,imap], ax = axarr[imap])
            plt.xlabel("Spatial correlation")
        plt.suptitle("Distribution of spatial correlation", fontsize=20, fontweight="bold")
        # Add common x and y axis labels by making one big axis
        fig.add_subplot(111, frameon=False)
        plt.tick_params(labelcolor="none", top="off", bottom="off", left="off", right="off") # hide tick labels and ticks
        plt.grid(False) # remove global grid
        plt.xlabel("Microstate number", labelpad=20)
        plt.ylabel("Label number")
    
    # Prepare for calculation of transition matrix
    # I modified the function, so it takes the list argument gap_index
    # gap_index should have the indices right before gaps in data
    
    # Gaps: Between dropped epochs, trials (eo/ec) and subjects
    # The between subjects gaps is removed by dividing the data into subjects
    n_trials = 5
    n_epoch_length = final_epochs[0].get_data().shape[2]
    
    micro_labels = []
    micro_subject_EC_idx = [0]
    micro_subject_EO_idx = [0]
    gaps_idx = []
    gaps_trials_idx = []
    for i in range(n_subjects):
        # Get indices for subject
        micro_subject_EC_idx.append(micro_subject_EC_idx[i]+EC_micro_data[i].shape[0])
        temp_EC = m_labels[0][micro_subject_EC_idx[i]:micro_subject_EC_idx[i+1]]
        # Get labels for subject i EO
        micro_subject_EO_idx.append(micro_subject_EO_idx[i]+EO_micro_data[i].shape[0])
        temp_EO = m_labels[1][micro_subject_EO_idx[i]:micro_subject_EO_idx[i+1]]
        # Save
        micro_labels.append([temp_EC,temp_EO]) # (subject, eye)
        
        # Get indices with gaps
        # Dropped epochs are first considered
        # Each epoch last 4s, which correspond to 2000 samples and a trial is 15 epochs - dropped epochs
        # Get epochs for each condition
        EC_drop_epochs = Drop_epochs_df.iloc[i,1:][Drop_epochs_df.iloc[i,1:] <= 75].to_numpy()
        EO_drop_epochs = Drop_epochs_df.iloc[i,1:][(Drop_epochs_df.iloc[i,1:] >= 75)&
                                                (Drop_epochs_df.iloc[i,1:] <= 150)].to_numpy()
        # Get indices for the epochs for EC that were dropped and correct for changing index due to drop
        EC_drop_epochs_gaps_idx = []
        counter = 0
        for d in range(len(EC_drop_epochs)):
            drop_epoch_number = EC_drop_epochs[d]
            Drop_epoch_idx = (drop_epoch_number-counter)*n_epoch_length # counter subtracted as the drop index is before dropped
            EC_drop_epochs_gaps_idx.append(Drop_epoch_idx-1) # -1 for point just before gap
            counter += 1
        # Negative index might occur if the first epochs were removed. This index is not needed for transition matrix
        if len(EC_drop_epochs_gaps_idx) > 0:
            for d in range(len(EC_drop_epochs_gaps_idx)): # check all, e.g. if epoch 0,1,2,3 are dropped then all should be caught
                if EC_drop_epochs_gaps_idx[0] == -1:
                    EC_drop_epochs_gaps_idx = EC_drop_epochs_gaps_idx[1:len(EC_drop_epochs)]
        
        # Get indices for the epochs for EO that were dropped and correct for changing index due to drop
        EO_drop_epochs_gaps_idx = []
        counter = 0
        for d in range(len(EO_drop_epochs)):
            drop_epoch_number = EO_drop_epochs[d]-75
            Drop_epoch_idx = (drop_epoch_number-counter)*n_epoch_length # counter subtracted as the drop index is before dropped
            EO_drop_epochs_gaps_idx.append(Drop_epoch_idx-1) # -1 for point just before gap
            counter += 1
        # Negative index might occur if the first epoch was removed. This index is not needed for transition matrix
        if len(EO_drop_epochs_gaps_idx) > 0:
            for d in range(len(EO_drop_epochs_gaps_idx)): # check all, e.g. if epoch 0,1,2,3 are dropped then all should be caught
                if EO_drop_epochs_gaps_idx[0] == -1:
                    EO_drop_epochs_gaps_idx = EO_drop_epochs_gaps_idx[1:len(EO_drop_epochs)]
        
        # Gaps between trials
        Trial_indices = [0, 15, 30, 45, 60, 75] # all the indices for start and end of the 5 trials
        EC_trial_gaps_idx = []
        EO_trial_gaps_idx = []
        counter_EC = 0
        counter_EO = 0
        for t in range(len(Trial_indices)-2): # -2 as start and end is not used in transition matrix
            temp_drop = EC_drop_epochs[(EC_drop_epochs >= Trial_indices[t])&
                                (EC_drop_epochs < Trial_indices[t+1])]
            # Correct the trial id for any potential drops within that trial
            counter_EC += len(temp_drop)
            trial_idx_corrected_for_drops = 15*(t+1)-counter_EC
            EC_trial_gaps_idx.append((trial_idx_corrected_for_drops*n_epoch_length)-1) # multiply id with length of epoch and subtract 1
            
            temp_drop = EO_drop_epochs[(EO_drop_epochs >= Trial_indices[t]+75)&
                                (EO_drop_epochs < Trial_indices[t+1]+75)]
            # Correct the trial id for any potential drops within that trial
            counter_EO += len(temp_drop)
            trial_idx_corrected_for_drops = 15*(t+1)-counter_EO
            EO_trial_gaps_idx.append((trial_idx_corrected_for_drops*n_epoch_length)-1) # multiply id with length of epoch and subtract 1
        
        # Concatenate all drop indices
        gaps_idx.append([np.unique(np.sort(EC_drop_epochs_gaps_idx+EC_trial_gaps_idx)),
                        np.unique(np.sort(EO_drop_epochs_gaps_idx+EO_trial_gaps_idx))])
        # Make on with trial gaps only for use in LRTC analysis
        gaps_trials_idx.append([EC_trial_gaps_idx,EO_trial_gaps_idx])
    
    # Save the gap idx files
    np.save("Gaps_idx.npy",np.array(gaps_idx))
    np.save("Gaps_trials_idx.npy",np.array(gaps_trials_idx))
    
    # %% Calculate microstate features
    # Symbol distribution (also called ratio of time covered RTT)
    # Transition matrix
    # Shannon entropy
    EC_p_hat = p_empirical(m_labels[0], n_maps)
    EO_p_hat = p_empirical(m_labels[1], n_maps)
    # Sanity check: Overall between EC and EO
    
    microstate_time_data = np.zeros((n_subjects,n_eye_status,n_maps))
    microstate_transition_data = np.zeros((n_subjects,n_eye_status,n_maps,n_maps))
    microstate_entropy_data = np.zeros((n_subjects,n_eye_status))
    for i in range(n_subjects):
        # Calculate ratio of time covered
        temp_EC_p_hat = p_empirical(micro_labels[i][0], n_maps)
        temp_EO_p_hat = p_empirical(micro_labels[i][1], n_maps)
        # Calculate transition matrix
    
    glia's avatar
    glia committed
        temp_EC_T_hat = T_empirical(micro_labels[i][0], n_maps, gaps_idx[i][0])
        temp_EO_T_hat = T_empirical(micro_labels[i][1], n_maps, gaps_idx[i][1])
    
        """
        temp_EC_T_hat = T_empirical(micro_labels[i][0], n_maps)
        temp_EO_T_hat = T_empirical(micro_labels[i][1], n_maps)
    
    glia's avatar
    glia committed
        # Calculate Shannon entropy
        temp_EC_h_hat = H_1(micro_labels[i][0], n_maps)
        temp_EO_h_hat = H_1(micro_labels[i][1], n_maps)
        
        # Save the data
        microstate_time_data[i,0,:] = temp_EC_p_hat
        microstate_time_data[i,1,:] = temp_EO_p_hat
        microstate_transition_data[i,0,:,:] = temp_EC_T_hat
        microstate_transition_data[i,1,:,:] = temp_EO_T_hat
        microstate_entropy_data[i,0] = temp_EC_h_hat/max_entropy(n_maps) # ratio of max entropy
        microstate_entropy_data[i,1] = temp_EO_h_hat/max_entropy(n_maps) # ratio of max entropy
    
    # Save transition data
    np.save(Feature_savepath+"microstate_transition_data.npy", microstate_transition_data)
    # Convert transition data to dataframe for further processing with other features
    # Transition matrix should be read as probability of row to column
    microstate_transition_data_arr =\
         microstate_transition_data.reshape((n_subjects,n_eye_status,n_maps*n_maps)) # flatten 4 x 4 matrix to 1D
    
    transition_info = ["M1->M1", "M1->M2", "M1->M3", "M1->M4", "M1->M5",
                       "M2->M1", "M2->M2", "M2->M3", "M2->M4", "M2-M5",
                       "M3->M1", "M3->M2", "M3->M3", "M3->M4", "M3->M5",
                       "M4->M1", "M4->M2", "M4->M3", "M4->M4", "M4->M5",
                       "M5->M1", "M5->M2", "M5->M3", "M5->M4", "M5->M5"]
    
    glia's avatar
    glia committed
    
    arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, microstate_transition_data_arr.shape), indexing="ij"))) + [microstate_transition_data_arr.ravel()])
    microstate_transition_data_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Transition", "Value"])
    # Change from numerical coding to actual values
    eye_status = list(final_epochs[0].event_id.keys())
    
    index_values = [Subject_id,eye_status,transition_info]
    for col in range(len(index_values)):
        col_name = microstate_transition_data_df.columns[col]
        for shape in reversed(range(microstate_transition_data_arr.shape[col])): # notice this is the shape of original numpy array. Not shape of DF
            microstate_transition_data_df.loc[microstate_transition_data_df.iloc[:,col] == shape,col_name]\
            = index_values[col][shape]
    
    # Add group status
    Group_status = np.array(["CTRL"]*len(microstate_transition_data_df["Subject_ID"]))
    Group_status[np.array([i in cases for i in microstate_transition_data_df["Subject_ID"]])] = "PTSD"
    # Add to dataframe
    microstate_transition_data_df.insert(2, "Group_status", Group_status)
    
    # Save df
    microstate_transition_data_df.to_pickle(os.path.join(Feature_savepath,"microstate_transition_data_df.pkl"))
    
    # Convert time covered data to Pandas dataframe
    # The dimensions will each be a column with numbers and the last column will be the actual values
    arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, microstate_time_data.shape), indexing="ij"))) + [microstate_time_data.ravel()])
    microstate_time_df = pd.DataFrame(arr, columns = ["Subject_ID", "Eye_status", "Microstate", "Value"])
    # Change from numerical coding to actual values
    eye_status = list(final_epochs[0].event_id.keys())
    
    glia's avatar
    glia committed
    
    index_values = [Subject_id,eye_status,microstates]
    for col in range(len(index_values)):
        col_name = microstate_time_df.columns[col]
        for shape in reversed(range(microstate_time_data.shape[col])): # notice this is the shape of original numpy array. Not shape of DF
            microstate_time_df.loc[microstate_time_df.iloc[:,col] == shape,col_name]\
            = index_values[col][shape]