FeatureEstimation.py

assert pec_data_df.shape[0] == np.prod(temp_df.shape)
test1 = pec_data_df.iloc[np.random.randint(n_subjects),:]
assert test1["Value"] ==\
    temp_df[test1[1]][test1[2]][test1[3]][test1[5]][test1[6]][Subject_id.index(test1[0])]
# Fix column names
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]

PEC_df = pd.concat([PEC_df,temp_df], axis=1)

# Add group status
Groups = ["CTRL", "PTSD"]
Group_status = np.array([0]*PEC_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in PEC_df["Subject_ID"]])] = 1 # PTSD = 1
PEC_df.insert(1, "Group_status", Group_status)

# Only use PTSD patient group
PEC_df2 = PEC_df.loc[PEC_df["Group_status"]==1,:]

Subject_info_cols = ["Subject_ID","Group_status"]

# Use gridsearch and permutations to estimate gap statistic and use it to 
# determine number of clusters and sparsity s
# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020
# Error when trying to determine Gap statistic for 1 cluster? Also in R package
max_clusters = 6
n_sparsity_feat = 20
perm_res = []
for k in range(1,max_clusters):
    # Cannot permute with 1 cluster
    n_clusters = k+1
    x = np.array(PEC_df2.copy().drop(Subject_info_cols, axis=1))
    perm = pysparcl.cluster.permute_modified(x, k=n_clusters, verbose=True,
                                             nvals=n_sparsity_feat, nperms=100)
    perm_res.append(perm)

# Save the results
with open(Feature_savepath+"PEC_drop_interpol_ch_kmeans_perm.pkl", "wb") as file:
    pickle.dump(perm_res, file)

# # Load
# with open(Feature_savepath+"PEC_drop_interpol_ch_kmeans_perm.pkl", "rb") as file:
#     perm_res = pickle.load(file)

# Convert results to array
perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4))
for i in range(len(perm_res)):
    _, gaps, sdgaps, wbounds, _ = perm_res[i].values()
    for i2 in range(n_sparsity_feat):
        perm_res_arr[20*i+i2,0] = i+2 # cluster size
        perm_res_arr[20*i+i2,1] = gaps[i2] # gap statistic
        perm_res_arr[20*i+i2,2] = sdgaps[i2] # gap statistic std
        perm_res_arr[20*i+i2,3] = wbounds[i2] # sparsity feature s

# For each sparsity s, determine best k using one-standard-error criterion
# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s
# that fulfill Gap(k) >= Gap(k+1)-std(k+1)
def one_standard_deviation_search(gaps, std):
    best_gaps = np.argmax(gaps)
    current_gaps = gaps[best_gaps]
    current_std = std[best_gaps]
    current_gaps_idx = best_gaps
    while (gaps[current_gaps_idx-1] >= current_gaps - current_std):
        if current_gaps_idx == 0:
            break
        else:
            current_gaps_idx -= 1
            current_gaps = gaps[current_gaps_idx]
            current_std = std[current_gaps_idx]
    out = current_gaps, current_std, current_gaps_idx
    return out

best_ks = np.zeros((n_sparsity_feat, 2))
all_s = np.unique(perm_res_arr[:,3])
plt.figure(figsize=(12,12))
for i2 in range(n_sparsity_feat):
    current_s = all_s[i2]
    gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1]
    std = perm_res_arr[perm_res_arr[:,3] == current_s,2]
    _, _, idx = one_standard_deviation_search(gaps, std)
    # Save to array
    best_ks[i2,0] = current_s
    best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx]
    # Plot gap
    plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"),
             gaps, yerr=std, capsize=5, label = np.round(current_s,3))
plt.title("Gap statistic for different fixed s")
plt.legend(loc=1)
plt.xlabel("Number of clusters")
plt.ylabel("Gap statistic")

best_k = int(scipy.stats.mode(best_ks[:,1])[0])

# Determine s using fixed k as lowest s within 1 std of max gap statistic
# According to Witten & Tibshirani, 2010
best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1])
best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx]
best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx]
one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std

best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]])

# Perform clustering with k clusters
x = np.array(PEC_df2.copy().drop(Subject_info_cols, axis=1))
sparcl = pysparcl.cluster.kmeans(x, k=best_k, wbounds=best_s)[0]

# Save the results
with open(Feature_savepath+"PEC_drop_interpol_ch_sparse_kmeans.pkl", "wb") as file:
    pickle.dump(sparcl, file)

# Get overview of the features chosen and summarize feature type with countplot
nonzero_idx = sparcl["ws"].nonzero()
sparcl_features = PEC_df2.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]

# Prepare variables
Freq_Bands = {"delta": [1.25, 4.0],
              "theta": [4.0, 8.0],
              "alpha": [8.0, 13.0],
              "beta": [13.0, 30.0],
              "gamma": [30.0, 49.0]}
n_freq_bands = len(Freq_Bands)
eye_status = list(source_epochs[0].event_id.keys())
n_eye_status = len(eye_status)

sparcl_feat = []
sparcl_feat_counts = []
for e in range(n_eye_status):
    ee = eye_status[e]
    for f in range(n_freq_bands):
        ff = list(Freq_Bands.keys())[f]
        temp_feat = sparcl_features[sparcl_features.str.contains(("_"+ee))]
        temp_feat = temp_feat[temp_feat.str.contains(("_"+ff))]
        # Save to list
        sparcl_feat.append(temp_feat)
        sparcl_feat_counts.append(["{}_{}".format(ee,ff), len(temp_feat)])

# Convert the list to dataframe to use in countplot
sparcl_feat_counts_df = pd.DataFrame(columns=["Eye_status", "Freq_band"])
for i in range(len(sparcl_feat_counts)):
    # If this feature type does not exist, then skip it
    if sparcl_feat_counts[i][1] == 0:
        continue
    ee, ff = sparcl_feat_counts[i][0].split("_")
    counts = sparcl_feat_counts[i][1]
    temp_df = pd.DataFrame({"Eye_status":np.repeat(ee,counts),
                            "Freq_band":np.repeat(ff,counts)})
    sparcl_feat_counts_df = sparcl_feat_counts_df.append(temp_df, ignore_index=True)

# Fix Freq_band categorical order
cat_type = pd.CategoricalDtype(categories=list(Freq_Bands.keys()), ordered=True)
sparcl_feat_counts_df["Freq_band"] = sparcl_feat_counts_df["Freq_band"].astype(cat_type)

plt.figure(figsize=(8,8))
g = sns.countplot(y="Freq_band", hue="Eye_status", data=sparcl_feat_counts_df)
plt.title("PEC Sparse K-means features")
plt.xlabel("Number of non-zero weights")
plt.ylabel("Frequency Band")

# %% Functional connectivity in source space
# MNE implementation of PLV and wPLI is phase across trials(epochs), e.g. for ERPs
# I'll use my own manually implemented PLV and wPLI across time and then average across epochs
# Notice that the new MNE-connectivity library now also takes phase across time

sfreq = final_epochs[0].info["sfreq"]
# error when using less than 5 cycles for spectrum estimation
# 1Hz too low with epoch length of 4, thus I changed the fmin to 1.25 for delta
Freq_Bands = {"delta": [1.25, 4.0],
              "theta": [4.0, 8.0],
              "alpha": [8.0, 13.0],
              "beta": [13.0, 30.0],
              "gamma": [30.0, 49.0]}
n_freq_bands = len(Freq_Bands)
freq_centers = np.array([2.5,6,10.5,21.5,40])
# Convert to tuples for the mne function
fmin=tuple([list(Freq_Bands.values())[f][0] for f in range(len(Freq_Bands))])
fmax=tuple([list(Freq_Bands.values())[f][1] for f in range(len(Freq_Bands))])

# Make linspace array for morlet waves
freq_centers = np.arange(fmin[0],fmax[-1]+0.25,0.25)
# Prepare Morlets
morlets = mne.time_frequency.tfr.morlet(sfreq,freq_centers,n_cycles=3)

# Make freqs array for indexing
freqs0 = [0]*n_freq_bands
for f in range(n_freq_bands):
    freqs0[f] = freq_centers[(freq_centers>=fmin[f]) & (freq_centers<=fmax[f])]

# The in-built connectivity function gives an (n_channel, n_channel, freqs output
# For loop over subject ID and eye status is implemented
n_subjects = len(Subject_id)
eye_status = list(final_epochs[0].event_id.keys())
n_eye_status = len(eye_status)
ch_names = final_epochs[0].info["ch_names"]

# Load source labels
with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file:
    labels = pickle.load(file)
label_names = [label.name for label in labels]
n_sources = len(label_names)

# Connectivity methods
connectivity_methods = ["coh","imcoh","plv","wpli"]
n_con_methods=len(connectivity_methods)

# Number of pairwise ch connections
n_ch_connections = scipy.special.comb(n_sources,2, exact=True, repetition=False)

# Load source time series
with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file:
    STCs_list = pickle.load(file)

# I made my own slightly-optimized PLV & WPLI function
# Version 2 based on Filter + Hilbert instead of Morlets
def calculate_PLV_WPLI_across_time(data):
    n_ch, n_time0 = data.shape
    x = data.copy()
    # Filter the signal in the different freq bands
    con_array0 = np.zeros((2,n_ch,n_ch,n_freq_bands))
    # con_array0[con_array0==0] = np.nan
    for fname, frange in Freq_Bands.items():
        fmin, fmax = [float(interval) for interval in frange]
        signal_filtered = mne.filter.filter_data(x, sfreq, fmin, fmax,
                                          fir_design="firwin", verbose=0)
        # Filtering on finite signals will yield very low values for first
        # and last timepoint, which can create outliers. E.g. 1e-29 compared to 1e-14
        # This systematic error is removed by removing the first and last timepoint
        signal_filtered = signal_filtered[:,1:-1]
        # Hilbert transform to get complex signal
        analytic_signal = scipy.signal.hilbert(signal_filtered)
        # Calculate for the lower diagnonal only as it is symmetric
        for ch_r in range(n_ch):
            for ch_c in range(n_ch):
                if ch_r>ch_c:
                    # =========================================================================
                    # PLV over time correspond to mean across time of the absolute value of
                    # the circular length of the relative phases. So PLV will be 1 if
                    # the phases of 2 signals maintain a constant lag
                    # In equational form: PLV = 1/N * |sum(e^i(phase1-phase2))|
                    # In code: abs(mean(exp(1i*phase_diff)))
                    # =========================================================================
                    # The real part correspond to the amplitude and the imaginary part can be used to calculate the phase
                    phase_diff = np.angle(analytic_signal[ch_r])-np.angle(analytic_signal[ch_c])
                    # Convert phase difference to complex part i(phase1-phase2)
                    phase_diff_im = 0*phase_diff+1j*phase_diff
                    # Take the exponential, then the mean followed by absolute value
                    PLV = np.abs(np.mean(np.exp(phase_diff_im)))
                    # Save to array
                    con_array0[0,ch_r,ch_c,list(Freq_Bands.keys()).index(fname)] = PLV
                    # =========================================================================
                    # PLI over time correspond to the sign of the sine of relative phase
                    # differences. So PLI will be 1 if one signal is always leading or
                    # lagging behind the other signal. But it is insensitive to changes in
                    # relative phase, as long as it is the same signal that leads.
                    # If 2 signals are almost in phase, they might shift between lead/lag
                    # due to small fluctuations from noise. This would lead to unstable
                    # estimation of "phase" synchronisation.
                    # The wPLI tries to correct for this by weighting the PLI with the
                    # magnitude of the lag, to attenuate noise sources giving rise to
                    # near zero phase lag "synchronization"
                    # In equational form: WPLI = |E{|phase_diff|*sign(phase_diff)}| / E{|phase_diff|}
                    # =========================================================================
                    # Calculate the magnitude of phase differences
                    phase_diff_mag = np.abs(np.sin(phase_diff))
                    # Calculate the signed phase difference (PLI)
                    sign_phase_diff = np.sign(np.sin(phase_diff))
                    # Calculate the nominator (abs and average across time)
                    WPLI_nominator = np.abs(np.mean(phase_diff_mag*sign_phase_diff))
                    # Calculate denominator for normalization
                    WPLI_denom = np.mean(phase_diff_mag)
                    # Calculate WPLI
                    WPLI = WPLI_nominator/WPLI_denom
                    # Save to array
                    con_array0[1,ch_r,ch_c,list(Freq_Bands.keys()).index(fname)] = WPLI
    return con_array0

# Pre-allocatate memory
con_data = np.zeros((n_con_methods,n_subjects,n_eye_status,n_sources,n_sources,n_freq_bands))
n_epochs_matrix = np.zeros((n_subjects,n_eye_status))

# Get current time
c_time = time.localtime()
c_time = time.strftime("%H:%M:%S", c_time)
print(c_time)

def connectivity_estimation(i):
    con_data0 = np.zeros((n_con_methods,n_eye_status,n_sources,n_sources,n_freq_bands))
    con_data0[con_data0==0] = np.nan
    n_epochs_matrix0 = np.zeros((n_eye_status))
    for e in range(n_eye_status):
        ee = eye_status[e]
        eye_idx = final_epochs[i].events[:,2] == e+1 # EC = 1 and EO = 2
        # Get source time series
        temp_STC = STCs_list[i][eye_idx]
        # Calculate the coherence and ImgCoh for the given subject and eye status
        con, freqs, times, n_epochs, n_tapers = spectral_connectivity(
            temp_STC, method=connectivity_methods[0:2],
            mode="multitaper", sfreq=sfreq, fmin=fmin, fmax=fmax,
            faverage=True, verbose=0)
        # Save the results in array
        con_data0[0,e,:,:,:] = con[0] # coherence
        con_data0[1,e,:,:,:] = np.abs(con[1]) # Absolute value of ImgCoh to reflect magnitude of ImgCoh
        
        # Calculate PLV and wPLI for each epoch and then average
        n_epochs0 = temp_STC.shape[0]
        con_data1 = np.zeros((len(connectivity_methods[2:]),n_epochs0,n_sources,n_sources,n_freq_bands))
        for epoch in range(n_epochs0):
            # First the data is retrieved and epoch axis dropped
            temp_data = temp_STC[epoch,:,:]
            # PLV and WPLI value is calculated across timepoints in each freq band
            PLV_WPLI_con = calculate_PLV_WPLI_across_time(temp_data)
            # Save results
            con_data1[0,epoch,:,:,:] = PLV_WPLI_con[0] # phase locking value
            con_data1[1,epoch,:,:,:] = PLV_WPLI_con[1] # weighted phase lag index
        # Take average across epochs for PLV and wPLI
        con_data2 = np.mean(con_data1,axis=1)
        # Save to final array
        con_data0[2,e,:,:,:] = con_data2[0] # phase locking value
        con_data0[3,e,:,:,:] = con_data2[1] # weighted phase lag index
        n_epochs_matrix0[e] = n_epochs
    
    print("{} out of {} finished".format(i+1,n_subjects))
    return i, con_data0, n_epochs_matrix0

with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor:
    for i, con_result, n_epochs_mat in executor.map(connectivity_estimation, range(n_subjects)): # Function and arguments
        con_data[:,i,:,:,:,:] = con_result
        n_epochs_matrix[i] = n_epochs_mat

# Get current time
c_time = time.localtime()
c_time = time.strftime("%H:%M:%S", c_time)
print(c_time)

# Save the results
np.save(Feature_savepath+"Source_drop_interpol_ch_connectivity_measures_data.npy", con_data) # (con_measure,subject,eye,ch,ch,freq)

# Also save as dataframe format for feature selection
# Convert to Pandas dataframe
# The dimensions will each be a column with numbers and the last column will be the actual values
arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, con_data.shape), indexing="ij"))) + [con_data.ravel()])
con_data_df = pd.DataFrame(arr, columns = ["Con_measurement", "Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"])
# Change from numerical coding to actual values
eye_status = list(final_epochs[0].event_id.keys())
freq_bands_name = list(Freq_Bands.keys())

index_values = [connectivity_methods,Subject_id,eye_status,label_names,label_names,freq_bands_name]
for col in range(len(index_values)):
    col_name = con_data_df.columns[col]
    for shape in range(con_data.shape[col]): # notice not dataframe but the array
        con_data_df.loc[con_data_df.iloc[:,col] == shape,col_name]\
        = index_values[col][shape]

# Add group status
Group_status = np.array(["CTRL"]*len(con_data_df["Subject_ID"]))
Group_status[np.array([i in cases for i in con_data_df["Subject_ID"]])] = "PTSD"
# Add to dataframe
con_data_df.insert(3, "Group_status", Group_status)

# Remove all diagonal and upper-matrix entries
con_data_df = con_data_df.iloc[con_data_df["Value"].to_numpy().nonzero()]

# Save df
con_data_df.to_pickle(os.path.join(Feature_savepath,"con_data_source_drop_interpol_df.pkl"))

# %% Estimate Granger's Causality in source space
# Load source labels
with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file:
    labels = pickle.load(file)
label_names = [label.name for label in labels]
n_sources = len(label_names)

# Load source time series
with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file:
    STCs_list = pickle.load(file)

# Granger's causality might be influenced by volume conduction, thus working with CSD might be beneficial
# But since I already used source modelling to alleviate this problem I will not apply CSD
# Barrett et al, 2012 also do not apply CSD on source GC

# GC assumes stationarity, thus I will test for stationarity using ADF test
# The null hypothesis of ADF is that it has unit root, i.e. is non-stationary
# I will test how many can reject the null hypothesis, i.e. are stationary

# Due to the low numerical values in STC the ADF test is unstable, thus I multiply it to be around 1e0

stationary_test_arr = [0]*n_subjects
n_tests = [0]*n_subjects
for i in range(n_subjects):
    # Get data
    data_arr = STCs_list[i]
    # Get shape
    n_epochs, n_channels, n_timepoints = data_arr.shape
    # Create array for indices to print out progress
    ep_progress_idx = np.arange(n_epochs//5,n_epochs,n_epochs//5)
    # Calculate number of tests performed for each subject
    n_tests[i] = n_epochs*n_channels
    # Prepare empty array (with 2's as 0 and 1 will be used)
    stationary_test_arr0 = np.zeros((n_epochs,n_channels))+2 # make array of 2's
    for ep in range(n_epochs):
        for c in range(n_channels):
            ADF = adfuller(data_arr[ep,c,:]*1e14) # multilying with a constant does not change ADF, but helps against numerical instability
            p_value = ADF[1]
            if p_value < 0.05:
                stationary_test_arr0[ep,c] = True # Stationary set to 1
            else:
                stationary_test_arr0[ep,c] = False # Non-stationary set to 0
        # Print partial progress
        if len(np.where(ep_progress_idx==ep)[0]) > 0:
            print("Finished epoch number: {} out of {}".format(ep,n_epochs))
    # Indices that were not tested
    no_test_idx = np.where(stationary_test_arr0==2)[0]
    if len(no_test_idx) > 0:
        print("An unexpected error occurred and {} was not tested".format(no_test_idx))
    # Save to list
    stationary_test_arr[i] = stationary_test_arr0
    # Print progress
    print("Finished subject {} out of {}".format(i+1,n_subjects))

with open(Stat_savepath+"Source_drop_interpol_GC_stationarity_tests.pkl", "wb") as filehandle:
    # The data is stored as binary data stream
    pickle.dump(stationary_test_arr, filehandle)

# I used a threshold of 0.05
# This means that on average I would expect 5% false positives among the tests that showed significance for stationarity
ratio_stationary = [0]*n_subjects
for i in range(n_subjects):
    # Ratio of tests that showed stationarity
    ratio_stationary[i] = np.sum(stationary_test_arr[i])/n_tests[i]

print("Ratio of stationary time series: {0:.3f}".format(np.mean(ratio_stationary))) # 88%

# The pre-processing have already ensured that most of the data fulfills the stationarity assumption.

# Divide the data into eyes closed and open
ch_names = label_names
n_channels = len(ch_names)

STC_eye_data = []
for i in range(n_subjects):
    # Get index for eyes open and eyes closed
    EC_index = final_epochs[i].events[:,2] == 1
    EO_index = final_epochs[i].events[:,2] == 2
    # Get the data
    EC_epoch_data = STCs_list[i][EC_index,:,:] # eye index
    EO_epoch_data = STCs_list[i][EO_index,:,:]
    # Save to list
    STC_eye_data.append([EC_epoch_data, EO_epoch_data])

# Make each epoch a TimeSeries object
# Input for TimeSeries is: (ch, time)
eye_status = list(final_epochs[0].event_id.keys())
n_eye_status = len(eye_status)
sfreq = final_epochs[0].info["sfreq"]

Timeseries_data = []
for i in range(n_subjects):
    temp_list1 = []
    for e in range(n_eye_status):
        temp_list2 = []
        n_epochs = STC_eye_data[i][e].shape[0]
        for ep in range(n_epochs):
            # Convert to TimeSeries
            time_series = nts.TimeSeries(STC_eye_data[i][e][ep,:,:], sampling_rate=sfreq)
            # Save the object
            temp_list2.append(time_series)
        # Save the timeseries across eye status
        temp_list1.append(temp_list2)
    # Save the timeseries across subjects
    Timeseries_data.append(temp_list1) # output [subject][eye][epoch](ch,time)

# Test multiple specified model orders of AR models, each combination has its own model
m_orders = np.linspace(1,25,25) # the model orders tested
m_orders = np.round(m_orders)
n_timepoints = len(Timeseries_data[0][0][0])
n_ch_combinations = scipy.special.comb(n_channels,2, exact=True, repetition=False)

# To reduce computation time I only test representative epochs (1 from each 1 min session)
# There will be 5 epochs from eyes closed and 5 from eyes open
n_rep_epoch = 5
# The subjects have different number of epochs due to dropped epochs
gaps_trials_idx = np.load("Gaps_trials_idx.npy") # time_points between sessions
# I convert the gap time points to epoch number used as representative epoch
epoch_idx = np.zeros((n_subjects,n_eye_status,n_rep_epoch), dtype=int) # prepare array
epoch_idx[:,:,0:4] = np.round(gaps_trials_idx/n_timepoints,0)-8 # take random epoch from sessions 1 to 4
epoch_idx[:,:,4] = np.round(gaps_trials_idx[:,:,3]/n_timepoints,0)+5 # take random epoch from session 5

# Checking if all epoch idx exists
for i in range(n_subjects):
    EC_index = final_epochs[i].events[:,2] == 1
    EO_index = final_epochs[i].events[:,2] == 2
    assert np.sum(EC_index) >= epoch_idx[i,0,4]
    assert np.sum(EO_index) >= epoch_idx[i,1,4]

# Prepare model order estimation
AIC_arr = np.zeros((len(m_orders),n_subjects,n_eye_status,n_rep_epoch,n_ch_combinations))
BIC_arr = np.zeros((len(m_orders),n_subjects,n_eye_status,n_rep_epoch,n_ch_combinations))

def GC_model_order_est(i):
    AIC_arr0 = np.zeros((len(m_orders),n_eye_status,n_rep_epoch,n_ch_combinations))
    BIC_arr0 = np.zeros((len(m_orders),n_eye_status,n_rep_epoch,n_ch_combinations))
    for e in range(n_eye_status):
        n_epochs = STC_eye_data[i][e].shape[0]
        N_total = n_timepoints*n_epochs # total number of datapoints for specific eye condition
        for ep in range(n_rep_epoch):
            epp = epoch_idx[i,e,ep]
            for o in range(len(m_orders)):
                order = int(m_orders[o])
                # Make the Granger Causality object
                GCA1 = nta.GrangerAnalyzer(Timeseries_data[i][e][epp-1], order=order,
                                           n_freqs=2000)
                for c in range(n_ch_combinations):
                    # Retrieve error covariance matrix for all combinations
                    ecov = np.array(list(GCA1.error_cov.values()))
                    # Calculate AIC
                    AIC = ntsu.akaike_information_criterion(ecov[c,:,:], p = n_channels,
                                                            m=order, Ntotal=N_total)
                    # Calculate BIC
                    BIC = ntsu.bayesian_information_criterion(ecov[c,:,:], p = n_channels,
                                                              m=order, Ntotal=N_total)
                    # Save the information criterions
                    AIC_arr0[o,e,ep,c] = AIC
                    BIC_arr0[o,e,ep,c] = BIC

    print("{} out of {} finished testing".format(i+1,n_subjects))
    return i, AIC_arr0, BIC_arr0

# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)

with concurrent.futures.ProcessPoolExecutor() as executor:
    for i, AIC_result, BIC_result in executor.map(GC_model_order_est, range(n_subjects)): # Function and arguments
        AIC_arr[:,i] = AIC_result
        BIC_arr[:,i] = BIC_result

# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)

# Save the AIC and BIC results
np.save(Feature_savepath+"AIC_Source_drop_interpol_GC_model_order.npy", AIC_arr) # (m. order, subject, eye, epoch, combination)
np.save(Feature_savepath+"BIC_Source_drop_interpol_GC_model_order.npy", BIC_arr) # (m. order, subject, eye, epoch, combination)

# Load data
AIC_arr = np.load(Feature_savepath+"AIC_Source_drop_interpol_GC_model_order.npy")
BIC_arr = np.load(Feature_savepath+"BIC_Source_drop_interpol_GC_model_order.npy")

# Average across all subjects, eye status, epochs and combinations
plt.figure(figsize=(8,6))
plt.plot(m_orders, np.nanmean(AIC_arr, axis=(1,2,3,4)), label="AIC")
plt.plot(m_orders, np.nanmean(BIC_arr, axis=(1,2,3,4)), label="BIC")
plt.title("Average information criteria value")
plt.xlabel("Model order (Lag)")
plt.legend()

np.sum(np.isnan(AIC_arr))/AIC_arr.size # around 0.07% NaN due to non-convergence
np.sum(np.isnan(BIC_arr))/BIC_arr.size # around 0.07% NaN due to non-convergence

# If we look at each subject
mean_subject_AIC = np.nanmean(AIC_arr, axis=(2,3,4))

plt.figure(figsize=(8,6))
for i in range(n_subjects):
    plt.plot(m_orders, mean_subject_AIC[:,i])
plt.title("Average AIC for each subject")
plt.xlabel("Model order (Lag)")

mean_subject_BIC = np.nanmean(BIC_arr, axis=(2,3,4))
plt.figure(figsize=(8,6))
for i in range(n_subjects):
    plt.plot(m_orders, mean_subject_BIC[:,i])
plt.title("Average BIC for each subject")
plt.xlabel("Model order (Lag)")

# We see that for many cases in BIC, it does not converge. Monotonic increasing!

# We can look at the distribution of chosen order for each time series analyzed
# I.e. I will find the minima in model order for each model
AIC_min_arr = np.argmin(AIC_arr, axis=0)
BIC_min_arr = np.argmin(BIC_arr, axis=0)

# Plot the distributions of the model order chosen
plt.figure(figsize=(8,6))
sns.distplot(AIC_min_arr.reshape(-1)+1, kde=False, norm_hist=True,
             bins=np.linspace(0.75,30.25,60), label="AIC")
plt.ylabel("Frequency density")
plt.xlabel("Model order")
plt.title("AIC Model Order Estimation")

plt.figure(figsize=(8,6))
sns.distplot(BIC_min_arr.reshape(-1)+1, kde=False, norm_hist=True, color="blue",
             bins=np.linspace(0.75,30.25,60), label="BIC")
plt.ylabel("Frequency density")
plt.xlabel("Model order")
plt.title("BIC Model Order Estimation")
# It is clear from the BIC model that most have model order 1
# which reflect their monotonic increasing nature without convergence
# Thus I will only use AIC

# There is a bias variance trade-off with model order [Stokes & Purdon, 2017]
# Lower order is associated with higher bias and higher order with variance
# I will choose the model order that is chosen the most (i.e. majority voting)
AR_order = int(np.nanquantile(AIC_min_arr.reshape(-1), q=0.5))
# Order = 5

# Calculate Granger Causality for each subject, eye and epoch
Freq_Bands = {"delta": [1.25, 4.0],
              "theta": [4.0, 8.0],
              "alpha": [8.0, 13.0],
              "beta": [13.0, 30.0],
              "gamma": [30.0, 49.0]}
n_freq_bands = len(Freq_Bands)

# Pre-allocate memory
GC_data = np.zeros((2,n_subjects,n_eye_status,n_channels,n_channels,n_freq_bands))

def GC_analysis(i):
    GC_data0 = np.zeros((2,n_eye_status,n_channels,n_channels,n_freq_bands))
    for e in range(n_eye_status):
        n_epochs = STC_eye_data[i][e].shape[0]
        # Make temporary array to save GC for each epoch
        temp_GC_data = np.zeros((2,n_epochs,n_channels,n_channels,n_freq_bands))
        for ep in range(n_epochs):
            # Fit the AR model
            GCA = nta.GrangerAnalyzer(Timeseries_data[i][e][ep], order=AR_order,
                                       n_freqs=int(800)) # n_Freq=800 correspond to step of 0.25Hz, the same as multitaper for power estimation
            for f in range(n_freq_bands):
                # Define lower and upper band
                f_lb = list(Freq_Bands.values())[f][0]
                f_ub = list(Freq_Bands.values())[f][1]
                # Get index corresponding to the frequency bands of interest
                freq_idx_G = np.where((GCA.frequencies >= f_lb) * (GCA.frequencies < f_ub))[0]
                # Calculcate Granger causality quantities
                g_xy = np.mean(GCA.causality_xy[:, :, freq_idx_G], -1) # avg on last dimension
                g_yx = np.mean(GCA.causality_yx[:, :, freq_idx_G], -1) # avg on last dimension
                # Transpose to use same format as con_measurement and save
                temp_GC_data[0,ep,:,:,f] = np.transpose(g_xy)
                temp_GC_data[1,ep,:,:,f] = np.transpose(g_yx)
        
        # Average over epochs for each person, eye condition, direction and frequency band
        temp_GC_epoch_mean = np.nanmean(temp_GC_data, axis=1) # sometimes Log(Sxx/xx_auto_component) is nan
        # Save to array
        GC_data0[:,e,:,:,:] = temp_GC_epoch_mean
        
    print("{} out of {} finished analyzing".format(i+1,n_subjects))
    return i, GC_data0

# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)

with concurrent.futures.ProcessPoolExecutor() as executor:
    for i, GC_result in executor.map(GC_analysis, range(n_subjects)): # Function and arguments
        GC_data[:,i] = GC_result
        
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)

# Output: GC_data (g_xy/g_yx, subject, eye, chx, chy, freq)
# Notice that for g_xy ([0,...]) it means "chy" Granger causes "chx"
# and for g_yx ([1,...]) it means "chx" Granger causes "chy"
# This is due to the transposing which flipped the results on to the lower-part of the diagonal

# Save the Granger_Causality data
np.save(Feature_savepath+"Source_drop_interpol_GrangerCausality_data.npy", GC_data)

# Theoretically negative GC values should be impossible, but in practice
# they can still occur due to problems with model fitting (see Stokes & Purdon, 2017)
print("{:.3f}% negative GC values".\
      format(np.sum(GC_data[~np.isnan(GC_data)]<0)/np.sum(~np.isnan(GC_data))*100)) # 0.08% negative values
# These values cannot be interpreted, but seems to occur mostly for true non-causal connections
# Hence I set them to 0
with np.errstate(invalid="ignore"): # invalid number refers to np.nan, which will be set to False for comparisons
    GC_data[(GC_data<0)] = 0

# Save as dataframe for further processing with other features
# Convert to Pandas dataframe
# The dimensions will each be a column with numbers and the last column will be the actual values
arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, GC_data.shape), indexing="ij"))) + [GC_data.ravel()])
GC_data_df = pd.DataFrame(arr, columns = ["GC_direction", "Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"])
# Change from numerical coding to actual values
eye_status = list(final_epochs[0].event_id.keys())
freq_bands_name = list(Freq_Bands.keys())
GC_directions_info = ["chy -> chx", "chx -> chy"]

index_values = [GC_directions_info,Subject_id,eye_status,ch_names,ch_names,freq_bands_name]
for col in range(len(index_values)):
    col_name = GC_data_df.columns[col]
    for shape in range(GC_data.shape[col]): # notice not dataframe but the array
        GC_data_df.loc[GC_data_df.iloc[:,col] == shape,col_name]\
        = index_values[col][shape]

# Add group status
Group_status = np.array(["CTRL"]*len(GC_data_df["Subject_ID"]))
Group_status[np.array([i in cases for i in GC_data_df["Subject_ID"]])] = "PTSD"
# Add to dataframe
GC_data_df.insert(3, "Group_status", Group_status)

# Remove all nan (including diagonal and upper-matrix entries)
GC_data_df = GC_data_df.iloc[np.invert(np.isnan(GC_data_df["Value"].to_numpy()))]

# Swap ch values for GC_direction chy -> chx (so it is always chx -> chy)
tempchy = GC_data_df[GC_data_df["GC_direction"] == "chy -> chx"]["chy"] # save chy
GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chy"] =\
             GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chx"] # overwrite old chy
GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chx"] = tempchy # overwrite chx

# Drop the GC_direction column
GC_data_df = GC_data_df.drop("GC_direction", axis=1)

# Save df
GC_data_df.to_pickle(os.path.join(Feature_savepath,"GC_data_source_drop_interpol_df.pkl"))

# Testing if df was formatted correctly
expected_GC_values = n_subjects*n_eye_status*n_ch_combinations*n_freq_bands*2 # 2 because it is bidirectional
assert GC_data_df.shape[0] == expected_GC_values
# Testing a random GC value
random_connection = np.random.randint(0,GC_data_df.shape[0])
test_connection = GC_data_df.iloc[random_connection,:]
i = np.where(Subject_id==test_connection["Subject_ID"])[0]
e = np.where(np.array(eye_status)==test_connection["Eye_status"])[0]
chx = np.where(np.array(ch_names)==test_connection["chx"])[0]
chy = np.where(np.array(ch_names)==test_connection["chy"])[0]
f = np.where(np.array(freq_bands_name)==test_connection["Freq_band"])[0]
value = test_connection["Value"]
if chx < chy: # the GC array is only lower diagonal to save memory
    assert GC_data[0,i,e,chy,chx,f] == value
if chx > chy:
    assert GC_data[1,i,e,chx,chy,f] == value