Newer
Older
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
assert pec_data_df.shape[0] == np.prod(temp_df.shape)
test1 = pec_data_df.iloc[np.random.randint(n_subjects),:]
assert test1["Value"] ==\
temp_df[test1[1]][test1[2]][test1[3]][test1[5]][test1[6]][Subject_id.index(test1[0])]
# Fix column names
temp_df.columns = [convertTupleHeader(temp_df.columns[i]) for i in range(len(temp_df.columns))]
PEC_df = pd.concat([PEC_df,temp_df], axis=1)
# Add group status
Groups = ["CTRL", "PTSD"]
Group_status = np.array([0]*PEC_df.shape[0]) # CTRL = 0
Group_status[np.array([i in cases for i in PEC_df["Subject_ID"]])] = 1 # PTSD = 1
PEC_df.insert(1, "Group_status", Group_status)
# Only use PTSD patient group
PEC_df2 = PEC_df.loc[PEC_df["Group_status"]==1,:]
Subject_info_cols = ["Subject_ID","Group_status"]
# Use gridsearch and permutations to estimate gap statistic and use it to
# determine number of clusters and sparsity s
# I will use 100 permutations and test 2 to 6 clusters as Zhang 2020
# Error when trying to determine Gap statistic for 1 cluster? Also in R package
max_clusters = 6
n_sparsity_feat = 20
perm_res = []
for k in range(1,max_clusters):
# Cannot permute with 1 cluster
n_clusters = k+1
x = np.array(PEC_df2.copy().drop(Subject_info_cols, axis=1))
perm = pysparcl.cluster.permute_modified(x, k=n_clusters, verbose=True,
nvals=n_sparsity_feat, nperms=100)
perm_res.append(perm)
# Save the results
with open(Feature_savepath+"PEC_drop_interpol_ch_kmeans_perm.pkl", "wb") as file:
pickle.dump(perm_res, file)
# # Load
# with open(Feature_savepath+"PEC_drop_interpol_ch_kmeans_perm.pkl", "rb") as file:
# perm_res = pickle.load(file)
# Convert results to array
perm_res_arr = np.zeros((len(perm_res)*n_sparsity_feat,4))
for i in range(len(perm_res)):
_, gaps, sdgaps, wbounds, _ = perm_res[i].values()
for i2 in range(n_sparsity_feat):
perm_res_arr[20*i+i2,0] = i+2 # cluster size
perm_res_arr[20*i+i2,1] = gaps[i2] # gap statistic
perm_res_arr[20*i+i2,2] = sdgaps[i2] # gap statistic std
perm_res_arr[20*i+i2,3] = wbounds[i2] # sparsity feature s
# For each sparsity s, determine best k using one-standard-error criterion
# Meaning the cluster and sparsity is chosen for the smallest value of k for a fixed s
# that fulfill Gap(k) >= Gap(k+1)-std(k+1)
def one_standard_deviation_search(gaps, std):
best_gaps = np.argmax(gaps)
current_gaps = gaps[best_gaps]
current_std = std[best_gaps]
current_gaps_idx = best_gaps
while (gaps[current_gaps_idx-1] >= current_gaps - current_std):
if current_gaps_idx == 0:
break
else:
current_gaps_idx -= 1
current_gaps = gaps[current_gaps_idx]
current_std = std[current_gaps_idx]
out = current_gaps, current_std, current_gaps_idx
return out
best_ks = np.zeros((n_sparsity_feat, 2))
all_s = np.unique(perm_res_arr[:,3])
plt.figure(figsize=(12,12))
for i2 in range(n_sparsity_feat):
current_s = all_s[i2]
gaps = perm_res_arr[perm_res_arr[:,3] == current_s,1]
std = perm_res_arr[perm_res_arr[:,3] == current_s,2]
_, _, idx = one_standard_deviation_search(gaps, std)
# Save to array
best_ks[i2,0] = current_s
best_ks[i2,1] = perm_res_arr[perm_res_arr[:,3] == current_s,0][idx]
# Plot gap
plt.errorbar(perm_res_arr[perm_res_arr[:,3] == current_s,0].astype("int"),
gaps, yerr=std, capsize=5, label = np.round(current_s,3))
plt.title("Gap statistic for different fixed s")
plt.legend(loc=1)
plt.xlabel("Number of clusters")
plt.ylabel("Gap statistic")
best_k = int(scipy.stats.mode(best_ks[:,1])[0])
# Determine s using fixed k as lowest s within 1 std of max gap statistic
# According to Witten & Tibshirani, 2010
best_gaps_idx = np.argmax(perm_res_arr[perm_res_arr[:,0] == best_k,1])
best_gaps = perm_res_arr[perm_res_arr[:,0] == best_k,1][best_gaps_idx]
best_gaps_std = perm_res_arr[perm_res_arr[:,0] == best_k,2][best_gaps_idx]
one_std_crit = perm_res_arr[perm_res_arr[:,0] == best_k,1]>=best_gaps-best_gaps_std
best_s = np.array([perm_res_arr[perm_res_arr[:,0] == best_k,3][one_std_crit][0]])
# Perform clustering with k clusters
x = np.array(PEC_df2.copy().drop(Subject_info_cols, axis=1))
sparcl = pysparcl.cluster.kmeans(x, k=best_k, wbounds=best_s)[0]
# Save the results
with open(Feature_savepath+"PEC_drop_interpol_ch_sparse_kmeans.pkl", "wb") as file:
pickle.dump(sparcl, file)
# Get overview of the features chosen and summarize feature type with countplot
nonzero_idx = sparcl["ws"].nonzero()
sparcl_features = PEC_df2.copy().drop(Subject_info_cols, axis=1).columns[nonzero_idx]
# Prepare variables
Freq_Bands = {"delta": [1.25, 4.0],
"theta": [4.0, 8.0],
"alpha": [8.0, 13.0],
"beta": [13.0, 30.0],
"gamma": [30.0, 49.0]}
n_freq_bands = len(Freq_Bands)
eye_status = list(source_epochs[0].event_id.keys())
n_eye_status = len(eye_status)
sparcl_feat = []
sparcl_feat_counts = []
for e in range(n_eye_status):
ee = eye_status[e]
for f in range(n_freq_bands):
ff = list(Freq_Bands.keys())[f]
temp_feat = sparcl_features[sparcl_features.str.contains(("_"+ee))]
temp_feat = temp_feat[temp_feat.str.contains(("_"+ff))]
# Save to list
sparcl_feat.append(temp_feat)
sparcl_feat_counts.append(["{}_{}".format(ee,ff), len(temp_feat)])
# Convert the list to dataframe to use in countplot
sparcl_feat_counts_df = pd.DataFrame(columns=["Eye_status", "Freq_band"])
for i in range(len(sparcl_feat_counts)):
# If this feature type does not exist, then skip it
if sparcl_feat_counts[i][1] == 0:
continue
ee, ff = sparcl_feat_counts[i][0].split("_")
counts = sparcl_feat_counts[i][1]
temp_df = pd.DataFrame({"Eye_status":np.repeat(ee,counts),
"Freq_band":np.repeat(ff,counts)})
sparcl_feat_counts_df = sparcl_feat_counts_df.append(temp_df, ignore_index=True)
# Fix Freq_band categorical order
cat_type = pd.CategoricalDtype(categories=list(Freq_Bands.keys()), ordered=True)
sparcl_feat_counts_df["Freq_band"] = sparcl_feat_counts_df["Freq_band"].astype(cat_type)
plt.figure(figsize=(8,8))
g = sns.countplot(y="Freq_band", hue="Eye_status", data=sparcl_feat_counts_df)
plt.title("PEC Sparse K-means features")
plt.xlabel("Number of non-zero weights")
plt.ylabel("Frequency Band")
# %% Functional connectivity in source space
# MNE implementation of PLV and wPLI is phase across trials(epochs), e.g. for ERPs
# I'll use my own manually implemented PLV and wPLI across time and then average across epochs
# Notice that the new MNE-connectivity library now also takes phase across time
sfreq = final_epochs[0].info["sfreq"]
# error when using less than 5 cycles for spectrum estimation
# 1Hz too low with epoch length of 4, thus I changed the fmin to 1.25 for delta
Freq_Bands = {"delta": [1.25, 4.0],
"theta": [4.0, 8.0],
"alpha": [8.0, 13.0],
"beta": [13.0, 30.0],
"gamma": [30.0, 49.0]}
n_freq_bands = len(Freq_Bands)
freq_centers = np.array([2.5,6,10.5,21.5,40])
# Convert to tuples for the mne function
fmin=tuple([list(Freq_Bands.values())[f][0] for f in range(len(Freq_Bands))])
fmax=tuple([list(Freq_Bands.values())[f][1] for f in range(len(Freq_Bands))])
# Make linspace array for morlet waves
freq_centers = np.arange(fmin[0],fmax[-1]+0.25,0.25)
# Prepare Morlets
morlets = mne.time_frequency.tfr.morlet(sfreq,freq_centers,n_cycles=3)
# Make freqs array for indexing
freqs0 = [0]*n_freq_bands
for f in range(n_freq_bands):
freqs0[f] = freq_centers[(freq_centers>=fmin[f]) & (freq_centers<=fmax[f])]
# The in-built connectivity function gives an (n_channel, n_channel, freqs output
# For loop over subject ID and eye status is implemented
n_subjects = len(Subject_id)
eye_status = list(final_epochs[0].event_id.keys())
n_eye_status = len(eye_status)
ch_names = final_epochs[0].info["ch_names"]
# Load source labels
with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file:
labels = pickle.load(file)
label_names = [label.name for label in labels]
n_sources = len(label_names)
# Connectivity methods
connectivity_methods = ["coh","imcoh","plv","wpli"]
n_con_methods=len(connectivity_methods)
# Number of pairwise ch connections
n_ch_connections = scipy.special.comb(n_sources,2, exact=True, repetition=False)
# Load source time series
with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file:
STCs_list = pickle.load(file)
# I made my own slightly-optimized PLV & WPLI function
# Version 2 based on Filter + Hilbert instead of Morlets
def calculate_PLV_WPLI_across_time(data):
n_ch, n_time0 = data.shape
x = data.copy()
# Filter the signal in the different freq bands
con_array0 = np.zeros((2,n_ch,n_ch,n_freq_bands))
# con_array0[con_array0==0] = np.nan
for fname, frange in Freq_Bands.items():
fmin, fmax = [float(interval) for interval in frange]
signal_filtered = mne.filter.filter_data(x, sfreq, fmin, fmax,
fir_design="firwin", verbose=0)
# Filtering on finite signals will yield very low values for first
# and last timepoint, which can create outliers. E.g. 1e-29 compared to 1e-14
# This systematic error is removed by removing the first and last timepoint
signal_filtered = signal_filtered[:,1:-1]
# Hilbert transform to get complex signal
analytic_signal = scipy.signal.hilbert(signal_filtered)
# Calculate for the lower diagnonal only as it is symmetric
for ch_r in range(n_ch):
for ch_c in range(n_ch):
if ch_r>ch_c:
# =========================================================================
# PLV over time correspond to mean across time of the absolute value of
# the circular length of the relative phases. So PLV will be 1 if
# the phases of 2 signals maintain a constant lag
# In equational form: PLV = 1/N * |sum(e^i(phase1-phase2))|
# In code: abs(mean(exp(1i*phase_diff)))
# =========================================================================
# The real part correspond to the amplitude and the imaginary part can be used to calculate the phase
phase_diff = np.angle(analytic_signal[ch_r])-np.angle(analytic_signal[ch_c])
# Convert phase difference to complex part i(phase1-phase2)
phase_diff_im = 0*phase_diff+1j*phase_diff
# Take the exponential, then the mean followed by absolute value
PLV = np.abs(np.mean(np.exp(phase_diff_im)))
# Save to array
con_array0[0,ch_r,ch_c,list(Freq_Bands.keys()).index(fname)] = PLV
# =========================================================================
# PLI over time correspond to the sign of the sine of relative phase
# differences. So PLI will be 1 if one signal is always leading or
# lagging behind the other signal. But it is insensitive to changes in
# relative phase, as long as it is the same signal that leads.
# If 2 signals are almost in phase, they might shift between lead/lag
# due to small fluctuations from noise. This would lead to unstable
# estimation of "phase" synchronisation.
# The wPLI tries to correct for this by weighting the PLI with the
# magnitude of the lag, to attenuate noise sources giving rise to
# near zero phase lag "synchronization"
# In equational form: WPLI = |E{|phase_diff|*sign(phase_diff)}| / E{|phase_diff|}
# =========================================================================
# Calculate the magnitude of phase differences
phase_diff_mag = np.abs(np.sin(phase_diff))
# Calculate the signed phase difference (PLI)
sign_phase_diff = np.sign(np.sin(phase_diff))
# Calculate the nominator (abs and average across time)
WPLI_nominator = np.abs(np.mean(phase_diff_mag*sign_phase_diff))
# Calculate denominator for normalization
WPLI_denom = np.mean(phase_diff_mag)
# Calculate WPLI
WPLI = WPLI_nominator/WPLI_denom
# Save to array
con_array0[1,ch_r,ch_c,list(Freq_Bands.keys()).index(fname)] = WPLI
return con_array0
# Pre-allocatate memory
con_data = np.zeros((n_con_methods,n_subjects,n_eye_status,n_sources,n_sources,n_freq_bands))
n_epochs_matrix = np.zeros((n_subjects,n_eye_status))
# Get current time
c_time = time.localtime()
c_time = time.strftime("%H:%M:%S", c_time)
print(c_time)
def connectivity_estimation(i):
con_data0 = np.zeros((n_con_methods,n_eye_status,n_sources,n_sources,n_freq_bands))
con_data0[con_data0==0] = np.nan
n_epochs_matrix0 = np.zeros((n_eye_status))
for e in range(n_eye_status):
ee = eye_status[e]
eye_idx = final_epochs[i].events[:,2] == e+1 # EC = 1 and EO = 2
# Get source time series
temp_STC = STCs_list[i][eye_idx]
# Calculate the coherence and ImgCoh for the given subject and eye status
con, freqs, times, n_epochs, n_tapers = spectral_connectivity(
temp_STC, method=connectivity_methods[0:2],
mode="multitaper", sfreq=sfreq, fmin=fmin, fmax=fmax,
faverage=True, verbose=0)
# Save the results in array
con_data0[0,e,:,:,:] = con[0] # coherence
con_data0[1,e,:,:,:] = np.abs(con[1]) # Absolute value of ImgCoh to reflect magnitude of ImgCoh
# Calculate PLV and wPLI for each epoch and then average
n_epochs0 = temp_STC.shape[0]
con_data1 = np.zeros((len(connectivity_methods[2:]),n_epochs0,n_sources,n_sources,n_freq_bands))
for epoch in range(n_epochs0):
# First the data is retrieved and epoch axis dropped
temp_data = temp_STC[epoch,:,:]
# PLV and WPLI value is calculated across timepoints in each freq band
PLV_WPLI_con = calculate_PLV_WPLI_across_time(temp_data)
# Save results
con_data1[0,epoch,:,:,:] = PLV_WPLI_con[0] # phase locking value
con_data1[1,epoch,:,:,:] = PLV_WPLI_con[1] # weighted phase lag index
# Take average across epochs for PLV and wPLI
con_data2 = np.mean(con_data1,axis=1)
# Save to final array
con_data0[2,e,:,:,:] = con_data2[0] # phase locking value
con_data0[3,e,:,:,:] = con_data2[1] # weighted phase lag index
n_epochs_matrix0[e] = n_epochs
print("{} out of {} finished".format(i+1,n_subjects))
return i, con_data0, n_epochs_matrix0
with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor:
for i, con_result, n_epochs_mat in executor.map(connectivity_estimation, range(n_subjects)): # Function and arguments
con_data[:,i,:,:,:,:] = con_result
n_epochs_matrix[i] = n_epochs_mat
# Get current time
c_time = time.localtime()
c_time = time.strftime("%H:%M:%S", c_time)
print(c_time)
# Save the results
np.save(Feature_savepath+"Source_drop_interpol_ch_connectivity_measures_data.npy", con_data) # (con_measure,subject,eye,ch,ch,freq)
# Also save as dataframe format for feature selection
# Convert to Pandas dataframe
# The dimensions will each be a column with numbers and the last column will be the actual values
arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, con_data.shape), indexing="ij"))) + [con_data.ravel()])
con_data_df = pd.DataFrame(arr, columns = ["Con_measurement", "Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"])
# Change from numerical coding to actual values
eye_status = list(final_epochs[0].event_id.keys())
freq_bands_name = list(Freq_Bands.keys())
index_values = [connectivity_methods,Subject_id,eye_status,label_names,label_names,freq_bands_name]
for col in range(len(index_values)):
col_name = con_data_df.columns[col]
for shape in range(con_data.shape[col]): # notice not dataframe but the array
con_data_df.loc[con_data_df.iloc[:,col] == shape,col_name]\
= index_values[col][shape]
# Add group status
Group_status = np.array(["CTRL"]*len(con_data_df["Subject_ID"]))
Group_status[np.array([i in cases for i in con_data_df["Subject_ID"]])] = "PTSD"
# Add to dataframe
con_data_df.insert(3, "Group_status", Group_status)
# Remove all diagonal and upper-matrix entries
con_data_df = con_data_df.iloc[con_data_df["Value"].to_numpy().nonzero()]
# Save df
con_data_df.to_pickle(os.path.join(Feature_savepath,"con_data_source_drop_interpol_df.pkl"))
# %% Estimate Granger's Causality in source space
# Load source labels
with open("custom_aparc2009_Li_et_al_2022.pkl", "rb") as file:
labels = pickle.load(file)
label_names = [label.name for label in labels]
n_sources = len(label_names)
# Load source time series
with open(Feature_savepath+"STCs_each_epoch_drop_interpol_ch_fix_snr.pkl", "rb") as file:
STCs_list = pickle.load(file)
# Granger's causality might be influenced by volume conduction, thus working with CSD might be beneficial
# But since I already used source modelling to alleviate this problem I will not apply CSD
# Barrett et al, 2012 also do not apply CSD on source GC
# GC assumes stationarity, thus I will test for stationarity using ADF test
# The null hypothesis of ADF is that it has unit root, i.e. is non-stationary
# I will test how many can reject the null hypothesis, i.e. are stationary
# Due to the low numerical values in STC the ADF test is unstable, thus I multiply it to be around 1e0
stationary_test_arr = [0]*n_subjects
n_tests = [0]*n_subjects
for i in range(n_subjects):
# Get data
data_arr = STCs_list[i]
# Get shape
n_epochs, n_channels, n_timepoints = data_arr.shape
# Create array for indices to print out progress
ep_progress_idx = np.arange(n_epochs//5,n_epochs,n_epochs//5)
# Calculate number of tests performed for each subject
n_tests[i] = n_epochs*n_channels
# Prepare empty array (with 2's as 0 and 1 will be used)
stationary_test_arr0 = np.zeros((n_epochs,n_channels))+2 # make array of 2's
for ep in range(n_epochs):
for c in range(n_channels):
ADF = adfuller(data_arr[ep,c,:]*1e14) # multilying with a constant does not change ADF, but helps against numerical instability
p_value = ADF[1]
if p_value < 0.05:
stationary_test_arr0[ep,c] = True # Stationary set to 1
else:
stationary_test_arr0[ep,c] = False # Non-stationary set to 0
# Print partial progress
if len(np.where(ep_progress_idx==ep)[0]) > 0:
print("Finished epoch number: {} out of {}".format(ep,n_epochs))
# Indices that were not tested
no_test_idx = np.where(stationary_test_arr0==2)[0]
if len(no_test_idx) > 0:
print("An unexpected error occurred and {} was not tested".format(no_test_idx))
# Save to list
stationary_test_arr[i] = stationary_test_arr0
# Print progress
print("Finished subject {} out of {}".format(i+1,n_subjects))
with open(Stat_savepath+"Source_drop_interpol_GC_stationarity_tests.pkl", "wb") as filehandle:
# The data is stored as binary data stream
pickle.dump(stationary_test_arr, filehandle)
# I used a threshold of 0.05
# This means that on average I would expect 5% false positives among the tests that showed significance for stationarity
ratio_stationary = [0]*n_subjects
for i in range(n_subjects):
# Ratio of tests that showed stationarity
ratio_stationary[i] = np.sum(stationary_test_arr[i])/n_tests[i]
print("Ratio of stationary time series: {0:.3f}".format(np.mean(ratio_stationary))) # 88%
# The pre-processing have already ensured that most of the data fulfills the stationarity assumption.
# Divide the data into eyes closed and open
ch_names = label_names
n_channels = len(ch_names)
STC_eye_data = []
for i in range(n_subjects):
# Get index for eyes open and eyes closed
EC_index = final_epochs[i].events[:,2] == 1
EO_index = final_epochs[i].events[:,2] == 2
# Get the data
EC_epoch_data = STCs_list[i][EC_index,:,:] # eye index
EO_epoch_data = STCs_list[i][EO_index,:,:]
# Save to list
STC_eye_data.append([EC_epoch_data, EO_epoch_data])
# Make each epoch a TimeSeries object
# Input for TimeSeries is: (ch, time)
eye_status = list(final_epochs[0].event_id.keys())
n_eye_status = len(eye_status)
sfreq = final_epochs[0].info["sfreq"]
Timeseries_data = []
for i in range(n_subjects):
temp_list1 = []
for e in range(n_eye_status):
temp_list2 = []
n_epochs = STC_eye_data[i][e].shape[0]
for ep in range(n_epochs):
# Convert to TimeSeries
time_series = nts.TimeSeries(STC_eye_data[i][e][ep,:,:], sampling_rate=sfreq)
# Save the object
temp_list2.append(time_series)
# Save the timeseries across eye status
temp_list1.append(temp_list2)
# Save the timeseries across subjects
Timeseries_data.append(temp_list1) # output [subject][eye][epoch](ch,time)
# Test multiple specified model orders of AR models, each combination has its own model
m_orders = np.linspace(1,25,25) # the model orders tested
m_orders = np.round(m_orders)
n_timepoints = len(Timeseries_data[0][0][0])
n_ch_combinations = scipy.special.comb(n_channels,2, exact=True, repetition=False)
# To reduce computation time I only test representative epochs (1 from each 1 min session)
# There will be 5 epochs from eyes closed and 5 from eyes open
n_rep_epoch = 5
# The subjects have different number of epochs due to dropped epochs
gaps_trials_idx = np.load("Gaps_trials_idx.npy") # time_points between sessions
# I convert the gap time points to epoch number used as representative epoch
epoch_idx = np.zeros((n_subjects,n_eye_status,n_rep_epoch), dtype=int) # prepare array
epoch_idx[:,:,0:4] = np.round(gaps_trials_idx/n_timepoints,0)-8 # take random epoch from sessions 1 to 4
epoch_idx[:,:,4] = np.round(gaps_trials_idx[:,:,3]/n_timepoints,0)+5 # take random epoch from session 5
# Checking if all epoch idx exists
for i in range(n_subjects):
EC_index = final_epochs[i].events[:,2] == 1
EO_index = final_epochs[i].events[:,2] == 2
assert np.sum(EC_index) >= epoch_idx[i,0,4]
assert np.sum(EO_index) >= epoch_idx[i,1,4]
# Prepare model order estimation
AIC_arr = np.zeros((len(m_orders),n_subjects,n_eye_status,n_rep_epoch,n_ch_combinations))
BIC_arr = np.zeros((len(m_orders),n_subjects,n_eye_status,n_rep_epoch,n_ch_combinations))
def GC_model_order_est(i):
AIC_arr0 = np.zeros((len(m_orders),n_eye_status,n_rep_epoch,n_ch_combinations))
BIC_arr0 = np.zeros((len(m_orders),n_eye_status,n_rep_epoch,n_ch_combinations))
for e in range(n_eye_status):
n_epochs = STC_eye_data[i][e].shape[0]
N_total = n_timepoints*n_epochs # total number of datapoints for specific eye condition
for ep in range(n_rep_epoch):
epp = epoch_idx[i,e,ep]
for o in range(len(m_orders)):
order = int(m_orders[o])
# Make the Granger Causality object
GCA1 = nta.GrangerAnalyzer(Timeseries_data[i][e][epp-1], order=order,
n_freqs=2000)
for c in range(n_ch_combinations):
# Retrieve error covariance matrix for all combinations
ecov = np.array(list(GCA1.error_cov.values()))
# Calculate AIC
AIC = ntsu.akaike_information_criterion(ecov[c,:,:], p = n_channels,
m=order, Ntotal=N_total)
# Calculate BIC
BIC = ntsu.bayesian_information_criterion(ecov[c,:,:], p = n_channels,
m=order, Ntotal=N_total)
# Save the information criterions
AIC_arr0[o,e,ep,c] = AIC
BIC_arr0[o,e,ep,c] = BIC
print("{} out of {} finished testing".format(i+1,n_subjects))
return i, AIC_arr0, BIC_arr0
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
with concurrent.futures.ProcessPoolExecutor() as executor:
for i, AIC_result, BIC_result in executor.map(GC_model_order_est, range(n_subjects)): # Function and arguments
AIC_arr[:,i] = AIC_result
BIC_arr[:,i] = BIC_result
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
# Save the AIC and BIC results
np.save(Feature_savepath+"AIC_Source_drop_interpol_GC_model_order.npy", AIC_arr) # (m. order, subject, eye, epoch, combination)
np.save(Feature_savepath+"BIC_Source_drop_interpol_GC_model_order.npy", BIC_arr) # (m. order, subject, eye, epoch, combination)
# Load data
AIC_arr = np.load(Feature_savepath+"AIC_Source_drop_interpol_GC_model_order.npy")
BIC_arr = np.load(Feature_savepath+"BIC_Source_drop_interpol_GC_model_order.npy")
# Average across all subjects, eye status, epochs and combinations
plt.figure(figsize=(8,6))
plt.plot(m_orders, np.nanmean(AIC_arr, axis=(1,2,3,4)), label="AIC")
plt.plot(m_orders, np.nanmean(BIC_arr, axis=(1,2,3,4)), label="BIC")
plt.title("Average information criteria value")
plt.xlabel("Model order (Lag)")
plt.legend()
np.sum(np.isnan(AIC_arr))/AIC_arr.size # around 0.07% NaN due to non-convergence
np.sum(np.isnan(BIC_arr))/BIC_arr.size # around 0.07% NaN due to non-convergence
# If we look at each subject
mean_subject_AIC = np.nanmean(AIC_arr, axis=(2,3,4))
plt.figure(figsize=(8,6))
for i in range(n_subjects):
plt.plot(m_orders, mean_subject_AIC[:,i])
plt.title("Average AIC for each subject")
plt.xlabel("Model order (Lag)")
mean_subject_BIC = np.nanmean(BIC_arr, axis=(2,3,4))
plt.figure(figsize=(8,6))
for i in range(n_subjects):
plt.plot(m_orders, mean_subject_BIC[:,i])
plt.title("Average BIC for each subject")
plt.xlabel("Model order (Lag)")
# We see that for many cases in BIC, it does not converge. Monotonic increasing!
# We can look at the distribution of chosen order for each time series analyzed
# I.e. I will find the minima in model order for each model
AIC_min_arr = np.argmin(AIC_arr, axis=0)
BIC_min_arr = np.argmin(BIC_arr, axis=0)
# Plot the distributions of the model order chosen
plt.figure(figsize=(8,6))
sns.distplot(AIC_min_arr.reshape(-1)+1, kde=False, norm_hist=True,
bins=np.linspace(0.75,30.25,60), label="AIC")
plt.ylabel("Frequency density")
plt.xlabel("Model order")
plt.title("AIC Model Order Estimation")
plt.figure(figsize=(8,6))
sns.distplot(BIC_min_arr.reshape(-1)+1, kde=False, norm_hist=True, color="blue",
bins=np.linspace(0.75,30.25,60), label="BIC")
plt.ylabel("Frequency density")
plt.xlabel("Model order")
plt.title("BIC Model Order Estimation")
# It is clear from the BIC model that most have model order 1
# which reflect their monotonic increasing nature without convergence
# Thus I will only use AIC
# There is a bias variance trade-off with model order [Stokes & Purdon, 2017]
# Lower order is associated with higher bias and higher order with variance
# I will choose the model order that is chosen the most (i.e. majority voting)
AR_order = int(np.nanquantile(AIC_min_arr.reshape(-1), q=0.5))
# Order = 5
# Calculate Granger Causality for each subject, eye and epoch
Freq_Bands = {"delta": [1.25, 4.0],
"theta": [4.0, 8.0],
"alpha": [8.0, 13.0],
"beta": [13.0, 30.0],
"gamma": [30.0, 49.0]}
n_freq_bands = len(Freq_Bands)
# Pre-allocate memory
GC_data = np.zeros((2,n_subjects,n_eye_status,n_channels,n_channels,n_freq_bands))
def GC_analysis(i):
GC_data0 = np.zeros((2,n_eye_status,n_channels,n_channels,n_freq_bands))
for e in range(n_eye_status):
n_epochs = STC_eye_data[i][e].shape[0]
# Make temporary array to save GC for each epoch
temp_GC_data = np.zeros((2,n_epochs,n_channels,n_channels,n_freq_bands))
for ep in range(n_epochs):
# Fit the AR model
GCA = nta.GrangerAnalyzer(Timeseries_data[i][e][ep], order=AR_order,
n_freqs=int(800)) # n_Freq=800 correspond to step of 0.25Hz, the same as multitaper for power estimation
for f in range(n_freq_bands):
# Define lower and upper band
f_lb = list(Freq_Bands.values())[f][0]
f_ub = list(Freq_Bands.values())[f][1]
# Get index corresponding to the frequency bands of interest
freq_idx_G = np.where((GCA.frequencies >= f_lb) * (GCA.frequencies < f_ub))[0]
# Calculcate Granger causality quantities
g_xy = np.mean(GCA.causality_xy[:, :, freq_idx_G], -1) # avg on last dimension
g_yx = np.mean(GCA.causality_yx[:, :, freq_idx_G], -1) # avg on last dimension
# Transpose to use same format as con_measurement and save
temp_GC_data[0,ep,:,:,f] = np.transpose(g_xy)
temp_GC_data[1,ep,:,:,f] = np.transpose(g_yx)
# Average over epochs for each person, eye condition, direction and frequency band
temp_GC_epoch_mean = np.nanmean(temp_GC_data, axis=1) # sometimes Log(Sxx/xx_auto_component) is nan
# Save to array
GC_data0[:,e,:,:,:] = temp_GC_epoch_mean
print("{} out of {} finished analyzing".format(i+1,n_subjects))
return i, GC_data0
# Get current time
c_time1 = time.localtime()
c_time1 = time.strftime("%a %d %b %Y %H:%M:%S", c_time1)
print(c_time1)
with concurrent.futures.ProcessPoolExecutor() as executor:
for i, GC_result in executor.map(GC_analysis, range(n_subjects)): # Function and arguments
GC_data[:,i] = GC_result
# Get current time
c_time2 = time.localtime()
c_time2 = time.strftime("%a %d %b %Y %H:%M:%S", c_time2)
print("Started", c_time1, "\nCurrent Time",c_time2)
# Output: GC_data (g_xy/g_yx, subject, eye, chx, chy, freq)
# Notice that for g_xy ([0,...]) it means "chy" Granger causes "chx"
# and for g_yx ([1,...]) it means "chx" Granger causes "chy"
# This is due to the transposing which flipped the results on to the lower-part of the diagonal
# Save the Granger_Causality data
np.save(Feature_savepath+"Source_drop_interpol_GrangerCausality_data.npy", GC_data)
# Theoretically negative GC values should be impossible, but in practice
# they can still occur due to problems with model fitting (see Stokes & Purdon, 2017)
print("{:.3f}% negative GC values".\
format(np.sum(GC_data[~np.isnan(GC_data)]<0)/np.sum(~np.isnan(GC_data))*100)) # 0.08% negative values
# These values cannot be interpreted, but seems to occur mostly for true non-causal connections
# Hence I set them to 0
with np.errstate(invalid="ignore"): # invalid number refers to np.nan, which will be set to False for comparisons
GC_data[(GC_data<0)] = 0
# Save as dataframe for further processing with other features
# Convert to Pandas dataframe
# The dimensions will each be a column with numbers and the last column will be the actual values
arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, GC_data.shape), indexing="ij"))) + [GC_data.ravel()])
GC_data_df = pd.DataFrame(arr, columns = ["GC_direction", "Subject_ID", "Eye_status", "chx", "chy", "Freq_band", "Value"])
# Change from numerical coding to actual values
eye_status = list(final_epochs[0].event_id.keys())
freq_bands_name = list(Freq_Bands.keys())
GC_directions_info = ["chy -> chx", "chx -> chy"]
index_values = [GC_directions_info,Subject_id,eye_status,ch_names,ch_names,freq_bands_name]
for col in range(len(index_values)):
col_name = GC_data_df.columns[col]
for shape in range(GC_data.shape[col]): # notice not dataframe but the array
GC_data_df.loc[GC_data_df.iloc[:,col] == shape,col_name]\
= index_values[col][shape]
# Add group status
Group_status = np.array(["CTRL"]*len(GC_data_df["Subject_ID"]))
Group_status[np.array([i in cases for i in GC_data_df["Subject_ID"]])] = "PTSD"
# Add to dataframe
GC_data_df.insert(3, "Group_status", Group_status)
# Remove all nan (including diagonal and upper-matrix entries)
GC_data_df = GC_data_df.iloc[np.invert(np.isnan(GC_data_df["Value"].to_numpy()))]
# Swap ch values for GC_direction chy -> chx (so it is always chx -> chy)
tempchy = GC_data_df[GC_data_df["GC_direction"] == "chy -> chx"]["chy"] # save chy
GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chy"] =\
GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chx"] # overwrite old chy
GC_data_df.loc[GC_data_df["GC_direction"] == "chy -> chx","chx"] = tempchy # overwrite chx
# Drop the GC_direction column
GC_data_df = GC_data_df.drop("GC_direction", axis=1)
# Save df
GC_data_df.to_pickle(os.path.join(Feature_savepath,"GC_data_source_drop_interpol_df.pkl"))
# Testing if df was formatted correctly
expected_GC_values = n_subjects*n_eye_status*n_ch_combinations*n_freq_bands*2 # 2 because it is bidirectional
assert GC_data_df.shape[0] == expected_GC_values
# Testing a random GC value
random_connection = np.random.randint(0,GC_data_df.shape[0])
test_connection = GC_data_df.iloc[random_connection,:]
i = np.where(Subject_id==test_connection["Subject_ID"])[0]
e = np.where(np.array(eye_status)==test_connection["Eye_status"])[0]
chx = np.where(np.array(ch_names)==test_connection["chx"])[0]
chy = np.where(np.array(ch_names)==test_connection["chy"])[0]
f = np.where(np.array(freq_bands_name)==test_connection["Freq_band"])[0]
value = test_connection["Value"]
if chx < chy: # the GC array is only lower diagonal to save memory
assert GC_data[0,i,e,chy,chx,f] == value
if chx > chy:
assert GC_data[1,i,e,chx,chy,f] == value