From 5ab7a51f86c2c75e6cedbecc8101598847014c3b Mon Sep 17 00:00:00 2001 From: Monica J Emerson <monj@dtu.dk> Date: Wed, 29 Sep 2021 15:47:42 +0200 Subject: [PATCH] added code for sharing --- code/image_preprocessing.py | 156 ++++ code/local_features.py | 117 +++ ...feature_patch_allPatients_singleProtein.py | 645 ++++++++++++++++ ..._feature_patch_allPatients_threeProtein.py | 689 ++++++++++++++++++ ...ng_feature_patch_allPatients_twoProtein.py | 551 ++++++++++++++ code/lung_lif_to_png.py | 51 ++ code/microscopy_analysis.py | 624 ++++++++++++++++ 7 files changed, 2833 insertions(+) create mode 100755 code/image_preprocessing.py create mode 100755 code/local_features.py create mode 100755 code/lung_feature_patch_allPatients_singleProtein.py create mode 100755 code/lung_feature_patch_allPatients_threeProtein.py create mode 100755 code/lung_feature_patch_allPatients_twoProtein.py create mode 100644 code/lung_lif_to_png.py create mode 100755 code/microscopy_analysis.py diff --git a/code/image_preprocessing.py b/code/image_preprocessing.py new file mode 100755 index 0000000..7c50445 --- /dev/null +++ b/code/image_preprocessing.py @@ -0,0 +1,156 @@ +#%% #!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Preprocessing of the images + +1. Compute maximum projections. +2. Explore the intensities of the maximum projection images. +3. Correct the intensities. Normalisation across channels and images, +to equalise the standard deviation for intensity values > th (just considering +the pixels that contain protein, and not those represinting air, so as to avoid +enhancing the background noise) + +Created on Wed Oct 7 10:34:52 2020 + +@author: monj +""" +import os +import microscopy_analysis as ma +import matplotlib.pyplot as plt +from datetime import datetime +import numpy as np +import skimage.io as io +import copy + +# Turn interactive plotting off +plt.ioff() + +plt.close('all') + +start_time_global = datetime.now() + +#%% I/O directories + +diseases = ['emphysema','uip','sarcoidosis','covid'] #I don't include sarcoidosis and uip, I manually create the group later, just so those don't count twice for the std estimation +conditions = diseases[:] +conditions.insert(0,'control') + +# Input directories - images are stored in folders starting with the name 'frame' +base_dir = '../input_data/' +dir_in = base_dir + '/raw/' +base_name = 'frame' + +# Output directories: max projections and postprocessed +dir_maxProj_raw = base_dir + 'maximum_projections/' +ma.make_output_dirs(dir_maxProj_raw,conditions) + +dir_postprocessed = base_dir + 'postprocessed_maxProjs/' + +dir_maxProj_processed = dir_postprocessed + 'rgb/' +ma.make_output_dirs(dir_maxProj_processed,conditions) + +dir_maxProj_processed_cmy = dir_postprocessed + 'cmy/' +ma.make_output_dirs(dir_maxProj_processed_cmy,conditions) + +dir_results_intinspect = dir_postprocessed + 'intensity_inspection/' +ma.make_output_dirs(dir_results_intinspect,conditions) + +#%% Get maximum projection images + +# Automatically decide whether to recompute or read the max. projections images +read_flag = True +for disease in diseases: + disease_flag = len(os.listdir(dir_maxProj_raw + disease + '/'))>2 + #print(disease) + #print(disease_flag) + read_flag = disease_flag&read_flag + + +# Get maximum projection images +if not(read_flag): + # Compute maximum projection images (runtime ~= 135 s) + print('Computing maximum projection images') + startTime = datetime.now() + max_img_list = [] + for condition in conditions: + max_img_list += [ma.comp_max_imgs(dir_in + condition, base_name, dir_maxProj_raw + condition)] + endTime = datetime.now() - startTime + print(endTime) +else: + # Read maximum projection images (runtime ~= 20 s ) + print('Reading maximum projection images') + startTime = datetime.now() + max_img_list = [] + for condition in conditions: + max_img_list += [ma.read_max_imgs(dir_maxProj_raw + condition, base_name)] + endTime = datetime.now() - startTime + print(endTime) + +#%% Plot histograms of raw images (approx. 29s per condition) +for condition, max_img_list_condition in zip(conditions, max_img_list): + ma.plotHist_perChannel_imgset_list(max_img_list_condition, dir_in + condition, dir_results_intinspect + condition, name_tag = 'original') + +#%% Measure the spread of intensities in each channel +"""The spread of intensities is quantified by the standard deviation of the +intensities that are over a threshold. The threshold value is set manually +and it defines the minimum intensity associated with the presence of protein. +Image regions with intensity values under the threshold are considered to +image air, i.e. absence of protein.""" + +# Protein presence/absence (air) intensity threshold +th_int = 15 #could it be set automatically based on the data? + +# Compute stds per image channel and, for each sample, plot intensity histograms per channel. +std_list = [] +for condition, max_img_list_condition in zip(conditions, max_img_list): + std_list += [ma.comp_std_perChannel(max_img_list_condition, th_int, dir_in + condition)] + +# Histogram of standard deviations (std) per image channel +fig, axs = plt.subplots(len(conditions),4, figsize = (4*2,len(conditions)*2), sharex = True, sharey = True) +plt.suptitle('Distribution of intensity spread across images\n Rows: ' + ', '.join(conditions)) +for ind_c,std_list_condition in enumerate(std_list): + for ind,channel_list in enumerate(std_list_condition): + axs[0][ind+1].title.set_text('Channel ' + str(ind+1)) + axs[ind_c][ind+1].hist(channel_list, bins = 50, density = True) + +# Histogram of standard deviations per image +axs[0][0].title.set_text('All channels') +std_list_flat = [] +for ind_c,std_list_condition in enumerate(std_list): + std_list_condition_flat = ma.flatten_list(std_list_condition) + axs[ind_c][0].hist(std_list_condition_flat, bins = 50, density = True) + std_list_flat += std_list_condition_flat +plt.show() + +# Average intesity-spread (std) across all image channels +mean_std = sum(std_list_flat)/len(std_list_flat) +print('The average spread of an image channel is ' + str(round(mean_std))) + +#%% Correct intensities (34s per condition) +max_img_corr_list = [] +for condition, max_img_list_condition in zip(conditions, max_img_list): + max_img_corr_list += [ma.intensity_spread_normalisation(copy.deepcopy(max_img_list_condition), th_int, mean_std, dir_in + condition, base_name, dir_maxProj_processed + condition)] + +#%% Plot histograms of corrected images (29s per condition) +for condition, max_img_corr_list_condition in zip(conditions, max_img_corr_list): + ma.plotHist_perChannel_imgset_list(max_img_corr_list_condition, dir_in + condition, dir_results_intinspect + condition, name_tag = 'corrected') + +#%% Visualise raw vs. corrected images +for max_img_list_condition, max_img_corr_list_condition, condition in zip(max_img_list, max_img_corr_list, conditions): + ma.compare_imgpairs_list(max_img_list_condition, max_img_corr_list_condition , dir_in + condition, dir_results_intinspect + condition) + + #%% Exchange channels: fibrillar collagen with elastin +for max_img_list_condition in max_img_corr_list: + print(len(max_img_list_condition)) + for max_img_list_sample in max_img_list_condition: + print(len(max_img_list_sample)) + for nr_frame,max_img in enumerate(max_img_list_sample): + max_img_list_sample[nr_frame]= max_img[:,:,[0,2,1]] + +#%% Save images in cmy colourspace +for max_img_corr_list_condition, condition in zip(max_img_corr_list, conditions): + ma.rgb2cmy_list(max_img_corr_list_condition, dir_in + condition, base_name, dir_maxProj_processed_cmy + condition) + +#%%% Print total computational time +end_time_global = datetime.now() - startTime +print(end_time_global) \ No newline at end of file diff --git a/code/local_features.py b/code/local_features.py new file mode 100755 index 0000000..11b5b3b --- /dev/null +++ b/code/local_features.py @@ -0,0 +1,117 @@ +""" +Helping functions for extracting features from images +""" + +import skimage.io +import numpy as np +import matplotlib.pyplot as plt +import scipy.ndimage + +def get_gauss_feat_im(im, sigma=1, normalize=False): + """Gauss derivative feaures for every image pixel. + Arguments: + image: a 2D image, shape (r,c). + sigma: standard deviation for Gaussian derivatives. + normalize: flag indicating normalization of features. + Returns: + imfeat: a 3D array of size (r,c,15) with a 15-dimentional feature + vector for every image pixel. + Author: vand@dtu.dk, 2020 + """ + + r,c = im.shape + imfeat = np.zeros((r,c,15)) + imfeat[:,:,0] = scipy.ndimage.gaussian_filter(im,sigma,order=0) + imfeat[:,:,1] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,1]) + imfeat[:,:,2] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,0]) + imfeat[:,:,3] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,2]) + imfeat[:,:,4] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,1]) + imfeat[:,:,5] = scipy.ndimage.gaussian_filter(im,sigma,order=[2,0]) + imfeat[:,:,6] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,3]) + imfeat[:,:,7] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,2]) + imfeat[:,:,8] = scipy.ndimage.gaussian_filter(im,sigma,order=[2,1]) + imfeat[:,:,9] = scipy.ndimage.gaussian_filter(im,sigma,order=[3,0]) + imfeat[:,:,10] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,4]) + imfeat[:,:,11] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,3]) + imfeat[:,:,12] = scipy.ndimage.gaussian_filter(im,sigma,order=[2,2]) + imfeat[:,:,13] = scipy.ndimage.gaussian_filter(im,sigma,order=[3,1]) + imfeat[:,:,14] = scipy.ndimage.gaussian_filter(im,sigma,order=[4,0]) + + if normalize: + imfeat -= np.mean(imfeat, axis=(0,1)) + imfeat *= (1/np.std(imfeat, axis=(0,1))) + + return imfeat + +def im2col(im, patch_size=[3,3], stepsize=1): + """Rearrange image patches into columns + Arguments: + image: a 2D image, shape (r,c). + patch size: size of extracted paches. + stepsize: patch step size. + Returns: + patches: a 2D array which in every column has a patch associated + with one image pixel. For stepsize 1, number of returned column + is (r-patch_size[0]+1)*(c-patch_size[0]+1) due to bounary. The + length of columns is pathc_size[0]*patch_size[1]. + """ + + r,c = im.shape + s0, s1 = im.strides + nrows =r-patch_size[0]+1 + ncols = c-patch_size[1]+1 + shp = patch_size[0],patch_size[1],nrows,ncols + strd = s0,s1,s0,s1 + + out_view = np.lib.stride_tricks.as_strided(im, shape=shp, strides=strd) + return out_view.reshape(patch_size[0]*patch_size[1],-1)[:,::stepsize] + + +def ndim2col(im, block_size=[3,3], stepsize=1): + """Rearrange image blocks into columns for N-D image (e.g. RGB image)""""" + if(im.ndim == 2): + return im2col(im, block_size, stepsize) + else: + r,c,l = im.shape + patches = np.zeros((l*block_size[0]*block_size[1], + (r-block_size[0]+1)*(c-block_size[1]+1))) + for i in range(l): + patches[i*block_size[0]*block_size[1]:(i+1)*block_size[0]*block_size[1], + :] = im2col(im[:,:,i],block_size,stepsize) + return patches + +#%% + +if __name__ == '__main__': + + filename = '../example_data/training_image.png' + I = skimage.io.imread(filename) + I = I.astype(np.float) + + + # features based on gaussian derivatives + sigma = 1; + gf = get_gauss_feat_im(I, sigma) + + fig,ax = plt.subplots(3,5) + for j in range(5): + for i in range(3): + ax[i][j].imshow(gf[:,:,5*i+j]) + ax[i][j].set_title(f'layer {5*i+j}') + + + # features based on image patches + I = I[I.shape[0]//2:I.shape[0]//2+20,I.shape[1]//2:I.shape[1]//2+20] # smaller image such that we can see the difference in layers + pf = im2col(I,[3,3]) + pf = pf.reshape((9,I.shape[0]-2,I.shape[1]-2)) + + fig, ax = plt.subplots() + ax.imshow(I) + + fig,ax = plt.subplots(3,3) + for j in range(3): + for i in range(3): + ax[i][j].imshow(pf[3*i+j]) + ax[i][j].set_title(f'layer {3*i+j}') + + diff --git a/code/lung_feature_patch_allPatients_singleProtein.py b/code/lung_feature_patch_allPatients_singleProtein.py new file mode 100755 index 0000000..a43234d --- /dev/null +++ b/code/lung_feature_patch_allPatients_singleProtein.py @@ -0,0 +1,645 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Explorative analysis of protein composition and structure in lung ECM to +study disease-induced remodeling of the three proteins that provide structural +integrity to the lung: collagen IV, elastin and fibrillar collagen. + +The single-protein analyses provide the following: + - A 'Data Model' per protein, a dictionary of image patches representative + of the local protein structure found in the data (both + healthy and disease group) at the scale defined by the patch size. + - A 'Sorted Data Model' or disease signature per protein, a sorted version + of the dictionary above, based on how often a dictionary element (called + cluster centre in the publication) is found in the healthy vs. the disease. + - A 'Probability Map' for every image based on each protein, with pixel + correspondance to the input images (maximum projections), highlighting + regions characteristic of health or disease. + - An 'Agreement Map' for every image, which combines the corresponding + probability maps from the three proteins to visualise when structural + remodelling of proteins coexists in space. + - A 'Disease Effect' per protein, to quantify how well the control and + disease groups are separated after assigning a health probability score to + each image, computed as the average of its probability map values. + +by Monica J. Emerson monj@dtu.dk and Anders B. Dahl abda@dtu.dk + +For more information, read our article (add link and citation)""" + +#%% Load packages and initialise parameters + +import numpy as np +import matplotlib.pyplot as plt +import sklearn.cluster +import microscopy_analysis as ma +import skimage.io +import skimage.transform + +import os +from datetime import datetime +import sys +from matplotlib.offsetbox import OffsetImage, AnnotationBbox +from math import ceil + +startTime = datetime.now() + +plt.close('all') + +sc_fac = 0.5 #25 #25 #0.5 # scaling factor of the image +patch_size = 17 +nr_clusters = 100 # number of clusters +protein_mode = 'single' +colour_mode = 'cmy' #for image visualisation +th_back = 15 +nr_runs = 10 + +#%% Define and create directories + +disease = 'emphysema' #diseased (mix, 2 of each condition)' #''emphysema' 'sarcoidosis' 'ild' 'covid' +control = 'control' +conditions = [disease, control] + +# input directories - images start with the name 'frame' +base_dir_in = '../input_data/' +#dir_in = base_dir_in + 'maximum_projections' #uncomment to read maximum projections without contrast equalisation +dir_in = base_dir_in + 'postprocessed_maxProjs/rgb/' #Read rgb for correct protein channel ordering, visualisation will be in cmy +base_name = 'frame' + +#output directory +dir_results = '../results/' #rerun just to make sure not to muck up the results +os.makedirs(dir_results, exist_ok = True) +os.makedirs(dir_results + '/statistics', exist_ok = True) + + #%% Read (preprocessed) maximum corrected images +#Read maximum projection images (runtime ~= 20 s ) +print('Reading maximum projection images') + +max_img_list_control = ma.read_max_imgs(dir_in + control, base_name) +max_img_list_disease = ma.read_max_imgs(dir_in + disease, base_name) + +#%% Exchange channels: fibrillar collagen with elastin +for max_img_sample_control in max_img_list_control: + for nr_frame, max_img_control in enumerate(max_img_sample_control): + max_img_sample_control[nr_frame] = max_img_control[:,:,[0,2,1]] + +for max_img_sample_disease in max_img_list_disease: + for nr_frame, max_img_disease in enumerate(max_img_sample_disease): + max_img_sample_disease[nr_frame] = max_img_disease[:,:,[0,2,1]] + +#%% Prepare maximum projection images for feature extraction + +#Flatten and merge conditions +max_img_list_control_flat = ma.flatten_list(max_img_list_control) +max_img_list_disease_flat = ma.flatten_list(max_img_list_disease) + +#Rescale +max_img_list_control_processed = [] +for max_img in ma.flatten_list(max_img_list_control): + max_img_list_control_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)] + +max_img_list_disease_processed = [] +for max_img in ma.flatten_list(max_img_list_disease) : + max_img_list_disease_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype('uint8')] + +#%% Compute patches +patch_feat_list_control = [[],[],[]] +for max_img in max_img_list_control_processed: + for channel in range(max_img.shape[2]): + patch_feat_list_control[channel] += [ma.ndim2col_pad(max_img[...,channel], (patch_size, patch_size),norm=False).transpose()] + +patch_feat_list_disease = [[],[],[]] +for max_img in max_img_list_disease_processed: + for channel in range(max_img.shape[2]): + patch_feat_list_disease[channel] += [ma.ndim2col_pad(max_img[...,channel], (patch_size, patch_size),norm=False).transpose()] + +for run in range(nr_runs): + #directories for results from the different runs + dir_probs = dir_results + disease + '_' + protein_mode + '_%dclusters_%ddownscale_%dpatchsize/'%(nr_clusters,1/sc_fac,patch_size) + dir_probs += 'run'+str(run)+'/' + ma.make_output_dirs(dir_probs, conditions) + + #%% Run the whole analysis for the 3 channels independently + prob_img_list_control_allchannels = [[],[],[]] + prob_img_list_disease_allchannels = [[],[],[]] + + for channel in range(len(patch_feat_list_control)): + + #k-means clustering + startTime_clustering = datetime.now() + + batch_size = 10000 + th_nr_pathesINcluster = 20 + th_nr_weakClusters = 0 #added for covid and ild + + #If cluster centers were already computed, use those + if os.path.exists(dir_probs + 'array_cluster_centers_channel'+str(channel)+'.npy'): + cluster_centers = np.load(dir_probs + 'array_cluster_centers_channel'+str(channel)+'.npy') + kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size) + kmeans.cluster_centers_ = cluster_centers + + #Compute cluster centers + else: + #Get features to cluster + patch_feat_total = patch_feat_list_control[channel] + patch_feat_list_disease[channel] + + #features for clustering + nr_keep = 10000 # number of features/image randomly picked for clustering + n_im = len(patch_feat_total) + feat_dim = patch_feat_total[0].shape[1] + patch_feat_to_cluster = np.zeros((nr_keep*n_im,feat_dim)) + + f = 0 + nr_keep_control = nr_keep//(2*len(patch_feat_list_control[channel])//len(patch_feat_list_disease[channel])) + nr_keep_disease = nr_keep//2 + for patch_feat in patch_feat_list_control[channel]: + keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_control] + patch_feat_to_cluster[f:f+nr_keep_control,:] = patch_feat[keep_indices,:] + f += nr_keep + + for patch_feat in patch_feat_list_disease[channel]: + keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_disease] + patch_feat_to_cluster[f:f+nr_keep_disease,:] = patch_feat[keep_indices,:] + f += nr_keep + + #Get feature vectors + feat_to_cluster = patch_feat_to_cluster + + #Compute cluster centers and save if there is no weak clusters + nr_weakClusters = th_nr_weakClusters+1 + repetition = 0 + nr_weakClusters_list = [] + while nr_weakClusters>th_nr_weakClusters: #While weak clusters is different from empty recompute + kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size, verbose = False, reassignment_ratio=0.0005, max_no_improvement = 100) #0.00075 was good for uip + kmeans.fit(feat_to_cluster) + + #Nr. patches that have contributed to each cluster + nr_feat_in_cluster = [sum(kmeans.labels_==cluster) for cluster in range(nr_clusters)] + + weak_cluster_IDs = [ind for ind,val in enumerate(nr_feat_in_cluster) if val<th_nr_pathesINcluster] + print('Weak cluster IDs channel '+ str(channel) + ': ',weak_cluster_IDs) + + #Weak clusters are those composed by very few patches + nr_weakClusters = len([1 for i in nr_feat_in_cluster if i<th_nr_pathesINcluster]) + + #If there are weak clusters, the clustering should be recomputed + if nr_weakClusters>th_nr_weakClusters: + print(str(nr_weakClusters)+ " clusters composed of less than "+str(th_nr_pathesINcluster)+" images") + else: + np.save(dir_probs + 'array_cluster_centers_channel'+str(channel)+'.npy', kmeans.cluster_centers_) # .npy extension is added if not given + + print(repetition) + repetition += 1 + nr_weakClusters_list += [nr_weakClusters] + + + endTime_clustering = datetime.now() - startTime_clustering + print(endTime_clustering) + + #%% Plot all cluster centers (grid view) + cluster_centers = kmeans.cluster_centers_ + int_sum_list= ma.plot_grid_cluster_centers(cluster_centers, range(nr_clusters), patch_size, protein_mode = protein_mode, colour_mode = colour_mode, channel = channel) + plt.savefig(dir_probs + 'clustercenters_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Histograms for background, healthy and disease + hist_control, assignment_list_control= ma.compute_assignment_hist(patch_feat_list_control[channel], kmeans) + hist_disease, assignment_list_disease = ma.compute_assignment_hist(patch_feat_list_disease[channel], kmeans) + + #Show histograms (bar plot) of healthy and disease + fig, ax = plt.subplots(1,1) + ax.bar(np.array(range(0,nr_clusters))-0.25, hist_control, width = 0.5, label='Control', color = 'b') + ax.bar(np.array(range(0,nr_clusters))+0.25, hist_disease, width = 0.5, label='Disease', color = 'r') + ax.legend() + plt.savefig(dir_probs + 'assignmentHistograms_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Compute populated, occurrence and importance cluster measures from the histogram + #grid dimensions + + #OCCURRENCE RATIO: How more often does it occur in one of the conditions + occurrence_ratio = np.empty((nr_clusters)) + ind_control = hist_control>hist_disease + ind_disease = ~ind_control + control_cond = (hist_disease!=0) & ind_control + occurrence_ratio[control_cond] = hist_control[control_cond]/hist_disease[control_cond] + disease_cond = (hist_control!=0) & ind_disease + occurrence_ratio[disease_cond] = -hist_disease[disease_cond]/hist_control[disease_cond] + + #POPULATED: How trustworthy is the cluster? + populated = hist_control+hist_disease + weights = populated/populated.max() + + #Cluster IMPORTANCE: occurrence weighed by POPULATED + def sigmoid(x,k): + return 2 / (1 + np.exp(-k*x)) - 1 + + importance = occurrence_ratio * sigmoid(weights,100) + + #%% Plot cluster centers in the 2d populated/occurrence space + fig_control, ax_control = plt.subplots(figsize=(15,15)) + ax_control.scatter(populated[occurrence_ratio>1], occurrence_ratio[occurrence_ratio>1]) + ax_control.set_ylim(0.8,7) + ax_control.set_xlim(0,0.08) + + fig_disease, ax_disease = plt.subplots(figsize=(15,15)) + ax_disease.scatter(populated[occurrence_ratio<1], -occurrence_ratio[occurrence_ratio<1]) + ax_disease.set_ylim(0.8,7) + ax_disease.set_xlim(0,0.08) + + for x0, y0, cluster_nr in zip(populated, occurrence_ratio, np.arange(0,nr_clusters)): + cluster_centre = np.reshape(kmeans.cluster_centers_[cluster_nr,:],(patch_size,patch_size)) + if y0>0: + ab = AnnotationBbox(OffsetImage(cluster_centre.astype('uint8'),cmap='gray'), (x0, y0), frameon=False) + ax_control.add_artist(ab) + else: + ab = AnnotationBbox(OffsetImage(cluster_centre.astype('uint8'),cmap='gray'), (x0, -y0), frameon=False) + ax_disease.add_artist(ab) + plt.figure(fig_control.number) + plt.savefig(dir_probs + 'controlClustercenters_2Dspace_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + plt.figure(fig_disease.number) + plt.savefig(dir_probs + 'diseaseClustercenters_2Dspace_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Plot all clusters sorted by IMPORTANCE + clusters_control_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]>0] + clusters_control_occW_sorted.reverse() + + clusters_disease_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]<0] + + clusters_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten())] + clusters_occW_sorted.reverse() + + occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(importance.flatten())] + occurrence_ratio_sorted.reverse() + + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = colour_mode, channel = channel) #contrast = 'equal', + plt.suptitle('Cluster centers channel '+str(channel)+' \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted)) + ' in between') + plt.savefig(dir_probs + 'clustercenters_occW_sorted_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + + #%% Compute probabilities + #per cluster + prob_control= hist_control/(hist_control+hist_disease) + prob_disease = hist_disease/(hist_control+hist_disease) + + #per image region + r,c,_ = max_img_list_control_flat[0].shape + img_shape = (int(r*sc_fac),int(c*sc_fac)) + prob_img_list_control = ma.comp_prob_imgs(prob_control,assignment_list_control,img_shape) + prob_img_list_disease = ma.comp_prob_imgs(prob_control,assignment_list_disease,img_shape) + prob_img_list_control_allchannels[channel] = prob_img_list_control + prob_img_list_disease_allchannels[channel] = prob_img_list_disease + + #per image and sample + #control + sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True, ) + + nr_list = 0 + prob_list_control= [] + for max_img_list_sample,sample_name in zip(max_img_list_control, sample_names_control): #in general, in max_img_list_condition + #print(sample_name) + nr_frames = len(max_img_list_sample) + prob_img_list_sample = prob_img_list_control[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + prob_sample = np.zeros((nr_frames,1)) + for prob_img, frame_nr in zip(prob_img_list_sample,range(nr_frames)): + #compute accumulated probabilities + prob_frame = sum(sum(prob_img))/prob_img.size + #print(prob_frame) + prob_sample[frame_nr] = prob_frame + prob_list_control += [prob_sample] + + #disease + sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True) + + nr_list = 0 + prob_list_disease = [] + for max_img_list_sample,sample_name in zip(max_img_list_disease, sample_names_disease): #in general, in max_img_list_condition + #print(sample_name) + nr_frames = len(max_img_list_sample) + prob_img_list_sample = prob_img_list_disease[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + prob_sample = np.zeros((nr_frames,1)) + for prob_img, frame_nr in zip(prob_img_list_sample,range(nr_frames)): + #compute accumulated probabilities + prob_frame = sum(sum(prob_img))/prob_img.size + #print(prob_frame) + prob_sample[frame_nr] = prob_frame + prob_list_disease += [prob_sample] + + prob_list_samples = prob_list_control + prob_list_disease + + + #%% Box plots + fig, ax = plt.subplots(figsize = (15,5)) + ax.set_title('Image health scores') + + sample_sep = 0.85 + patient_sep = 2.35 + condition_sep = 2.5 + positions_control = np.tile([0,0+sample_sep],int(len(max_img_list_control)/2)) + np.repeat(np.arange(0,len(max_img_list_control)/2*patient_sep,patient_sep),2) + positions_disease = np.tile([len(max_img_list_control)+condition_sep,len(max_img_list_control)+condition_sep+sample_sep],int(len(max_img_list_disease)/2)) + np.repeat(np.arange(0,len(max_img_list_disease)/2*patient_sep,patient_sep),2) + array_probs = np.squeeze(np.asarray(prob_list_samples)) + bp_control = ax.boxplot((array_probs[:len(max_img_list_control),...]).T, positions = positions_control, patch_artist = True) + bp_disease = ax.boxplot(array_probs[len(max_img_list_control):,...].T, positions = positions_disease, patch_artist = True) + + for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']: + plt.setp(bp_control[element], color='blue') + plt.setp(bp_control[element], linewidth = 2) + + for patch in bp_control['boxes']: + patch.set(facecolor='white') + patch.set(linewidth = 2) + + for element in ['boxes','whiskers', 'fliers', 'means', 'medians', 'caps']: + plt.setp(bp_disease[element], color='red') + plt.setp(bp_disease[element], linewidth = 2) + + for patch in bp_disease['boxes']: + patch.set(facecolor= 'white') + patch.set(linewidth = 2) + #patch.set(alpha=0.2) + + ax.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease]) + ax.set_ylim(0.3,0.7) + ax.set_ylabel('Probability health') + #ax.set_ylim(0,1) + + ax.tick_params( + axis='both', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + #labelleft = False, # ticks along the top edge are off + #labelbottom=False + ) + + #plot points + positions = list(positions_control) + list(positions_disease) + for prob_sample, position in zip(prob_list_samples, positions): + print(prob_sample) + print(5*[position]) + # Add some random "jitter" to the x-axis + x = np.random.normal(5*[position], 0) + #plot(x, y, 'r.', alpha=0.2) + ax.plot(x, prob_sample,'k.', markersize = 6, zorder = 3) + + #horizontal line + x = np.linspace(0,position.max()) + y = 0.5*np.ones(x.shape) + ax.plot(x,y,'k',linewidth = 1, linestyle = 'dashed') + + plt.savefig(dir_probs + 'scatterBoxplots_channel'+str(channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + fig2, ax2 = plt.subplots(figsize = (15,5)) + #plot points + for prob_sample, position in zip(prob_list_control, positions_control): + ax2.plot(5*[position], prob_sample,'b.', markersize = 4) + + for prob_sample, position in zip(prob_list_disease, positions_disease): + ax2.plot(5*[position], prob_sample,'r.', markersize = 4) + + #horizontal line + x = np.linspace(0,position.max()) + y = 0.5*np.ones(x.shape) + ax2.plot(x,y,'k',linewidth = 1, linestyle = 'dashed') + + ax2.set_xticks(ax.get_xticks()) + ax2.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease]) + ax2.set_ylim(0.3,0.7) + ax2.set_ylabel('Probability health') + + plt.savefig(dir_probs + 'samplewiseScatterPlots_channel'+str(channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Create agreement maps + prob_img_rgb_list_control = [] + for prob_img_c0, prob_img_c1, prob_img_c2 in zip(prob_img_list_control_allchannels[0],prob_img_list_control_allchannels[1],prob_img_list_control_allchannels[2]): + prob_img_rgb = np.stack((prob_img_c0,prob_img_c1, prob_img_c2), axis = 2) + prob_img_rgb_list_control += [prob_img_rgb] + + prob_img_rgb_list_disease = [] + for prob_img_c0, prob_img_c1, prob_img_c2 in zip(prob_img_list_disease_allchannels[0],prob_img_list_disease_allchannels[1],prob_img_list_disease_allchannels[2]): + prob_img_rgb = np.stack((prob_img_c0,prob_img_c1, prob_img_c2), axis = 2) + prob_img_rgb_list_disease += [prob_img_rgb] + + #%% Plot per channel prob images, and agreement maps + nr_frames = len(max_img_list_control[0]) + + min_prob_img = [] + max_prob_img = [] + + #control + sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True) + nr_list = 0 + fig_amaps, ax_amaps = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True) + fig_maximgs, ax_maximgs = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True) + for max_img_list_sample,sample_name, sample_nr in zip(max_img_list_control, sample_names_control,range(len(sample_names_control))): #in general, in max_img_list_condition + print(sample_name) + prob_img_list_sample = prob_img_rgb_list_control[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + fig, axs = plt.subplots(5,nr_frames, figsize = (nr_frames*2,5*2), sharex=True, sharey=True) + for max_img,prob_img, frame_nr in zip(max_img_list_sample,prob_img_list_sample,range(nr_frames)): + max_img_ds = skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype('uint8') + + #set probs as titles + for channel in range(prob_img.shape[2]): + prob_frame = sum(sum(prob_img[...,channel]))/prob_img[...,channel].size + axs[channel+2][frame_nr].set_title("%.2f" % np.round(prob_frame,2)) + + #remove background + ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back) + prob_img[ind_back] = 0.5 + + #plot + max_img_ds = ma.rgb2cmy(max_img_ds) + axs[0][frame_nr].imshow(max_img_ds) + axs[0][frame_nr].xaxis.set_ticklabels([]) + axs[0][frame_nr].xaxis.set_ticks_position('none') + axs[0][frame_nr].yaxis.set_ticklabels([]) + axs[0][frame_nr].yaxis.set_ticks_position('none') + + axs[2][frame_nr].imshow(1-prob_img[...,0],cmap = plt.cm.bwr, vmin = 0, vmax = 1) + axs[2][frame_nr].xaxis.set_ticklabels([]) + axs[2][frame_nr].xaxis.set_ticks_position('none') + axs[2][frame_nr].yaxis.set_ticklabels([]) + axs[2][frame_nr].yaxis.set_ticks_position('none') + + axs[3][frame_nr].imshow(1-prob_img[...,1],cmap = plt.cm.bwr, vmin = 0, vmax = 1) + axs[3][frame_nr].xaxis.set_ticklabels([]) + axs[3][frame_nr].xaxis.set_ticks_position('none') + axs[3][frame_nr].yaxis.set_ticklabels([]) + axs[3][frame_nr].yaxis.set_ticks_position('none') + + axs[4][frame_nr].imshow(1-prob_img[...,2],cmap = plt.cm.bwr, vmin = 0, vmax = 1) + axs[4][frame_nr].xaxis.set_ticklabels([]) + axs[4][frame_nr].xaxis.set_ticks_position('none') + axs[4][frame_nr].yaxis.set_ticklabels([]) + axs[4][frame_nr].yaxis.set_ticks_position('none') + + #normalise range of agreement map so that global max and min values are 0 and 1 + prob_img_norm = (prob_img-0.2)/0.6 + min_prob_img += [np.min(prob_img)] + max_prob_img += [np.max(prob_img)] + + axs[1][frame_nr].imshow(prob_img_norm) + axs[1][frame_nr].xaxis.set_ticklabels([]) + axs[1][frame_nr].xaxis.set_ticks_position('none') + axs[1][frame_nr].yaxis.set_ticklabels([]) + axs[1][frame_nr].yaxis.set_ticks_position('none') + + ax_amaps[frame_nr][sample_nr].imshow(prob_img_norm) + ax_amaps[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_amaps[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_amaps[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_amaps[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + ax_maximgs[frame_nr][sample_nr].imshow(max_img_ds) + ax_maximgs[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_maximgs[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_maximgs[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_maximgs[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + plt.suptitle('Per-channel health probabilities for sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) + plt.figure(fig.number),plt.savefig(dir_probs + '/' + control + '/' + 'channelWise_probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=1000) + #plt.show() + plt.figure(fig_amaps.number),plt.savefig(dir_probs + '/' + control + '/' + 'agreementMaps_control_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + plt.figure(fig_maximgs.number),plt.savefig(dir_probs + '/' + control + '/' + 'maxIMGs_control_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + + #disease + sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True) + + nr_list = 0 + fig_amaps, ax_amaps = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True) + fig_maximgs, ax_maximgs = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True) + for max_img_list_sample,sample_name, sample_nr in zip(max_img_list_disease, sample_names_disease, range(len(sample_names_disease))): #in general, in max_img_list_condition + print(sample_name) + prob_img_list_sample = prob_img_rgb_list_disease[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + fig, axs = plt.subplots(5,nr_frames, figsize = (nr_frames*2,5*2), sharex=True, sharey=True) + for max_img, prob_img, frame_nr in zip(max_img_list_sample, prob_img_list_sample,range(nr_frames)): + max_img_ds = skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype('uint8') + + #set probs as titles + for channel in range(prob_img.shape[2]): + prob_frame = sum(sum(prob_img[...,channel]))/prob_img[...,channel].size + axs[channel+2][frame_nr].set_title("%.2f" % np.round(prob_frame,2)) + + #remove background + ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back) + prob_img[ind_back] = 0.5 + + #plot + max_img_ds = ma.rgb2cmy(max_img_ds) + axs[0][frame_nr].imshow(max_img_ds) + axs[0][frame_nr].xaxis.set_ticklabels([]) + axs[0][frame_nr].xaxis.set_ticks_position('none') + axs[0][frame_nr].yaxis.set_ticklabels([]) + axs[0][frame_nr].yaxis.set_ticks_position('none') + + axs[2][frame_nr].imshow(1-prob_img[...,0],cmap = plt.cm.bwr, vmin = 0, vmax = 1) + axs[2][frame_nr].xaxis.set_ticklabels([]) + axs[2][frame_nr].xaxis.set_ticks_position('none') + axs[2][frame_nr].yaxis.set_ticklabels([]) + axs[2][frame_nr].yaxis.set_ticks_position('none') + + axs[3][frame_nr].imshow(1-prob_img[...,1],cmap = plt.cm.bwr, vmin = 0, vmax = 1) + axs[3][frame_nr].xaxis.set_ticklabels([]) + axs[3][frame_nr].xaxis.set_ticks_position('none') + axs[3][frame_nr].yaxis.set_ticklabels([]) + axs[3][frame_nr].yaxis.set_ticks_position('none') + + axs[4][frame_nr].imshow(1-prob_img[...,2],cmap = plt.cm.bwr, vmin = 0, vmax = 1) + axs[4][frame_nr].xaxis.set_ticklabels([]) + axs[4][frame_nr].xaxis.set_ticks_position('none') + axs[4][frame_nr].yaxis.set_ticklabels([]) + axs[4][frame_nr].yaxis.set_ticks_position('none') + + #normalise range of agreement map so that global max and min values are 0 and 1 + prob_img_norm = (prob_img-0.2)/0.6 + min_prob_img += [np.min(prob_img)] + max_prob_img += [np.max(prob_img)] + + axs[1][frame_nr].imshow(prob_img_norm) + axs[1][frame_nr].xaxis.set_ticklabels([]) + axs[1][frame_nr].xaxis.set_ticks_position('none') + axs[1][frame_nr].yaxis.set_ticklabels([]) + axs[1][frame_nr].yaxis.set_ticks_position('none') + + ax_amaps[frame_nr][sample_nr].imshow(prob_img_norm) + ax_amaps[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_amaps[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_amaps[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_amaps[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + ax_maximgs[frame_nr][sample_nr].imshow(max_img_ds) + ax_maximgs[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_maximgs[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_maximgs[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_maximgs[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + plt.suptitle('Per-channel health probabilities for sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) + plt.figure(fig.number), plt.savefig(dir_probs + '/' + disease + '/' + 'channelWise_probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=1000) + #plt.show() + plt.figure(fig_amaps.number),plt.savefig(dir_probs + '/' + disease + '/' + 'agreementMaps_'+disease+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + plt.figure(fig_maximgs.number),plt.savefig(dir_probs + '/' + disease + '/' + 'maxIMGs_'+disease+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + + print('Agreement map minimum ',sum(min_prob_img)/len(min_prob_img)) + print('Agreement map maximum ',sum(max_prob_img)/len(max_prob_img)) + + #%% Load packages to Testt the statistical significance of the mean differences + from statsmodels.multivariate.manova import MANOVA + from statsmodels.formula.api import ols + from statsmodels.stats.anova import anova_lm + import pandas as pd + from patsy import ModelDesc + + nr_frames_patient = 10 + + #%%Image probabilities + img_probs_control = [] + for prob_img_list in prob_img_list_control_allchannels: + img_probs_control += [[np.mean(prob_img) for prob_img in prob_img_list]] + + img_probs_disease = [] + for prob_img_list in prob_img_list_disease_allchannels: + img_probs_disease += [np.array([np.mean(prob_img) for prob_img in prob_img_list])] + + img_probs = [np.concatenate((probs_control,probs_disease)) for probs_control,probs_disease in zip(img_probs_control,img_probs_disease)] + + #%%ANOVA SETUP + + img_nr = np.tile(np.arange(nr_frames),img_probs[0].size//nr_frames) + sample_nr = np.tile(np.repeat(np.arange(nr_frames_patient//nr_frames),nr_frames),img_probs[0].size//nr_frames_patient) + patient_nr = np.concatenate([np.repeat(np.arange(len(max_img_list_control_flat)//nr_frames_patient),nr_frames_patient),np.repeat(np.arange(len(max_img_list_disease_flat)//nr_frames_patient),nr_frames_patient)]) + condition_nr = np.concatenate([np.zeros(len(max_img_list_control_flat)), np.ones(len(max_img_list_disease_flat))]) + + for channel in range(len(img_probs)): + dep_var = (img_probs[channel]).reshape(img_probs[0].size,1) + + print(np.mean(dep_var)) + print(np.mean(dep_var[:len(dep_var)//2])) + print(np.mean(dep_var[len(dep_var)//2:])) + + data_dict = {'Image':img_nr, + 'Patient':patient_nr, + 'Sample':sample_nr, + 'Condition':condition_nr, + 'S':dep_var} + + formula = 'S ~ C(Condition)/C(Patient)/C(Sample)' + + desc = ModelDesc.from_formula(formula) + print(desc.describe()) + + lm = ols(formula, data_dict).fit() + print(lm.summary()) + + anova_table = anova_lm(lm) + print(anova_table) + print(disease + ' on channel ' + str(channel)) + writer_anova = pd.ExcelWriter(dir_results + 'statistics/statistics_anova_' + disease + '_' + protein_mode + '_channel' + str(channel) + '_run' + str(run) + '.xlsx', engine='xlsxwriter') + anova_table.to_excel(writer_anova,sheet_name = disease + ' on channel ' + str(channel)) + writer_anova.save() + \ No newline at end of file diff --git a/code/lung_feature_patch_allPatients_threeProtein.py b/code/lung_feature_patch_allPatients_threeProtein.py new file mode 100755 index 0000000..0e7ae7a --- /dev/null +++ b/code/lung_feature_patch_allPatients_threeProtein.py @@ -0,0 +1,689 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Explorative analysis of protein composition and structure in lung ECM to +study disease-induced remodeling of the three proteins that provide structural +integrity to the lung: collagen IV, elastin and fibrillar collagen. + +The three-protein analysis provides the following: + - A 'Data Model', a dictionary of image patches representative of the local + protein structure and composition found in the data (both healthy and + disease group) at the scale defined by the patch size. + - A 'Sorted Data Model' or disease signature, a sorted version of the + dictionary above, based on how often a dictionary element (called + cluster centre in the publication) is found in the healthy vs. the disease. + - A 'Probability Map' for every image based on the three proteins, with + pixel correspondance to the input images (maximum projections), + highlighting regions characteristic of health or disease. + - A 'Disease Effect', to quantify how well the control and + disease groups are separated after assigning a health probability score to + each image, computed as the average of its probability map values. + +by Monica J. Emerson monj@dtu.dk and Anders B. Dahl abda@dtu.dk + +For more information, read our article (add link and citation)""" + +#%% Load packages and initialise parameters + +import numpy as np +import matplotlib.pyplot as plt +import sklearn.cluster +import microscopy_analysis as ma +import skimage.io +import skimage.transform + +import os +from datetime import datetime +import sys +from matplotlib.offsetbox import OffsetImage, AnnotationBbox +from math import ceil + +startTime = datetime.now() + +plt.close('all') + +sc_fac = 0.5 #25 #25 #0.5 # scaling factor of the image +patch_size = 17 +nr_clusters = 100 #49#100 # number of clusters, choose a value with an integer as a square root +protein_mode = 'triple' +colour_mode = 'cmy' #'bnw' #colour +th_back = 15 +nr_runs = 10 #5 + +#%% Define and create directories + +disease = 'covid' #diseased (mix, 2 of each condition)' #''emphysema' 'sarcoidosis' 'ild' 'covid' +control = 'control' +conditions = [disease, control] + +# input directories - images start with the name 'frame' +base_dir_in = '../input_data/' +#dir_in = base_dir_in + 'maximum_projections' #uncomment to read maximum projections without contrast equalisation +dir_in = base_dir_in + 'postprocessed_maxProjs/rgb/' #Read rgb for correct protein channel ordering, visualisation will be in cmy +base_name = 'frame' + +#output directory +dir_results = '../results/' #rerun just to make sure not to muck up the results +os.makedirs(dir_results, exist_ok = True) +os.makedirs(dir_results + '/statistics', exist_ok = True) + +#%% Read (preprocessed) maximum corrected images +#Read maximum projection images (runtime ~= 20 s ) +print('Reading maximum projection images') + +max_img_list_control = ma.read_max_imgs(dir_in + control, base_name) +max_img_list_disease = ma.read_max_imgs(dir_in + disease, base_name) + +#%% Exchange channels: fibrillar collagen with elastin +for max_img_sample_control in max_img_list_control: + for nr_frame, max_img_control in enumerate(max_img_sample_control): + max_img_sample_control[nr_frame] = max_img_control[:,:,[0,2,1]] + +for max_img_sample_disease in max_img_list_disease: + for nr_frame, max_img_disease in enumerate(max_img_sample_disease): + max_img_sample_disease[nr_frame] = max_img_disease[:,:,[0,2,1]] + +#%% Prepare maximum projection images for feature extraction +#Flatten and merge conditions +max_img_list_control_flat = ma.flatten_list(max_img_list_control) +max_img_list_disease_flat = ma.flatten_list(max_img_list_disease) + +#Rescale +max_img_list_control_processed = [] +for max_img in ma.flatten_list(max_img_list_control) : + max_img_list_control_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)] + +max_img_list_disease_processed = [] +for max_img in ma.flatten_list(max_img_list_disease) : + max_img_list_disease_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)] + +#%% Compute patches +patch_feat_list_control = [] +for max_img in max_img_list_control_processed: + patch_feat_list_control += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()] + +patch_feat_list_disease = [] +for max_img in max_img_list_disease_processed: + patch_feat_list_disease += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()] + +patch_feat_total = patch_feat_list_control + patch_feat_list_disease + +for run in range(nr_runs): + + #directories for results from the different runs + dir_probs = dir_results + disease + '_' + protein_mode + '_%dclusters_%ddownscale_%dpatchsize/'%(nr_clusters,1/sc_fac,patch_size) + dir_probs += 'run'+str(run)+'/' + ma.make_output_dirs(dir_probs, conditions) + dir_probs_noBack = dir_probs + 'noBackground/' + os.makedirs(dir_probs_noBack, exist_ok = True) + + #%% k-means clustering + startTime_clustering = datetime.now() + + batch_size = 10000 + th_nr_pathesINcluster = 20 + th_nr_weakClusters = 0 #added for covid and ild + + #If cluster centers were already computed, use those + if os.path.exists(dir_probs + 'array_cluster_centers_colour.npy'): + cluster_centers = np.load(dir_probs + 'array_cluster_centers_colour.npy') + kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size) + kmeans.cluster_centers_ = cluster_centers + + #Compute cluster centres + else: + #features for clustering + nr_keep = 10000 # number of features/image randomly picked for clustering + n_im = len(patch_feat_total) + feat_dim = patch_feat_total[0].shape[1] + patch_feat_to_cluster = np.zeros((nr_keep*n_im,feat_dim),dtype = max_img.dtype) + + f = 0 + nr_keep_control = nr_keep//(2*len(patch_feat_list_control)//len(patch_feat_list_disease)) + nr_keep_disease = nr_keep//2 + for patch_feat in patch_feat_list_control: + keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_control] + patch_feat_to_cluster[f:f+nr_keep_control,:] = patch_feat[keep_indices,:] + f += nr_keep + + for patch_feat in patch_feat_list_disease: + keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_disease] + patch_feat_to_cluster[f:f+nr_keep_disease,:] = patch_feat[keep_indices,:] + f += nr_keep + + #Get feature vectors + feat_to_cluster = patch_feat_to_cluster + + #Compute cluster centers and save if there is no weak clusters + nr_weakClusters = th_nr_weakClusters+1 + repetition = 0 + nr_weakClusters_list = [] + while nr_weakClusters>th_nr_weakClusters: #While weak clusters is different from empty recompute + kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size, verbose = False, reassignment_ratio=0.0005, max_no_improvement = 100) #0.00075 was good for uip + kmeans.fit(feat_to_cluster) + + #Nr. patches that have contributed to each cluster + nr_feat_in_cluster = [sum(kmeans.labels_==cluster) for cluster in range(nr_clusters)] + + weak_cluster_IDs = [ind for ind,val in enumerate(nr_feat_in_cluster) if val<th_nr_pathesINcluster] + print('Weak cluster IDs: ',weak_cluster_IDs) + + #Weak clusters are those composed by very few patches + nr_weakClusters = len([1 for i in nr_feat_in_cluster if i<th_nr_pathesINcluster]) + + #If there are weak clusters, the clustering should be recomputed + if nr_weakClusters>th_nr_weakClusters: + print(str(nr_weakClusters)+ " clusters composed of less than "+str(th_nr_pathesINcluster)+" images") + else: + np.save(dir_probs + 'array_cluster_centers_colour.npy', kmeans.cluster_centers_) # .npy extension is added if not given + + print(repetition) + repetition += 1 + nr_weakClusters_list += [nr_weakClusters] + + + endTime_clustering = datetime.now() - startTime_clustering + print(endTime_clustering) + + #%% Plot all cluster centers (grid view) + cluster_centers = kmeans.cluster_centers_ + int_sum_list = ma.plot_grid_cluster_centers(cluster_centers, range(nr_clusters), patch_size, colour_mode = colour_mode) + plt.savefig(dir_probs + 'clustercenters_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Histograms for background, control and disease + hist_control, assignment_list_control= ma.compute_assignment_hist(patch_feat_list_control, kmeans) + hist_disease, assignment_list_disease = ma.compute_assignment_hist(patch_feat_list_disease, kmeans) + + #%% Show histograms (bar plot) of control and disease + fig, ax = plt.subplots(1,1) + ax.bar(np.array(range(0,nr_clusters))-0.25, hist_control, width = 0.5, label='Control', color = 'b') + ax.bar(np.array(range(0,nr_clusters))+0.25, hist_disease, width = 0.5, label='Disease', color = 'r') + ax.legend() + plt.savefig(dir_probs + 'assignmentHistograms_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Compute populated, occurrence and importance cluster measures from the histogram + + #OCCURRENCE RATIO: How more often does it occur in one of the conditions + occurrence_ratio = np.empty((nr_clusters)) + ind_control = hist_control>hist_disease + ind_disease = ~ind_control + control_cond = (hist_disease!=0) & ind_control + occurrence_ratio[control_cond] = hist_control[control_cond]/hist_disease[control_cond] + disease_cond = (hist_control!=0) & ind_disease + occurrence_ratio[disease_cond] = -hist_disease[disease_cond]/hist_control[disease_cond] + + #POPULATED: How trustworthy is the cluster? + populated = hist_control+hist_disease + weights = populated/populated.max() + + #Cluster IMPORTANCE: occurrence weighed by POPULATED + def sigmoid(x,k): + return 2 / (1 + np.exp(-k*x)) - 1 + + x = np.linspace(0,1,100) + plt.plot(x,sigmoid(x,100)) + + importance = occurrence_ratio * sigmoid(weights,100) + + #%% Plot all control and disease cluster centers in the 2d populated/occurrence space + fig_control, ax_control = plt.subplots(figsize=(15,15)) + ax_control.scatter(populated[occurrence_ratio>1], occurrence_ratio[occurrence_ratio>1]) + ax_control.set_ylim(0.8,7) + ax_control.set_xlim(0,0.08) + + fig_disease, ax_disease = plt.subplots(figsize=(15,15)) + ax_disease.scatter(populated[occurrence_ratio<1], -occurrence_ratio[occurrence_ratio<1]) + ax_disease.set_ylim(0.8,7) + ax_disease.set_xlim(0,0.08) + + for x0, y0, cluster_nr in zip(populated, occurrence_ratio, np.arange(0,nr_clusters)): + cluster_centre = np.transpose((np.reshape(kmeans.cluster_centers_[cluster_nr,:],(3,patch_size,patch_size))),(1,2,0)) + cluster_centre = ma.rgb2cmy(cluster_centre) + if y0>0: + ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, y0), frameon=False) + ax_control.add_artist(ab) + else: + ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, -y0), frameon=False) + ax_disease.add_artist(ab) + + plt.figure(fig_control.number) + plt.savefig(dir_probs + 'controlClustercenters_2Dspace_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + plt.figure(fig_disease.number) + plt.savefig(dir_probs + 'diseaseClustercenters_2Dspace_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Plot all clusters sorted by IMPORTANCE + + #According to occurence + clusters_control_occ_sorted = [np.arange(0,100)[i] for i in np.argsort(occurrence_ratio.flatten()) if (occurrence_ratio.flatten())[i]>0] + clusters_control_occ_sorted.reverse() + + clusters_disease_occ_sorted = [np.arange(0,100)[i] for i in np.argsort(occurrence_ratio.flatten()) if (occurrence_ratio.flatten())[i]<0] + + clusters_occ_sorted = [np.arange(0,100)[i] for i in np.argsort(occurrence_ratio.flatten())] + clusters_occ_sorted.reverse() + + occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(occurrence_ratio.flatten())] + occurrence_ratio_sorted.reverse() + + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occ_sorted, patch_size, colour_mode = colour_mode) + plt.suptitle('Cluster centers \n sorted by occ from control to disease with cluster ' + str(len(clusters_control_occ_sorted)) + ' in between') + plt.savefig(dir_probs + 'clustercenters_occ_sorted_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #According to occurence weighed by populated + clusters_control_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]>0] + clusters_control_occW_sorted.reverse() + + clusters_disease_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]<0] + + clusters_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten())] + clusters_occW_sorted.reverse() + + occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(importance.flatten())] + occurrence_ratio_sorted.reverse() + + cluster_nums = np.arange(0,nr_clusters) + cluster_nums_sorted = [cluster_nums[i] for i in np.argsort(importance.flatten())] + cluster_nums_sorted.reverse() + + prob_control = hist_control/(hist_control+hist_disease) + prob_control_sorted = [prob_control[i] for i in np.argsort(importance.flatten())] + prob_control_sorted.reverse() + + + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, colour_mode = colour_mode) + plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted)) + ' in between') + plt.savefig(dir_probs + 'clustercenters_occW_sorted_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #with probabilities + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, colour_mode = colour_mode, titles = True, occurrence = prob_control_sorted) + plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted)) + ' in between') + plt.savefig(dir_probs + 'clustercenters_occW_sorted_wProbs_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #with cluster numbers + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, colour_mode = colour_mode, titles = True, occurrence = cluster_nums_sorted) + plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted)) + ' in between') + plt.savefig(dir_probs + 'clustercenters_occW_sorted_wClusternums_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Plot all clusters sorted - split channelwise view + ma.plot_grid_cluster_centers(np.copy(kmeans.cluster_centers_), clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = 'cmy', channel = 0) + plt.savefig(dir_probs + 'clustercenters_occW_sorted_split_channel1_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + ma.plot_grid_cluster_centers(np.copy(kmeans.cluster_centers_), clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = 'cmy', channel = 1) + plt.savefig(dir_probs + 'clustercenters_occW_sorted_split_channel2_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + ma.plot_grid_cluster_centers(np.copy(kmeans.cluster_centers_), clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = 'cmy', channel = 2) + plt.savefig(dir_probs + 'clustercenters_occW_sorted_split_channel3_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + + #%% Find background clusters + clusters_background_intBased = [i for i in range(len(int_sum_list)) if int_sum_list[i] < th_back*patch_size**2] + + clusters_background = clusters_background_intBased + print('Background clusters'+str(clusters_background)) + + #%% Plot grid of ordered, non-background clusters that are populated over a thresh + th_populated = 0.5/nr_clusters + th_occurr = 1.25 + + #control + clusters_control_occW_sorted_noBackground = [i for i in clusters_control_occW_sorted if (i not in clusters_background)&(populated[i]>th_populated)&(occurrence_ratio[i]>th_occurr)] + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_control_occW_sorted_noBackground, patch_size, colour_mode = colour_mode) + #plt.suptitle('Characteristic clusters for control, excluding background, \n from most important to least') + plt.savefig(dir_probs_noBack + 'clustercenters_control_occW_sorted_noBack%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #disease + clusters_disease_occW_sorted_noBackground = [i for i in clusters_disease_occW_sorted if (i not in clusters_background)&(populated[i]>th_populated)&(abs(occurrence_ratio[i])>th_occurr)] + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_disease_occW_sorted_noBackground, patch_size, colour_mode = colour_mode) + #plt.suptitle('Charactertistic clusters for ' + disease + ', excluding background, \n from most important to least') + plt.savefig(dir_probs_noBack + 'clustercenters_disease_occW_sorted_noBack%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Plot channelwise split characteristic clusters + nr_clusters_display = 10 + clusters_control = clusters_control_occW_sorted_noBackground[:nr_clusters_display] + clusters_disease = clusters_disease_occW_sorted_noBackground[:nr_clusters_display] + + #control clusters + cluster_centers_control = kmeans.cluster_centers_[clusters_control] + fig, axs = plt.subplots(1,len(clusters_control), figsize=(len(clusters_control)*3,3), sharex=True, sharey=True) + fig.suptitle('Cluster centers for control') + + fig_split, axs_split = plt.subplots(3,len(clusters_control), figsize=(len(clusters_control)*3,9), sharex=True, sharey=True) + fig_split.suptitle('Cluster centers for control - split') + for l in np.arange(0,len(clusters_control)): + + cluster_centre = np.transpose((np.reshape(cluster_centers_control[l,:],(3,patch_size,patch_size))),(1,2,0)) + + #channel 1 - collagen IV + im = np.zeros(cluster_centre.shape) + im[...,0] = cluster_centre[...,0] + im = ma.rgb2cmy(im) + axs_split[0][l].imshow(im.astype(np.uint8)) + axs_split[0][l].axis('off') + axs_split[0][l].set_title(clusters_control[l]) + + #channel 2 - elastin + im = np.zeros(cluster_centre.shape) + im[...,1] = cluster_centre[...,1] + im = ma.rgb2cmy(im) + axs_split[1][l].imshow(im.astype(np.uint8)) + axs_split[1][l].axis('off') + + #channel 3 - SHG + im = np.zeros(cluster_centre.shape) + im[...,2] = cluster_centre[...,2] + im = ma.rgb2cmy(im) + axs_split[2][l].imshow(im.astype(np.uint8)) + axs_split[2][l].axis('off') + + plt.savefig(dir_probs_noBack + 'clustercenters_control_split_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #cmy merge + cluster_centre = ma.rgb2cmy(cluster_centre) + axs[l].imshow(cluster_centre.astype(np.uint8)) + axs[l].axis('off') + axs[l].set_title(clusters_control[l]) + + plt.figure(fig.number), + plt.savefig(dir_probs_noBack + 'clustercenters_control_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #disease clusters + cluster_centers_disease = kmeans.cluster_centers_[clusters_disease] + fig, axs = plt.subplots(1,len(clusters_disease), figsize=(len(clusters_disease)*3,3), sharex=True, sharey=True) + fig.suptitle('Cluster centers for disease') + + fig_split, axs_split = plt.subplots(3,len(clusters_disease), figsize=(len(clusters_disease)*3,9), sharex=True, sharey=True) + fig_split.suptitle('Cluster centers for disease - split') + for l in np.arange(0,len(clusters_disease)): + + cluster_centre = np.transpose((np.reshape(cluster_centers_disease[l,:],(3,patch_size,patch_size))),(1,2,0)) + + #channel 1 - collagen IV + im = np.zeros(cluster_centre.shape) + im[...,0] = cluster_centre[...,0] + im = ma.rgb2cmy(im) + axs_split[0][l].imshow(im.astype(np.uint8)) + axs_split[0][l].axis('off') + axs_split[0][l].set_title(clusters_disease[l]) + + #channel 2 - elastin + im = np.zeros(cluster_centre.shape) + im[...,1] = cluster_centre[...,1] + im = ma.rgb2cmy(im) + axs_split[1][l].imshow(im.astype(np.uint8)) + axs_split[1][l].axis('off') + + #channel 3 - SHG + im = np.zeros(cluster_centre.shape) + im[...,2] = cluster_centre[...,2] + im = ma.rgb2cmy(im) + axs_split[2][l].imshow(im.astype(np.uint8)) + axs_split[2][l].axis('off') + + plt.savefig(dir_probs_noBack + 'clustercenters_disease_split_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #cmy merge + cluster_centre = ma.rgb2cmy(cluster_centre) + axs[l].imshow(cluster_centre.astype(np.uint8)) + axs[l].axis('off') + axs[l].set_title(clusters_disease[l]) + + plt.figure(fig.number), + plt.savefig(dir_probs_noBack + 'clustercenters_disease_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Compute probability images + startTime_probs = datetime.now() + + #Assign probabilities of control and disease to clusters + prob_control= hist_control/(hist_control+hist_disease) + prob_disease = hist_disease/(hist_control+hist_disease) + + #Compute probability images + r,c,_ = max_img_list_control_flat[0].shape + img_shape = (int(r*sc_fac),int(c*sc_fac)) + prob_img_list_control = ma.comp_prob_imgs(prob_control,assignment_list_control,img_shape) + prob_img_list_disease = ma.comp_prob_imgs(prob_control,assignment_list_disease,img_shape) + + #%% Display probability images and accumulate probs per frame and sample + nr_frames = len(max_img_list_control[0]) + + #control + sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True, ) + + nr_list = 0 + prob_list_control= [] + fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True) + for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_control, sample_names_control, range(len(sample_names_control))): #in general, in max_img_list_condition + print(sample_name) + nr_frames = len(max_img_list_sample) + prob_img_list_sample = prob_img_list_control[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + prob_sample = np.zeros((nr_frames,1)) + fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True) + for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)): + max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype) + + #compute image probability + prob_frame = sum(sum(prob_img))/prob_img.size + + #remove background from prob imgs + ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back) + prob_img[ind_back] = 0.5 + + #plot + ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2)) + axs[1][frame_nr].xaxis.set_ticklabels([]) + axs[1][frame_nr].xaxis.set_ticks_position('none') + axs[1][frame_nr].yaxis.set_ticklabels([]) + axs[1][frame_nr].yaxis.set_ticks_position('none') + + if colour_mode == 'cmy': + max_img_ds = ma.rgb2cmy(max_img_ds) + axs[0][frame_nr].imshow(max_img_ds) + axs[0][frame_nr].xaxis.set_ticklabels([]) + axs[0][frame_nr].xaxis.set_ticks_position('none') + axs[0][frame_nr].yaxis.set_ticklabels([]) + axs[0][frame_nr].yaxis.set_ticks_position('none') + + #save img probs + prob_sample[frame_nr] = prob_frame + prob_list_control += [prob_sample] + + plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) + plt.savefig(dir_probs + '/' + control + '/' + 'probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + control + '/' + 'probMaps_'+control+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + + #disease + sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True) + + nr_list = 0 + prob_list_disease = [] + fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True) + for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_disease, sample_names_disease, range(len(sample_names_disease))): #in general, in max_img_list_condition + print(sample_name) + nr_frames = len(max_img_list_sample) + prob_img_list_sample = prob_img_list_disease[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + prob_sample = np.zeros((nr_frames,1)) + fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True) + for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)): + max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype) + + #compute image probability + prob_frame = sum(sum(prob_img))/prob_img.size + + #remove background from prob imgs + ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back) + prob_img[ind_back] = 0.5 + + #plot + ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2)) + axs[1][frame_nr].xaxis.set_ticklabels([]) + axs[1][frame_nr].xaxis.set_ticks_position('none') + axs[1][frame_nr].yaxis.set_ticklabels([]) + axs[1][frame_nr].yaxis.set_ticks_position('none') + + if colour_mode == 'cmy': + max_img_ds = ma.rgb2cmy(max_img_ds) + axs[0][frame_nr].imshow(max_img_ds) + axs[0][frame_nr].xaxis.set_ticklabels([]) + axs[0][frame_nr].xaxis.set_ticks_position('none') + axs[0][frame_nr].yaxis.set_ticklabels([]) + axs[0][frame_nr].yaxis.set_ticks_position('none') + + #save img probs + prob_sample[frame_nr] = prob_frame + prob_list_disease += [prob_sample] + + plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) + plt.savefig(dir_probs + '/' + disease + '/' + 'probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + disease + '/' + 'probMaps_'+disease+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + + prob_list_samples = prob_list_control + prob_list_disease + + #%% Box plots + fig, ax = plt.subplots(figsize = (15,5)) + ax.set_title('Image health scores ') + + sample_sep = 0.85 + patient_sep = 2.35 + condition_sep = 2.5 + positions_control = np.tile([0,0+sample_sep],int(len(max_img_list_control)/2)) + np.repeat(np.arange(0,len(max_img_list_control)/2*patient_sep,patient_sep),2) + positions_disease = np.tile([len(max_img_list_control)+condition_sep,len(max_img_list_control)+condition_sep+sample_sep],int(len(max_img_list_disease)/2)) + np.repeat(np.arange(0,len(max_img_list_disease)/2*patient_sep,patient_sep),2) + array_probs = np.squeeze(np.asarray(prob_list_samples)) + bp_control = ax.boxplot((array_probs[:len(max_img_list_control),...]).T, positions = positions_control, patch_artist = True) + bp_disease = ax.boxplot(array_probs[len(max_img_list_control):,...].T, positions = positions_disease, patch_artist = True) + + for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']: + plt.setp(bp_control[element], color='blue') + plt.setp(bp_control[element], linewidth = 2) + + for patch in bp_control['boxes']: + patch.set(facecolor='white') + patch.set(linewidth = 2) + + for element in ['boxes','whiskers', 'fliers', 'means', 'medians', 'caps']: + plt.setp(bp_disease[element], color='red') + plt.setp(bp_disease[element], linewidth = 2) + + for patch in bp_disease['boxes']: + patch.set(facecolor= 'white') + patch.set(linewidth = 2) + #patch.set(alpha=0.2) + + ax.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease]) + ax.set_ylim(0.3,0.7) + ax.set_ylabel('Probability health') + #ax.set_ylim(0,1) + + ax.tick_params( + axis='both', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + #labelleft = False, # ticks along the top edge are off + #labelbottom=False + ) + + #plot points + positions = list(positions_control) + list(positions_disease) + for prob_sample, position in zip(prob_list_samples, positions): + print(prob_sample) + print(5*[position]) + # Add some random "jitter" to the x-axis + x = np.random.normal(5*[position], 0) + #plot(x, y, 'r.', alpha=0.2) + ax.plot(x, prob_sample,'k.', markersize = 6, zorder = 3) + + #horizontal line + x = np.linspace(0,position.max()) + y = 0.5*np.ones(x.shape) + ax.plot(x,y,'k',linewidth = 1, linestyle = 'dashed') + + plt.savefig(dir_probs + 'scatterBoxplots_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + fig2, ax2 = plt.subplots(figsize = (15,5)) + #plot points + for prob_sample, position in zip(prob_list_control, positions_control): + ax2.plot(5*[position], prob_sample,'b.', markersize = 4) + + for prob_sample, position in zip(prob_list_disease, positions_disease): + ax2.plot(5*[position], prob_sample,'r.', markersize = 4) + + #horizontal line + x = np.linspace(0,position.max()) + y = 0.5*np.ones(x.shape) + ax2.plot(x,y,'k',linewidth = 1, linestyle = 'dashed') + + ax2.set_xticks(ax.get_xticks()) + ax2.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease]) + ax2.set_ylim(0.3,0.7) + ax2.set_ylabel('Probability health') + + plt.savefig(dir_probs + 'samplewiseScatterPlots_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + + #%% Print computational time + endTime = datetime.now() - startTime + print(endTime) + + + #%%ANOVA SETUP + os.makedirs(dir_results + 'statistics/', exist_ok = True) + + from statsmodels.formula.api import ols + from statsmodels.stats.anova import anova_lm + import pandas as pd + from patsy import ModelDesc + + nr_frames_patient = 10 + img_nr = np.tile(np.arange(nr_frames),array_probs.size//nr_frames) + sample_nr = np.tile(np.repeat(np.arange(nr_frames_patient//nr_frames),nr_frames),array_probs.size//nr_frames_patient) + #patient_nr = np.tile(np.repeat(np.arange(array_probs.size//nr_frames_patient//len(conditions)),nr_frames_patient),len(conditions)) + patient_nr = np.concatenate([np.repeat(np.arange(len(max_img_list_control_flat)//nr_frames_patient),nr_frames_patient),np.repeat(np.arange(len(max_img_list_disease_flat)//nr_frames_patient),nr_frames_patient)]) + condition_nr = np.concatenate([np.zeros(len(max_img_list_control_flat)), np.ones(len(max_img_list_disease_flat))]) + + + dep_var = array_probs.flatten() + + data_dict = {'Image':img_nr, + 'Sample':sample_nr, + 'Patient':patient_nr, + 'Condition': condition_nr, + 'S':dep_var} + + formula = 'S ~ C(Condition)/C(Patient)/C(Sample)' + + desc = ModelDesc.from_formula(formula) + print(desc.describe()) + + lm = ols(formula, data_dict).fit() + print(lm.summary()) + + anova_table = anova_lm(lm) + print(anova_table) + writer_anova = pd.ExcelWriter(dir_results + 'statistics/statistics_anova_' + disease + '_' + protein_mode + '_ds'+ str(1/sc_fac) +'_run' + str(run) +'.xlsx', engine='xlsxwriter') + anova_table.to_excel(writer_anova,sheet_name = disease) + writer_anova.save() + + + + + + + diff --git a/code/lung_feature_patch_allPatients_twoProtein.py b/code/lung_feature_patch_allPatients_twoProtein.py new file mode 100755 index 0000000..debe53f --- /dev/null +++ b/code/lung_feature_patch_allPatients_twoProtein.py @@ -0,0 +1,551 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Explorative analysis of protein composition and structure in lung ECM to +study disease-induced remodeling of the three proteins that provide structural +integrity to the lung: collagen IV, elastin and fibrillar collagen. + +The two-protein analyses provide the following: + - A 'Data Model' per combination of proteins, a dictionary of image patches + representative of the local protein structure and composition found in the + data (both healthy and disease group) at the scale defined by the patch + size. + - A 'Sorted Data Model' or disease signature per combination of proteins, + a sorted version of the dictionary above, based on how often a dictionary + element (called cluster centre in the publication) is found in the healthy + vs. the disease. + - A 'Probability Map' for every image based on each combination of + proteins, with pixel correspondance to the input images (maximum + projections), highlighting regions characteristic of health or disease. + - A 'Disease Effect' per combination of proteins, to quantify how well the + control and disease groups are separated after assigning a health + probability score to each image, computed as the average of its probability + map values. + +by Monica J. Emerson monj@dtu.dk and Anders B. Dahl abda@dtu.dk + +For more information, read our article (add link and citation)""" + +#%% Load packages and initialise parameters + +import numpy as np +import matplotlib.pyplot as plt +import sklearn.cluster +import microscopy_analysis as ma +import skimage.io +import skimage.transform + +import os +from datetime import datetime +import sys +from matplotlib.offsetbox import OffsetImage, AnnotationBbox +from math import ceil +import copy + +startTime = datetime.now() + +plt.close('all') + +sc_fac = 0.5 #25 #25 #0.5 # scaling factor of the image +patch_size = 17 +nr_clusters = 100 #49#100 # number of clusters, choose a value with an integer as a square root +th_back = 15 +nr_runs = 10 + +protein_mode = 'dropout' #'single', 'triple' +colour_mode = 'cmy' + +# drop_channel = 0 +# keep_channels = [x for x in range(3) if x!=drop_channel] + +#%% Define and create directories + +disease = 'emphysema' #diseased (mix, 2 of each condition)' #''emphysema' 'sarcoidosis' 'ild' 'covid' +control = 'control' +conditions = [disease, control] + +# input directories - images start with the name 'frame' +base_dir_in = '../input_data/' +#dir_in = base_dir_in + 'maximum_projections' #uncomment to read maximum projections without contrast equalisation +dir_in = base_dir_in + 'postprocessed_maxProjs/rgb/' #Read rgb for correct protein channel ordering, visualisation will be in cmy +base_name = 'frame' + +#output directory +dir_results = '../results/' #rerun just to make sure not to muck up the results +os.makedirs(dir_results, exist_ok = True) +os.makedirs(dir_results + '/statistics', exist_ok = True) + +#%% Read (preprocessed) maximum corrected images +#Read maximum projection images (runtime ~= 20 s ) +print('Reading maximum projection images') + +max_img_list_control_3c = ma.read_max_imgs(dir_in + control, base_name) +max_img_list_disease_3c = ma.read_max_imgs(dir_in + disease, base_name) + +for drop_channel in range(3): + keep_channels = [x for x in range(3) if x!=drop_channel] + #%% Exchange channels: fibrillar collagen with elastin + max_img_list_control = copy.deepcopy(max_img_list_control_3c) + for max_img_sample_control in max_img_list_control: + for nr_frame, max_img_control in enumerate(max_img_sample_control): + img = max_img_control[:,:,[0,2,1]] + max_img_sample_control[nr_frame] = img[:,:,keep_channels] + + max_img_list_disease = copy.deepcopy(max_img_list_disease_3c) + for max_img_sample_disease in max_img_list_disease: + for nr_frame, max_img_disease in enumerate(max_img_sample_disease): + img = max_img_disease[:,:,[0,2,1]] + max_img_sample_disease[nr_frame] = img[:,:,keep_channels] + + #%% Prepare maximum projection images for feature extraction + #Flatten and merge conditions + max_img_list_control_flat = ma.flatten_list(max_img_list_control) + max_img_list_disease_flat = ma.flatten_list(max_img_list_disease) + + #Rescale + max_img_list_control_processed = [] + for max_img in ma.flatten_list(max_img_list_control) : + max_img_list_control_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)] + + max_img_list_disease_processed = [] + for max_img in ma.flatten_list(max_img_list_disease) : + max_img_list_disease_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)] + + #%% Compute patches + patch_feat_list_control = [] + for max_img in max_img_list_control_processed: + patch_feat_list_control += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()] + + patch_feat_list_disease = [] + for max_img in max_img_list_disease_processed: + patch_feat_list_disease += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()] + + patch_feat_total = patch_feat_list_control + patch_feat_list_disease + + for run in range(nr_runs): + + #directories for results from the different runs + dir_probs = dir_results + disease + '_' + protein_mode + '_%dclusters_%ddownscale_%dpatchsize/'%(nr_clusters,1/sc_fac,patch_size) + dir_probs += 'run'+str(run)+'/' + ma.make_output_dirs(dir_probs, conditions) + + #%% k-means clustering + startTime_clustering = datetime.now() + + batch_size = 10000 + th_nr_pathesINcluster = 20 + th_nr_weakClusters = 0 #added for covid and ild + + #If cluster centers were already computed, use those + if os.path.exists(dir_probs + 'array_cluster_centers_'+protein_mode+'_channel'+str(drop_channel)+'.npy'): + cluster_centers = np.load(dir_probs + 'array_cluster_centers_'+protein_mode+'_channel'+str(drop_channel)+'.npy') + kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size) + kmeans.cluster_centers_ = cluster_centers + + #Compute cluster centres + else: + #features for clustering + nr_keep = 10000 # number of features/image randomly picked for clustering + n_im = len(patch_feat_total) + feat_dim = patch_feat_total[0].shape[1] + patch_feat_to_cluster = np.zeros((nr_keep*n_im,feat_dim),dtype = max_img.dtype) + + f = 0 + nr_keep_control = nr_keep//(2*len(patch_feat_list_control)//len(patch_feat_list_disease)) + nr_keep_disease = nr_keep//2 + for patch_feat in patch_feat_list_control: + keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_control] + patch_feat_to_cluster[f:f+nr_keep_control,:] = patch_feat[keep_indices,:] + f += nr_keep + + for patch_feat in patch_feat_list_disease: + keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_disease] + patch_feat_to_cluster[f:f+nr_keep_disease,:] = patch_feat[keep_indices,:] + f += nr_keep + + #Get feature vectors + feat_to_cluster = patch_feat_to_cluster + + #Compute cluster centers and save if there is no weak clusters + nr_weakClusters = th_nr_weakClusters+1 + repetition = 0 + nr_weakClusters_list = [] + while nr_weakClusters>th_nr_weakClusters: #While weak clusters is different from empty recompute + kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size, verbose = False, reassignment_ratio=0.0005, max_no_improvement = 100) #0.00075 was good for uip + kmeans.fit(feat_to_cluster) + + #Nr. patches that have contributed to each cluster + nr_feat_in_cluster = [sum(kmeans.labels_==cluster) for cluster in range(nr_clusters)] + + weak_cluster_IDs = [ind for ind,val in enumerate(nr_feat_in_cluster) if val<th_nr_pathesINcluster] + print('Weak cluster IDs: ',weak_cluster_IDs) + + #Weak clusters are those composed by very few patches + nr_weakClusters = len([1 for i in nr_feat_in_cluster if i<th_nr_pathesINcluster]) + + #If there are weak clusters, the clustering should be recomputed + if nr_weakClusters>th_nr_weakClusters: + print(str(nr_weakClusters)+ " clusters composed of less than "+str(th_nr_pathesINcluster)+" images") + else: + np.save(dir_probs + 'array_cluster_centers_'+protein_mode+'_channel'+str(drop_channel)+'.npy', kmeans.cluster_centers_) # .npy extension is added if not given + + print(repetition) + repetition += 1 + nr_weakClusters_list += [nr_weakClusters] + + + endTime_clustering = datetime.now() - startTime_clustering + print(endTime_clustering) + + #%% Plot all cluster centers (grid view) + cluster_centers = kmeans.cluster_centers_ + int_sum_list = ma.plot_grid_cluster_centers(cluster_centers, range(nr_clusters), patch_size, protein_mode, colour_mode, channel = drop_channel) + plt.savefig(dir_probs + 'clustercenters_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Histograms for background, healthy and disease + hist_control, assignment_list_control= ma.compute_assignment_hist(patch_feat_list_control, kmeans) + hist_disease, assignment_list_disease = ma.compute_assignment_hist(patch_feat_list_disease, kmeans) + + # Show histograms (bar plot) of healthy and disease + fig, ax = plt.subplots(1,1) + ax.bar(np.array(range(0,nr_clusters))-0.25, hist_control, width = 0.5, label='Control', color = 'b') + ax.bar(np.array(range(0,nr_clusters))+0.25, hist_disease, width = 0.5, label='Disease', color = 'r') + ax.legend() + plt.savefig(dir_probs + 'assignmentHistograms_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Compute populated, occurrence and importance cluster measures from the histogram + #grid dimensions + + #OCCURRENCE RATIO: How more often does it occur in one of the conditions + occurrence_ratio = np.empty((nr_clusters)) + ind_control = hist_control>hist_disease + ind_disease = ~ind_control + control_cond = (hist_disease!=0) & ind_control + occurrence_ratio[control_cond] = hist_control[control_cond]/hist_disease[control_cond] + disease_cond = (hist_control!=0) & ind_disease + occurrence_ratio[disease_cond] = -hist_disease[disease_cond]/hist_control[disease_cond] + + #POPULATED How trustworthy is the cluster? + populated = hist_control+hist_disease + weights = populated/populated.max() + + #Cluster IMPORTANCE: occurrence weighed by POPULATED + def sigmoid(x,k): + return 2 / (1 + np.exp(-k*x)) - 1 + + importance = occurrence_ratio * sigmoid(weights,100) + + #%% Plot cluster centers in the 2d populated/occurrence space + fig_control, ax_control = plt.subplots(figsize=(15,15)) + ax_control.scatter(populated[occurrence_ratio>1], occurrence_ratio[occurrence_ratio>1]) + ax_control.set_ylim(0.8,7) + ax_control.set_xlim(0,0.08) + + fig_disease, ax_disease = plt.subplots(figsize=(15,15)) + ax_disease.scatter(populated[occurrence_ratio<1], -occurrence_ratio[occurrence_ratio<1]) + ax_disease.set_ylim(0.8,7) + ax_disease.set_xlim(0,0.08) + + for x0, y0, cluster_nr in zip(populated, occurrence_ratio, np.arange(0,nr_clusters)): + if protein_mode == 'dropout': + cluster_centre = np.zeros((patch_size,patch_size,3),dtype = cluster_centers.dtype) + cluster_centre[:,:,keep_channels] = np.transpose(np.reshape(kmeans.cluster_centers_[cluster_nr,:],(2,patch_size,patch_size)),(1,2,0)) + else: + cluster_centre = np.transpose((np.reshape(kmeans.cluster_centers_[cluster_nr,:],(3,patch_size,patch_size))),(1,2,0)) + cluster_centre = ma.rgb2cmy(cluster_centre) + if y0>0: + ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, y0), frameon=False) + ax_control.add_artist(ab) + else: + ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, -y0), frameon=False) + ax_disease.add_artist(ab) + + plt.figure(fig_control.number) + plt.savefig(dir_probs + 'controlClustercenters_2Dspace_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + + plt.figure(fig_disease.number) + plt.savefig(dir_probs + 'diseaseClustercenters_2Dspace_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Plot all clusters sorted by IMPORTANCE + clusters_control_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]>0] + clusters_control_occW_sorted.reverse() + + clusters_disease_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]<0] + + clusters_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten())] + clusters_occW_sorted.reverse() + + occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(importance.flatten())] + occurrence_ratio_sorted.reverse() + + ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, protein_mode, colour_mode, drop_channel) + plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted)) + ' in between') + plt.savefig(dir_probs + 'clustercenters_occW_sorted_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Compute probability images + + startTime_probs = datetime.now() + + #Assign probabilities of control and disease to clusters + prob_control= hist_control/(hist_control+hist_disease) + prob_disease = hist_disease/(hist_control+hist_disease) + + #Compute probability images + r,c,_ = max_img_list_control_flat[0].shape + img_shape = (int(r*sc_fac),int(c*sc_fac)) + prob_img_list_control = ma.comp_prob_imgs(prob_control,assignment_list_control,img_shape) + prob_img_list_disease = ma.comp_prob_imgs(prob_control,assignment_list_disease,img_shape) + + #%% Display probability images and accumulate probs per frame and sample + nr_frames = len(max_img_list_control[0]) + + #control + sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True, ) + + nr_list = 0 + prob_list_control= [] + fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True) + for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_control, sample_names_control, range(len(sample_names_control))): #in general, in max_img_list_condition + print(sample_name) + nr_frames = len(max_img_list_sample) + prob_img_list_sample = prob_img_list_control[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + prob_sample = np.zeros((nr_frames,1)) + fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True) + for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)): + max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype) + + #compute image probability + prob_frame = sum(sum(prob_img))/prob_img.size + + #background + ind_back = np.ones((max_img_ds.shape[0],max_img_ds.shape[1]), dtype = bool) + for channel in range(max_img_ds.shape[2]): + ind_back = ind_back & (max_img_ds[:,:,channel]<th_back) + prob_img[ind_back] = 0.5 + + if protein_mode == 'dropout': + max_img_ds_3c = np.zeros((max_img_ds.shape[0],max_img_ds.shape[1],3)) + max_img_ds_3c[:,:,keep_channels] = max_img_ds + max_img_ds = max_img_ds_3c + + #plot + ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2)) + axs[1][frame_nr].xaxis.set_ticklabels([]) + axs[1][frame_nr].xaxis.set_ticks_position('none') + axs[1][frame_nr].yaxis.set_ticklabels([]) + axs[1][frame_nr].yaxis.set_ticks_position('none') + + if colour_mode == 'cmy': + max_img_ds = ma.rgb2cmy(max_img_ds) + axs[0][frame_nr].imshow(max_img_ds) + axs[0][frame_nr].xaxis.set_ticklabels([]) + axs[0][frame_nr].xaxis.set_ticks_position('none') + axs[0][frame_nr].yaxis.set_ticklabels([]) + axs[0][frame_nr].yaxis.set_ticks_position('none') + + #compute accumulated probabilities:part 2 + prob_sample[frame_nr] = prob_frame + prob_list_control += [prob_sample] + + plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) + plt.savefig(dir_probs + '/' + control + '/' + 'probImHealth_'+sample_name+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + control + '/' + 'probMaps_'+control+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + + #disease + sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True) + + nr_list = 0 + prob_list_disease = [] + fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True) + for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_disease, sample_names_disease, range(len(sample_names_disease))): #in general, in max_img_list_condition + print(sample_name) + nr_frames = len(max_img_list_sample) + prob_img_list_sample = prob_img_list_disease[nr_list:nr_list+nr_frames] + nr_list += nr_frames + + prob_sample = np.zeros((nr_frames,1)) + fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True) + for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)): + max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype) + + #compute image probability + prob_frame = sum(sum(prob_img))/prob_img.size + + #background + ind_back = np.ones((max_img_ds.shape[0],max_img_ds.shape[1]), dtype = bool) + for channel in range(max_img_ds.shape[2]): + ind_back = ind_back & (max_img_ds[:,:,channel]<th_back) + prob_img[ind_back] = 0.5 + + if protein_mode == 'dropout': + max_img_ds_3c = np.zeros((max_img_ds.shape[0],max_img_ds.shape[1],3)) + max_img_ds_3c[:,:,keep_channels] = max_img_ds + max_img_ds = max_img_ds_3c + + #plot + ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none') + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([]) + ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none') + + axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1) + axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2)) + axs[1][frame_nr].xaxis.set_ticklabels([]) + axs[1][frame_nr].xaxis.set_ticks_position('none') + axs[1][frame_nr].yaxis.set_ticklabels([]) + axs[1][frame_nr].yaxis.set_ticks_position('none') + + if colour_mode == 'cmy': + max_img_ds = ma.rgb2cmy(max_img_ds) + axs[0][frame_nr].imshow(max_img_ds) + axs[0][frame_nr].xaxis.set_ticklabels([]) + axs[0][frame_nr].xaxis.set_ticks_position('none') + axs[0][frame_nr].yaxis.set_ticklabels([]) + axs[0][frame_nr].yaxis.set_ticks_position('none') + + #compute accumulated probabilities:part 2 + prob_sample[frame_nr] = prob_frame + prob_list_disease += [prob_sample] + + plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) + plt.savefig(dir_probs + '/' + disease + '/' + 'probImHealth_'+sample_name+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + disease + '/' + 'probMaps_'+disease+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + #plt.show() + + prob_list_samples = prob_list_control + prob_list_disease + + #%% Box plots + fig, ax = plt.subplots(figsize = (15,5)) + ax.set_title('Image health scores ') + + sample_sep = 0.85 + patient_sep = 2.35 + condition_sep = 2.5 + positions_control = np.tile([0,0+sample_sep],int(len(max_img_list_control)/2)) + np.repeat(np.arange(0,len(max_img_list_control)/2*patient_sep,patient_sep),2) + positions_disease = np.tile([len(max_img_list_control)+condition_sep,len(max_img_list_control)+condition_sep+sample_sep],int(len(max_img_list_disease)/2)) + np.repeat(np.arange(0,len(max_img_list_disease)/2*patient_sep,patient_sep),2) + array_probs = np.squeeze(np.asarray(prob_list_samples)) + bp_control = ax.boxplot((array_probs[:len(max_img_list_control),...]).T, positions = positions_control, patch_artist = True) + bp_disease = ax.boxplot(array_probs[len(max_img_list_control):,...].T, positions = positions_disease, patch_artist = True) + + for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']: + plt.setp(bp_control[element], color='blue') + plt.setp(bp_control[element], linewidth = 2) + + for patch in bp_control['boxes']: + patch.set(facecolor='white') + patch.set(linewidth = 2) + + for element in ['boxes','whiskers', 'fliers', 'means', 'medians', 'caps']: + plt.setp(bp_disease[element], color='red') + plt.setp(bp_disease[element], linewidth = 2) + + for patch in bp_disease['boxes']: + patch.set(facecolor= 'white') + patch.set(linewidth = 2) + #patch.set(alpha=0.2) + + ax.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease]) + ax.set_ylim(0.3,0.7) + ax.set_ylabel('Probability health') + #ax.set_ylim(0,1) + + ax.tick_params( + axis='both', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + #labelleft = False, # ticks along the top edge are off + #labelbottom=False + ) + + #plot points + positions = list(positions_control) + list(positions_disease) + for prob_sample, position in zip(prob_list_samples, positions): + print(prob_sample) + print(5*[position]) + # Add some random "jitter" to the x-axis + x = np.random.normal(5*[position], 0) + #plot(x, y, 'r.', alpha=0.2) + ax.plot(x, prob_sample,'k.', markersize = 6, zorder = 3) + + #horizontal line + x = np.linspace(0,position.max()) + y = 0.5*np.ones(x.shape) + ax.plot(x,y,'k',linewidth = 1, linestyle = 'dashed') + + plt.savefig(dir_probs + 'scatterBoxplots_'+protein_mode+'_channel'+str(drop_channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + fig2, ax2 = plt.subplots(figsize = (15,5)) + #plot points + for prob_sample, position in zip(prob_list_control, positions_control): + ax2.plot(5*[position], prob_sample,'b.', markersize = 5) + + for prob_sample, position in zip(prob_list_disease, positions_disease): + ax2.plot(5*[position], prob_sample,'r.', markersize = 5) + + #horizontal line + x = np.linspace(0,position.max()) + y = 0.5*np.ones(x.shape) + ax2.plot(x,y,'k',linewidth = 1, linestyle = 'dashed') + + ax2.set_xticks(ax.get_xticks()) + ax2.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease]) + ax2.set_ylim(0.3,0.7) + ax2.set_ylabel('Probability health') + + plt.savefig(dir_probs + 'samplewiseScatterPlots_'+protein_mode+'_channel'+str(drop_channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300) + + #%% Print computational time + endTime = datetime.now() - startTime + print(endTime) + + + #%%ANOVA SETUP + os.makedirs(dir_results + 'statistics/', exist_ok = True) + + from statsmodels.formula.api import ols + from statsmodels.stats.anova import anova_lm + import pandas as pd + from patsy import ModelDesc + + nr_frames_patient = 10 + img_nr = np.tile(np.arange(nr_frames),array_probs.size//nr_frames) + sample_nr = np.tile(np.repeat(np.arange(nr_frames_patient//nr_frames),nr_frames),array_probs.size//nr_frames_patient) + patient_nr = np.concatenate([np.repeat(np.arange(len(max_img_list_control_flat)//nr_frames_patient),nr_frames_patient),np.repeat(np.arange(len(max_img_list_disease_flat)//nr_frames_patient),nr_frames_patient)]) + condition_nr = np.concatenate([np.zeros(len(max_img_list_control_flat)), np.ones(len(max_img_list_disease_flat))]) + + + dep_var = array_probs.flatten() + + data_dict = {'Image':img_nr, + 'Sample':sample_nr, + 'Patient':patient_nr, + 'Condition': condition_nr, + 'S':dep_var} + + formula = 'S ~ C(Condition)/C(Patient)/C(Sample)' + + desc = ModelDesc.from_formula(formula) + print(desc.describe()) + + lm = ols(formula, data_dict).fit() + print(lm.summary()) + + anova_table = anova_lm(lm) + print(anova_table) + writer_anova = pd.ExcelWriter(dir_results + 'statistics/statistics_anova_' + disease + '_'+protein_mode+'_channel'+str(drop_channel)+'_ds'+ str(1/sc_fac) +'_run' + str(run) +'.xlsx', engine='xlsxwriter') + anova_table.to_excel(writer_anova,sheet_name = disease) + writer_anova.save() + + diff --git a/code/lung_lif_to_png.py b/code/lung_lif_to_png.py new file mode 100644 index 0000000..d61120c --- /dev/null +++ b/code/lung_lif_to_png.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Transform lif files to png for easy access and visualization +""" + + +import numpy as np +from readlif.reader import LifFile +import skimage.io +import os +import microscopy_analysis as ma + +#%% +# folder containing lif files +in_dir = '../../../covidPatients/scans/' #data101220/'# data_October2020/sarcoidosis/' #'../../5April2020_control/' #healthy #'../../24December19_emphysema/' #sick +# output folder +out_dir = '../data_corrected_bis/covid/' #control/ #/emphysema/ +file_names = ma.listdir_custom(in_dir, string_start_pos = -7, ext = 'lif') + +# make output directory if it does not exist +if not os.path.exists(out_dir): + os.mkdir(out_dir) + +# run through all lif files and save as png for easy access and visualization +for file in file_names: + print(file) + lif_in = LifFile(in_dir + file) + n_im = lif_in.num_images + dir_name_out = (out_dir + (file.split('.')[0]).replace(' ', '_') + '/') + if not os.path.exists(dir_name_out): + os.mkdir(dir_name_out) + for n in range(0,n_im): + print(n) + im_in = lif_in.get_image(n) + r,c,n_frames = im_in.dims[:3] + n_ch = im_in.channels + if (n_ch == 3)|(n_ch==4): #TODO: Fix support for removing channel 1 in covids. Could be generalised to select channels of interest, but we don't need this functionality now + dir_name_out_im = dir_name_out + ('frame_%02d' % n) + if not os.path.exists(dir_name_out_im): + os.mkdir(dir_name_out_im) + im_out = np.zeros((r,c,3), dtype='uint8') + for i in range(0,n_frames): + im_out[:,:,0] = np.array(im_in.get_frame(i,0,0)).transpose() + for j in range(1,3): + if n_ch == 4: #support for removing channel 1 in covids. + im_out[:,:,j] = np.array(im_in.get_frame(i,0,j+1)).transpose() + else: + im_out[:,:,j] = np.array(im_in.get_frame(i,0,j)).transpose() + skimage.io.imsave(dir_name_out_im + '/image_%02d.png' % i, im_out) + diff --git a/code/microscopy_analysis.py b/code/microscopy_analysis.py new file mode 100755 index 0000000..394ea5f --- /dev/null +++ b/code/microscopy_analysis.py @@ -0,0 +1,624 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Helping functions for analysis of lif microscopy images + +Developed by monj@dtu.dk and abda@dtu.dk +""" + +import os +import skimage.io as io +import skimage.transform +import numpy as np +import local_features as lf + +import matplotlib.pyplot as plt +from math import ceil + +#%% General functions + +# sort list of directories from selected string position +def sort_strings_customStart(strings, string_start_pos = 0): + """Sorts list of strings + + Input(s) + strings: list of strings + string_start_pos (default 0): defines the section of the string on which the ordering is based. + + Output(s) + strings_sortered: list of sorted strings + + Author: Monica Jane Emerson (monj@dtu.dk)""" + + strings_roi = [i[string_start_pos:] for i in strings] + #key indicates that the indices (k) are sorted according to subjects_list[k] + ind = np.array(sorted(range(len(strings_roi)), key=lambda k: strings_roi[k])) + strings_sorted = [strings[i] for i in ind] + + return strings_sorted + +#Provide the contents of a directory sorted +def listdir_custom(directory, string_start_pos = 0, ext = '', dir_flag = False, base_name = False): + 'monj@dtu.dk' + + if dir_flag: + if base_name: + list_dirs = [dI for dI in os.listdir(directory) if (os.path.isdir(os.path.join(directory,dI)) & dI[0:len(base_name)]==base_name)] + else: + list_dirs = [dI for dI in os.listdir(directory) if os.path.isdir(os.path.join(directory,dI))] + else: + if base_name: + list_dirs = [dI for dI in os.listdir(directory)if dI[0:len(base_name)]==base_name] + else: + if ext == '': + list_dirs = [f for f in os.listdir(directory) if f.endswith(ext)] + else: + list_dirs = [dI for dI in os.listdir(directory)] + + listdirs_sorted = sort_strings_customStart(list_dirs,string_start_pos) + + return listdirs_sorted + +def flatten_list(list): + 'monj@dtu.dk' + + list_flat = [item for sublist in list for item in sublist] + + return list_flat + +#%% IO functions + +#make directory, and subdirectories within, if they don't exist +def make_output_dirs(directory,subdirectories = False): + 'monj@dtu.dk' + + os.makedirs(directory, exist_ok = True) + + if subdirectories: + for subdir in subdirectories: + os.makedirs(directory + subdir + '/', exist_ok = True) + + # def make_output_dirs(directory,subdirectories): + # 'monj@dtu.dk' + + # os.makedirs(directory, exist_ok = True) + # os.makedirs(directory + 'control/', exist_ok = True) + + # for disease in diseases: + # os.makedirs(directory + disease + '/', exist_ok = True) + + +#Reads images starting with base_name from the subdirectories of the input directory. +#Option for reading scaled down versions and in bnw or colour +def read_max_imgs(dir_condition, base_name, sc_fac = 1, colour_mode = 'colour'): + 'monj@dtu.dk' + + sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + #print(sample_list) + + max_img_list = [] + for sample in sample_list: + sample_dir = dir_condition + '/' + sample + '/' + frame_list = listdir_custom(sample_dir, base_name = base_name) + #print(frame_list) + + frame_img_list = [] + for frame in frame_list: + frame_path = sample_dir + frame + + #Option to load in bnw or colour + if colour_mode == 'bnw': + img = io.imread(frame_path, as_gray = True).astype('uint8') + if sc_fac ==1: + frame_img_list += [img] + else: + frame_img_list += [skimage.transform.rescale(img, sc_fac, preserve_range = True).astype('uint8')] + else: + img = io.imread(frame_path).astype('uint8') + if sc_fac == 1: + frame_img_list += [img] + else: + frame_img_list += [skimage.transform.rescale(img, sc_fac, preserve_range = True, multichannel=True).astype('uint8')] + + max_img_list += [frame_img_list] + #print(frame_img_list[0].dtype) + + return max_img_list + +#%% Functions for intensity inspection and image preprocessing + +# computes the maximum projection image from a directory of images +def get_max_img(in_dir, ext = '.png', n_img = 0): + file_names = [f for f in os.listdir(in_dir) if f.endswith(ext)] + file_names.sort() + if ( n_img < 1 ): + n_img = len(file_names) + img_in = io.imread(in_dir + file_names[0]) + for i in range(1,n_img): + img_in = np.maximum(img_in, io.imread(in_dir + file_names[i])) + return img_in + +# computes a list of maximum projection images from a list of directories +def compute_max_img_list(in_dir, ext, base_name, dir_out = ''): + """by abda + Modified by monj""" + + dir_list = [dI for dI in os.listdir(in_dir) if (os.path.isdir(os.path.join(in_dir,dI)) and dI[0:len(base_name)]==base_name)] + dir_list.sort() + + max_img_list = [] + for d in dir_list: + image_dir_in = in_dir + d + '/' + max_img = get_max_img(image_dir_in, ext) + if dir_out!='': + os.makedirs(dir_out, exist_ok = True) + io.imsave(dir_out + d + '.png', max_img.astype('uint8')) + + max_img_list += [max_img] + + return max_img_list + +# One more level up from compute_max_img_list. Computes a list of lists of maximum +#projection images from a list of directories, each containing a list of directories +#with the set of images that should be combined into a maximum projection. +def comp_max_imgs(dir_condition, base_name, dir_out = ''): + 'monj@dtu.dk' + + dir_list_condition = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + + max_img_list_condition = [] + for directory in dir_list_condition: + dir_in = dir_condition + '/' + directory + '/' + + if dir_out!= '': + subdir_out = dir_in.replace(dir_condition,dir_out) + else: + subdir_out = '' + + max_img_condition = compute_max_img_list(dir_in, '.png', base_name, subdir_out) + max_img_list_condition+= [max_img_condition] + + return max_img_list_condition + + +def comp_std_perChannel(max_im_list, th_int, dir_condition): + 'monj@dtu.dk' + + sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + + std_list = [[],[],[]] + for sample, frame_img_list in zip(sample_list, max_im_list): + + for ind,img in enumerate(frame_img_list): + h, w, channels = img.shape + + for channel in range(0,channels): + intensities = img[:,:,channel].ravel() + std_list[channel] += [(intensities[intensities>th_int]).std()] + + return std_list + +def intensity_spread_normalisation(img_list, th_int, mean_std, dir_condition, base_name, dir_results): + 'monj@dtu.dk' + sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + + img_corr_list = [] + for sample, sample_imgs in zip(sample_list, img_list): + sample_dir = dir_condition + '/' + sample + '/' + #print(sample_dir) + frame_list = listdir_custom(sample_dir, base_name = base_name) + #print(frame_list) + frame_img_corr_list = [] + for frame,img in zip(frame_list,sample_imgs): + h, w, channels = img.shape + img_corr = img + for channel in range(0,channels): + img_channel = img_corr[:,:,channel] + img_channel[img_channel>th_int] = img_channel[img_channel>th_int]*(mean_std/img_channel.std()) + img_corr[:,:,channel] = img_channel + + frame_img_corr_list += [img_corr] + os.makedirs(dir_results + '/' + sample, exist_ok = True) + io.imsave(dir_results + '/' + sample + '/' + frame +'.png', img_corr) + + img_corr_list += [frame_img_corr_list] + + return img_corr_list + +def rgb2cmy_list(img_list, dir_condition, base_name, dir_results): + 'monj@dtu.dk' + sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + + for sample, sample_imgs in zip(sample_list, img_list): + sample_dir = dir_condition + '/' + sample + '/' + #print(sample_dir) + frame_list = listdir_custom(sample_dir, base_name = base_name) + #print(frame_list) + for frame,img in zip(frame_list,sample_imgs): + img_cmy = rgb2cmy(img) + + os.makedirs(dir_results + '/' + sample, exist_ok = True) + io.imsave(dir_results + '/' + sample + '/' + frame +'.png', img_cmy) + + +#def plotHist_perChannel_imgset +def plotHist_perChannel_imgset_list(max_img_list, dir_condition, dir_results = '', name_tag = 'original'): + 'monj@dtu.dk' + + sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + + for sample, frame_img_list in zip(sample_list, max_img_list): + fig, axs = plt.subplots(4,len(frame_img_list), figsize = (len(frame_img_list)*2,4*2)) + plt.suptitle('Sample '+sample[-4:] + ', acq. date: ' + sample[:6]) + + for ind,img in enumerate(frame_img_list): + h, w, channels = img.shape + axs[0][ind].imshow(img) + + for channel in range(0,channels): + intensities = img[:,:,channel].ravel() + axs[channel+1][ind].hist(intensities, bins = 50) + axs[channel+1][ind].set_aspect(1.0/axs[channel+1][ind].get_data_ratio()) + + if dir_results!= '': + plt.savefig(dir_results + '/' + sample[-4:] + '_' + name_tag + '_perChannelHistograms.png', dpi = 300) + plt.close(fig) + +#def compare_imgpairs +def compare_imgpairs_list(list1_imgsets, list2_imgsets, dir_condition, dir_results = ''): + 'monj@dtu.dk' + + sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True) + + for imgset1, imgset2, sample in zip(list1_imgsets, list2_imgsets, sample_list): + fig, axs = plt.subplots(2,len(imgset1), figsize = (2*len(imgset1),2*2),sharex=True,sharey=True) + plt.suptitle('Sample '+sample[-4:] + ', acq. date: ' + sample[:6]) + + for ind, img1, img2 in zip(range(0,len(imgset1)), imgset1, imgset2): + axs[0][ind].imshow(img1) + axs[1][ind].imshow(img2) + + if dir_results!= '': + plt.savefig(dir_results + '/' + sample[-4:] + '_originalVScorrected.png', dpi=300) + plt.close(fig) + +def black2white_background(im_n): + """ + + Parameters + ---------- + im_n : Normalized RGB image (intensities in range [0,1] of type float) + + Returns + ------- + im_conv : Normalized RGB image where black has been converted to white (or visa versa of type float) + + abda@dtu.dk + """ + im_conv = np.zeros((im_n.shape[:2]+(3,))) + im_conv[:,:,0] = (1-im_n[:,:,1])*(1-im_n[:,:,2]) + im_n[:,:,0]*np.maximum(im_n[:,:,1],im_n[:,:,2]) + im_conv[:,:,1] = (1-im_n[:,:,0])*(1-im_n[:,:,2]) + im_n[:,:,1]*np.maximum(im_n[:,:,0],im_n[:,:,2]) + im_conv[:,:,2] = (1-im_n[:,:,0])*(1-im_n[:,:,1]) + im_n[:,:,2]*np.maximum(im_n[:,:,0],im_n[:,:,1]) + return im_conv + +def rgb2cmy(img): + 'monj@dtu.dk' + img_n = img.astype(float)/255 #normalise + img_conv_n = black2white_background(img_n) + img_conv = (img_conv_n*255).astype('uint8') #denormalise + return 255-img_conv + +#%%Functions for feature analysis and visualisation + +# computes the max projection image and the features into a list +def compute_max_img_feat(in_dir, ext, base_name, sigma, sc_fac, save = False, abs_intensity = True): + dir_list = [dI for dI in os.listdir(in_dir) if (os.path.isdir(os.path.join(in_dir,dI)) and dI[0:len(base_name)]==base_name)] + dir_list.sort() + max_img_list = [] + for d in dir_list: + image_dir = in_dir + d + '/' + max_img = get_max_img(image_dir, ext) + + if save: + dir_name_out_img = in_dir.replace('data','maxProjImages') + os.makedirs(dir_name_out_img, exist_ok = True) + + io.imsave( dir_name_out_img + d + '.png', max_img.astype('uint8')) + + max_img_list += [skimage.transform.rescale(max_img, sc_fac, multichannel=True)] + + feat_list = [] + for max_img in max_img_list: + r,c = max_img.shape[:2] + feat = np.zeros((r*c,45)) + for i in range(0,3): + feat_tmp = lf.get_gauss_feat_im(max_img[:,:,i], sigma) + feat[:,i*15:(i+1)*15] = feat_tmp.reshape(-1,feat_tmp.shape[2]) + if not(abs_intensity): + feat_list += [np.concatenate((feat[:,1:15],feat[:,16:30],feat[:,31:45]),axis = 1)] + else: + feat_list += [feat] + return max_img_list, feat_list + + +# computes a histogram of features from a kmeans object +def compute_assignment_hist(feat_list, kmeans, background_feat = None): + assignment_list = [] + for feat in feat_list: + assignment_list += [kmeans.predict(feat)] + edges = np.arange(kmeans.n_clusters+1)-0.5 # histogram edges halfway between integers + hist = np.zeros(kmeans.n_clusters) + for a in assignment_list: + hist += np.histogram(a,bins=edges)[0] + + sum_hist = np.sum(hist) + hist/= sum_hist + + if background_feat is not None: + assignment_back = kmeans.predict(background_feat) + hist_back = np.histogram(assignment_back,bins=edges)[0] + return hist, assignment_list, hist_back, assignment_back + else: + return hist, assignment_list + + +# image to array of patches +def im2col(A, BSZ, stepsize=1, norm=False): + # Parameters + m,n = A.shape + s0, s1 = A.strides + nrows = m-BSZ[0]+1 + ncols = n-BSZ[1]+1 + shp = BSZ[0],BSZ[1],nrows,ncols + strd = s0,s1,s0,s1 + + out_view = np.lib.stride_tricks.as_strided(A, shape=shp, strides=strd) + out_view_shaped = out_view.reshape(BSZ[0]*BSZ[1],-1)[:,::stepsize] + if norm: + one_norm = np.sum(out_view_shaped,axis=0) + ind_zero_norm = np.where(one_norm !=0) + out_view_shaped[:,ind_zero_norm] = 255*out_view_shaped[:,ind_zero_norm]/one_norm[ind_zero_norm] + return out_view_shaped + +# nd image to array of patches +def ndim2col(A, BSZ, stepsize=1, norm=False): + if(A.ndim == 2): + return im2col(A, BSZ, stepsize, norm) + else: + r,c,l = A.shape + patches = np.zeros((l*BSZ[0]*BSZ[1],(r-BSZ[0]+1)*(c-BSZ[1]+1)),dtype=A.dtype) + for i in range(l): + patches[i*BSZ[0]*BSZ[1]:(i+1)*BSZ[0]*BSZ[1],:] = im2col(A[:,:,i],BSZ,stepsize,norm) + return patches + +# nd image to array of patches with mirror padding along boundaries +def ndim2col_pad(A, BSZ, stepsize=1, norm=False): + r,c = A.shape[:2] + if (A.ndim == 2): + l = 1 + else: + l = A.shape[2] + tmp = np.zeros((r+BSZ[0]-1,c+BSZ[1]-1,l),dtype = A.dtype) + fhr = int(np.floor(BSZ[0]/2)) + fhc = int(np.floor(BSZ[1]/2)) + thr = int(BSZ[0]-fhr-1) + thc = int(BSZ[1]-fhc-1) + + tmp[fhr:fhr+r,fhc:fhc+c,:] = A.reshape((r,c,l)) + tmp[:fhr,:] = np.flip(tmp[fhr:fhr*2,:], axis=0) + tmp[fhr+r:,:] = np.flip(tmp[r:r+thr,:], axis=0) + tmp[:,:fhc] = np.flip(tmp[:,fhc:fhc*2], axis=1) + tmp[:,fhc+c:] = np.flip(tmp[:,c:c+thc], axis=1) + tmp = np.squeeze(tmp) + return ndim2col(tmp,BSZ,stepsize,norm) + + +def patch2featvector(patches, colour_mode = 'bnw', colour_weight = 1): + patch_size = int((patches.shape[1]/3)**0.5) + patches_reshaped = patches.reshape(patches.shape[0],3,patch_size,patch_size) + patches_bnw = np.mean(patches_reshaped,axis = 1) + + if colour_mode == 'bnw': + features = patches_bnw.reshape(patches.shape[0],patch_size**2) + else: + int_perchannel = np.mean(np.mean(patches_reshaped,axis = 2),axis = 2) + features = [patches_bnw.reshape(patches.shape[0],patch_size**2), colour_weight*int_perchannel] + return features + +def patch2featvector_list(list_patches, colour_mode = 'bnw', colour_weight = 1): + list_features = [] + for patches in list_patches: + features = patch2featvector(patches, colour_mode, colour_weight) + list_features += [features] + + return list_features + +def comp_prob_imgs(prob_control,assignment_list_condition, prob_img_shape): + #Compute probability images for condition + r,c = prob_img_shape + prob_img_list_condition = [] + for assignment in assignment_list_condition: + prob = prob_control[assignment.astype(int)] + prob_img_list_condition += [prob.reshape(r,c)] + return prob_img_list_condition + +#%% Functions for visualisation of learnt features +def plot_grid_cluster_centers(cluster_centers, cluster_order, patch_size, protein_mode = 'triple', colour_mode = 'rgb', channel = 'all', titles = False, occurrence = ''): + #grid dimensions + size_x = round(len(cluster_order)**(1/2)) + size_y = ceil(len(cluster_order)/size_x) + + #figure format + overhead = 0 + if titles: + overhead = 1 + w, h = plt.figaspect(size_x/size_y) + fig, axs = plt.subplots(size_x,size_y, figsize=(1.3*w,1.3*h*(1+overhead/2)), sharex=True, sharey=True) + #print('Grid size: ', grid_size[1], grid_size[2], 'Figure size: ', w, h) + + int_sum_list = [] + ax_list = axs.ravel() + for ind, cluster in enumerate(cluster_order): + #print(ind) + if colour_mode =='bnw': + cluster_centre = np.reshape(cluster_centers[cluster,:],(patch_size,patch_size)) + ax_list[ind].imshow(cluster_centre.astype('uint8'),cmap='gray') + else: + if protein_mode == 'single': #in bnw + colour give the clusters a uniform colour + cluster_centre = np.zeros((patch_size,patch_size,3),dtype = cluster_centers.dtype) + cluster_centre[:,:,channel] = np.reshape(cluster_centers[cluster,:],(patch_size,patch_size)) + + elif protein_mode == 'dropout': + cluster_centre = np.zeros((patch_size,patch_size,3),dtype = cluster_centers.dtype) + keep_channels = [x for x in range(3) if x!=channel] + cluster_centre[:,:,keep_channels] = np.transpose(np.reshape(cluster_centers[cluster,:],(2,patch_size,patch_size)),(1,2,0)) + + else: + cluster_centre = np.transpose(np.reshape(cluster_centers[cluster,:],(3,patch_size,patch_size)),(1,2,0)) + if channel!='all': + remove_channels = [x for x in range(3) if x!=channel] + cluster_centre[:,:,remove_channels] = np.zeros((patch_size,patch_size,2),dtype = cluster_centers.dtype) + + if colour_mode == 'cmy': + cluster_centre = rgb2cmy(cluster_centre) + ax_list[ind].imshow(cluster_centre.astype('uint8')) + + if titles: + if occurrence !='': + ax_list[ind].set_title(round(occurrence[ind],2)) + else: + ax_list[ind].set_title(cluster) + + if colour_mode =='bnw': + int_sum_list += [sum((cluster_centre).ravel())] + else: + int_sum_list += [sum((np.max(cluster_centre,2)).ravel())] + + plt.setp(axs, xticks=[], yticks=[]) + #plt.show() + + return int_sum_list + +#%% Functions for channel statistics +def smooth(y, box_pts): + box = np.ones(box_pts)/box_pts + y_smooth = np.convolve(y, box, mode='same') + return y_smooth + +def hist_outline(density, bins, axs, color = 'k', w = 1): + y,binEdges=np.histogram(density,bins=bins, density = True) + bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) + axs.plot(bincenters,smooth(y,w), color = color) + +def channel_corr_scatter(pixel_probs, axs, bins, lims = (0,1), w = 1, edgecolors='b', s=5): + + hist_outline(pixel_probs[0],bins,axs[0][0], color = edgecolors, w = w) + axs[0][0].set_xlim(lims[0],lims[1]) + axs[0][0].xaxis.set_label_position('top') + axs[0][0].set_xlabel('Colagen 4'),axs[0][0].set_ylabel('Colagen 4'), + axs[1][0].scatter(pixel_probs[0],pixel_probs[1], s=s, facecolors='none', edgecolors=edgecolors) + axs[1][0].set_xlim(lims[0],lims[1]), axs[1][0].set_ylim(lims[0],lims[1]) + axs[1][0].set_ylabel('Fibrilar colagen') + axs[2][0].scatter(pixel_probs[0],pixel_probs[2], s=s, facecolors='none', edgecolors=edgecolors) + axs[2][0].set_xlim(lims[0],lims[1]), axs[2][0].set_ylim(lims[0],lims[1]) + axs[2][0].set_ylabel('Elastin') + + axs[0][1].scatter(pixel_probs[1],pixel_probs[0], s=s, facecolors='none', edgecolors=edgecolors) + axs[0][1].set_xlim(lims[0],lims[1]), axs[0][1].set_ylim(lims[0],lims[1]) + axs[0][1].xaxis.set_label_position('top'), axs[0][1].set_xlabel('Fibrilar Colagen') + hist_outline(pixel_probs[1],bins,axs[1][1], color = edgecolors, w = w) + axs[1][1].set_xlim(lims[0],lims[1]) + axs[2][1].scatter(pixel_probs[1],pixel_probs[2], s=s, facecolors='none', edgecolors=edgecolors) + axs[2][1].set_xlim(lims[0],lims[1]), axs[2][1].set_ylim(lims[0],lims[1]) + + + axs[0][2].scatter(pixel_probs[2],pixel_probs[0], s=s, facecolors='none', edgecolors=edgecolors) + axs[0][2].set_xlim(lims[0],lims[1]), axs[0][2].set_ylim(lims[0],lims[1]) + axs[0][2].xaxis.set_label_position('top'), axs[0][2].set_xlabel('Elastin') + axs[1][2].scatter(pixel_probs[2],pixel_probs[1], s=s, facecolors='none', edgecolors=edgecolors) + axs[1][2].set_xlim(lims[0],lims[1]), axs[1][2].set_ylim(lims[0],lims[1]) + hist_outline(pixel_probs[2],bins,axs[2][2], color = edgecolors, w = w) + axs[2][2].set_xlim(lims[0],lims[1]) + +def channel_corr_density(pixel_probs, lims = (0,1)): + fig, axs = plt.subplots(3,3, figsize = (6,6)) + + axs[0][0].hist(pixel_probs[0],density = True), + axs[0][0].set_xlim(lims[0],lims[1]) + axs[0][0].xaxis.set_label_position('top') + axs[0][0].set_xlabel('Colagen 4'),axs[0][0].set_ylabel('Colagen 4'), + axs[1][0].hist2d(pixel_probs[0],pixel_probs[1]) + axs[1][0].set_xlim(lims[0],lims[1]), axs[1][0].set_ylim(lims[0],lims[1]) + axs[1][0].set_ylabel('Fibrilar colagen') + axs[2][0].hist2d(pixel_probs[0],pixel_probs[2]) + axs[2][0].set_xlim(lims[0],lims[1]), axs[2][0].set_ylim(lims[0],lims[1]) + axs[2][0].set_ylabel('Elastin') + + axs[0][1].hist2d(pixel_probs[1],pixel_probs[0]) + axs[0][1].set_xlim(lims[0],lims[1]), axs[0][1].set_ylim(lims[0],lims[1]) + axs[0][1].xaxis.set_label_position('top'), axs[0][1].set_xlabel('Fibrilar Colagen') + axs[1][1].hist(pixel_probs[1],density = True) + axs[1][1].set_xlim(lims[0],lims[1]) + axs[2][1].hist2d(pixel_probs[1],pixel_probs[2]) + axs[2][1].set_xlim(lims[0],lims[1]), axs[2][1].set_ylim(lims[0],lims[1]) + + axs[0][2].hist2d(pixel_probs[2],pixel_probs[0]) + axs[0][2].set_xlim(lims[0],lims[1]), axs[0][2].set_ylim(lims[0],lims[1]) + axs[0][2].xaxis.set_label_position('top'), axs[0][2].set_xlabel('Elastin') + axs[1][2].hist2d(pixel_probs[2],pixel_probs[1]) + axs[1][2].set_xlim(lims[0],lims[1]), axs[1][2].set_ylim(lims[0],lims[1]) + axs[2][2].hist(pixel_probs[2],density = True) + axs[2][2].set_xlim(lims[0],lims[1]) + +def get_patient_probs(img_probs): + nr_frames_patient = 10 + patient_probs_allchannels= [] + for img_channel in img_probs: + nr_patients = len(img_channel)//nr_frames_patient + patient_probs = [] + for patient in range(nr_patients): + patient_img_probs = img_channel[patient*nr_frames_patient: (patient+1)*nr_frames_patient] + patient_probs += [sum(patient_img_probs)/len(patient_img_probs)] + patient_probs_allchannels += [patient_probs] + + return patient_probs_allchannels + +# def plot_mapsAndimages(dir_condition, directory_list, map_img, max_img_list, base_name = 'frame', r = 1024, c = 1024): +# nr_list = 0 +# for directory in directory_list: +# in_dir = dir_condition + directory + '/' +# dir_list = [dI for dI in os.listdir(in_dir) if (os.path.isdir(os.path.join(in_dir ,dI)) and dI[0:len(base_name)]==base_name)] +# print(dir_list) +# img_all_control = [] +# img_all_control += map_img[nr_list:nr_list+len(dir_list)] +# img_all_control += max_img_list[nr_list:nr_list+len(dir_list)] +# fig, axs = plt.subplots(2,len(dir_list), sharex=True, sharey=True) +# nr = 0 +# prob_sample = np.zeros((len(dir_list),1)) +# for ax, img in zip(axs.ravel(), img_all_control): +# print(nr) +# if nr<len(dir_list): +# prob_img = sum(sum(img))/(r*c) +# ax.set_title('Pc '+str(round(prob_img,2))) +# prob_sample[nr] = prob_img +# #print(prob_sample) +# nr += 1 +# if ( img.ndimg == 2 ): +# ax.imshow(skimage.transform.resize(img, (1024,1024)), cmap=plt.cm.bwr, vmin = 0, vmax = 1) +# else: +# ax.imshow(skimage.transform.resize(img.astype(np.uint8), (1024,1024))) +# plt.suptitle('Probability control of sample '+str(directory)+', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2))) +# plt.savefig(dir_probs + '/control/' + '/probImControl_'+str(directory)+'_%dclusters_%dsigma%ddownscale_absInt%s.png'%(nr_clusters,sigma,1/sc_fac,abs_intensity), dpi=1000) +# ##plt.show() +# nr_list += len(dir_list) + + + + + + + + + + + -- GitLab