From 5ab7a51f86c2c75e6cedbecc8101598847014c3b Mon Sep 17 00:00:00 2001
From: Monica J Emerson <monj@dtu.dk>
Date: Wed, 29 Sep 2021 15:47:42 +0200
Subject: [PATCH] added code for sharing

---
 code/image_preprocessing.py                   | 156 ++++
 code/local_features.py                        | 117 +++
 ...feature_patch_allPatients_singleProtein.py | 645 ++++++++++++++++
 ..._feature_patch_allPatients_threeProtein.py | 689 ++++++++++++++++++
 ...ng_feature_patch_allPatients_twoProtein.py | 551 ++++++++++++++
 code/lung_lif_to_png.py                       |  51 ++
 code/microscopy_analysis.py                   | 624 ++++++++++++++++
 7 files changed, 2833 insertions(+)
 create mode 100755 code/image_preprocessing.py
 create mode 100755 code/local_features.py
 create mode 100755 code/lung_feature_patch_allPatients_singleProtein.py
 create mode 100755 code/lung_feature_patch_allPatients_threeProtein.py
 create mode 100755 code/lung_feature_patch_allPatients_twoProtein.py
 create mode 100644 code/lung_lif_to_png.py
 create mode 100755 code/microscopy_analysis.py

diff --git a/code/image_preprocessing.py b/code/image_preprocessing.py
new file mode 100755
index 0000000..7c50445
--- /dev/null
+++ b/code/image_preprocessing.py
@@ -0,0 +1,156 @@
+#%% #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Preprocessing of the images
+
+1. Compute maximum projections.
+2. Explore the intensities of the maximum projection images.
+3. Correct the intensities. Normalisation across channels and images, 
+to equalise the standard deviation for intensity values > th (just considering 
+the pixels that contain protein, and not those represinting air, so as to avoid
+enhancing the background noise)
+
+Created on Wed Oct  7 10:34:52 2020
+
+@author: monj
+"""
+import os
+import microscopy_analysis as ma
+import matplotlib.pyplot as plt
+from datetime import datetime
+import numpy as np
+import skimage.io as io
+import copy
+
+# Turn interactive plotting off
+plt.ioff()
+
+plt.close('all')
+
+start_time_global = datetime.now()
+
+#%% I/O directories
+
+diseases = ['emphysema','uip','sarcoidosis','covid'] #I don't include sarcoidosis and uip, I manually create the group later, just so those don't count twice for the std estimation
+conditions = diseases[:]
+conditions.insert(0,'control')
+
+# Input directories - images are stored in folders starting with the name 'frame'
+base_dir = '../input_data/'
+dir_in = base_dir + '/raw/'
+base_name = 'frame'
+
+# Output directories: max projections and postprocessed
+dir_maxProj_raw = base_dir + 'maximum_projections/' 
+ma.make_output_dirs(dir_maxProj_raw,conditions)
+
+dir_postprocessed = base_dir + 'postprocessed_maxProjs/'
+
+dir_maxProj_processed = dir_postprocessed + 'rgb/' 
+ma.make_output_dirs(dir_maxProj_processed,conditions)
+    
+dir_maxProj_processed_cmy = dir_postprocessed + 'cmy/' 
+ma.make_output_dirs(dir_maxProj_processed_cmy,conditions)
+
+dir_results_intinspect = dir_postprocessed + 'intensity_inspection/' 
+ma.make_output_dirs(dir_results_intinspect,conditions)
+
+#%% Get maximum projection images
+
+# Automatically decide whether to recompute or read the max. projections images
+read_flag = True
+for disease in diseases:
+    disease_flag = len(os.listdir(dir_maxProj_raw + disease + '/'))>2
+    #print(disease)
+    #print(disease_flag)
+    read_flag = disease_flag&read_flag
+     
+
+# Get maximum projection images
+if not(read_flag):
+    # Compute maximum projection images (runtime ~= 135 s)
+    print('Computing maximum projection images')
+    startTime = datetime.now()
+    max_img_list = []
+    for condition in conditions:
+        max_img_list += [ma.comp_max_imgs(dir_in + condition, base_name, dir_maxProj_raw + condition)]
+    endTime = datetime.now() - startTime
+    print(endTime)
+else:
+    # Read maximum projection images (runtime ~= 20 s )
+    print('Reading maximum projection images')
+    startTime = datetime.now()
+    max_img_list = []
+    for condition in conditions:
+        max_img_list += [ma.read_max_imgs(dir_maxProj_raw + condition, base_name)]
+    endTime = datetime.now() - startTime
+    print(endTime)
+
+#%% Plot histograms of raw images (approx. 29s per condition)
+for condition, max_img_list_condition in zip(conditions, max_img_list):
+    ma.plotHist_perChannel_imgset_list(max_img_list_condition, dir_in + condition, dir_results_intinspect + condition, name_tag = 'original')
+
+#%% Measure the spread of intensities in each channel
+"""The spread of intensities is quantified by the standard deviation of the 
+intensities that are over a threshold. The threshold value is set manually 
+and it defines the minimum intensity associated with the presence of protein.
+Image regions with intensity values under the threshold are considered to 
+image air, i.e. absence of protein."""
+
+# Protein presence/absence (air) intensity threshold
+th_int = 15 #could it be set automatically based on the data?
+
+# Compute stds per image channel and, for each sample, plot intensity histograms per channel.
+std_list = []
+for condition, max_img_list_condition in zip(conditions, max_img_list):
+    std_list += [ma.comp_std_perChannel(max_img_list_condition, th_int, dir_in + condition)] 
+
+# Histogram of standard deviations (std) per image channel 
+fig, axs = plt.subplots(len(conditions),4, figsize = (4*2,len(conditions)*2), sharex = True, sharey = True)
+plt.suptitle('Distribution of intensity spread across images\n Rows: ' +  ', '.join(conditions))
+for ind_c,std_list_condition in enumerate(std_list):  
+    for ind,channel_list in enumerate(std_list_condition):
+        axs[0][ind+1].title.set_text('Channel ' + str(ind+1))
+        axs[ind_c][ind+1].hist(channel_list, bins = 50, density = True)
+  
+# Histogram of standard deviations per image  
+axs[0][0].title.set_text('All channels')
+std_list_flat = []
+for ind_c,std_list_condition in enumerate(std_list):
+    std_list_condition_flat = ma.flatten_list(std_list_condition) 
+    axs[ind_c][0].hist(std_list_condition_flat, bins = 50, density = True)    
+    std_list_flat += std_list_condition_flat
+plt.show()    
+    
+# Average intesity-spread (std) across all image channels
+mean_std = sum(std_list_flat)/len(std_list_flat)
+print('The average spread of an image channel is ' + str(round(mean_std)))
+ 
+#%% Correct intensities (34s per condition)
+max_img_corr_list = []
+for condition, max_img_list_condition in zip(conditions, max_img_list):
+    max_img_corr_list += [ma.intensity_spread_normalisation(copy.deepcopy(max_img_list_condition), th_int, mean_std, dir_in + condition, base_name, dir_maxProj_processed + condition)]
+
+#%% Plot histograms of corrected images (29s per condition)
+for condition, max_img_corr_list_condition in zip(conditions, max_img_corr_list):
+   ma.plotHist_perChannel_imgset_list(max_img_corr_list_condition, dir_in + condition, dir_results_intinspect + condition, name_tag = 'corrected')
+
+#%% Visualise raw vs. corrected images
+for max_img_list_condition, max_img_corr_list_condition, condition in zip(max_img_list, max_img_corr_list, conditions):
+    ma.compare_imgpairs_list(max_img_list_condition, max_img_corr_list_condition , dir_in + condition, dir_results_intinspect + condition)  
+
+ #%% Exchange channels: fibrillar collagen with elastin
+for max_img_list_condition in max_img_corr_list:
+    print(len(max_img_list_condition))
+    for max_img_list_sample in max_img_list_condition:
+        print(len(max_img_list_sample))
+        for nr_frame,max_img in enumerate(max_img_list_sample):
+            max_img_list_sample[nr_frame]= max_img[:,:,[0,2,1]]
+            
+#%% Save images in cmy colourspace
+for max_img_corr_list_condition, condition in zip(max_img_corr_list, conditions):
+    ma.rgb2cmy_list(max_img_corr_list_condition, dir_in + condition, base_name, dir_maxProj_processed_cmy + condition)  
+
+#%%% Print total computational time
+end_time_global = datetime.now() - startTime
+print(end_time_global)
\ No newline at end of file
diff --git a/code/local_features.py b/code/local_features.py
new file mode 100755
index 0000000..11b5b3b
--- /dev/null
+++ b/code/local_features.py
@@ -0,0 +1,117 @@
+"""
+Helping functions for extracting features from images
+"""
+
+import skimage.io
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.ndimage
+
+def get_gauss_feat_im(im, sigma=1, normalize=False):
+    """Gauss derivative feaures for every image pixel.
+    Arguments:
+        image: a 2D image, shape (r,c).
+        sigma: standard deviation for Gaussian derivatives.
+        normalize: flag indicating normalization of features.
+    Returns:
+        imfeat: a 3D array of size (r,c,15) with a 15-dimentional feature
+            vector for every image pixel.
+    Author: vand@dtu.dk, 2020
+    """
+      
+    r,c = im.shape
+    imfeat = np.zeros((r,c,15))
+    imfeat[:,:,0] = scipy.ndimage.gaussian_filter(im,sigma,order=0)
+    imfeat[:,:,1] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,1])
+    imfeat[:,:,2] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,0])
+    imfeat[:,:,3] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,2])
+    imfeat[:,:,4] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,1])
+    imfeat[:,:,5] = scipy.ndimage.gaussian_filter(im,sigma,order=[2,0])
+    imfeat[:,:,6] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,3])
+    imfeat[:,:,7] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,2])
+    imfeat[:,:,8] = scipy.ndimage.gaussian_filter(im,sigma,order=[2,1])
+    imfeat[:,:,9] = scipy.ndimage.gaussian_filter(im,sigma,order=[3,0])
+    imfeat[:,:,10] = scipy.ndimage.gaussian_filter(im,sigma,order=[0,4])
+    imfeat[:,:,11] = scipy.ndimage.gaussian_filter(im,sigma,order=[1,3])
+    imfeat[:,:,12] = scipy.ndimage.gaussian_filter(im,sigma,order=[2,2])
+    imfeat[:,:,13] = scipy.ndimage.gaussian_filter(im,sigma,order=[3,1])
+    imfeat[:,:,14] = scipy.ndimage.gaussian_filter(im,sigma,order=[4,0])
+    
+    if normalize:
+        imfeat -= np.mean(imfeat, axis=(0,1))
+        imfeat *= (1/np.std(imfeat, axis=(0,1)))
+    
+    return imfeat
+
+def im2col(im, patch_size=[3,3], stepsize=1):
+    """Rearrange image patches into columns
+    Arguments:
+        image: a 2D image, shape (r,c).
+        patch size: size of extracted paches.
+        stepsize: patch step size.
+    Returns:
+        patches: a 2D array which in every column has a patch associated 
+            with one image pixel. For stepsize 1, number of returned column 
+            is (r-patch_size[0]+1)*(c-patch_size[0]+1) due to bounary. The 
+            length of columns is pathc_size[0]*patch_size[1].
+    """
+    
+    r,c = im.shape
+    s0, s1 = im.strides    
+    nrows =r-patch_size[0]+1
+    ncols = c-patch_size[1]+1
+    shp = patch_size[0],patch_size[1],nrows,ncols
+    strd = s0,s1,s0,s1
+
+    out_view = np.lib.stride_tricks.as_strided(im, shape=shp, strides=strd)
+    return out_view.reshape(patch_size[0]*patch_size[1],-1)[:,::stepsize]
+
+
+def ndim2col(im, block_size=[3,3], stepsize=1):
+    """Rearrange image blocks into columns for N-D image (e.g. RGB image)"""""
+    if(im.ndim == 2):
+        return im2col(im, block_size, stepsize)
+    else:
+        r,c,l = im.shape
+        patches = np.zeros((l*block_size[0]*block_size[1],
+                            (r-block_size[0]+1)*(c-block_size[1]+1)))
+        for i in range(l):
+            patches[i*block_size[0]*block_size[1]:(i+1)*block_size[0]*block_size[1],
+                    :] = im2col(im[:,:,i],block_size,stepsize)
+        return patches
+
+#%%
+
+if __name__ == '__main__':
+    
+    filename = '../example_data/training_image.png'
+    I = skimage.io.imread(filename)
+    I = I.astype(np.float)
+    
+    
+    # features based on gaussian derivatives
+    sigma = 1;
+    gf = get_gauss_feat_im(I, sigma)
+    
+    fig,ax = plt.subplots(3,5)
+    for j in range(5):
+        for i in range(3):
+            ax[i][j].imshow(gf[:,:,5*i+j])
+            ax[i][j].set_title(f'layer {5*i+j}')
+            
+            
+    # features based on image patches
+    I = I[I.shape[0]//2:I.shape[0]//2+20,I.shape[1]//2:I.shape[1]//2+20] # smaller image such that we can see the difference in layers
+    pf = im2col(I,[3,3])
+    pf = pf.reshape((9,I.shape[0]-2,I.shape[1]-2))
+    
+    fig, ax = plt.subplots()
+    ax.imshow(I)
+            
+    fig,ax = plt.subplots(3,3)
+    for j in range(3):
+        for i in range(3):
+            ax[i][j].imshow(pf[3*i+j])
+            ax[i][j].set_title(f'layer {3*i+j}')
+   
+    
diff --git a/code/lung_feature_patch_allPatients_singleProtein.py b/code/lung_feature_patch_allPatients_singleProtein.py
new file mode 100755
index 0000000..a43234d
--- /dev/null
+++ b/code/lung_feature_patch_allPatients_singleProtein.py
@@ -0,0 +1,645 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Explorative analysis of protein composition and structure in lung ECM to 
+study disease-induced remodeling of the three proteins that provide structural 
+integrity to the lung: collagen IV, elastin and fibrillar collagen.
+
+The single-protein analyses provide the following:
+    - A 'Data Model' per protein, a dictionary of image patches representative 
+    of the local protein structure found in the data (both 
+    healthy and disease group) at the scale defined by the patch size.
+    - A 'Sorted Data Model' or disease signature per protein, a sorted version
+    of the dictionary above, based on how often a dictionary element (called 
+    cluster centre in the publication) is found in the healthy vs. the disease.
+    - A 'Probability Map' for every image based on each protein, with pixel 
+    correspondance to the input images (maximum projections), highlighting 
+    regions characteristic of health or disease. 
+    - An 'Agreement Map' for every image, which combines the corresponding
+    probability maps from the three proteins to visualise when structural 
+    remodelling of proteins coexists in space.
+    - A 'Disease Effect' per protein, to quantify how well the control and 
+    disease groups are separated after assigning a health probability score to 
+    each image, computed as the average of its probability map values.
+    
+by Monica J. Emerson monj@dtu.dk and Anders B. Dahl abda@dtu.dk
+
+For more information, read our article (add link and citation)"""
+
+#%% Load packages and initialise parameters
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sklearn.cluster
+import microscopy_analysis as ma
+import skimage.io 
+import skimage.transform
+
+import os
+from datetime import datetime
+import sys
+from matplotlib.offsetbox import OffsetImage, AnnotationBbox
+from math import ceil
+
+startTime = datetime.now()
+
+plt.close('all')
+
+sc_fac = 0.5 #25 #25 #0.5 # scaling factor of the image
+patch_size = 17
+nr_clusters = 100 # number of clusters
+protein_mode = 'single'
+colour_mode = 'cmy' #for image visualisation
+th_back = 15
+nr_runs = 10
+    
+#%% Define and create directories
+
+disease = 'emphysema' #diseased (mix, 2 of each condition)' #''emphysema' 'sarcoidosis' 'ild' 'covid'
+control = 'control'
+conditions = [disease, control]
+    
+# input directories - images start with the name 'frame'
+base_dir_in = '../input_data/'
+#dir_in = base_dir_in + 'maximum_projections' #uncomment to read maximum projections without contrast equalisation
+dir_in = base_dir_in + 'postprocessed_maxProjs/rgb/' #Read rgb for correct protein channel ordering, visualisation will be in cmy
+base_name = 'frame'
+
+#output directory
+dir_results = '../results/' #rerun just to make sure not to muck up the results
+os.makedirs(dir_results, exist_ok = True)  
+os.makedirs(dir_results + '/statistics', exist_ok = True)  
+
+ #%% Read (preprocessed) maximum corrected images
+#Read maximum projection images (runtime ~= 20 s )
+print('Reading maximum projection images')
+
+max_img_list_control = ma.read_max_imgs(dir_in + control, base_name)
+max_img_list_disease = ma.read_max_imgs(dir_in + disease, base_name)
+
+#%% Exchange channels: fibrillar collagen with elastin
+for max_img_sample_control in max_img_list_control:
+    for nr_frame, max_img_control in enumerate(max_img_sample_control):
+        max_img_sample_control[nr_frame] = max_img_control[:,:,[0,2,1]]
+
+for max_img_sample_disease in max_img_list_disease:
+    for nr_frame, max_img_disease in enumerate(max_img_sample_disease):
+        max_img_sample_disease[nr_frame] = max_img_disease[:,:,[0,2,1]]
+
+#%% Prepare maximum projection images for feature extraction
+
+#Flatten and merge conditions
+max_img_list_control_flat = ma.flatten_list(max_img_list_control) 
+max_img_list_disease_flat = ma.flatten_list(max_img_list_disease)
+
+#Rescale
+max_img_list_control_processed = []
+for max_img in ma.flatten_list(max_img_list_control):
+    max_img_list_control_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)]
+
+max_img_list_disease_processed = []
+for max_img in ma.flatten_list(max_img_list_disease) :
+    max_img_list_disease_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype('uint8')]
+
+#%% Compute patches
+patch_feat_list_control = [[],[],[]]
+for max_img in max_img_list_control_processed:
+    for channel in range(max_img.shape[2]):
+        patch_feat_list_control[channel] += [ma.ndim2col_pad(max_img[...,channel], (patch_size, patch_size),norm=False).transpose()]
+    
+patch_feat_list_disease = [[],[],[]]
+for max_img in max_img_list_disease_processed:
+    for channel in range(max_img.shape[2]):
+        patch_feat_list_disease[channel]  += [ma.ndim2col_pad(max_img[...,channel], (patch_size, patch_size),norm=False).transpose()]
+
+for run in range(nr_runs):
+    #directories for results from the different runs
+    dir_probs = dir_results + disease + '_' + protein_mode + '_%dclusters_%ddownscale_%dpatchsize/'%(nr_clusters,1/sc_fac,patch_size)
+    dir_probs += 'run'+str(run)+'/'
+    ma.make_output_dirs(dir_probs, conditions)
+    
+    #%% Run the whole analysis for the 3 channels independently
+    prob_img_list_control_allchannels = [[],[],[]]
+    prob_img_list_disease_allchannels = [[],[],[]]
+    
+    for channel in range(len(patch_feat_list_control)):
+    
+    #k-means clustering
+        startTime_clustering = datetime.now()
+        
+        batch_size = 10000
+        th_nr_pathesINcluster = 20
+        th_nr_weakClusters = 0 #added for covid and ild
+        
+        #If cluster centers were already computed, use those
+        if os.path.exists(dir_probs + 'array_cluster_centers_channel'+str(channel)+'.npy'):
+              cluster_centers = np.load(dir_probs + 'array_cluster_centers_channel'+str(channel)+'.npy')
+              kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size)
+              kmeans.cluster_centers_ = cluster_centers
+        
+        #Compute cluster centers
+        else: 
+            #Get features to cluster
+            patch_feat_total = patch_feat_list_control[channel] + patch_feat_list_disease[channel]
+        
+            #features for clustering
+            nr_keep = 10000 # number of features/image randomly picked for clustering 
+            n_im = len(patch_feat_total)
+            feat_dim = patch_feat_total[0].shape[1]
+            patch_feat_to_cluster = np.zeros((nr_keep*n_im,feat_dim))
+            
+            f = 0
+            nr_keep_control = nr_keep//(2*len(patch_feat_list_control[channel])//len(patch_feat_list_disease[channel]))
+            nr_keep_disease = nr_keep//2
+            for patch_feat in patch_feat_list_control[channel]:
+                keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_control]
+                patch_feat_to_cluster[f:f+nr_keep_control,:] = patch_feat[keep_indices,:]
+                f += nr_keep
+            
+            for patch_feat in patch_feat_list_disease[channel]:
+                keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_disease]
+                patch_feat_to_cluster[f:f+nr_keep_disease,:] = patch_feat[keep_indices,:]
+                f += nr_keep
+            
+            #Get feature vectors
+            feat_to_cluster = patch_feat_to_cluster
+            
+            #Compute cluster centers and save if there is no weak clusters
+            nr_weakClusters = th_nr_weakClusters+1
+            repetition = 0
+            nr_weakClusters_list = []
+            while nr_weakClusters>th_nr_weakClusters: #While weak clusters is different from empty recompute
+                kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size, verbose = False, reassignment_ratio=0.0005, max_no_improvement = 100) #0.00075 was good for uip
+                kmeans.fit(feat_to_cluster)
+                
+                #Nr. patches that have contributed to each cluster
+                nr_feat_in_cluster = [sum(kmeans.labels_==cluster) for cluster in range(nr_clusters)]
+                
+                weak_cluster_IDs = [ind for ind,val in enumerate(nr_feat_in_cluster) if val<th_nr_pathesINcluster]
+                print('Weak cluster IDs channel '+ str(channel) + ': ',weak_cluster_IDs)
+                    
+                #Weak clusters are those composed by very few patches
+                nr_weakClusters = len([1 for i in nr_feat_in_cluster if i<th_nr_pathesINcluster])
+                
+                #If there are weak clusters, the clustering should be recomputed
+                if nr_weakClusters>th_nr_weakClusters:
+                    print(str(nr_weakClusters)+ " clusters composed of less than "+str(th_nr_pathesINcluster)+" images") 
+                else:
+                    np.save(dir_probs + 'array_cluster_centers_channel'+str(channel)+'.npy', kmeans.cluster_centers_)    # .npy extension is added if not given
+            
+                print(repetition)
+                repetition += 1
+                nr_weakClusters_list += [nr_weakClusters]
+                
+         
+        endTime_clustering = datetime.now() - startTime_clustering
+        print(endTime_clustering)
+    
+        #%% Plot all cluster centers (grid view)
+        cluster_centers = kmeans.cluster_centers_
+        int_sum_list= ma.plot_grid_cluster_centers(cluster_centers, range(nr_clusters), patch_size, protein_mode = protein_mode, colour_mode = colour_mode, channel = channel)
+        plt.savefig(dir_probs + 'clustercenters_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+         
+        #%% Histograms for background, healthy and disease
+        hist_control, assignment_list_control= ma.compute_assignment_hist(patch_feat_list_control[channel], kmeans)
+        hist_disease, assignment_list_disease = ma.compute_assignment_hist(patch_feat_list_disease[channel], kmeans)
+    
+        #Show histograms (bar plot) of healthy and disease
+        fig, ax = plt.subplots(1,1)
+        ax.bar(np.array(range(0,nr_clusters))-0.25, hist_control, width = 0.5, label='Control', color = 'b')
+        ax.bar(np.array(range(0,nr_clusters))+0.25, hist_disease, width = 0.5, label='Disease', color = 'r')
+        ax.legend()
+        plt.savefig(dir_probs + 'assignmentHistograms_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    
+        #%% Compute populated, occurrence and importance cluster measures from the histogram
+        #grid dimensions
+
+        #OCCURRENCE RATIO: How more often does it occur in one of the conditions
+        occurrence_ratio = np.empty((nr_clusters))
+        ind_control = hist_control>hist_disease
+        ind_disease = ~ind_control
+        control_cond = (hist_disease!=0) & ind_control
+        occurrence_ratio[control_cond] = hist_control[control_cond]/hist_disease[control_cond]
+        disease_cond = (hist_control!=0) & ind_disease
+        occurrence_ratio[disease_cond] = -hist_disease[disease_cond]/hist_control[disease_cond]
+        
+        #POPULATED: How trustworthy is the cluster? 
+        populated = hist_control+hist_disease
+        weights = populated/populated.max()
+        
+        #Cluster IMPORTANCE: occurrence weighed by POPULATED
+        def sigmoid(x,k):
+            return 2 / (1 + np.exp(-k*x)) - 1
+        
+        importance = occurrence_ratio * sigmoid(weights,100)
+    
+        #%% Plot cluster centers in the 2d populated/occurrence space
+        fig_control, ax_control = plt.subplots(figsize=(15,15))
+        ax_control.scatter(populated[occurrence_ratio>1], occurrence_ratio[occurrence_ratio>1]) 
+        ax_control.set_ylim(0.8,7)
+        ax_control.set_xlim(0,0.08)
+        
+        fig_disease, ax_disease = plt.subplots(figsize=(15,15))
+        ax_disease.scatter(populated[occurrence_ratio<1], -occurrence_ratio[occurrence_ratio<1]) 
+        ax_disease.set_ylim(0.8,7)
+        ax_disease.set_xlim(0,0.08)
+        
+        for x0, y0, cluster_nr in zip(populated, occurrence_ratio, np.arange(0,nr_clusters)):
+            cluster_centre = np.reshape(kmeans.cluster_centers_[cluster_nr,:],(patch_size,patch_size))
+            if y0>0:
+                ab = AnnotationBbox(OffsetImage(cluster_centre.astype('uint8'),cmap='gray'), (x0, y0), frameon=False)
+                ax_control.add_artist(ab)
+            else:
+                ab = AnnotationBbox(OffsetImage(cluster_centre.astype('uint8'),cmap='gray'), (x0, -y0), frameon=False)
+                ax_disease.add_artist(ab)
+        plt.figure(fig_control.number)
+        plt.savefig(dir_probs + 'controlClustercenters_2Dspace_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300) 
+        plt.figure(fig_disease.number)
+        plt.savefig(dir_probs + 'diseaseClustercenters_2Dspace_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    
+        #%% Plot all clusters sorted by IMPORTANCE
+        clusters_control_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]>0]
+        clusters_control_occW_sorted.reverse()
+        
+        clusters_disease_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]<0]
+        
+        clusters_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten())]
+        clusters_occW_sorted.reverse()
+        
+        occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(importance.flatten())]
+        occurrence_ratio_sorted.reverse()
+        
+        ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = colour_mode, channel = channel) #contrast = 'equal',    
+        plt.suptitle('Cluster centers channel '+str(channel)+' \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted))  + ' in between')
+        plt.savefig(dir_probs + 'clustercenters_occW_sorted_channel'+str(channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+         
+
+        #%% Compute probabilities 
+        #per cluster
+        prob_control= hist_control/(hist_control+hist_disease)
+        prob_disease = hist_disease/(hist_control+hist_disease)
+        
+        #per image region
+        r,c,_ = max_img_list_control_flat[0].shape
+        img_shape = (int(r*sc_fac),int(c*sc_fac))
+        prob_img_list_control = ma.comp_prob_imgs(prob_control,assignment_list_control,img_shape)
+        prob_img_list_disease = ma.comp_prob_imgs(prob_control,assignment_list_disease,img_shape) 
+        prob_img_list_control_allchannels[channel] = prob_img_list_control
+        prob_img_list_disease_allchannels[channel] = prob_img_list_disease   
+        
+        #per image and sample  
+        #control
+        sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True, )
+        
+        nr_list = 0
+        prob_list_control= []
+        for max_img_list_sample,sample_name in zip(max_img_list_control, sample_names_control): #in general,  in max_img_list_condition
+            #print(sample_name)
+            nr_frames = len(max_img_list_sample)
+            prob_img_list_sample = prob_img_list_control[nr_list:nr_list+nr_frames]
+            nr_list += nr_frames
+            
+            prob_sample = np.zeros((nr_frames,1))
+            for prob_img, frame_nr in zip(prob_img_list_sample,range(nr_frames)):
+                #compute accumulated probabilities
+                prob_frame = sum(sum(prob_img))/prob_img.size
+                #print(prob_frame)
+                prob_sample[frame_nr] = prob_frame
+            prob_list_control += [prob_sample]
+            
+        #disease
+        sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True)
+        
+        nr_list = 0
+        prob_list_disease = []
+        for max_img_list_sample,sample_name in zip(max_img_list_disease, sample_names_disease): #in general,  in max_img_list_condition
+            #print(sample_name)
+            nr_frames = len(max_img_list_sample)
+            prob_img_list_sample = prob_img_list_disease[nr_list:nr_list+nr_frames]
+            nr_list += nr_frames
+    
+            prob_sample = np.zeros((nr_frames,1))
+            for prob_img, frame_nr in zip(prob_img_list_sample,range(nr_frames)):
+                #compute accumulated probabilities
+                prob_frame = sum(sum(prob_img))/prob_img.size
+                #print(prob_frame)
+                prob_sample[frame_nr] = prob_frame
+            prob_list_disease += [prob_sample]
+            
+        prob_list_samples = prob_list_control + prob_list_disease   
+         
+
+        #%% Box plots
+        fig, ax = plt.subplots(figsize = (15,5))
+        ax.set_title('Image health scores')
+        
+        sample_sep = 0.85
+        patient_sep = 2.35
+        condition_sep = 2.5 
+        positions_control = np.tile([0,0+sample_sep],int(len(max_img_list_control)/2)) + np.repeat(np.arange(0,len(max_img_list_control)/2*patient_sep,patient_sep),2)
+        positions_disease = np.tile([len(max_img_list_control)+condition_sep,len(max_img_list_control)+condition_sep+sample_sep],int(len(max_img_list_disease)/2)) + np.repeat(np.arange(0,len(max_img_list_disease)/2*patient_sep,patient_sep),2)
+        array_probs = np.squeeze(np.asarray(prob_list_samples))
+        bp_control = ax.boxplot((array_probs[:len(max_img_list_control),...]).T, positions = positions_control, patch_artist = True)
+        bp_disease = ax.boxplot(array_probs[len(max_img_list_control):,...].T, positions = positions_disease, patch_artist = True)
+        
+        for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
+            plt.setp(bp_control[element], color='blue')
+            plt.setp(bp_control[element], linewidth = 2)
+            
+        for patch in bp_control['boxes']:
+            patch.set(facecolor='white')
+            patch.set(linewidth = 2)
+            
+        for element in ['boxes','whiskers', 'fliers', 'means', 'medians', 'caps']:
+            plt.setp(bp_disease[element], color='red')
+            plt.setp(bp_disease[element], linewidth = 2)
+            
+        for patch in bp_disease['boxes']:
+            patch.set(facecolor= 'white')  
+            patch.set(linewidth = 2)
+            #patch.set(alpha=0.2) 
+            
+        ax.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease])
+        ax.set_ylim(0.3,0.7)
+        ax.set_ylabel('Probability health')
+        #ax.set_ylim(0,1)
+        
+        ax.tick_params(
+            axis='both',          # changes apply to the x-axis
+            which='both',      # both major and minor ticks are affected
+            #labelleft = False,      # ticks along the top edge are off
+            #labelbottom=False
+            )
+        
+        #plot points
+        positions = list(positions_control) + list(positions_disease)
+        for prob_sample, position in zip(prob_list_samples, positions):
+            print(prob_sample)
+            print(5*[position])
+            # Add some random "jitter" to the x-axis
+            x = np.random.normal(5*[position], 0)
+            #plot(x, y, 'r.', alpha=0.2)
+            ax.plot(x, prob_sample,'k.', markersize = 6, zorder = 3)
+           
+        #horizontal line
+        x = np.linspace(0,position.max())    
+        y = 0.5*np.ones(x.shape)
+        ax.plot(x,y,'k',linewidth = 1, linestyle = 'dashed')
+        
+        plt.savefig(dir_probs + 'scatterBoxplots_channel'+str(channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+        fig2, ax2 = plt.subplots(figsize = (15,5))
+        #plot points
+        for prob_sample, position in zip(prob_list_control, positions_control):
+            ax2.plot(5*[position], prob_sample,'b.', markersize = 4)
+            
+        for prob_sample, position in zip(prob_list_disease, positions_disease):
+            ax2.plot(5*[position], prob_sample,'r.', markersize = 4)
+        
+        #horizontal line
+        x = np.linspace(0,position.max())    
+        y = 0.5*np.ones(x.shape)
+        ax2.plot(x,y,'k',linewidth = 1, linestyle = 'dashed')
+        
+        ax2.set_xticks(ax.get_xticks())
+        ax2.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease])
+        ax2.set_ylim(0.3,0.7)
+        ax2.set_ylabel('Probability health')
+    
+        plt.savefig(dir_probs + 'samplewiseScatterPlots_channel'+str(channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+
+    #%% Create agreement maps
+    prob_img_rgb_list_control = []
+    for prob_img_c0, prob_img_c1, prob_img_c2 in zip(prob_img_list_control_allchannels[0],prob_img_list_control_allchannels[1],prob_img_list_control_allchannels[2]):
+        prob_img_rgb = np.stack((prob_img_c0,prob_img_c1, prob_img_c2), axis = 2)
+        prob_img_rgb_list_control += [prob_img_rgb]
+    
+    prob_img_rgb_list_disease = []
+    for prob_img_c0, prob_img_c1, prob_img_c2 in zip(prob_img_list_disease_allchannels[0],prob_img_list_disease_allchannels[1],prob_img_list_disease_allchannels[2]):
+        prob_img_rgb = np.stack((prob_img_c0,prob_img_c1, prob_img_c2), axis = 2)
+        prob_img_rgb_list_disease += [prob_img_rgb]    
+        
+    #%% Plot per channel prob images, and agreement maps
+    nr_frames = len(max_img_list_control[0])
+    
+    min_prob_img = []
+    max_prob_img = []
+    
+    #control
+    sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True)
+    nr_list = 0
+    fig_amaps, ax_amaps = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True)
+    fig_maximgs, ax_maximgs = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True)
+    for max_img_list_sample,sample_name, sample_nr in zip(max_img_list_control, sample_names_control,range(len(sample_names_control))): #in general,  in max_img_list_condition
+        print(sample_name)
+        prob_img_list_sample = prob_img_rgb_list_control[nr_list:nr_list+nr_frames]
+        nr_list += nr_frames
+        
+        fig, axs = plt.subplots(5,nr_frames, figsize = (nr_frames*2,5*2), sharex=True, sharey=True)
+        for max_img,prob_img, frame_nr in zip(max_img_list_sample,prob_img_list_sample,range(nr_frames)):
+            max_img_ds = skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype('uint8')
+            
+            #set probs as titles
+            for channel in range(prob_img.shape[2]):
+                prob_frame = sum(sum(prob_img[...,channel]))/prob_img[...,channel].size
+                axs[channel+2][frame_nr].set_title("%.2f" % np.round(prob_frame,2))
+            
+            #remove background
+            ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back)
+            prob_img[ind_back] = 0.5
+            
+            #plot
+            max_img_ds = ma.rgb2cmy(max_img_ds)
+            axs[0][frame_nr].imshow(max_img_ds)
+            axs[0][frame_nr].xaxis.set_ticklabels([])
+            axs[0][frame_nr].xaxis.set_ticks_position('none')
+            axs[0][frame_nr].yaxis.set_ticklabels([])
+            axs[0][frame_nr].yaxis.set_ticks_position('none')
+            
+            axs[2][frame_nr].imshow(1-prob_img[...,0],cmap = plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[2][frame_nr].xaxis.set_ticklabels([])
+            axs[2][frame_nr].xaxis.set_ticks_position('none')
+            axs[2][frame_nr].yaxis.set_ticklabels([])
+            axs[2][frame_nr].yaxis.set_ticks_position('none')
+            
+            axs[3][frame_nr].imshow(1-prob_img[...,1],cmap = plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[3][frame_nr].xaxis.set_ticklabels([])
+            axs[3][frame_nr].xaxis.set_ticks_position('none')
+            axs[3][frame_nr].yaxis.set_ticklabels([])
+            axs[3][frame_nr].yaxis.set_ticks_position('none')
+            
+            axs[4][frame_nr].imshow(1-prob_img[...,2],cmap = plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[4][frame_nr].xaxis.set_ticklabels([])
+            axs[4][frame_nr].xaxis.set_ticks_position('none')
+            axs[4][frame_nr].yaxis.set_ticklabels([])
+            axs[4][frame_nr].yaxis.set_ticks_position('none')
+            
+            #normalise range of agreement map so that global max and min values are 0 and 1
+            prob_img_norm = (prob_img-0.2)/0.6
+            min_prob_img += [np.min(prob_img)]
+            max_prob_img += [np.max(prob_img)]
+            
+            axs[1][frame_nr].imshow(prob_img_norm)
+            axs[1][frame_nr].xaxis.set_ticklabels([])
+            axs[1][frame_nr].xaxis.set_ticks_position('none')
+            axs[1][frame_nr].yaxis.set_ticklabels([])
+            axs[1][frame_nr].yaxis.set_ticks_position('none')
+            
+            ax_amaps[frame_nr][sample_nr].imshow(prob_img_norm)
+            ax_amaps[frame_nr][sample_nr].xaxis.set_ticklabels([])
+            ax_amaps[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+            ax_amaps[frame_nr][sample_nr].yaxis.set_ticklabels([])
+            ax_amaps[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+            
+            ax_maximgs[frame_nr][sample_nr].imshow(max_img_ds)
+            ax_maximgs[frame_nr][sample_nr].xaxis.set_ticklabels([])
+            ax_maximgs[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+            ax_maximgs[frame_nr][sample_nr].yaxis.set_ticklabels([])
+            ax_maximgs[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+                    
+        plt.suptitle('Per-channel health probabilities for sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+        plt.figure(fig.number),plt.savefig(dir_probs + '/' + control + '/' + 'channelWise_probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=1000)
+        #plt.show() 
+    plt.figure(fig_amaps.number),plt.savefig(dir_probs + '/' + control + '/' + 'agreementMaps_control_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    #plt.show() 
+    plt.figure(fig_maximgs.number),plt.savefig(dir_probs + '/' + control + '/' + 'maxIMGs_control_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    #plt.show()
+    
+    #disease
+    sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True)
+    
+    nr_list = 0
+    fig_amaps, ax_amaps = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True)
+    fig_maximgs, ax_maximgs = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True)
+    for max_img_list_sample,sample_name, sample_nr in zip(max_img_list_disease, sample_names_disease, range(len(sample_names_disease))): #in general,  in max_img_list_condition
+        print(sample_name)
+        prob_img_list_sample = prob_img_rgb_list_disease[nr_list:nr_list+nr_frames]
+        nr_list += nr_frames
+        
+        fig, axs = plt.subplots(5,nr_frames, figsize = (nr_frames*2,5*2), sharex=True, sharey=True)
+        for max_img, prob_img, frame_nr in zip(max_img_list_sample, prob_img_list_sample,range(nr_frames)):
+            max_img_ds = skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype('uint8')
+            
+            #set probs as titles
+            for channel in range(prob_img.shape[2]):
+                prob_frame = sum(sum(prob_img[...,channel]))/prob_img[...,channel].size
+                axs[channel+2][frame_nr].set_title("%.2f" % np.round(prob_frame,2))
+            
+            #remove background
+            ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back)
+            prob_img[ind_back] = 0.5
+            
+            #plot
+            max_img_ds = ma.rgb2cmy(max_img_ds)
+            axs[0][frame_nr].imshow(max_img_ds)
+            axs[0][frame_nr].xaxis.set_ticklabels([])
+            axs[0][frame_nr].xaxis.set_ticks_position('none')
+            axs[0][frame_nr].yaxis.set_ticklabels([])
+            axs[0][frame_nr].yaxis.set_ticks_position('none')
+            
+            axs[2][frame_nr].imshow(1-prob_img[...,0],cmap = plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[2][frame_nr].xaxis.set_ticklabels([])
+            axs[2][frame_nr].xaxis.set_ticks_position('none')
+            axs[2][frame_nr].yaxis.set_ticklabels([])
+            axs[2][frame_nr].yaxis.set_ticks_position('none')
+            
+            axs[3][frame_nr].imshow(1-prob_img[...,1],cmap = plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[3][frame_nr].xaxis.set_ticklabels([])
+            axs[3][frame_nr].xaxis.set_ticks_position('none')
+            axs[3][frame_nr].yaxis.set_ticklabels([])
+            axs[3][frame_nr].yaxis.set_ticks_position('none')
+            
+            axs[4][frame_nr].imshow(1-prob_img[...,2],cmap = plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[4][frame_nr].xaxis.set_ticklabels([])
+            axs[4][frame_nr].xaxis.set_ticks_position('none')
+            axs[4][frame_nr].yaxis.set_ticklabels([])
+            axs[4][frame_nr].yaxis.set_ticks_position('none')
+            
+            #normalise range of agreement map so that global max and min values are 0 and 1
+            prob_img_norm = (prob_img-0.2)/0.6
+            min_prob_img += [np.min(prob_img)]
+            max_prob_img += [np.max(prob_img)]
+            
+            axs[1][frame_nr].imshow(prob_img_norm)
+            axs[1][frame_nr].xaxis.set_ticklabels([])
+            axs[1][frame_nr].xaxis.set_ticks_position('none')
+            axs[1][frame_nr].yaxis.set_ticklabels([])
+            axs[1][frame_nr].yaxis.set_ticks_position('none')
+            
+            ax_amaps[frame_nr][sample_nr].imshow(prob_img_norm)
+            ax_amaps[frame_nr][sample_nr].xaxis.set_ticklabels([])
+            ax_amaps[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+            ax_amaps[frame_nr][sample_nr].yaxis.set_ticklabels([])
+            ax_amaps[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+            
+            ax_maximgs[frame_nr][sample_nr].imshow(max_img_ds)
+            ax_maximgs[frame_nr][sample_nr].xaxis.set_ticklabels([])
+            ax_maximgs[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+            ax_maximgs[frame_nr][sample_nr].yaxis.set_ticklabels([])
+            ax_maximgs[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+            
+        plt.suptitle('Per-channel health probabilities for sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+        plt.figure(fig.number), plt.savefig(dir_probs + '/' + disease + '/' + 'channelWise_probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=1000)
+        #plt.show() 
+    plt.figure(fig_amaps.number),plt.savefig(dir_probs + '/' + disease + '/' + 'agreementMaps_'+disease+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    #plt.show()
+    plt.figure(fig_maximgs.number),plt.savefig(dir_probs + '/' + disease + '/' + 'maxIMGs_'+disease+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    #plt.show()
+    
+    print('Agreement map minimum ',sum(min_prob_img)/len(min_prob_img))
+    print('Agreement map maximum ',sum(max_prob_img)/len(max_prob_img))
+    
+    #%% Load packages to Testt the statistical significance of the mean differences
+    from statsmodels.multivariate.manova import MANOVA
+    from statsmodels.formula.api import ols
+    from statsmodels.stats.anova import anova_lm
+    import pandas as pd
+    from patsy import ModelDesc
+    
+    nr_frames_patient = 10
+    
+    #%%Image probabilities
+    img_probs_control = []
+    for prob_img_list in prob_img_list_control_allchannels:
+        img_probs_control += [[np.mean(prob_img) for prob_img in prob_img_list]]
+    
+    img_probs_disease = []
+    for prob_img_list in prob_img_list_disease_allchannels:
+        img_probs_disease += [np.array([np.mean(prob_img) for prob_img in prob_img_list])]
+        
+    img_probs = [np.concatenate((probs_control,probs_disease)) for probs_control,probs_disease in zip(img_probs_control,img_probs_disease)]
+    
+    #%%ANOVA SETUP
+    
+    img_nr = np.tile(np.arange(nr_frames),img_probs[0].size//nr_frames)
+    sample_nr = np.tile(np.repeat(np.arange(nr_frames_patient//nr_frames),nr_frames),img_probs[0].size//nr_frames_patient)
+    patient_nr = np.concatenate([np.repeat(np.arange(len(max_img_list_control_flat)//nr_frames_patient),nr_frames_patient),np.repeat(np.arange(len(max_img_list_disease_flat)//nr_frames_patient),nr_frames_patient)])
+    condition_nr = np.concatenate([np.zeros(len(max_img_list_control_flat)), np.ones(len(max_img_list_disease_flat))])
+
+    for channel in range(len(img_probs)):
+        dep_var = (img_probs[channel]).reshape(img_probs[0].size,1)
+        
+        print(np.mean(dep_var))
+        print(np.mean(dep_var[:len(dep_var)//2]))
+        print(np.mean(dep_var[len(dep_var)//2:]))
+        
+        data_dict = {'Image':img_nr,
+                     'Patient':patient_nr,
+                     'Sample':sample_nr,
+                     'Condition':condition_nr,
+                     'S':dep_var}
+        
+        formula = 'S ~ C(Condition)/C(Patient)/C(Sample)'
+
+        desc = ModelDesc.from_formula(formula)
+        print(desc.describe())
+        
+        lm = ols(formula, data_dict).fit()
+        print(lm.summary())
+        
+        anova_table = anova_lm(lm)
+        print(anova_table)
+        print(disease + ' on channel ' + str(channel))
+        writer_anova = pd.ExcelWriter(dir_results + 'statistics/statistics_anova_' + disease + '_' + protein_mode + '_channel' + str(channel) + '_run' + str(run) + '.xlsx', engine='xlsxwriter')
+        anova_table.to_excel(writer_anova,sheet_name = disease + ' on channel ' + str(channel))
+        writer_anova.save()
+    
\ No newline at end of file
diff --git a/code/lung_feature_patch_allPatients_threeProtein.py b/code/lung_feature_patch_allPatients_threeProtein.py
new file mode 100755
index 0000000..0e7ae7a
--- /dev/null
+++ b/code/lung_feature_patch_allPatients_threeProtein.py
@@ -0,0 +1,689 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Explorative analysis of protein composition and structure in lung ECM to 
+study disease-induced remodeling of the three proteins that provide structural 
+integrity to the lung: collagen IV, elastin and fibrillar collagen.
+
+The three-protein analysis provides the following:
+    - A 'Data Model', a dictionary of image patches representative of the local 
+    protein structure and composition found in the data (both healthy and 
+    disease group) at the scale defined by the patch size.
+    - A 'Sorted Data Model' or disease signature, a sorted version of the 
+    dictionary above, based on how often a dictionary element (called 
+    cluster centre in the publication) is found in the healthy vs. the disease.
+    - A 'Probability Map' for every image based on the three proteins, with 
+    pixel correspondance to the input images (maximum projections), 
+    highlighting regions characteristic of health or disease. 
+    - A 'Disease Effect', to quantify how well the control and 
+    disease groups are separated after assigning a health probability score to 
+    each image, computed as the average of its probability map values.
+    
+by Monica J. Emerson monj@dtu.dk and Anders B. Dahl abda@dtu.dk
+
+For more information, read our article (add link and citation)"""
+
+#%% Load packages and initialise parameters
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sklearn.cluster
+import microscopy_analysis as ma
+import skimage.io 
+import skimage.transform
+ 
+import os
+from datetime import datetime
+import sys
+from matplotlib.offsetbox import OffsetImage, AnnotationBbox
+from math import ceil
+
+startTime = datetime.now()
+
+plt.close('all')
+
+sc_fac = 0.5 #25 #25 #0.5 # scaling factor of the image
+patch_size = 17
+nr_clusters = 100 #49#100 # number of clusters, choose a value with an integer as a square root
+protein_mode = 'triple'
+colour_mode = 'cmy' #'bnw' #colour
+th_back = 15
+nr_runs = 10 #5
+ 
+#%% Define and create directories
+
+disease = 'covid' #diseased (mix, 2 of each condition)' #''emphysema' 'sarcoidosis' 'ild' 'covid'
+control = 'control'
+conditions = [disease, control]
+    
+# input directories - images start with the name 'frame'
+base_dir_in = '../input_data/'
+#dir_in = base_dir_in + 'maximum_projections' #uncomment to read maximum projections without contrast equalisation
+dir_in = base_dir_in + 'postprocessed_maxProjs/rgb/' #Read rgb for correct protein channel ordering, visualisation will be in cmy
+base_name = 'frame'
+
+#output directory
+dir_results = '../results/' #rerun just to make sure not to muck up the results
+os.makedirs(dir_results, exist_ok = True)  
+os.makedirs(dir_results + '/statistics', exist_ok = True)  
+
+#%% Read (preprocessed) maximum corrected images
+#Read maximum projection images (runtime ~= 20 s )
+print('Reading maximum projection images')
+
+max_img_list_control = ma.read_max_imgs(dir_in + control, base_name)
+max_img_list_disease = ma.read_max_imgs(dir_in + disease, base_name)
+
+#%% Exchange channels: fibrillar collagen with elastin
+for max_img_sample_control in max_img_list_control:
+    for nr_frame, max_img_control in enumerate(max_img_sample_control):
+        max_img_sample_control[nr_frame] = max_img_control[:,:,[0,2,1]]
+
+for max_img_sample_disease in max_img_list_disease:
+    for nr_frame, max_img_disease in enumerate(max_img_sample_disease):
+        max_img_sample_disease[nr_frame] = max_img_disease[:,:,[0,2,1]]
+
+#%% Prepare maximum projection images for feature extraction
+#Flatten and merge conditions
+max_img_list_control_flat = ma.flatten_list(max_img_list_control) 
+max_img_list_disease_flat = ma.flatten_list(max_img_list_disease)
+
+#Rescale
+max_img_list_control_processed = []
+for max_img in ma.flatten_list(max_img_list_control) :
+    max_img_list_control_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)]
+
+max_img_list_disease_processed = []
+for max_img in ma.flatten_list(max_img_list_disease) :
+    max_img_list_disease_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)]
+
+#%% Compute patches
+patch_feat_list_control = []
+for max_img in max_img_list_control_processed:
+    patch_feat_list_control += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()]
+
+patch_feat_list_disease = []
+for max_img in max_img_list_disease_processed:
+    patch_feat_list_disease += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()]
+
+patch_feat_total = patch_feat_list_control + patch_feat_list_disease
+    
+for run in range(nr_runs):
+    
+    #directories for results from the different runs
+    dir_probs = dir_results + disease + '_' + protein_mode + '_%dclusters_%ddownscale_%dpatchsize/'%(nr_clusters,1/sc_fac,patch_size)
+    dir_probs += 'run'+str(run)+'/'
+    ma.make_output_dirs(dir_probs, conditions)
+    dir_probs_noBack = dir_probs + 'noBackground/'
+    os.makedirs(dir_probs_noBack, exist_ok = True) 
+    
+    #%% k-means clustering
+    startTime_clustering = datetime.now()
+    
+    batch_size = 10000
+    th_nr_pathesINcluster = 20
+    th_nr_weakClusters = 0 #added for covid and ild
+    
+    #If cluster centers were already computed, use those
+    if os.path.exists(dir_probs + 'array_cluster_centers_colour.npy'):
+         cluster_centers = np.load(dir_probs + 'array_cluster_centers_colour.npy')
+         kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size)
+         kmeans.cluster_centers_ = cluster_centers
+    
+    #Compute cluster centres
+    else: 
+        #features for clustering
+        nr_keep = 10000 # number of features/image randomly picked for clustering 
+        n_im = len(patch_feat_total)
+        feat_dim = patch_feat_total[0].shape[1]
+        patch_feat_to_cluster = np.zeros((nr_keep*n_im,feat_dim),dtype = max_img.dtype)
+        
+        f = 0
+        nr_keep_control = nr_keep//(2*len(patch_feat_list_control)//len(patch_feat_list_disease))
+        nr_keep_disease = nr_keep//2
+        for patch_feat in patch_feat_list_control:
+            keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_control]
+            patch_feat_to_cluster[f:f+nr_keep_control,:] = patch_feat[keep_indices,:]
+            f += nr_keep
+        
+        for patch_feat in patch_feat_list_disease:
+            keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_disease]
+            patch_feat_to_cluster[f:f+nr_keep_disease,:] = patch_feat[keep_indices,:]
+            f += nr_keep
+            
+        #Get feature vectors
+        feat_to_cluster = patch_feat_to_cluster
+        
+        #Compute cluster centers and save if there is no weak clusters
+        nr_weakClusters = th_nr_weakClusters+1
+        repetition = 0
+        nr_weakClusters_list = []
+        while nr_weakClusters>th_nr_weakClusters: #While weak clusters is different from empty recompute
+            kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size, verbose = False, reassignment_ratio=0.0005, max_no_improvement = 100) #0.00075 was good for uip
+            kmeans.fit(feat_to_cluster)
+            
+            #Nr. patches that have contributed to each cluster
+            nr_feat_in_cluster = [sum(kmeans.labels_==cluster) for cluster in range(nr_clusters)]
+            
+            weak_cluster_IDs = [ind for ind,val in enumerate(nr_feat_in_cluster) if val<th_nr_pathesINcluster]
+            print('Weak cluster IDs: ',weak_cluster_IDs)
+                
+            #Weak clusters are those composed by very few patches
+            nr_weakClusters = len([1 for i in nr_feat_in_cluster if i<th_nr_pathesINcluster])
+            
+            #If there are weak clusters, the clustering should be recomputed
+            if nr_weakClusters>th_nr_weakClusters:
+                print(str(nr_weakClusters)+ " clusters composed of less than "+str(th_nr_pathesINcluster)+" images") 
+            else:
+                np.save(dir_probs + 'array_cluster_centers_colour.npy', kmeans.cluster_centers_)    # .npy extension is added if not given
+                
+            print(repetition)
+            repetition += 1
+            nr_weakClusters_list += [nr_weakClusters]
+            
+     
+    endTime_clustering = datetime.now() - startTime_clustering
+    print(endTime_clustering)
+    
+    #%% Plot all cluster centers (grid view)
+    cluster_centers = kmeans.cluster_centers_
+    int_sum_list = ma.plot_grid_cluster_centers(cluster_centers, range(nr_clusters), patch_size, colour_mode = colour_mode)
+    plt.savefig(dir_probs + 'clustercenters_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #%% Histograms for background, control and disease
+    hist_control, assignment_list_control= ma.compute_assignment_hist(patch_feat_list_control, kmeans)
+    hist_disease, assignment_list_disease = ma.compute_assignment_hist(patch_feat_list_disease, kmeans)
+    
+    #%% Show histograms (bar plot) of control and disease
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.array(range(0,nr_clusters))-0.25, hist_control, width = 0.5, label='Control', color = 'b')
+    ax.bar(np.array(range(0,nr_clusters))+0.25, hist_disease, width = 0.5, label='Disease', color = 'r')
+    ax.legend()
+    plt.savefig(dir_probs + 'assignmentHistograms_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    
+    #%% Compute populated, occurrence and importance cluster measures from the histogram
+
+    #OCCURRENCE RATIO: How more often does it occur in one of the conditions
+    occurrence_ratio = np.empty((nr_clusters))
+    ind_control = hist_control>hist_disease
+    ind_disease = ~ind_control
+    control_cond = (hist_disease!=0) & ind_control
+    occurrence_ratio[control_cond] = hist_control[control_cond]/hist_disease[control_cond]
+    disease_cond = (hist_control!=0) & ind_disease
+    occurrence_ratio[disease_cond] = -hist_disease[disease_cond]/hist_control[disease_cond]
+    
+    #POPULATED: How trustworthy is the cluster? 
+    populated = hist_control+hist_disease 
+    weights = populated/populated.max()
+    
+    #Cluster IMPORTANCE: occurrence weighed by POPULATED
+    def sigmoid(x,k):
+        return 2 / (1 + np.exp(-k*x)) - 1
+    
+    x = np.linspace(0,1,100)
+    plt.plot(x,sigmoid(x,100))
+    
+    importance = occurrence_ratio * sigmoid(weights,100)
+    
+    #%% Plot all control and disease cluster centers in the 2d populated/occurrence space
+    fig_control, ax_control = plt.subplots(figsize=(15,15))
+    ax_control.scatter(populated[occurrence_ratio>1], occurrence_ratio[occurrence_ratio>1]) 
+    ax_control.set_ylim(0.8,7)
+    ax_control.set_xlim(0,0.08)
+    
+    fig_disease, ax_disease = plt.subplots(figsize=(15,15))
+    ax_disease.scatter(populated[occurrence_ratio<1], -occurrence_ratio[occurrence_ratio<1]) 
+    ax_disease.set_ylim(0.8,7)
+    ax_disease.set_xlim(0,0.08)
+    
+    for x0, y0, cluster_nr in zip(populated, occurrence_ratio, np.arange(0,nr_clusters)):
+        cluster_centre = np.transpose((np.reshape(kmeans.cluster_centers_[cluster_nr,:],(3,patch_size,patch_size))),(1,2,0))
+        cluster_centre = ma.rgb2cmy(cluster_centre)
+        if y0>0:
+            ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, y0), frameon=False)
+            ax_control.add_artist(ab)
+        else:
+            ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, -y0), frameon=False)
+            ax_disease.add_artist(ab)
+    
+    plt.figure(fig_control.number)
+    plt.savefig(dir_probs + 'controlClustercenters_2Dspace_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+    plt.figure(fig_disease.number)
+    plt.savefig(dir_probs + 'diseaseClustercenters_2Dspace_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+       
+    #%% Plot all clusters sorted by IMPORTANCE
+    
+    #According to occurence
+    clusters_control_occ_sorted = [np.arange(0,100)[i] for i in np.argsort(occurrence_ratio.flatten()) if (occurrence_ratio.flatten())[i]>0]
+    clusters_control_occ_sorted.reverse()
+    
+    clusters_disease_occ_sorted = [np.arange(0,100)[i] for i in np.argsort(occurrence_ratio.flatten()) if (occurrence_ratio.flatten())[i]<0]
+    
+    clusters_occ_sorted = [np.arange(0,100)[i] for i in np.argsort(occurrence_ratio.flatten())]
+    clusters_occ_sorted.reverse()
+    
+    occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(occurrence_ratio.flatten())]
+    occurrence_ratio_sorted.reverse()
+    
+    ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occ_sorted, patch_size, colour_mode = colour_mode)
+    plt.suptitle('Cluster centers \n sorted by occ from control to disease with cluster ' + str(len(clusters_control_occ_sorted))  + ' in between')
+    plt.savefig(dir_probs + 'clustercenters_occ_sorted_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #According to occurence weighed by populated
+    clusters_control_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]>0]
+    clusters_control_occW_sorted.reverse()
+    
+    clusters_disease_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]<0]
+    
+    clusters_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten())]
+    clusters_occW_sorted.reverse()
+    
+    occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(importance.flatten())]
+    occurrence_ratio_sorted.reverse()
+    
+    cluster_nums = np.arange(0,nr_clusters)
+    cluster_nums_sorted = [cluster_nums[i] for i in np.argsort(importance.flatten())]
+    cluster_nums_sorted.reverse()
+    
+    prob_control = hist_control/(hist_control+hist_disease)
+    prob_control_sorted = [prob_control[i] for i in np.argsort(importance.flatten())]
+    prob_control_sorted.reverse()
+    
+    
+    ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size,  colour_mode = colour_mode)
+    plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted))  + ' in between')
+    plt.savefig(dir_probs + 'clustercenters_occW_sorted_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #with probabilities
+    ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size,  colour_mode = colour_mode, titles = True, occurrence = prob_control_sorted)
+    plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted))  + ' in between')
+    plt.savefig(dir_probs + 'clustercenters_occW_sorted_wProbs_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #with cluster numbers
+    ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size,  colour_mode = colour_mode, titles = True, occurrence = cluster_nums_sorted)
+    plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted))  + ' in between')
+    plt.savefig(dir_probs + 'clustercenters_occW_sorted_wClusternums_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #%% Plot all clusters sorted - split channelwise view
+    ma.plot_grid_cluster_centers(np.copy(kmeans.cluster_centers_), clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = 'cmy', channel = 0)
+    plt.savefig(dir_probs + 'clustercenters_occW_sorted_split_channel1_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    ma.plot_grid_cluster_centers(np.copy(kmeans.cluster_centers_), clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = 'cmy', channel = 1)
+    plt.savefig(dir_probs + 'clustercenters_occW_sorted_split_channel2_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    ma.plot_grid_cluster_centers(np.copy(kmeans.cluster_centers_), clusters_occW_sorted, patch_size, protein_mode = protein_mode, colour_mode = 'cmy', channel = 2)
+    plt.savefig(dir_probs + 'clustercenters_occW_sorted_split_channel3_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+
+    
+    #%% Find background clusters
+    clusters_background_intBased = [i for i in range(len(int_sum_list)) if int_sum_list[i] < th_back*patch_size**2]
+
+    clusters_background = clusters_background_intBased
+    print('Background clusters'+str(clusters_background))
+    
+    #%% Plot grid of ordered, non-background clusters that are populated over a thresh
+    th_populated = 0.5/nr_clusters
+    th_occurr = 1.25
+    
+    #control
+    clusters_control_occW_sorted_noBackground = [i for i in clusters_control_occW_sorted if (i not in clusters_background)&(populated[i]>th_populated)&(occurrence_ratio[i]>th_occurr)]
+    ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_control_occW_sorted_noBackground, patch_size, colour_mode = colour_mode)
+    #plt.suptitle('Characteristic clusters for control, excluding background, \n from most important to least')
+    plt.savefig(dir_probs_noBack + 'clustercenters_control_occW_sorted_noBack%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #disease
+    clusters_disease_occW_sorted_noBackground = [i for i in clusters_disease_occW_sorted if (i not in clusters_background)&(populated[i]>th_populated)&(abs(occurrence_ratio[i])>th_occurr)]
+    ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_disease_occW_sorted_noBackground, patch_size, colour_mode = colour_mode)
+    #plt.suptitle('Charactertistic clusters for ' + disease + ', excluding background, \n from most important to least')
+    plt.savefig(dir_probs_noBack + 'clustercenters_disease_occW_sorted_noBack%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+     
+    #%% Plot channelwise split characteristic clusters 
+    nr_clusters_display = 10
+    clusters_control = clusters_control_occW_sorted_noBackground[:nr_clusters_display] 
+    clusters_disease = clusters_disease_occW_sorted_noBackground[:nr_clusters_display] 
+    
+    #control clusters 
+    cluster_centers_control = kmeans.cluster_centers_[clusters_control]
+    fig, axs = plt.subplots(1,len(clusters_control), figsize=(len(clusters_control)*3,3), sharex=True, sharey=True)
+    fig.suptitle('Cluster centers for control')
+ 
+    fig_split, axs_split = plt.subplots(3,len(clusters_control), figsize=(len(clusters_control)*3,9), sharex=True, sharey=True)
+    fig_split.suptitle('Cluster centers for control - split')
+    for l in np.arange(0,len(clusters_control)):
+   
+        cluster_centre = np.transpose((np.reshape(cluster_centers_control[l,:],(3,patch_size,patch_size))),(1,2,0))
+    
+        #channel 1 - collagen IV
+        im = np.zeros(cluster_centre.shape)
+        im[...,0] = cluster_centre[...,0]
+        im = ma.rgb2cmy(im)
+        axs_split[0][l].imshow(im.astype(np.uint8))
+        axs_split[0][l].axis('off')
+        axs_split[0][l].set_title(clusters_control[l])
+         
+        #channel 2 - elastin
+        im = np.zeros(cluster_centre.shape)
+        im[...,1] = cluster_centre[...,1]
+        im = ma.rgb2cmy(im)
+        axs_split[1][l].imshow(im.astype(np.uint8))
+        axs_split[1][l].axis('off')
+         
+        #channel 3 - SHG
+        im = np.zeros(cluster_centre.shape)
+        im[...,2] = cluster_centre[...,2]
+        im = ma.rgb2cmy(im)
+        axs_split[2][l].imshow(im.astype(np.uint8))
+        axs_split[2][l].axis('off')
+        
+        plt.savefig(dir_probs_noBack + 'clustercenters_control_split_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+            
+        #cmy merge
+        cluster_centre = ma.rgb2cmy(cluster_centre)
+        axs[l].imshow(cluster_centre.astype(np.uint8))
+        axs[l].axis('off')
+        axs[l].set_title(clusters_control[l])
+        
+        plt.figure(fig.number),
+        plt.savefig(dir_probs_noBack + 'clustercenters_control_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+    #disease clusters 
+    cluster_centers_disease = kmeans.cluster_centers_[clusters_disease]
+    fig, axs = plt.subplots(1,len(clusters_disease), figsize=(len(clusters_disease)*3,3), sharex=True, sharey=True)
+    fig.suptitle('Cluster centers for disease')
+ 
+    fig_split, axs_split = plt.subplots(3,len(clusters_disease), figsize=(len(clusters_disease)*3,9), sharex=True, sharey=True)
+    fig_split.suptitle('Cluster centers for disease - split')
+    for l in np.arange(0,len(clusters_disease)):
+   
+        cluster_centre = np.transpose((np.reshape(cluster_centers_disease[l,:],(3,patch_size,patch_size))),(1,2,0))
+    
+        #channel 1 - collagen IV
+        im = np.zeros(cluster_centre.shape)
+        im[...,0] = cluster_centre[...,0]
+        im = ma.rgb2cmy(im)
+        axs_split[0][l].imshow(im.astype(np.uint8))
+        axs_split[0][l].axis('off')
+        axs_split[0][l].set_title(clusters_disease[l])
+         
+        #channel 2 - elastin
+        im = np.zeros(cluster_centre.shape)
+        im[...,1] = cluster_centre[...,1]
+        im = ma.rgb2cmy(im)
+        axs_split[1][l].imshow(im.astype(np.uint8))
+        axs_split[1][l].axis('off')
+         
+        #channel 3 - SHG
+        im = np.zeros(cluster_centre.shape)
+        im[...,2] = cluster_centre[...,2]
+        im = ma.rgb2cmy(im)
+        axs_split[2][l].imshow(im.astype(np.uint8))
+        axs_split[2][l].axis('off')
+        
+        plt.savefig(dir_probs_noBack + 'clustercenters_disease_split_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+            
+        #cmy merge
+        cluster_centre = ma.rgb2cmy(cluster_centre)
+        axs[l].imshow(cluster_centre.astype(np.uint8))
+        axs[l].axis('off')
+        axs[l].set_title(clusters_disease[l])
+        
+        plt.figure(fig.number),
+        plt.savefig(dir_probs_noBack + 'clustercenters_disease_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+    #%% Compute probability images                
+    startTime_probs = datetime.now()
+
+    #Assign probabilities of control and disease to clusters
+    prob_control= hist_control/(hist_control+hist_disease)
+    prob_disease = hist_disease/(hist_control+hist_disease)
+    
+    #Compute probability images
+    r,c,_ = max_img_list_control_flat[0].shape
+    img_shape = (int(r*sc_fac),int(c*sc_fac))
+    prob_img_list_control = ma.comp_prob_imgs(prob_control,assignment_list_control,img_shape)
+    prob_img_list_disease = ma.comp_prob_imgs(prob_control,assignment_list_disease,img_shape) 
+  
+    #%% Display probability images and accumulate probs per frame and sample
+    nr_frames = len(max_img_list_control[0])
+    
+    #control
+    sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True, )
+    
+    nr_list = 0
+    prob_list_control= []
+    fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True)
+    for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_control, sample_names_control, range(len(sample_names_control))): #in general,  in max_img_list_condition
+        print(sample_name)
+        nr_frames = len(max_img_list_sample)
+        prob_img_list_sample = prob_img_list_control[nr_list:nr_list+nr_frames]
+        nr_list += nr_frames
+        
+        prob_sample = np.zeros((nr_frames,1))
+        fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True)
+        for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)):
+            max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype)
+            
+            #compute image probability
+            prob_frame = sum(sum(prob_img))/prob_img.size
+            
+            #remove background from prob imgs
+            ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back)
+            prob_img[ind_back] = 0.5
+            
+            #plot
+            ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+            ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([])
+            ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+            ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([])
+            ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+             
+            axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+            axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2))
+            axs[1][frame_nr].xaxis.set_ticklabels([])
+            axs[1][frame_nr].xaxis.set_ticks_position('none')
+            axs[1][frame_nr].yaxis.set_ticklabels([])
+            axs[1][frame_nr].yaxis.set_ticks_position('none')
+             
+            if colour_mode == 'cmy':
+                 max_img_ds = ma.rgb2cmy(max_img_ds)
+            axs[0][frame_nr].imshow(max_img_ds)
+            axs[0][frame_nr].xaxis.set_ticklabels([])
+            axs[0][frame_nr].xaxis.set_ticks_position('none')
+            axs[0][frame_nr].yaxis.set_ticklabels([])
+            axs[0][frame_nr].yaxis.set_ticks_position('none')
+            
+            #save img probs
+            prob_sample[frame_nr] = prob_frame 
+        prob_list_control += [prob_sample]
+        
+        plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+        plt.savefig(dir_probs + '/' + control + '/' + 'probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        #plt.show() 
+    plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + control + '/' + 'probMaps_'+control+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    #plt.show()
+        
+    #disease
+    sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True)
+    
+    nr_list = 0
+    prob_list_disease = []
+    fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True)
+    for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_disease, sample_names_disease, range(len(sample_names_disease))): #in general,  in max_img_list_condition
+        print(sample_name)
+        nr_frames = len(max_img_list_sample)
+        prob_img_list_sample = prob_img_list_disease[nr_list:nr_list+nr_frames]
+        nr_list += nr_frames
+        
+        prob_sample = np.zeros((nr_frames,1))
+        fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True)
+        for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)):
+             max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype)
+            
+             #compute image probability
+             prob_frame = sum(sum(prob_img))/prob_img.size
+            
+             #remove background from prob imgs
+             ind_back = (max_img_ds[:,:,0]<th_back)&(max_img_ds[:,:,1]<th_back)&(max_img_ds[:,:,2]<th_back)
+             prob_img[ind_back] = 0.5
+            
+             #plot
+             ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+             ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([])
+             ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+             ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([])
+             ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+             
+             axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+             axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2))
+             axs[1][frame_nr].xaxis.set_ticklabels([])
+             axs[1][frame_nr].xaxis.set_ticks_position('none')
+             axs[1][frame_nr].yaxis.set_ticklabels([])
+             axs[1][frame_nr].yaxis.set_ticks_position('none')
+             
+             if colour_mode == 'cmy':
+                  max_img_ds = ma.rgb2cmy(max_img_ds)
+             axs[0][frame_nr].imshow(max_img_ds)
+             axs[0][frame_nr].xaxis.set_ticklabels([])
+             axs[0][frame_nr].xaxis.set_ticks_position('none')
+             axs[0][frame_nr].yaxis.set_ticklabels([])
+             axs[0][frame_nr].yaxis.set_ticks_position('none')
+            
+             #save img probs
+             prob_sample[frame_nr] = prob_frame 
+        prob_list_disease += [prob_sample]
+        
+        plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+        plt.savefig(dir_probs + '/' + disease + '/' + 'probImHealth_'+sample_name+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        #plt.show() 
+    plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + disease + '/' + 'probMaps_'+disease+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    #plt.show()
+    
+    prob_list_samples = prob_list_control + prob_list_disease   
+         
+    #%% Box plots
+    fig, ax = plt.subplots(figsize = (15,5))
+    ax.set_title('Image health scores ')
+    
+    sample_sep = 0.85
+    patient_sep = 2.35
+    condition_sep = 2.5 
+    positions_control = np.tile([0,0+sample_sep],int(len(max_img_list_control)/2)) + np.repeat(np.arange(0,len(max_img_list_control)/2*patient_sep,patient_sep),2)
+    positions_disease = np.tile([len(max_img_list_control)+condition_sep,len(max_img_list_control)+condition_sep+sample_sep],int(len(max_img_list_disease)/2)) + np.repeat(np.arange(0,len(max_img_list_disease)/2*patient_sep,patient_sep),2)
+    array_probs = np.squeeze(np.asarray(prob_list_samples))
+    bp_control = ax.boxplot((array_probs[:len(max_img_list_control),...]).T, positions = positions_control, patch_artist = True)
+    bp_disease = ax.boxplot(array_probs[len(max_img_list_control):,...].T, positions = positions_disease, patch_artist = True)
+    
+    for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
+            plt.setp(bp_control[element], color='blue')
+            plt.setp(bp_control[element], linewidth = 2)
+            
+    for patch in bp_control['boxes']:
+            patch.set(facecolor='white')
+            patch.set(linewidth = 2)
+            
+    for element in ['boxes','whiskers', 'fliers', 'means', 'medians', 'caps']:
+            plt.setp(bp_disease[element], color='red')
+            plt.setp(bp_disease[element], linewidth = 2)
+            
+    for patch in bp_disease['boxes']:
+            patch.set(facecolor= 'white')  
+            patch.set(linewidth = 2) 
+        #patch.set(alpha=0.2) 
+        
+    ax.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease])
+    ax.set_ylim(0.3,0.7)
+    ax.set_ylabel('Probability health')
+    #ax.set_ylim(0,1)
+    
+    ax.tick_params(
+        axis='both',          # changes apply to the x-axis
+        which='both',      # both major and minor ticks are affected
+        #labelleft = False,      # ticks along the top edge are off
+        #labelbottom=False
+        )
+    
+    #plot points
+    positions = list(positions_control) + list(positions_disease)
+    for prob_sample, position in zip(prob_list_samples, positions):
+        print(prob_sample)
+        print(5*[position])
+        # Add some random "jitter" to the x-axis
+        x = np.random.normal(5*[position], 0)
+        #plot(x, y, 'r.', alpha=0.2)
+        ax.plot(x, prob_sample,'k.', markersize = 6, zorder = 3)
+       
+    #horizontal line
+    x = np.linspace(0,position.max())    
+    y = 0.5*np.ones(x.shape)
+    ax.plot(x,y,'k',linewidth = 1, linestyle = 'dashed')
+    
+    plt.savefig(dir_probs + 'scatterBoxplots_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    
+    fig2, ax2 = plt.subplots(figsize = (15,5))
+    #plot points
+    for prob_sample, position in zip(prob_list_control, positions_control):
+        ax2.plot(5*[position], prob_sample,'b.', markersize = 4)
+        
+    for prob_sample, position in zip(prob_list_disease, positions_disease):
+        ax2.plot(5*[position], prob_sample,'r.', markersize = 4)
+    
+    #horizontal line
+    x = np.linspace(0,position.max())    
+    y = 0.5*np.ones(x.shape)
+    ax2.plot(x,y,'k',linewidth = 1, linestyle = 'dashed')
+    
+    ax2.set_xticks(ax.get_xticks())
+    ax2.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease])
+    ax2.set_ylim(0.3,0.7)
+    ax2.set_ylabel('Probability health')
+
+    plt.savefig(dir_probs + 'samplewiseScatterPlots_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+    
+       
+    #%% Print computational time
+    endTime = datetime.now() - startTime
+    print(endTime)
+      
+    
+    #%%ANOVA SETUP
+    os.makedirs(dir_results + 'statistics/', exist_ok = True)  
+    
+    from statsmodels.formula.api import ols
+    from statsmodels.stats.anova import anova_lm
+    import pandas as pd
+    from patsy import ModelDesc
+    
+    nr_frames_patient = 10
+    img_nr = np.tile(np.arange(nr_frames),array_probs.size//nr_frames)
+    sample_nr = np.tile(np.repeat(np.arange(nr_frames_patient//nr_frames),nr_frames),array_probs.size//nr_frames_patient)
+    #patient_nr = np.tile(np.repeat(np.arange(array_probs.size//nr_frames_patient//len(conditions)),nr_frames_patient),len(conditions))
+    patient_nr = np.concatenate([np.repeat(np.arange(len(max_img_list_control_flat)//nr_frames_patient),nr_frames_patient),np.repeat(np.arange(len(max_img_list_disease_flat)//nr_frames_patient),nr_frames_patient)])
+    condition_nr = np.concatenate([np.zeros(len(max_img_list_control_flat)), np.ones(len(max_img_list_disease_flat))])
+    
+    
+    dep_var = array_probs.flatten()
+        
+    data_dict = {'Image':img_nr,
+                 'Sample':sample_nr,
+                 'Patient':patient_nr,
+                 'Condition': condition_nr,
+                 'S':dep_var}
+    
+    formula = 'S ~ C(Condition)/C(Patient)/C(Sample)'
+    
+    desc = ModelDesc.from_formula(formula)
+    print(desc.describe())
+        
+    lm = ols(formula, data_dict).fit()
+    print(lm.summary())
+    
+    anova_table = anova_lm(lm)
+    print(anova_table)
+    writer_anova = pd.ExcelWriter(dir_results + 'statistics/statistics_anova_' + disease + '_' + protein_mode + '_ds'+ str(1/sc_fac) +'_run' + str(run) +'.xlsx', engine='xlsxwriter')
+    anova_table.to_excel(writer_anova,sheet_name = disease)
+    writer_anova.save()
+    
+    
+
+
+
+
+
diff --git a/code/lung_feature_patch_allPatients_twoProtein.py b/code/lung_feature_patch_allPatients_twoProtein.py
new file mode 100755
index 0000000..debe53f
--- /dev/null
+++ b/code/lung_feature_patch_allPatients_twoProtein.py
@@ -0,0 +1,551 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Explorative analysis of protein composition and structure in lung ECM to 
+study disease-induced remodeling of the three proteins that provide structural 
+integrity to the lung: collagen IV, elastin and fibrillar collagen.
+
+The two-protein analyses provide the following:
+    - A 'Data Model' per combination of proteins, a dictionary of image patches
+    representative of the local protein structure and composition found in the
+    data (both healthy and disease group) at the scale defined by the patch
+    size.
+    - A 'Sorted Data Model' or disease signature per combination of proteins, 
+    a sorted version of the dictionary above, based on how often a dictionary 
+    element (called cluster centre in the publication) is found in the healthy 
+    vs. the disease.
+    - A 'Probability Map' for every image based on each combination of 
+    proteins, with pixel correspondance to the input images (maximum 
+    projections), highlighting regions characteristic of health or disease. 
+    - A 'Disease Effect' per combination of proteins, to quantify how well the
+    control and disease groups are separated after assigning a health 
+    probability score to each image, computed as the average of its probability
+    map values.
+    
+by Monica J. Emerson monj@dtu.dk and Anders B. Dahl abda@dtu.dk
+
+For more information, read our article (add link and citation)"""
+
+#%% Load packages and initialise parameters
+
+import numpy as np
+import matplotlib.pyplot as plt
+import sklearn.cluster
+import microscopy_analysis as ma
+import skimage.io 
+import skimage.transform
+ 
+import os
+from datetime import datetime
+import sys
+from matplotlib.offsetbox import OffsetImage, AnnotationBbox
+from math import ceil
+import copy
+
+startTime = datetime.now()
+
+plt.close('all')
+
+sc_fac = 0.5 #25 #25 #0.5 # scaling factor of the image
+patch_size = 17
+nr_clusters = 100 #49#100 # number of clusters, choose a value with an integer as a square root
+th_back = 15
+nr_runs = 10
+
+protein_mode = 'dropout' #'single', 'triple'
+colour_mode = 'cmy'
+
+# drop_channel = 0
+# keep_channels = [x for x in range(3) if x!=drop_channel]
+ 
+#%% Define and create directories
+
+disease = 'emphysema' #diseased (mix, 2 of each condition)' #''emphysema' 'sarcoidosis' 'ild' 'covid'
+control = 'control'
+conditions = [disease, control]
+    
+# input directories - images start with the name 'frame'
+base_dir_in = '../input_data/'
+#dir_in = base_dir_in + 'maximum_projections' #uncomment to read maximum projections without contrast equalisation
+dir_in = base_dir_in + 'postprocessed_maxProjs/rgb/' #Read rgb for correct protein channel ordering, visualisation will be in cmy
+base_name = 'frame'
+
+#output directory
+dir_results = '../results/' #rerun just to make sure not to muck up the results
+os.makedirs(dir_results, exist_ok = True)  
+os.makedirs(dir_results + '/statistics', exist_ok = True)  
+
+#%% Read (preprocessed) maximum corrected images
+#Read maximum projection images (runtime ~= 20 s )
+print('Reading maximum projection images')
+
+max_img_list_control_3c = ma.read_max_imgs(dir_in + control, base_name)
+max_img_list_disease_3c = ma.read_max_imgs(dir_in + disease, base_name)
+
+for drop_channel in range(3):
+    keep_channels = [x for x in range(3) if x!=drop_channel]
+    #%% Exchange channels: fibrillar collagen with elastin
+    max_img_list_control = copy.deepcopy(max_img_list_control_3c)
+    for max_img_sample_control in max_img_list_control:
+        for nr_frame, max_img_control in enumerate(max_img_sample_control):
+            img = max_img_control[:,:,[0,2,1]]
+            max_img_sample_control[nr_frame] = img[:,:,keep_channels]
+            
+    max_img_list_disease = copy.deepcopy(max_img_list_disease_3c)
+    for max_img_sample_disease in max_img_list_disease:
+        for nr_frame, max_img_disease in enumerate(max_img_sample_disease):
+            img = max_img_disease[:,:,[0,2,1]]
+            max_img_sample_disease[nr_frame] = img[:,:,keep_channels]
+    
+    #%% Prepare maximum projection images for feature extraction
+    #Flatten and merge conditions
+    max_img_list_control_flat = ma.flatten_list(max_img_list_control) 
+    max_img_list_disease_flat = ma.flatten_list(max_img_list_disease)
+    
+    #Rescale
+    max_img_list_control_processed = []
+    for max_img in ma.flatten_list(max_img_list_control) :
+        max_img_list_control_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)]
+    
+    max_img_list_disease_processed = []
+    for max_img in ma.flatten_list(max_img_list_disease) :
+        max_img_list_disease_processed += [skimage.transform.rescale(max_img, sc_fac, preserve_range = True, multichannel=True).astype(max_img.dtype)]
+    
+    #%% Compute patches
+    patch_feat_list_control = []
+    for max_img in max_img_list_control_processed:
+        patch_feat_list_control += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()]
+    
+    patch_feat_list_disease = []
+    for max_img in max_img_list_disease_processed:
+        patch_feat_list_disease += [ma.ndim2col_pad(max_img, (patch_size, patch_size),norm=False).transpose()]
+    
+    patch_feat_total = patch_feat_list_control + patch_feat_list_disease
+        
+    for run in range(nr_runs):
+        
+        #directories for results from the different runs
+        dir_probs = dir_results + disease + '_' + protein_mode + '_%dclusters_%ddownscale_%dpatchsize/'%(nr_clusters,1/sc_fac,patch_size)
+        dir_probs += 'run'+str(run)+'/'
+        ma.make_output_dirs(dir_probs, conditions)
+      
+        #%% k-means clustering
+        startTime_clustering = datetime.now()
+        
+        batch_size = 10000
+        th_nr_pathesINcluster = 20
+        th_nr_weakClusters = 0 #added for covid and ild
+        
+        #If cluster centers were already computed, use those
+        if os.path.exists(dir_probs + 'array_cluster_centers_'+protein_mode+'_channel'+str(drop_channel)+'.npy'):
+             cluster_centers = np.load(dir_probs + 'array_cluster_centers_'+protein_mode+'_channel'+str(drop_channel)+'.npy')
+             kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size)
+             kmeans.cluster_centers_ = cluster_centers
+        
+        #Compute cluster centres
+        else: 
+            #features for clustering
+            nr_keep = 10000 # number of features/image randomly picked for clustering 
+            n_im = len(patch_feat_total)
+            feat_dim = patch_feat_total[0].shape[1]
+            patch_feat_to_cluster = np.zeros((nr_keep*n_im,feat_dim),dtype = max_img.dtype)
+            
+            f = 0
+            nr_keep_control = nr_keep//(2*len(patch_feat_list_control)//len(patch_feat_list_disease))
+            nr_keep_disease = nr_keep//2
+            for patch_feat in patch_feat_list_control:
+                keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_control]
+                patch_feat_to_cluster[f:f+nr_keep_control,:] = patch_feat[keep_indices,:]
+                f += nr_keep
+            
+            for patch_feat in patch_feat_list_disease:
+                keep_indices = np.random.permutation(np.arange(patch_feat.shape[0]))[:nr_keep_disease]
+                patch_feat_to_cluster[f:f+nr_keep_disease,:] = patch_feat[keep_indices,:]
+                f += nr_keep
+                
+            #Get feature vectors
+            feat_to_cluster = patch_feat_to_cluster
+            
+            #Compute cluster centers and save if there is no weak clusters
+            nr_weakClusters = th_nr_weakClusters+1
+            repetition = 0
+            nr_weakClusters_list = []
+            while nr_weakClusters>th_nr_weakClusters: #While weak clusters is different from empty recompute
+                kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=nr_clusters, batch_size = batch_size, verbose = False, reassignment_ratio=0.0005, max_no_improvement = 100) #0.00075 was good for uip
+                kmeans.fit(feat_to_cluster)
+                
+                #Nr. patches that have contributed to each cluster
+                nr_feat_in_cluster = [sum(kmeans.labels_==cluster) for cluster in range(nr_clusters)]
+                
+                weak_cluster_IDs = [ind for ind,val in enumerate(nr_feat_in_cluster) if val<th_nr_pathesINcluster]
+                print('Weak cluster IDs: ',weak_cluster_IDs)
+                    
+                #Weak clusters are those composed by very few patches
+                nr_weakClusters = len([1 for i in nr_feat_in_cluster if i<th_nr_pathesINcluster])
+                
+                #If there are weak clusters, the clustering should be recomputed
+                if nr_weakClusters>th_nr_weakClusters:
+                    print(str(nr_weakClusters)+ " clusters composed of less than "+str(th_nr_pathesINcluster)+" images") 
+                else:
+                    np.save(dir_probs + 'array_cluster_centers_'+protein_mode+'_channel'+str(drop_channel)+'.npy', kmeans.cluster_centers_)    # .npy extension is added if not given
+                    
+                print(repetition)
+                repetition += 1
+                nr_weakClusters_list += [nr_weakClusters]
+                
+         
+        endTime_clustering = datetime.now() - startTime_clustering
+        print(endTime_clustering)
+        
+        #%% Plot all cluster centers (grid view)
+        cluster_centers = kmeans.cluster_centers_
+        int_sum_list = ma.plot_grid_cluster_centers(cluster_centers, range(nr_clusters), patch_size, protein_mode, colour_mode, channel = drop_channel)
+        plt.savefig(dir_probs + 'clustercenters_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+         
+        #%% Histograms for background, healthy and disease
+        hist_control, assignment_list_control= ma.compute_assignment_hist(patch_feat_list_control, kmeans)
+        hist_disease, assignment_list_disease = ma.compute_assignment_hist(patch_feat_list_disease, kmeans)
+        
+        # Show histograms (bar plot) of healthy and disease
+        fig, ax = plt.subplots(1,1)
+        ax.bar(np.array(range(0,nr_clusters))-0.25, hist_control, width = 0.5, label='Control', color = 'b')
+        ax.bar(np.array(range(0,nr_clusters))+0.25, hist_disease, width = 0.5, label='Disease', color = 'r')
+        ax.legend()
+        plt.savefig(dir_probs + 'assignmentHistograms_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+        #%% Compute populated, occurrence and importance cluster measures from the histogram
+        #grid dimensions
+        
+        #OCCURRENCE RATIO: How more often does it occur in one of the conditions
+        occurrence_ratio = np.empty((nr_clusters))
+        ind_control = hist_control>hist_disease
+        ind_disease = ~ind_control
+        control_cond = (hist_disease!=0) & ind_control
+        occurrence_ratio[control_cond] = hist_control[control_cond]/hist_disease[control_cond]
+        disease_cond = (hist_control!=0) & ind_disease
+        occurrence_ratio[disease_cond] = -hist_disease[disease_cond]/hist_control[disease_cond]
+        
+        #POPULATED How trustworthy is the cluster? 
+        populated = hist_control+hist_disease
+        weights = populated/populated.max()
+        
+        #Cluster IMPORTANCE: occurrence weighed by POPULATED
+        def sigmoid(x,k):
+            return 2 / (1 + np.exp(-k*x)) - 1
+        
+        importance = occurrence_ratio * sigmoid(weights,100)
+        
+         #%% Plot cluster centers in the 2d populated/occurrence space
+        fig_control, ax_control = plt.subplots(figsize=(15,15))
+        ax_control.scatter(populated[occurrence_ratio>1], occurrence_ratio[occurrence_ratio>1]) 
+        ax_control.set_ylim(0.8,7)
+        ax_control.set_xlim(0,0.08)
+        
+        fig_disease, ax_disease = plt.subplots(figsize=(15,15))
+        ax_disease.scatter(populated[occurrence_ratio<1], -occurrence_ratio[occurrence_ratio<1]) 
+        ax_disease.set_ylim(0.8,7)
+        ax_disease.set_xlim(0,0.08)
+        
+        for x0, y0, cluster_nr in zip(populated, occurrence_ratio, np.arange(0,nr_clusters)):
+            if protein_mode == 'dropout':
+                cluster_centre = np.zeros((patch_size,patch_size,3),dtype = cluster_centers.dtype)
+                cluster_centre[:,:,keep_channels] = np.transpose(np.reshape(kmeans.cluster_centers_[cluster_nr,:],(2,patch_size,patch_size)),(1,2,0)) 
+            else:
+                cluster_centre = np.transpose((np.reshape(kmeans.cluster_centers_[cluster_nr,:],(3,patch_size,patch_size))),(1,2,0))
+            cluster_centre = ma.rgb2cmy(cluster_centre)
+            if y0>0:
+                ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, y0), frameon=False)
+                ax_control.add_artist(ab)
+            else:
+                ab = AnnotationBbox(OffsetImage(cluster_centre.astype(max_img.dtype)), (x0, -y0), frameon=False)
+                ax_disease.add_artist(ab)
+        
+        plt.figure(fig_control.number)
+        plt.savefig(dir_probs + 'controlClustercenters_2Dspace_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+           
+            
+        plt.figure(fig_disease.number)
+        plt.savefig(dir_probs + 'diseaseClustercenters_2Dspace_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+        #%% Plot all clusters sorted by IMPORTANCE
+        clusters_control_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]>0]
+        clusters_control_occW_sorted.reverse()
+        
+        clusters_disease_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten()) if (importance.flatten())[i]<0]
+        
+        clusters_occW_sorted = [np.arange(0,100)[i] for i in np.argsort(importance.flatten())]
+        clusters_occW_sorted.reverse()
+        
+        occurrence_ratio_sorted = [occurrence_ratio[i] for i in np.argsort(importance.flatten())]
+        occurrence_ratio_sorted.reverse()
+        
+        ma.plot_grid_cluster_centers(kmeans.cluster_centers_, clusters_occW_sorted, patch_size, protein_mode, colour_mode, drop_channel)
+        plt.suptitle('Cluster centers \n sorted by occ*f(p) from control to disease with cluster ' + str(len(clusters_control_occW_sorted))  + ' in between')
+        plt.savefig(dir_probs + 'clustercenters_occW_sorted_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+          
+        #%% Compute probability images
+                         
+        startTime_probs = datetime.now()
+        
+        #Assign probabilities of control and disease to clusters
+        prob_control= hist_control/(hist_control+hist_disease)
+        prob_disease = hist_disease/(hist_control+hist_disease)
+
+        #Compute probability images
+        r,c,_ = max_img_list_control_flat[0].shape
+        img_shape = (int(r*sc_fac),int(c*sc_fac))
+        prob_img_list_control = ma.comp_prob_imgs(prob_control,assignment_list_control,img_shape)
+        prob_img_list_disease = ma.comp_prob_imgs(prob_control,assignment_list_disease,img_shape) 
+            
+        #%% Display probability images and accumulate probs per frame and sample
+        nr_frames = len(max_img_list_control[0])
+        
+        #control
+        sample_names_control = ma.listdir_custom(dir_in + control, string_start_pos = -4, dir_flag = True, )
+        
+        nr_list = 0
+        prob_list_control= []
+        fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_control),figsize = (len(sample_names_control),nr_frames), sharex=True, sharey=True)
+        for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_control, sample_names_control, range(len(sample_names_control))): #in general,  in max_img_list_condition
+            print(sample_name)
+            nr_frames = len(max_img_list_sample)
+            prob_img_list_sample = prob_img_list_control[nr_list:nr_list+nr_frames]
+            nr_list += nr_frames
+            
+            prob_sample = np.zeros((nr_frames,1))
+            fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True)
+            for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)):
+                max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype)
+                
+                #compute image probability
+                prob_frame = sum(sum(prob_img))/prob_img.size
+            
+                #background
+                ind_back = np.ones((max_img_ds.shape[0],max_img_ds.shape[1]), dtype = bool)
+                for channel in range(max_img_ds.shape[2]):
+                    ind_back = ind_back & (max_img_ds[:,:,channel]<th_back)
+                prob_img[ind_back] = 0.5
+                    
+                if protein_mode == 'dropout':    
+                    max_img_ds_3c = np.zeros((max_img_ds.shape[0],max_img_ds.shape[1],3))
+                    max_img_ds_3c[:,:,keep_channels] = max_img_ds
+                    max_img_ds = max_img_ds_3c
+                
+                #plot
+                ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+                ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([])
+                ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+                ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([])
+                ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+            
+                axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+                axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2))
+                axs[1][frame_nr].xaxis.set_ticklabels([])
+                axs[1][frame_nr].xaxis.set_ticks_position('none')
+                axs[1][frame_nr].yaxis.set_ticklabels([])
+                axs[1][frame_nr].yaxis.set_ticks_position('none')
+            
+                if colour_mode == 'cmy':
+                    max_img_ds = ma.rgb2cmy(max_img_ds)
+                axs[0][frame_nr].imshow(max_img_ds)
+                axs[0][frame_nr].xaxis.set_ticklabels([])
+                axs[0][frame_nr].xaxis.set_ticks_position('none')
+                axs[0][frame_nr].yaxis.set_ticklabels([])
+                axs[0][frame_nr].yaxis.set_ticks_position('none')
+                
+                #compute accumulated probabilities:part 2
+                prob_sample[frame_nr] = prob_frame 
+            prob_list_control += [prob_sample]
+            
+            plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+            plt.savefig(dir_probs + '/' + control + '/' + 'probImHealth_'+sample_name+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+            #plt.show() 
+        plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + control + '/' + 'probMaps_'+control+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        #plt.show()
+            
+        #disease
+        sample_names_disease = ma.listdir_custom(dir_in + disease, string_start_pos = -4, dir_flag = True)
+        
+        nr_list = 0
+        prob_list_disease = []
+        fig_probmaps, ax_probmaps = plt.subplots(nr_frames, len(sample_names_disease),figsize = (len(sample_names_disease),nr_frames), sharex=True, sharey=True)
+        for max_img_list_sample,sample_name,sample_nr in zip(max_img_list_disease, sample_names_disease, range(len(sample_names_disease))): #in general,  in max_img_list_condition
+            print(sample_name)
+            nr_frames = len(max_img_list_sample)
+            prob_img_list_sample = prob_img_list_disease[nr_list:nr_list+nr_frames]
+            nr_list += nr_frames
+            
+            prob_sample = np.zeros((nr_frames,1))
+            fig, axs = plt.subplots(2,nr_frames, figsize = (nr_frames*2,2*2), sharex=True, sharey=True)
+            for prob_img, max_img, frame_nr in zip(prob_img_list_sample,max_img_list_sample,range(nr_frames)):
+                max_img_ds = skimage.transform.rescale(max_img, sc_fac, multichannel = True, preserve_range = True).astype(max_img.dtype)
+                
+                #compute image probability
+                prob_frame = sum(sum(prob_img))/prob_img.size
+            
+                #background
+                ind_back = np.ones((max_img_ds.shape[0],max_img_ds.shape[1]), dtype = bool)
+                for channel in range(max_img_ds.shape[2]):
+                    ind_back = ind_back & (max_img_ds[:,:,channel]<th_back)
+                prob_img[ind_back] = 0.5
+                    
+                if protein_mode == 'dropout':    
+                    max_img_ds_3c = np.zeros((max_img_ds.shape[0],max_img_ds.shape[1],3))
+                    max_img_ds_3c[:,:,keep_channels] = max_img_ds
+                    max_img_ds = max_img_ds_3c
+                
+                #plot
+                ax_probmaps[frame_nr][sample_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+                ax_probmaps[frame_nr][sample_nr].xaxis.set_ticklabels([])
+                ax_probmaps[frame_nr][sample_nr].xaxis.set_ticks_position('none')
+                ax_probmaps[frame_nr][sample_nr].yaxis.set_ticklabels([])
+                ax_probmaps[frame_nr][sample_nr].yaxis.set_ticks_position('none')
+            
+                axs[1][frame_nr].imshow(1-prob_img, cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+                axs[1][frame_nr].set_title("%.2f" % np.round(prob_frame,2))
+                axs[1][frame_nr].xaxis.set_ticklabels([])
+                axs[1][frame_nr].xaxis.set_ticks_position('none')
+                axs[1][frame_nr].yaxis.set_ticklabels([])
+                axs[1][frame_nr].yaxis.set_ticks_position('none')
+            
+                if colour_mode == 'cmy':
+                    max_img_ds = ma.rgb2cmy(max_img_ds)
+                axs[0][frame_nr].imshow(max_img_ds)
+                axs[0][frame_nr].xaxis.set_ticklabels([])
+                axs[0][frame_nr].xaxis.set_ticks_position('none')
+                axs[0][frame_nr].yaxis.set_ticklabels([])
+                axs[0][frame_nr].yaxis.set_ticks_position('none')
+                
+                #compute accumulated probabilities:part 2
+                prob_sample[frame_nr] = prob_frame 
+            prob_list_disease += [prob_sample]
+            
+            plt.suptitle('Health probability of sample '+ sample_name +', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+            plt.savefig(dir_probs + '/' + disease + '/' + 'probImHealth_'+sample_name+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+            #plt.show() 
+        plt.figure(fig_probmaps.number),plt.savefig(dir_probs + '/' + disease + '/' + 'probMaps_'+disease+'_'+protein_mode+'_channel'+str(drop_channel)+'_%dclusters_%ddownscale_%dpatchsize.png'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        #plt.show()
+        
+        prob_list_samples = prob_list_control + prob_list_disease   
+             
+        #%% Box plots
+        fig, ax = plt.subplots(figsize = (15,5))
+        ax.set_title('Image health scores ')
+        
+        sample_sep = 0.85
+        patient_sep = 2.35
+        condition_sep = 2.5 
+        positions_control = np.tile([0,0+sample_sep],int(len(max_img_list_control)/2)) + np.repeat(np.arange(0,len(max_img_list_control)/2*patient_sep,patient_sep),2)
+        positions_disease = np.tile([len(max_img_list_control)+condition_sep,len(max_img_list_control)+condition_sep+sample_sep],int(len(max_img_list_disease)/2)) + np.repeat(np.arange(0,len(max_img_list_disease)/2*patient_sep,patient_sep),2)
+        array_probs = np.squeeze(np.asarray(prob_list_samples))
+        bp_control = ax.boxplot((array_probs[:len(max_img_list_control),...]).T, positions = positions_control, patch_artist = True)
+        bp_disease = ax.boxplot(array_probs[len(max_img_list_control):,...].T, positions = positions_disease, patch_artist = True)
+        
+        for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
+            plt.setp(bp_control[element], color='blue')
+            plt.setp(bp_control[element], linewidth = 2)
+            
+        for patch in bp_control['boxes']:
+            patch.set(facecolor='white')
+            patch.set(linewidth = 2)
+            
+        for element in ['boxes','whiskers', 'fliers', 'means', 'medians', 'caps']:
+            plt.setp(bp_disease[element], color='red')
+            plt.setp(bp_disease[element], linewidth = 2)
+            
+        for patch in bp_disease['boxes']:
+            patch.set(facecolor= 'white')  
+            patch.set(linewidth = 2)
+            #patch.set(alpha=0.2) 
+            
+        ax.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease])
+        ax.set_ylim(0.3,0.7)
+        ax.set_ylabel('Probability health')
+        #ax.set_ylim(0,1)
+        
+        ax.tick_params(
+            axis='both',          # changes apply to the x-axis
+            which='both',      # both major and minor ticks are affected
+            #labelleft = False,      # ticks along the top edge are off
+            #labelbottom=False
+            )
+        
+        #plot points
+        positions = list(positions_control) + list(positions_disease)
+        for prob_sample, position in zip(prob_list_samples, positions):
+            print(prob_sample)
+            print(5*[position])
+            # Add some random "jitter" to the x-axis
+            x = np.random.normal(5*[position], 0)
+            #plot(x, y, 'r.', alpha=0.2)
+            ax.plot(x, prob_sample,'k.', markersize = 6, zorder = 3)
+           
+        #horizontal line
+        x = np.linspace(0,position.max())    
+        y = 0.5*np.ones(x.shape)
+        ax.plot(x,y,'k',linewidth = 1, linestyle = 'dashed')
+        
+        plt.savefig(dir_probs + 'scatterBoxplots_'+protein_mode+'_channel'+str(drop_channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+        
+        fig2, ax2 = plt.subplots(figsize = (15,5))
+        #plot points
+        for prob_sample, position in zip(prob_list_control, positions_control):
+            ax2.plot(5*[position], prob_sample,'b.', markersize = 5)
+            
+        for prob_sample, position in zip(prob_list_disease, positions_disease):
+            ax2.plot(5*[position], prob_sample,'r.', markersize = 5)
+        
+        #horizontal line
+        x = np.linspace(0,position.max())    
+        y = 0.5*np.ones(x.shape)
+        ax2.plot(x,y,'k',linewidth = 1, linestyle = 'dashed')
+        
+        ax2.set_xticks(ax.get_xticks())
+        ax2.set_xticklabels([i[-4:] for i in sample_names_control]+[i[-4:] for i in sample_names_disease])
+        ax2.set_ylim(0.3,0.7)
+        ax2.set_ylabel('Probability health')
+    
+        plt.savefig(dir_probs + 'samplewiseScatterPlots_'+protein_mode+'_channel'+str(drop_channel) +'_%dclusters_%ddownscale_%dpatchsize.pdf'%(nr_clusters,1/sc_fac,patch_size), dpi=300)
+ 
+        #%% Print computational time
+        endTime = datetime.now() - startTime
+        print(endTime)
+          
+        
+        #%%ANOVA SETUP
+        os.makedirs(dir_results + 'statistics/', exist_ok = True)  
+        
+        from statsmodels.formula.api import ols
+        from statsmodels.stats.anova import anova_lm
+        import pandas as pd
+        from patsy import ModelDesc
+        
+        nr_frames_patient = 10
+        img_nr = np.tile(np.arange(nr_frames),array_probs.size//nr_frames)
+        sample_nr = np.tile(np.repeat(np.arange(nr_frames_patient//nr_frames),nr_frames),array_probs.size//nr_frames_patient)
+        patient_nr = np.concatenate([np.repeat(np.arange(len(max_img_list_control_flat)//nr_frames_patient),nr_frames_patient),np.repeat(np.arange(len(max_img_list_disease_flat)//nr_frames_patient),nr_frames_patient)])
+        condition_nr = np.concatenate([np.zeros(len(max_img_list_control_flat)), np.ones(len(max_img_list_disease_flat))])
+        
+        
+        dep_var = array_probs.flatten()
+            
+        data_dict = {'Image':img_nr,
+                     'Sample':sample_nr,
+                     'Patient':patient_nr,
+                     'Condition': condition_nr,
+                     'S':dep_var}
+        
+        formula = 'S ~ C(Condition)/C(Patient)/C(Sample)'
+        
+        desc = ModelDesc.from_formula(formula)
+        print(desc.describe())
+            
+        lm = ols(formula, data_dict).fit()
+        print(lm.summary())
+        
+        anova_table = anova_lm(lm)
+        print(anova_table)
+        writer_anova = pd.ExcelWriter(dir_results + 'statistics/statistics_anova_' + disease + '_'+protein_mode+'_channel'+str(drop_channel)+'_ds'+ str(1/sc_fac) +'_run' + str(run) +'.xlsx', engine='xlsxwriter')
+        anova_table.to_excel(writer_anova,sheet_name = disease)
+        writer_anova.save()
+        
+        
diff --git a/code/lung_lif_to_png.py b/code/lung_lif_to_png.py
new file mode 100644
index 0000000..d61120c
--- /dev/null
+++ b/code/lung_lif_to_png.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Transform lif files to png for easy access and visualization
+"""
+
+
+import numpy as np
+from readlif.reader import LifFile
+import skimage.io
+import os
+import microscopy_analysis as ma
+
+#%%
+# folder containing lif files
+in_dir = '../../../covidPatients/scans/' #data101220/'# data_October2020/sarcoidosis/' #'../../5April2020_control/' #healthy #'../../24December19_emphysema/' #sick 
+# output folder
+out_dir = '../data_corrected_bis/covid/' #control/ #/emphysema/
+file_names = ma.listdir_custom(in_dir, string_start_pos = -7, ext = 'lif')
+
+# make output directory if it does not exist
+if not os.path.exists(out_dir):
+    os.mkdir(out_dir)
+
+# run through all lif files and save as png for easy access and visualization
+for file in file_names:
+    print(file)
+    lif_in = LifFile(in_dir + file)
+    n_im = lif_in.num_images
+    dir_name_out = (out_dir + (file.split('.')[0]).replace(' ', '_') + '/')
+    if not os.path.exists(dir_name_out):
+        os.mkdir(dir_name_out)
+    for n in range(0,n_im):
+        print(n)
+        im_in = lif_in.get_image(n)
+        r,c,n_frames = im_in.dims[:3]
+        n_ch = im_in.channels
+        if (n_ch == 3)|(n_ch==4): #TODO: Fix support for removing channel 1 in covids. Could be generalised to select channels of interest, but we don't need this functionality now
+            dir_name_out_im = dir_name_out + ('frame_%02d' % n)
+            if not os.path.exists(dir_name_out_im):
+                os.mkdir(dir_name_out_im)
+            im_out = np.zeros((r,c,3), dtype='uint8')
+            for i in range(0,n_frames):
+                im_out[:,:,0] = np.array(im_in.get_frame(i,0,0)).transpose()
+                for j in range(1,3):
+                    if n_ch == 4: #support for removing channel 1 in covids.
+                        im_out[:,:,j] = np.array(im_in.get_frame(i,0,j+1)).transpose()
+                    else:
+                        im_out[:,:,j] = np.array(im_in.get_frame(i,0,j)).transpose()
+                skimage.io.imsave(dir_name_out_im + '/image_%02d.png' % i, im_out)
+        
diff --git a/code/microscopy_analysis.py b/code/microscopy_analysis.py
new file mode 100755
index 0000000..394ea5f
--- /dev/null
+++ b/code/microscopy_analysis.py
@@ -0,0 +1,624 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Helping functions for analysis of lif microscopy images
+
+Developed by monj@dtu.dk and abda@dtu.dk
+"""
+
+import os
+import skimage.io as io
+import skimage.transform
+import numpy as np
+import local_features as lf
+
+import matplotlib.pyplot as plt
+from math import ceil
+
+#%% General functions
+
+# sort list of directories from selected string position
+def sort_strings_customStart(strings, string_start_pos = 0):
+    """Sorts list of strings
+    
+    Input(s)
+    strings: list of strings
+    string_start_pos (default 0): defines the section of the string on which the ordering is based. 
+    
+    Output(s)
+    strings_sortered: list of sorted strings
+    
+    Author: Monica Jane Emerson (monj@dtu.dk)"""
+    
+    strings_roi = [i[string_start_pos:] for i in strings]
+    #key indicates that the indices (k) are sorted according to subjects_list[k]
+    ind = np.array(sorted(range(len(strings_roi)), key=lambda k: strings_roi[k])) 
+    strings_sorted = [strings[i] for i in ind]
+    
+    return strings_sorted
+
+#Provide the contents of a directory sorted
+def listdir_custom(directory, string_start_pos = 0, ext = '', dir_flag = False, base_name = False):
+    'monj@dtu.dk'
+    
+    if dir_flag:
+        if base_name:
+            list_dirs = [dI for dI in os.listdir(directory) if (os.path.isdir(os.path.join(directory,dI)) & dI[0:len(base_name)]==base_name)]
+        else:   
+            list_dirs = [dI for dI in os.listdir(directory) if os.path.isdir(os.path.join(directory,dI))]
+    else:
+        if base_name:
+            list_dirs = [dI for dI in os.listdir(directory)if dI[0:len(base_name)]==base_name] 
+        else:
+            if ext == '':
+                list_dirs = [f for f in os.listdir(directory) if f.endswith(ext)]
+            else:
+                list_dirs = [dI for dI in os.listdir(directory)] 
+    
+    listdirs_sorted = sort_strings_customStart(list_dirs,string_start_pos)
+    
+    return listdirs_sorted
+
+def flatten_list(list):
+    'monj@dtu.dk'
+    
+    list_flat = [item for sublist in list for item in sublist]
+    
+    return list_flat
+
+#%% IO functions
+
+#make directory, and subdirectories within, if they don't exist
+def make_output_dirs(directory,subdirectories = False):
+    'monj@dtu.dk'
+    
+    os.makedirs(directory, exist_ok = True)
+
+    if subdirectories:
+        for subdir in subdirectories:
+            os.makedirs(directory + subdir + '/', exist_ok = True)
+  
+    # def make_output_dirs(directory,subdirectories):
+    # 'monj@dtu.dk'
+    
+    # os.makedirs(directory, exist_ok = True)
+    # os.makedirs(directory + 'control/', exist_ok = True)
+    
+    # for disease in diseases:
+    #     os.makedirs(directory + disease + '/', exist_ok = True)
+  
+    
+#Reads images starting with base_name from the subdirectories of the input directory.
+#Option for reading scaled down versions and in bnw or colour
+def read_max_imgs(dir_condition, base_name, sc_fac = 1, colour_mode = 'colour'):
+    'monj@dtu.dk'
+    
+    sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+    #print(sample_list)
+    
+    max_img_list = []
+    for sample in sample_list:
+        sample_dir = dir_condition + '/' + sample + '/'
+        frame_list = listdir_custom(sample_dir, base_name = base_name)
+        #print(frame_list)
+        
+        frame_img_list = []
+        for frame in frame_list:
+            frame_path = sample_dir + frame
+            
+            #Option to load in bnw or colour
+            if colour_mode == 'bnw':
+                img = io.imread(frame_path, as_gray = True).astype('uint8')
+                if sc_fac ==1:
+                    frame_img_list += [img]
+                else:
+                    frame_img_list += [skimage.transform.rescale(img, sc_fac, preserve_range = True).astype('uint8')] 
+            else:
+                img = io.imread(frame_path).astype('uint8')
+                if sc_fac == 1:
+                    frame_img_list += [img]
+                else:
+                    frame_img_list += [skimage.transform.rescale(img, sc_fac, preserve_range = True, multichannel=True).astype('uint8')]
+        
+        max_img_list += [frame_img_list] 
+        #print(frame_img_list[0].dtype)
+        
+    return max_img_list
+
+#%% Functions for intensity inspection and image preprocessing
+
+# computes the maximum projection image from a directory of images
+def get_max_img(in_dir, ext = '.png', n_img = 0):    
+    file_names = [f for f in os.listdir(in_dir) if f.endswith(ext)]
+    file_names.sort()
+    if ( n_img < 1 ):
+        n_img = len(file_names)
+    img_in = io.imread(in_dir + file_names[0])
+    for i in range(1,n_img):
+        img_in = np.maximum(img_in, io.imread(in_dir + file_names[i]))
+    return img_in
+
+# computes a list of maximum projection images from a list of directories
+def compute_max_img_list(in_dir, ext, base_name, dir_out = ''):
+    """by abda
+    Modified by monj"""
+    
+    dir_list = [dI for dI in os.listdir(in_dir) if (os.path.isdir(os.path.join(in_dir,dI)) and dI[0:len(base_name)]==base_name)]
+    dir_list.sort()
+    
+    max_img_list = []
+    for d in dir_list:
+        image_dir_in = in_dir + d + '/'
+        max_img = get_max_img(image_dir_in, ext)
+        if dir_out!='':
+            os.makedirs(dir_out, exist_ok = True)
+            io.imsave(dir_out + d + '.png', max_img.astype('uint8')) 
+            
+        max_img_list += [max_img]
+        
+    return max_img_list
+
+# One more level up from compute_max_img_list. Computes a list of lists of maximum
+#projection images from a list of directories, each containing a list of directories
+#with the set of images that should be combined into a maximum projection.
+def comp_max_imgs(dir_condition, base_name, dir_out = ''):
+    'monj@dtu.dk'
+    
+    dir_list_condition = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+    
+    max_img_list_condition = []
+    for directory in dir_list_condition:
+        dir_in = dir_condition + '/' + directory + '/'
+        
+        if dir_out!= '':
+            subdir_out = dir_in.replace(dir_condition,dir_out)
+        else:
+            subdir_out = ''
+        
+        max_img_condition = compute_max_img_list(dir_in, '.png', base_name, subdir_out)
+        max_img_list_condition+= [max_img_condition]
+    
+    return max_img_list_condition
+
+
+def comp_std_perChannel(max_im_list, th_int, dir_condition):   
+    'monj@dtu.dk'
+    
+    sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+    
+    std_list = [[],[],[]] 
+    for sample, frame_img_list in zip(sample_list, max_im_list):
+
+        for ind,img in enumerate(frame_img_list):
+            h, w, channels = img.shape 
+            
+            for channel in range(0,channels):
+                intensities = img[:,:,channel].ravel()
+                std_list[channel] += [(intensities[intensities>th_int]).std()]
+                
+    return std_list
+
+def intensity_spread_normalisation(img_list, th_int, mean_std, dir_condition, base_name, dir_results):
+    'monj@dtu.dk'
+    sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+    
+    img_corr_list = []
+    for sample, sample_imgs in zip(sample_list, img_list):
+        sample_dir = dir_condition + '/' + sample + '/'
+        #print(sample_dir)
+        frame_list = listdir_custom(sample_dir, base_name = base_name)
+        #print(frame_list)
+        frame_img_corr_list = []
+        for frame,img in zip(frame_list,sample_imgs):
+            h, w, channels = img.shape        
+            img_corr = img
+            for channel in range(0,channels):
+                img_channel = img_corr[:,:,channel]
+                img_channel[img_channel>th_int] = img_channel[img_channel>th_int]*(mean_std/img_channel.std())
+                img_corr[:,:,channel] = img_channel
+    
+            frame_img_corr_list += [img_corr]
+            os.makedirs(dir_results + '/' + sample, exist_ok = True)
+            io.imsave(dir_results + '/' + sample + '/' + frame +'.png', img_corr)
+            
+        img_corr_list += [frame_img_corr_list]
+        
+    return img_corr_list
+
+def rgb2cmy_list(img_list, dir_condition, base_name, dir_results):
+    'monj@dtu.dk'
+    sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+
+    for sample, sample_imgs in zip(sample_list, img_list):
+        sample_dir = dir_condition + '/' + sample + '/'
+        #print(sample_dir)
+        frame_list = listdir_custom(sample_dir, base_name = base_name)
+        #print(frame_list)
+        for frame,img in zip(frame_list,sample_imgs):
+            img_cmy = rgb2cmy(img)
+
+            os.makedirs(dir_results + '/' + sample, exist_ok = True)
+            io.imsave(dir_results + '/' + sample + '/' + frame +'.png', img_cmy)
+            
+    
+#def plotHist_perChannel_imgset
+def plotHist_perChannel_imgset_list(max_img_list, dir_condition, dir_results = '', name_tag = 'original'):   
+    'monj@dtu.dk'
+    
+    sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+    
+    for sample, frame_img_list in zip(sample_list, max_img_list):
+        fig, axs = plt.subplots(4,len(frame_img_list), figsize = (len(frame_img_list)*2,4*2))
+        plt.suptitle('Sample '+sample[-4:] + ', acq. date: ' + sample[:6])
+            
+        for ind,img in enumerate(frame_img_list):
+            h, w, channels = img.shape 
+            axs[0][ind].imshow(img)
+            
+            for channel in range(0,channels):
+                intensities = img[:,:,channel].ravel()
+                axs[channel+1][ind].hist(intensities, bins = 50)
+                axs[channel+1][ind].set_aspect(1.0/axs[channel+1][ind].get_data_ratio())
+    
+        if dir_results!= '':
+            plt.savefig(dir_results + '/' + sample[-4:] + '_' + name_tag + '_perChannelHistograms.png', dpi = 300)
+            plt.close(fig)
+            
+#def compare_imgpairs   
+def compare_imgpairs_list(list1_imgsets, list2_imgsets, dir_condition, dir_results = ''):   
+    'monj@dtu.dk'
+    
+    sample_list = listdir_custom(dir_condition, string_start_pos = -4, dir_flag = True)
+    
+    for imgset1, imgset2, sample in zip(list1_imgsets, list2_imgsets, sample_list):
+        fig, axs = plt.subplots(2,len(imgset1), figsize = (2*len(imgset1),2*2),sharex=True,sharey=True)
+        plt.suptitle('Sample '+sample[-4:] + ', acq. date: ' + sample[:6])
+            
+        for ind, img1, img2 in zip(range(0,len(imgset1)), imgset1, imgset2):
+            axs[0][ind].imshow(img1)
+            axs[1][ind].imshow(img2)
+               
+        if dir_results!= '':
+            plt.savefig(dir_results + '/' + sample[-4:] + '_originalVScorrected.png', dpi=300)
+            plt.close(fig)
+        
+def black2white_background(im_n):
+    """
+    
+    Parameters
+    ----------
+    im_n : Normalized RGB image (intensities in range [0,1] of type float)
+
+    Returns
+    -------
+    im_conv : Normalized RGB image where black has been converted to white (or visa versa of type float)
+
+    abda@dtu.dk
+    """
+    im_conv = np.zeros((im_n.shape[:2]+(3,)))
+    im_conv[:,:,0] = (1-im_n[:,:,1])*(1-im_n[:,:,2]) + im_n[:,:,0]*np.maximum(im_n[:,:,1],im_n[:,:,2])
+    im_conv[:,:,1] = (1-im_n[:,:,0])*(1-im_n[:,:,2]) + im_n[:,:,1]*np.maximum(im_n[:,:,0],im_n[:,:,2])
+    im_conv[:,:,2] = (1-im_n[:,:,0])*(1-im_n[:,:,1]) + im_n[:,:,2]*np.maximum(im_n[:,:,0],im_n[:,:,1])
+    return im_conv
+
+def rgb2cmy(img):
+    'monj@dtu.dk'
+    img_n = img.astype(float)/255 #normalise
+    img_conv_n = black2white_background(img_n)
+    img_conv = (img_conv_n*255).astype('uint8') #denormalise
+    return 255-img_conv
+
+#%%Functions for feature analysis and visualisation
+
+# computes the max projection image and the features into a list
+def compute_max_img_feat(in_dir, ext, base_name, sigma, sc_fac, save = False, abs_intensity = True):
+    dir_list = [dI for dI in os.listdir(in_dir) if (os.path.isdir(os.path.join(in_dir,dI)) and dI[0:len(base_name)]==base_name)]
+    dir_list.sort()
+    max_img_list = []
+    for d in dir_list:
+        image_dir = in_dir + d + '/'
+        max_img = get_max_img(image_dir, ext)
+        
+        if save:
+            dir_name_out_img = in_dir.replace('data','maxProjImages')
+            os.makedirs(dir_name_out_img, exist_ok = True)
+
+            io.imsave( dir_name_out_img + d + '.png', max_img.astype('uint8'))
+            
+        max_img_list += [skimage.transform.rescale(max_img, sc_fac, multichannel=True)]
+    
+    feat_list = []
+    for max_img in max_img_list:
+        r,c = max_img.shape[:2]
+        feat = np.zeros((r*c,45))
+        for i in range(0,3):
+            feat_tmp = lf.get_gauss_feat_im(max_img[:,:,i], sigma)
+            feat[:,i*15:(i+1)*15] = feat_tmp.reshape(-1,feat_tmp.shape[2])
+        if not(abs_intensity):
+            feat_list += [np.concatenate((feat[:,1:15],feat[:,16:30],feat[:,31:45]),axis = 1)]
+        else:    
+            feat_list += [feat]
+    return max_img_list, feat_list
+
+
+# computes a histogram of features from a kmeans object
+def compute_assignment_hist(feat_list, kmeans, background_feat = None):
+    assignment_list = []
+    for feat in feat_list:
+        assignment_list += [kmeans.predict(feat)] 
+    edges = np.arange(kmeans.n_clusters+1)-0.5 # histogram edges halfway between integers
+    hist = np.zeros(kmeans.n_clusters)
+    for a in assignment_list:
+        hist += np.histogram(a,bins=edges)[0]
+        
+    sum_hist = np.sum(hist)
+    hist/= sum_hist
+    
+    if background_feat is not None:
+        assignment_back = kmeans.predict(background_feat)
+        hist_back = np.histogram(assignment_back,bins=edges)[0]
+        return hist, assignment_list, hist_back, assignment_back
+    else: 
+        return hist, assignment_list
+
+
+# image to array of patches
+def im2col(A, BSZ, stepsize=1, norm=False):
+    # Parameters
+    m,n = A.shape
+    s0, s1 = A.strides    
+    nrows = m-BSZ[0]+1
+    ncols = n-BSZ[1]+1
+    shp = BSZ[0],BSZ[1],nrows,ncols
+    strd = s0,s1,s0,s1
+
+    out_view = np.lib.stride_tricks.as_strided(A, shape=shp, strides=strd)
+    out_view_shaped = out_view.reshape(BSZ[0]*BSZ[1],-1)[:,::stepsize]
+    if norm:
+        one_norm = np.sum(out_view_shaped,axis=0)
+        ind_zero_norm = np.where(one_norm !=0)
+        out_view_shaped[:,ind_zero_norm] = 255*out_view_shaped[:,ind_zero_norm]/one_norm[ind_zero_norm]
+    return out_view_shaped
+
+# nd image to array of patches
+def ndim2col(A, BSZ, stepsize=1, norm=False):
+    if(A.ndim == 2):
+        return im2col(A, BSZ, stepsize, norm)
+    else:
+        r,c,l = A.shape
+        patches = np.zeros((l*BSZ[0]*BSZ[1],(r-BSZ[0]+1)*(c-BSZ[1]+1)),dtype=A.dtype)
+        for i in range(l):
+            patches[i*BSZ[0]*BSZ[1]:(i+1)*BSZ[0]*BSZ[1],:] = im2col(A[:,:,i],BSZ,stepsize,norm)
+        return patches
+    
+# nd image to array of patches with mirror padding along boundaries
+def ndim2col_pad(A, BSZ, stepsize=1, norm=False):
+    r,c = A.shape[:2]
+    if (A.ndim == 2):
+        l = 1
+    else:
+        l = A.shape[2]
+    tmp = np.zeros((r+BSZ[0]-1,c+BSZ[1]-1,l),dtype = A.dtype)
+    fhr = int(np.floor(BSZ[0]/2))
+    fhc = int(np.floor(BSZ[1]/2))
+    thr = int(BSZ[0]-fhr-1)
+    thc = int(BSZ[1]-fhc-1)
+    
+    tmp[fhr:fhr+r,fhc:fhc+c,:] = A.reshape((r,c,l))
+    tmp[:fhr,:] = np.flip(tmp[fhr:fhr*2,:], axis=0)
+    tmp[fhr+r:,:] = np.flip(tmp[r:r+thr,:], axis=0)
+    tmp[:,:fhc] = np.flip(tmp[:,fhc:fhc*2], axis=1)
+    tmp[:,fhc+c:] = np.flip(tmp[:,c:c+thc], axis=1)
+    tmp = np.squeeze(tmp)
+    return ndim2col(tmp,BSZ,stepsize,norm)
+
+
+def patch2featvector(patches, colour_mode = 'bnw', colour_weight = 1):
+    patch_size = int((patches.shape[1]/3)**0.5)
+    patches_reshaped = patches.reshape(patches.shape[0],3,patch_size,patch_size)
+    patches_bnw = np.mean(patches_reshaped,axis = 1)
+    
+    if colour_mode == 'bnw':
+        features = patches_bnw.reshape(patches.shape[0],patch_size**2)
+    else:
+        int_perchannel = np.mean(np.mean(patches_reshaped,axis = 2),axis = 2)
+        features = [patches_bnw.reshape(patches.shape[0],patch_size**2), colour_weight*int_perchannel]
+    return features
+
+def patch2featvector_list(list_patches, colour_mode = 'bnw', colour_weight = 1):
+    list_features = []
+    for patches in list_patches:
+        features = patch2featvector(patches, colour_mode, colour_weight)
+        list_features += [features]
+        
+    return list_features
+
+def comp_prob_imgs(prob_control,assignment_list_condition, prob_img_shape):
+    #Compute probability images for condition
+    r,c = prob_img_shape
+    prob_img_list_condition = []
+    for assignment in assignment_list_condition:
+        prob = prob_control[assignment.astype(int)]
+        prob_img_list_condition += [prob.reshape(r,c)]
+    return prob_img_list_condition
+    
+#%% Functions for visualisation of learnt features
+def plot_grid_cluster_centers(cluster_centers, cluster_order, patch_size, protein_mode = 'triple', colour_mode = 'rgb', channel = 'all', titles = False, occurrence = ''):
+    #grid dimensions
+    size_x = round(len(cluster_order)**(1/2))
+    size_y = ceil(len(cluster_order)/size_x)
+
+    #figure format
+    overhead = 0
+    if titles:
+        overhead = 1
+    w, h = plt.figaspect(size_x/size_y)
+    fig, axs = plt.subplots(size_x,size_y, figsize=(1.3*w,1.3*h*(1+overhead/2)), sharex=True, sharey=True)
+    #print('Grid size: ', grid_size[1], grid_size[2], 'Figure size: ', w, h)
+    
+    int_sum_list = []
+    ax_list = axs.ravel()
+    for ind, cluster in enumerate(cluster_order):
+        #print(ind)
+        if colour_mode =='bnw':
+            cluster_centre = np.reshape(cluster_centers[cluster,:],(patch_size,patch_size))
+            ax_list[ind].imshow(cluster_centre.astype('uint8'),cmap='gray')
+        else:
+            if protein_mode == 'single': #in bnw + colour give the clusters a uniform colour
+                cluster_centre = np.zeros((patch_size,patch_size,3),dtype = cluster_centers.dtype)
+                cluster_centre[:,:,channel] = np.reshape(cluster_centers[cluster,:],(patch_size,patch_size))
+                
+            elif protein_mode == 'dropout':
+                cluster_centre = np.zeros((patch_size,patch_size,3),dtype = cluster_centers.dtype)
+                keep_channels = [x for x in range(3) if x!=channel]
+                cluster_centre[:,:,keep_channels] = np.transpose(np.reshape(cluster_centers[cluster,:],(2,patch_size,patch_size)),(1,2,0)) 
+
+            else:
+                cluster_centre = np.transpose(np.reshape(cluster_centers[cluster,:],(3,patch_size,patch_size)),(1,2,0))
+                if channel!='all':
+                    remove_channels = [x for x in range(3) if x!=channel]
+                    cluster_centre[:,:,remove_channels] = np.zeros((patch_size,patch_size,2),dtype = cluster_centers.dtype)
+                    
+            if colour_mode == 'cmy':
+                cluster_centre = rgb2cmy(cluster_centre)
+            ax_list[ind].imshow(cluster_centre.astype('uint8'))
+        
+        if titles:
+            if occurrence !='':
+                ax_list[ind].set_title(round(occurrence[ind],2))
+            else:
+                ax_list[ind].set_title(cluster)
+        
+        if colour_mode =='bnw':
+            int_sum_list += [sum((cluster_centre).ravel())] 
+        else:
+            int_sum_list += [sum((np.max(cluster_centre,2)).ravel())] 
+        
+    plt.setp(axs, xticks=[], yticks=[])
+    #plt.show()
+    
+    return int_sum_list
+ 
+#%% Functions for channel statistics  
+def smooth(y, box_pts):
+    box = np.ones(box_pts)/box_pts
+    y_smooth = np.convolve(y, box, mode='same')
+    return y_smooth
+
+def hist_outline(density, bins, axs, color = 'k', w = 1):
+    y,binEdges=np.histogram(density,bins=bins, density = True)
+    bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
+    axs.plot(bincenters,smooth(y,w), color = color)
+    
+def channel_corr_scatter(pixel_probs, axs, bins, lims = (0,1), w = 1, edgecolors='b', s=5):
+
+    hist_outline(pixel_probs[0],bins,axs[0][0], color = edgecolors, w = w)
+    axs[0][0].set_xlim(lims[0],lims[1])
+    axs[0][0].xaxis.set_label_position('top') 
+    axs[0][0].set_xlabel('Colagen 4'),axs[0][0].set_ylabel('Colagen 4'),  
+    axs[1][0].scatter(pixel_probs[0],pixel_probs[1], s=s, facecolors='none', edgecolors=edgecolors)
+    axs[1][0].set_xlim(lims[0],lims[1]), axs[1][0].set_ylim(lims[0],lims[1])
+    axs[1][0].set_ylabel('Fibrilar colagen')
+    axs[2][0].scatter(pixel_probs[0],pixel_probs[2], s=s, facecolors='none', edgecolors=edgecolors)
+    axs[2][0].set_xlim(lims[0],lims[1]), axs[2][0].set_ylim(lims[0],lims[1])
+    axs[2][0].set_ylabel('Elastin')
+    
+    axs[0][1].scatter(pixel_probs[1],pixel_probs[0], s=s, facecolors='none', edgecolors=edgecolors)
+    axs[0][1].set_xlim(lims[0],lims[1]), axs[0][1].set_ylim(lims[0],lims[1])
+    axs[0][1].xaxis.set_label_position('top'), axs[0][1].set_xlabel('Fibrilar Colagen') 
+    hist_outline(pixel_probs[1],bins,axs[1][1], color = edgecolors, w = w)
+    axs[1][1].set_xlim(lims[0],lims[1])
+    axs[2][1].scatter(pixel_probs[1],pixel_probs[2], s=s, facecolors='none', edgecolors=edgecolors)
+    axs[2][1].set_xlim(lims[0],lims[1]), axs[2][1].set_ylim(lims[0],lims[1])
+    
+    
+    axs[0][2].scatter(pixel_probs[2],pixel_probs[0], s=s, facecolors='none', edgecolors=edgecolors)
+    axs[0][2].set_xlim(lims[0],lims[1]), axs[0][2].set_ylim(lims[0],lims[1])
+    axs[0][2].xaxis.set_label_position('top'), axs[0][2].set_xlabel('Elastin') 
+    axs[1][2].scatter(pixel_probs[2],pixel_probs[1], s=s, facecolors='none', edgecolors=edgecolors)
+    axs[1][2].set_xlim(lims[0],lims[1]), axs[1][2].set_ylim(lims[0],lims[1])
+    hist_outline(pixel_probs[2],bins,axs[2][2], color = edgecolors, w = w)
+    axs[2][2].set_xlim(lims[0],lims[1])
+
+def channel_corr_density(pixel_probs, lims = (0,1)): 
+    fig, axs = plt.subplots(3,3, figsize = (6,6)) 
+    
+    axs[0][0].hist(pixel_probs[0],density = True),
+    axs[0][0].set_xlim(lims[0],lims[1])
+    axs[0][0].xaxis.set_label_position('top') 
+    axs[0][0].set_xlabel('Colagen 4'),axs[0][0].set_ylabel('Colagen 4'),  
+    axs[1][0].hist2d(pixel_probs[0],pixel_probs[1])
+    axs[1][0].set_xlim(lims[0],lims[1]), axs[1][0].set_ylim(lims[0],lims[1])
+    axs[1][0].set_ylabel('Fibrilar colagen')
+    axs[2][0].hist2d(pixel_probs[0],pixel_probs[2])
+    axs[2][0].set_xlim(lims[0],lims[1]), axs[2][0].set_ylim(lims[0],lims[1])
+    axs[2][0].set_ylabel('Elastin')
+    
+    axs[0][1].hist2d(pixel_probs[1],pixel_probs[0])
+    axs[0][1].set_xlim(lims[0],lims[1]), axs[0][1].set_ylim(lims[0],lims[1])
+    axs[0][1].xaxis.set_label_position('top'), axs[0][1].set_xlabel('Fibrilar Colagen') 
+    axs[1][1].hist(pixel_probs[1],density = True)
+    axs[1][1].set_xlim(lims[0],lims[1])
+    axs[2][1].hist2d(pixel_probs[1],pixel_probs[2])
+    axs[2][1].set_xlim(lims[0],lims[1]), axs[2][1].set_ylim(lims[0],lims[1])
+    
+    axs[0][2].hist2d(pixel_probs[2],pixel_probs[0])
+    axs[0][2].set_xlim(lims[0],lims[1]), axs[0][2].set_ylim(lims[0],lims[1])
+    axs[0][2].xaxis.set_label_position('top'), axs[0][2].set_xlabel('Elastin') 
+    axs[1][2].hist2d(pixel_probs[2],pixel_probs[1])
+    axs[1][2].set_xlim(lims[0],lims[1]), axs[1][2].set_ylim(lims[0],lims[1])
+    axs[2][2].hist(pixel_probs[2],density = True)
+    axs[2][2].set_xlim(lims[0],lims[1])
+    
+def get_patient_probs(img_probs):
+    nr_frames_patient = 10
+    patient_probs_allchannels= []
+    for img_channel in img_probs:
+        nr_patients = len(img_channel)//nr_frames_patient
+        patient_probs = []
+        for patient in range(nr_patients):
+            patient_img_probs = img_channel[patient*nr_frames_patient: (patient+1)*nr_frames_patient]
+            patient_probs += [sum(patient_img_probs)/len(patient_img_probs)]
+        patient_probs_allchannels += [patient_probs] 
+        
+    return patient_probs_allchannels
+    
+# def plot_mapsAndimages(dir_condition, directory_list, map_img, max_img_list, base_name = 'frame', r = 1024, c = 1024):
+#     nr_list = 0
+#     for directory in directory_list:
+#         in_dir = dir_condition + directory + '/'
+#         dir_list = [dI for dI in os.listdir(in_dir) if (os.path.isdir(os.path.join(in_dir ,dI)) and dI[0:len(base_name)]==base_name)]
+#         print(dir_list)
+#         img_all_control = []
+#         img_all_control += map_img[nr_list:nr_list+len(dir_list)]
+#         img_all_control += max_img_list[nr_list:nr_list+len(dir_list)]
+#         fig, axs = plt.subplots(2,len(dir_list), sharex=True, sharey=True)
+#         nr = 0
+#         prob_sample = np.zeros((len(dir_list),1))
+#         for ax, img in zip(axs.ravel(), img_all_control):
+#             print(nr)
+#             if nr<len(dir_list):
+#                 prob_img = sum(sum(img))/(r*c)
+#                 ax.set_title('Pc '+str(round(prob_img,2)))
+#                 prob_sample[nr] = prob_img
+#                 #print(prob_sample)
+#             nr += 1
+#             if ( img.ndimg == 2 ):
+#                 ax.imshow(skimage.transform.resize(img, (1024,1024)), cmap=plt.cm.bwr, vmin = 0, vmax = 1)
+#             else:
+#                 ax.imshow(skimage.transform.resize(img.astype(np.uint8), (1024,1024)))
+#         plt.suptitle('Probability control of sample '+str(directory)+', avg:' + str(round(prob_sample.mean(),2))+ ' std:' + str(round(prob_sample.std(),2)))
+#         plt.savefig(dir_probs + '/control/' + '/probImControl_'+str(directory)+'_%dclusters_%dsigma%ddownscale_absInt%s.png'%(nr_clusters,sigma,1/sc_fac,abs_intensity), dpi=1000)
+#         ##plt.show()
+#         nr_list += len(dir_list)
+
+
+
+
+
+
+
+
+
+
+
-- 
GitLab