prepare_data.py

# -*- coding: utf-8 -*-
"""
Created on Wed May 16 14:27:21 2018
This script is for preparing the input for the active learning, we need to have training data, pool data,
validation data.
@author: s161488
"""
import numpy as np
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt

path_mom = "DATA/"  # NOTE, NEED TO BE MANUALLY DEFINED


def prepare_train_data(path, select_benign_train, select_mali_train):
    """
    Args:
        path: the path where the data is saved
        select_benign_train: a list of selected benign images
        select_mali_train: a list of selected malignant images
    Ops:
        First, the images, labels. edges, im_index, cls_index can be extracted from the np.load
        images: shape [85, im_h, im_w, 3]
        labels: shape [85, im_h, im_w, 1]
        im_index: shape [85]
        cls_index: shape [85]
        Start_Point will determine how many images are initialized as training image
    Output:
    
    X_train, Y_train
    X_pool, Y_pool
    X_val, Y_val    
    """
    data_set = np.load(path, allow_pickle=True).item()
    images = data_set['image']
    labels = data_set['label']
    edges = data_set['edge']
    imageindex = data_set['ImageIndex']
    classindex = data_set['ClassIndex']
    benign_index = np.where(np.array(classindex) == 1)
    mali_index = np.where(np.array(classindex) == 2)

    choose_index_tr = np.concatenate([benign_index[0][select_benign_train], mali_index[0][select_mali_train]], axis=0)
    benign_index_left = np.delete(range(np.shape(benign_index[0])[0]), select_benign_train)
    mali_index_left = np.delete(range(np.shape(mali_index[0])[0]), select_mali_train)

    choose_index_pl = np.concatenate([benign_index[0][benign_index_left[:27]], mali_index[0][mali_index_left[:38]]],
                                     axis=0)
    choose_index_val = np.concatenate([benign_index[0][benign_index_left[-5:]], mali_index[0][mali_index_left[-5:]]],
                                      axis=0)
    data_train = extract_diff_data(images, labels, edges, imageindex, classindex, choose_index_tr)
    data_pl = extract_diff_data(images, labels, edges, imageindex, classindex, choose_index_pl)
    data_val = extract_diff_data(images, labels, edges, imageindex, classindex, choose_index_val)

    return data_train, data_pl, data_val


def prepare_pool_data(path, aug=False):
    data_set = np.load(path, allow_pickle=True).item()
    images = data_set['image']
    labels = data_set['label']
    edges = data_set['edge']
    imageindex = data_set['ImageIndex']
    classindex = data_set['ClassIndex']
    select_benign_train = [0, 1, 2, 3, 4]
    select_mali_train = [2, 4, 5, 6, 7]

    benign_index = np.where(np.array(classindex) == 1)
    mali_index = np.where(np.array(classindex) == 2)

    benign_index_left = np.delete(range(np.shape(benign_index[0])[0]), select_benign_train)
    mali_index_left = np.delete(range(np.shape(mali_index[0])[0]), select_mali_train)

    choose_index_pl = np.concatenate([benign_index[0][benign_index_left[:27]], mali_index[0][mali_index_left[:38]]],
                                     axis=0)
    data_pl = extract_diff_data(images, labels, edges, imageindex, classindex, choose_index_pl)
    if aug is True:
        targ_height_npy = 528  # this is for padding images
        targ_width_npy = 784  # this is for padding images
        x_image_val, y_label_val, y_edge_val = padding_training_data(data_pl[0], data_pl[1],
                                                                     data_pl[2], targ_height_npy,
                                                                     targ_width_npy)
        data_pl = [x_image_val, y_label_val, y_edge_val]

    return data_pl


def prepare_skin_data(path, num_tr, combine=True):
    """
    choose_index_tr: worst 16+best 16
    or middle 32
    this num_tr should be 1/2*total_number_of_training_images_at_inital_step
    I have tried it for 32, then I am going to check 16

    """
    val_num_im = 96
    tot_numeric_index = np.arange(900)
    if combine is True:
        tr_select_numeric_index = np.concatenate([tot_numeric_index[:num_tr], tot_numeric_index[-num_tr:]], axis=0)
    else:
        tr_select_numeric_index = tot_numeric_index[340:(340 + num_tr * 2)]
    val_select_numeric_index = tot_numeric_index[500:(500 + val_num_im)]
    pool_numeric_index = np.delete(tot_numeric_index,
                                   np.concatenate([tr_select_numeric_index, val_select_numeric_index], axis=0))
    im_seg_score = np.load('/home/s161488/Exp_Stat/Skin_Lesion/init_segment_score.npy')
    sorted_index = np.argsort(im_seg_score)

    data_set = np.load(path, encoding='latin1').item()
    images = np.array(data_set['image'])
    labels = np.array(data_set['label'])
    edges = np.array(data_set['edge'])

    labels = np.expand_dims(labels, axis=-1)
    edges = np.expand_dims(edges, axis=-1)
    tr_select_image_index = np.sort(sorted_index[tr_select_numeric_index])
    val_select_image_index = np.sort(sorted_index[val_select_numeric_index])
    pl_select_image_index = np.sort(sorted_index[pool_numeric_index])

    imindex = np.arange(np.shape(images)[0])
    clsindex = np.ones(np.shape(images)[0])

    data_tr = extract_diff_data(images, labels, edges, imindex, clsindex, tr_select_image_index)
    data_pl = extract_diff_data(images, labels, edges, imindex, clsindex, pl_select_image_index)
    data_val = extract_diff_data(images, labels, edges, imindex, clsindex, val_select_image_index)

    return data_tr[:3], data_pl[:3], data_val[:3]


def prepare_test_data(path):
    if "_test" not in path:
        print("-------I am loading the data from pool set------")
        return prepare_pool_data(path)
    data_set = np.load(path, allow_pickle=True).item()
    images = data_set['image']
    labels = data_set['label']
    edges = data_set['edge']
    imageindex = data_set['ImageIndex']
    classindex = data_set['ClassIndex']
    return images, labels, edges, imageindex, classindex


def generate_batch(x_image_tr, y_label_tr, y_edge_tr, y_binary_mask_tr, batch_index, batch_size):
    im_group = [x_image_tr, y_label_tr, y_edge_tr, y_binary_mask_tr]
    im_batch = []
    for single_im in im_group:
        _im_batch = single_im[batch_index:(batch_size + batch_index)]
        im_batch.append(_im_batch)
    batch_index = batch_index + batch_size
    return im_batch[0], im_batch[1], im_batch[2], im_batch[3], batch_index


def padding_training_data(x_image, y_label, y_edge, target_height, target_width):
    """Each image has different size, so I need to pad it with zeros to make sure each image has the same size.
       Then I can perform random crop, rotation and other augmentation on per batch instead of per image
    """
    x_im_pad, y_la_pad, y_ed_pad = [], [], []
    num_image = np.shape(x_image)[0]
    for i in range(num_image):
        image_pad, label_pad, edge_pad = padding_zeros(x_image[i], y_label[i], y_edge[i], target_height, target_width)
        x_im_pad.append(image_pad)
        y_la_pad.append(label_pad)
        y_ed_pad.append(edge_pad)
    x_im_pad = np.reshape(x_im_pad, [num_image, target_height, target_width, 3])
    y_la_pad = np.reshape(y_la_pad, [num_image, target_height, target_width, 1])
    y_ed_pad = np.reshape(y_ed_pad, [num_image, target_height, target_width, 1])
    return x_im_pad, y_la_pad, y_ed_pad


def padding_zeros(image, label, edge, target_height, target_width):
    im_h, im_w, _ = np.shape(image)
    delta_w = target_width - im_w
    delta_h = target_height - im_h
    top, bottom = delta_h // 2, delta_h - delta_h // 2
    left, right = delta_w // 2, delta_w - delta_w // 2
    image_pad = np.pad(image, ((top, bottom), (left, right), (0, 0)), mode='constant')
    label_pad = np.pad(label, ((top, bottom), (left, right)), mode='constant')
    edge_pad = np.pad(edge, ((top, bottom), (left, right)), mode='constant')
    return image_pad, label_pad, edge_pad


def extract_diff_data(image, label, edge, im_index, cls_index, choose_index):
    new_data = [[] for _ in range(5)]
    old_data = [image, label, edge, im_index, cls_index]
    for i in choose_index:
        for single_new, single_old in zip(new_data, old_data):
            single_new.append(single_old[i])
    return new_data[0], new_data[1], new_data[2], new_data[3], new_data[4]


def aug_train_data(image, label, edge, binary_mask, batch_size, aug, imshape):
    """This function is used for performing data augmentation. 
    image: placeholder. shape: [Batch_Size, im_h, im_w, 3], tf.float32
    label: placeholder. shape: [Batch_Size, im_h, im_w, 1], tf.int64
    edge: placeholder. shape: [Batch_Size, im_h, im_w, 1], tf.int64
    binary_mask: placeholder. shape: [Batch_Size, im_h, im_w, 1], tf.int64
    aug: bool
    imshape: [targ_h, targ_w, ch]
    Outputs:
    image: [Batch_Size, targ_h, targ_w, 3]
    label: [Batch_Size, targ_h, targ_w, 1]
    edge: [Batch_Size, targ_h, targ_w, 1]
    binary_mask: [Batch_Size, targ_h, targ_w, 1]
    
    """
    image = tf.cast(image, tf.int64)
    bigmatrix = tf.concat([image, label, edge, binary_mask], axis=3)
    target_height = imshape[0].astype('int32')
    target_width = imshape[1].astype('int32')
    if aug is True:
        bigmatrix_crop = tf.random_crop(bigmatrix, size=[batch_size, target_height, target_width, 6])
        bigmatrix_crop = tf.cond(tf.less_equal(tf.reduce_sum(bigmatrix_crop[:, :, :, 5]), 10),
                                 lambda: tf.image.resize_image_with_crop_or_pad(bigmatrix, target_height, target_width),
                                 lambda: bigmatrix_crop)
        # instead of judging by label, should do it by the binary mask!
        k = tf.random_uniform(shape=[batch_size], minval=0, maxval=6.5, dtype=tf.float32)
        bigmatrix_rot = tf.contrib.image.rotate(bigmatrix_crop, angles=k)
        image_aug = tf.cast(bigmatrix_rot[:, :, :, 0:3], tf.float32)
        label_aug = bigmatrix_rot[:, :, :, 3]
        edge_aug = bigmatrix_rot[:, :, :, 4]
        binary_mask_aug = bigmatrix_rot[:, :, :, 5]
    else:
        bigmatrix_rot = tf.image.resize_image_with_crop_or_pad(bigmatrix, target_height, target_width)
        image_aug = tf.cast(tf.cast(bigmatrix_rot[:, :, :, 0:3], tf.uint8), tf.float32)
        label_aug = tf.cast(bigmatrix_rot[:, :, :, 3], tf.int64)
        edge_aug = tf.cast(bigmatrix_rot[:, :, :, 4], tf.int64)
        binary_mask_aug = tf.cast(bigmatrix_rot[:, :, :, 5], tf.int64)
    return image_aug, tf.expand_dims(label_aug, -1), tf.expand_dims(edge_aug, -1), tf.expand_dims(binary_mask_aug, -1)


def collect_test_data(resize=True):
    test_a_path = path_mom + "/Data/glanddata_testa.npy"
    test_b_path = path_mom + "/Data/glanddata_testb.npy"
    image_tot, label_tot = [], []
    target_height, target_width = 528, 784
    for single_path in [test_a_path, test_b_path]:
        data_set = np.load(single_path, allow_pickle=True).item()
        images = data_set['image']
        y_label_pl = data_set['label']
        y_edge_pl = data_set['edge']
        x_image_val = []
        y_label_val = []
        if resize is True:
            for single_im, single_label in zip(images, y_label_pl):
                for _im_, _path_ in zip([single_im, single_label], [x_image_val, y_label_val]):
                    _im_ = cv2.resize(_im_, dsize=(784, 528), interpolation=cv2.INTER_CUBIC)
                    _path_.append(_im_)
        else:
            x_image_val, y_label_val, y_edge_val = padding_training_data(images, y_label_pl, y_edge_pl, target_height,
                                                                         target_width)
        image_tot.append(x_image_val)
        label_tot.append(y_label_val)
    image_tot = np.concatenate([image_tot[0], image_tot[1]], axis=0)
    label_tot = np.concatenate([label_tot[0], label_tot[1]], axis=0)
    print("The shape of the test images", np.shape(image_tot))
    return image_tot, label_tot


def save_im():
    im_tot, la_tot = collect_test_data()
    rand_value = np.random.choice(np.arange(len(im_tot)), 3, replace=False)
    for i in rand_value:
        fig = plt.figure(figsize=(10, 4))
        im_ = im_tot[i]
        la_ = la_tot[i]
        la_judge = (la_ != 0)
        for iterr, single_im in enumerate([im_, la_, la_judge]):
            ax = fig.add_subplot(1, 3, iterr + 1)
            ax.imshow(single_im)
        plt.savefig('/home/blia/im_%d.pdf' % i, pad_inches=0, box_inches='tight')