Skip to content
Snippets Groups Projects
ex2_4_1.py 3.72 KiB
Newer Older
  • Learn to ignore specific revisions
  • bjje's avatar
    bjje committed
    """
    Note: This is a long script. You may want to use breakpoint 
    """
    
    Stas Syrota's avatar
    Stas Syrota committed
    import importlib_resources
    
    bjje's avatar
    bjje committed
    import numpy as np
    
    bjje's avatar
    bjje committed
    import matplotlib.pyplot as plt
    
    bjje's avatar
    bjje committed
    from scipy.io import loadmat
    from scipy.stats import zscore
    
    
    Stas Syrota's avatar
    Stas Syrota committed
    filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat")
    
    
    bjje's avatar
    bjje committed
    # Load Matlab data file and extract variables of interest
    
    Stas Syrota's avatar
    Stas Syrota committed
    mat_data = loadmat(filename)
    X = mat_data["X"]
    y = mat_data["y"].squeeze()
    C = mat_data["C"][0, 0]
    M = mat_data["M"][0, 0]
    N = mat_data["N"][0, 0]
    attributeNames = [name[0][0] for name in mat_data["attributeNames"]]
    classNames = [cls[0][0] for cls in mat_data["classNames"]]
    
    bjje's avatar
    bjje committed
    
    
    bjje's avatar
    bjje committed
    print("Data loaded")
    
    
    bjje's avatar
    bjje committed
    # We start with a box plot of each attribute
    
    bjje's avatar
    bjje committed
    plt.figure()
    plt.title("Wine: Boxplot")
    plt.boxplot(X)
    plt.xticks(range(1, M + 1), attributeNames, rotation=45)
    
    bjje's avatar
    bjje committed
    
    # From this it is clear that there are some outliers in the Alcohol
    # attribute (10x10^14 is clearly not a proper value for alcohol content)
    # However, it is impossible to see the distribution of the data, because
    # the axis is dominated by these extreme outliers. To avoid this, we plot a
    # box plot of standardized data (using the zscore function).
    
    bjje's avatar
    bjje committed
    plt.figure(figsize=(12, 6))
    plt.title("Wine: Boxplot (standarized)")
    plt.boxplot(zscore(X, ddof=1), attributeNames)
    plt.xticks(range(1, M + 1), attributeNames, rotation=45)
    
    bjje's avatar
    bjje committed
    
    # This plot reveals that there are clearly some outliers in the Volatile
    # acidity, Density, and Alcohol attributes, i.e. attribute number 2, 8,
    
    Stas Syrota's avatar
    Stas Syrota committed
    # and 11.
    
    bjje's avatar
    bjje committed
    plt.show()
    
    bjje's avatar
    bjje committed
    
    # Next, we plot histograms of all attributes.
    
    bjje's avatar
    bjje committed
    plt.figure(figsize=(14, 9))
    
    Stas Syrota's avatar
    Stas Syrota committed
    u = np.floor(np.sqrt(M))
    v = np.ceil(float(M) / u)
    
    bjje's avatar
    bjje committed
    for i in range(M):
    
    bjje's avatar
    bjje committed
        plt.subplot(int(u), int(v), i + 1)
        plt.hist(X[:, i])
        plt.xlabel(attributeNames[i])
        plt.ylim(0, N)  # Make the y-axes equal for improved readability
    
    Stas Syrota's avatar
    Stas Syrota committed
        if i % v != 0:
    
    bjje's avatar
    bjje committed
            plt.yticks([])
    
    Stas Syrota's avatar
    Stas Syrota committed
        if i == 0:
    
    bjje's avatar
    bjje committed
            plt.title("Wine: Histogram")
    
    Stas Syrota's avatar
    Stas Syrota committed
    
    
    bjje's avatar
    bjje committed
    plt.show()
    
    bjje's avatar
    bjje committed
    
    # This confirms our belief about outliers in attributes 2, 8, and 11.
    
    Stas Syrota's avatar
    Stas Syrota committed
    # To take a closer look at this, we next plot histograms of the
    
    bjje's avatar
    bjje committed
    # attributes we suspect contains outliers
    
    bjje's avatar
    bjje committed
    plt.figure(figsize=(14, 9))
    
    bjje's avatar
    bjje committed
    m = [1, 7, 10]
    for i in range(len(m)):
    
    bjje's avatar
    bjje committed
        plt.subplot(1, len(m), i + 1)
        plt.hist(X[:, m[i]], 50)
        plt.xlabel(attributeNames[m[i]])
        plt.ylim(0, N)  # Make the y-axes equal for improved readability
    
    Stas Syrota's avatar
    Stas Syrota committed
        if i > 0:
    
    bjje's avatar
    bjje committed
            plt.yticks([])
    
    Stas Syrota's avatar
    Stas Syrota committed
        if i == 0:
    
    bjje's avatar
    bjje committed
            plt.title("Wine: Histogram (selected attributes)")
    
    bjje's avatar
    bjje committed
    
    
    bjje's avatar
    bjje committed
    plt.show()
    
    bjje's avatar
    bjje committed
    
    # The histograms show that there are a few very extreme values in these
    # three attributes. To identify these values as outliers, we must use our
    # knowledge about the data set and the attributes. Say we expect volatide
    # acidity to be around 0-2 g/dm^3, density to be close to 1 g/cm^3, and
    # alcohol percentage to be somewhere between 5-20 % vol. Then we can safely
    # identify the following outliers, which are a factor of 10 greater than
    # the largest we expect.
    
    Stas Syrota's avatar
    Stas Syrota committed
    outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200)
    
    bjje's avatar
    bjje committed
    valid_mask = np.logical_not(outlier_mask)
    
    # Finally we will remove these from the data set
    
    Stas Syrota's avatar
    Stas Syrota committed
    X = X[valid_mask, :]
    
    bjje's avatar
    bjje committed
    y = y[valid_mask]
    N = len(y)
    
    # Now, we can repeat the process to see if there are any more outliers
    # present in the data. We take a look at a histogram of all attributes:
    
    bjje's avatar
    bjje committed
    plt.figure(figsize=(14, 9))
    
    Stas Syrota's avatar
    Stas Syrota committed
    u = np.floor(np.sqrt(M))
    v = np.ceil(float(M) / u)
    
    bjje's avatar
    bjje committed
    for i in range(M):
    
    bjje's avatar
    bjje committed
        plt.subplot(int(u), int(v), i + 1)
        plt.hist(X[:, i])
        plt.xlabel(attributeNames[i])
        plt.ylim(0, N)  # Make the y-axes equal for improved readability
    
    Stas Syrota's avatar
    Stas Syrota committed
        if i % v != 0:
    
    bjje's avatar
    bjje committed
            plt.yticks([])
    
    Stas Syrota's avatar
    Stas Syrota committed
        if i == 0:
    
    bjje's avatar
    bjje committed
            plt.title("Wine: Histogram (after outlier detection)")
    
    bjje's avatar
    bjje committed
    
    # This reveals no further outliers, and we conclude that all outliers have
    # been detected and removed.
    
    
    bjje's avatar
    bjje committed
    plt.show()
    
    bjje's avatar
    bjje committed