minor restructuring of ex4 and new ex2_4_3

a0cca827 · bjje · 538f8e8d · a0cca827 · a0cca827 · a0cca827
Commit a0cca827 authored 5 months ago by bjje
--- a/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py
 # exercise 2.4.1
 """
-Note: This is a long script. You may want to use breakpoint 
+Note: This is a long script. You may want to use breakpoints 
 """
 import importlib_resources
 import numpy as np

--- a/exercises/02450Toolbox_Python/Scripts/ex2_4_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_4_3.py
+# exercise 2.4.3
+#%%
+## Intro
+"""
+Note: This is a long script. We suggest you run it usign the #%% feature 
+in VScode which allows you to easily run parts at the time in interactive mode 
+(similar to a Jupyter notebook)
+"""
+import importlib_resources
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.io import loadmat
+from scipy.stats import zscore
+
+#%%
+## TASK A: Load the Wine dataset
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat")
+
+# Load data file and extract variables of interest
+# Note the number of instances are: red wine (0) - 1599; white wine (1) - 4898. 
+mat_data = loadmat(filename)
+X = mat_data["X"]
+y = mat_data["y"].squeeze()
+C = mat_data["C"][0, 0]
+M = mat_data["M"][0, 0]
+N = mat_data["N"][0, 0]
+attribute_names = [name[0][0] for name in mat_data["attributeNames"]]
+attribute_names = [f"{a1}" for a1 in attribute_names[:]]
+class_names = [cls[0][0] for cls in mat_data["classNames"]]
+wine_id = np.arange(0, N)
+
+#%%
+## TASK B: Remove the outlies (as detected in a previous exercise)
+if True: # try setting once you and see the effect on the distances
+    outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200)
+    valid_mask = np.logical_not(outlier_mask)
+
+    # Finally we will remove these from the data set
+    X = X[valid_mask, :]
+    y = y[valid_mask]
+    wine_id = wine_id[valid_mask]
+    N = len(y)
+
+#%%
+## TASK C: Randomly select row indices to make the analysis simpler
+# You can change this if you want (the default is 100)
+N_wines_to_consider = 100
+
+np.random.seed(123) # we seed the random number generator to get the same random sample every time
+subsample_mask = np.random.choice(N, N_wines_to_consider, replace=False)
+X = X[subsample_mask, :]
+y = y[subsample_mask]
+wine_id = wine_id[subsample_mask] # this is simply so we can id the orginal winev if need be
+N = len(y)
+
+sorted_indices = np.argsort(y) # sort rows in X acording to whether they are red of white
+X = X[sorted_indices]
+y = y[sorted_indices]
+wine_id = wine_id[sorted_indices]
+N = len(y)
+
+idx = np.arange(0,N)
+wine_id_type = [f"{a3} (id={a1} type={a2})" for a1,a2,a3 in zip(wine_id, y , idx)]
+wine_id_type_vert = [f"(id={a1} type={a2}) {a3}" for a1,a2,a3 in zip(wine_id, y , idx)]
+
+
+#%%
+## TASK D: Optionally, standardize  the attributes
+# Try, once you have complted the script, to change this and see the effect on
+# the associated distance in TASK H and I
+if True:
+    X = zscore(X, ddof=1)
+
+#%%
+## TASK E: Show the attributes for insights
+print("This is X:")
+print(X)
+
+fig = plt.figure(figsize=(10, 8))
+plt.imshow(X, aspect='auto', cmap='jet')
+plt.colorbar(label='Feature Values')
+plt.title('Heatmap Data Matrix')
+plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
+plt.xticks(ticks=np.arange(len(attribute_names)), labels=attribute_names, rotation="vertical")
+#plt.xticks(ticks=np.arange(len(attribute_names)), labels=wine_id_type, fontsize=4)
+plt.xlabel('Attributes/features')
+plt.ylabel('Observations')
+plt.show()
+
+print("Data loaded (both standardized and not standardized versions)")
+
+#%%
+## TASK F: Extract two wines and compute distances between a white and red whine (warm up exercise)
+#
+# Experiment with the various scaling factors and attrbutes being scale to see how the 
+# scaling affects the Lp distances (default L2)
+#
+x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red
+x_white = np.copy(X[-1,:])
+print("x_red: %s" % x_red)
+print("x_white: %s" % x_white)
+dist_firstandlast = np.linalg.norm(x_red - x_white, 2)  # L_2
+print("Distance: %s  \n\n" % dist_firstandlast)
+
+# Try to change the scale of one of the wines and see the effect on teh distance
+sf = 1000
+x_red = sf*np.copy(X[0,:])
+x_white = sf*np.copy(X[-1,:])
+print("x_red: %s" % x_red)
+print("x_white: %s" % x_white)
+dist_firstandlast = np.linalg.norm(x_red - x_white, 2)  # L_2
+print(dist_firstandlast)
+print("Distance after scaling all attributes: %s \n\n" % dist_firstandlast)
+
+# Try to change the scale of one of the attributes in both wines and see the effect on the distance
+x_red = np.copy(X[0,:])
+x_white = np.copy(X[-1,:])
+sf = 1000
+x_white[1] = sf*x_white[1]
+x_red[1] = sf*x_red[1]
+print("x_red: %s" % x_red)
+print("x_white: %s" % x_white)
+dist_firstandlast = np.linalg.norm(x_red - x_white, 2)  # L_2
+print("Distance after scaling one attribute: %s  \n\n" % dist_firstandlast)
+
+
+#%% 
+## TASK G: Compute and visualize distances between a wine and all others 
+#
+x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red
+x_white = np.copy(X[-1,:])
+
+# we must use axis=1 to get the right result, otherwise the matrix norm will be used
+# (the matrix norm is calculated across the whole matrix, rather than across each row vector!)
+red_L1 = np.linalg.norm(X - x_red, 1, axis=1)  # L_1
+red_L2 = np.linalg.norm(X - x_red, 2, axis=1)  # L_2
+red_Linf = np.linalg.norm(X - x_red, np.inf, axis=1)  # L_inf
+
+# This is not important 
+def list_in_order(alist, order):
+    """Given a list 'alist' and a list of indices 'order'
+    returns the list in the order given by the indices"""
+    return [alist[i] for i in order]
+
+def rank_plot(distances):  # this is not important
+    order = np.argsort(distances) # find the ordering of the distances    
+    ax.bar(np.arange(len(distances)), distances[order]) # bar plot them
+    ax.set_xlabel("Wines / type", fontsize=12)
+    ax.set_ylabel("Distance to the first red whine", fontsize=12)
+    ax.set_xticks(np.arange(N))
+    #ax.set_frame_on(False) # remove frame
+    # make sure the correct order is used for the labels!
+    ax.set_xticklabels(
+        list_in_order(wine_id_type, order), rotation="vertical", fontsize=7
+    )
+
+# Make the plots (not important how this happens)
+fig = plt.figure(figsize=(15, 22.5))
+ax = fig.add_subplot(3, 1, 1)
+ax.set_title("$L_2$ norm", fontsize=16)
+rank_plot(red_L1)
+ax = fig.add_subplot(3, 1, 2)
+ax.set_title("$L_1$ norm", fontsize=16)
+rank_plot(red_L2)
+ax = fig.add_subplot(3, 1, 3)
+ax.set_title("$L_\infty$ norm", fontsize=16)
+rank_plot(red_Linf)
+plt.tight_layout()
+
+
+
+#%% 
+## TASK H: Plot distances among all wines
+# Compute pairwise distances between rows and save in the following variables:
+#
+# ´pairwise_distances_L1´: An NxN matrix with distances between row i and row j using L1
+# ´pairwise_distances_L2´: An NxN matrix with distances between row i and row j using L2
+# ´pairwise_distances_Linf´: An NxN matrix with distances between row i and row j using Linf
+#
+
+pairwise_distances_L1 = np.zeros((N, N))
+pairwise_distances_L2 = np.zeros((N, N))
+pairwise_distances_Linf = np.zeros((N, N))
+
+
+# TASK: INSERT YOUR CODE HERE
+raise NotImplementedError()
+
+
+# Plot the pairwise distances as an image (not critical to understand the specific plotting code)
+fig = plt.figure(figsize=(15, 22.5))
+ax = fig.add_subplot(3, 1, 1)
+cax=plt.imshow(pairwise_distances_L1, aspect='auto', cmap='jet')
+plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
+plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
+plt.title("Heatmap of Pairwise L1 Distances Between Observations")
+plt.colorbar(cax, label="Distance")
+ax.set_aspect('equal', 'box')
+
+ax = fig.add_subplot(3, 1, 2)
+cax=plt.imshow(pairwise_distances_L2, aspect='auto', cmap='jet')
+plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
+plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
+plt.title("Heatmap of Pairwise L2 Distances Between Observations")
+plt.colorbar(cax, label="Distance")
+ax.set_aspect('equal', 'box')
+
+ax = fig.add_subplot(3, 1, 3)
+cax=plt.imshow(pairwise_distances_Linf, aspect='auto', cmap='jet')
+plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
+plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
+plt.title("Heatmap of Pairwise Linf Distances Between Observations")
+plt.colorbar(cax, label="Distance")
+ax.set_aspect('equal', 'box')
+plt.tight_layout()
+
+plt.show()
+
+#%%
+## TASK I (i.e. i): Compute the following distances and store them in the approiate variables: 
+#
+# ´avg_interdist_white`: Average distance between all white wines based on the L1 norm (excluding distances to the same wine, i.e. 0)
+# ´avg_interdist_red´: Average distance between all red wines based on the L1 norm (excluding distances to the same wine, i.e. 0)
+# ´avg_intradist_red2white´: Average distance between white and red and white wines based on the L1 norm
+# 
+# Hint: You can obtain the required information from the ´pairwise_distances´ variables
+# above
+#
+# Question: Describe how the informaton about average inter and intra distances 
+# can be used in (automatically) disciminating between white and red wines?
+#
+# Question: Does it make a difference if you use the L1, L2 or Linf norm? Consider the
+# relative difference between the inter and intra wine distances (p.s. it does...). 
+#
+
+avg_interdist_white = np.nan # replace np.nan with your 
+avg_interdist_red = np.nan # replace np.nan with your 
+avg_intradist_red2white = np.nan # replace np.nan with your 
+
+
+# TASK: INSERT YOUR CODE HERE
+raise NotImplementedError()
+
+
+#%%
+print("You are now done with this exercise. ASk your TA to look over your solutions and discuss your findings with them.")#%%
+# %%
--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py
 # exercise 4.1.1
-import numpy as np
-import matplotlib.pyplot as plt 
-
-# Number of samples
-N = 200
-
-# Mean
-mu = 17
-
-# Standard deviation
-s = 2
-
-# Number of bins in histogram
-nbins = 20
-
-# Generate samples from the Normal distribution
-X = np.random.normal(mu, s, N).T
-# or equally:
-X = np.random.randn(N).T * s + mu
-
-# Plot the samples and histogram
-plt.figure(figsize=(12, 4))
-plt.title("Normal distribution")
-plt.subplot(1, 2, 1)
-plt.plot(X, ".")
-plt.subplot(1, 3, 3)
-plt.hist(X, bins=nbins)
-plt.show()
+# Content to be added on discrete probability
+# Will be distrbuted as hotfix (and via git)
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
+# exercise 4.2.1
+import numpy as np
+import matplotlib.pyplot as plt 
+
+# Number of samples
+N = 200
+
+# Mean
+mu = 17
+
+# Standard deviation
+s = 2
+
+# Number of bins in histogram
+nbins = 20
+
+# Generate samples from the Normal distribution
+X = np.random.normal(mu, s, N).T
+# or equally:
+X = np.random.randn(N).T * s + mu
+
+# Plot the samples and histogram
+plt.figure(figsize=(12, 4))
+plt.title("Normal distribution")
+plt.subplot(1, 2, 1)
+plt.plot(X, ".")
+plt.subplot(1, 3, 3)
+plt.hist(X, bins=nbins)
+plt.show()
--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py
-# exercise 4.1.2
+# exercise 4.2.2

 import numpy as np
 import matplotlib.pyplot as plt 

--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py
-# exercise 4.1.3
+# exercise 4.2.3
 import numpy as np
 import matplotlib.pyplot as plt 
 from scipy import stats

--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py
-# exercise 4.1.4
+# exercise 4.2.4

 import numpy as np

@@ -14,4 +14,4 @@ S = np.array([[4, 3], [3, 9]])
 # Generate samples from the Normal distribution
 X = np.random.multivariate_normal(mu, S, N)

-print("Ran Exercise 4.1.4")
+print("Ran Exercise 4.2.4")
--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py
-# exercise 4.1.5
+# exercise 4.2.5

 import numpy as np
 import matplotlib.pyplot as plt 

--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py
-# exercise 4.1.6
+# exercise 4.2.6
 import importlib_resources
 import numpy as np
 import scipy.linalg as linalg

--- a/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py