diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py index 3b88f6b2ea4b724f696b52a31f37b729ed923b2d..ca70203be28b3bceb266c1f73993023159a75cc0 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py @@ -1,6 +1,6 @@ # exercise 2.4.1 """ -Note: This is a long script. You may want to use breakpoint +Note: This is a long script. You may want to use breakpoints """ import importlib_resources import numpy as np diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_4_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_3.py new file mode 100644 index 0000000000000000000000000000000000000000..f1a15f3caf70807d29520c277b3f98633cdb872a --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex2_4_3.py @@ -0,0 +1,247 @@ +# exercise 2.4.3 +#%% +## Intro +""" +Note: This is a long script. We suggest you run it usign the #%% feature +in VScode which allows you to easily run parts at the time in interactive mode +(similar to a Jupyter notebook) +""" +import importlib_resources +import numpy as np +import matplotlib.pyplot as plt +from scipy.io import loadmat +from scipy.stats import zscore + +#%% +## TASK A: Load the Wine dataset +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat") + +# Load data file and extract variables of interest +# Note the number of instances are: red wine (0) - 1599; white wine (1) - 4898. +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +C = mat_data["C"][0, 0] +M = mat_data["M"][0, 0] +N = mat_data["N"][0, 0] +attribute_names = [name[0][0] for name in mat_data["attributeNames"]] +attribute_names = [f"{a1}" for a1 in attribute_names[:]] +class_names = [cls[0][0] for cls in mat_data["classNames"]] +wine_id = np.arange(0, N) + +#%% +## TASK B: Remove the outlies (as detected in a previous exercise) +if True: # try setting once you and see the effect on the distances + outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200) + valid_mask = np.logical_not(outlier_mask) + + # Finally we will remove these from the data set + X = X[valid_mask, :] + y = y[valid_mask] + wine_id = wine_id[valid_mask] + N = len(y) + +#%% +## TASK C: Randomly select row indices to make the analysis simpler +# You can change this if you want (the default is 100) +N_wines_to_consider = 100 + +np.random.seed(123) # we seed the random number generator to get the same random sample every time +subsample_mask = np.random.choice(N, N_wines_to_consider, replace=False) +X = X[subsample_mask, :] +y = y[subsample_mask] +wine_id = wine_id[subsample_mask] # this is simply so we can id the orginal winev if need be +N = len(y) + +sorted_indices = np.argsort(y) # sort rows in X acording to whether they are red of white +X = X[sorted_indices] +y = y[sorted_indices] +wine_id = wine_id[sorted_indices] +N = len(y) + +idx = np.arange(0,N) +wine_id_type = [f"{a3} (id={a1} type={a2})" for a1,a2,a3 in zip(wine_id, y , idx)] +wine_id_type_vert = [f"(id={a1} type={a2}) {a3}" for a1,a2,a3 in zip(wine_id, y , idx)] + + +#%% +## TASK D: Optionally, standardize the attributes +# Try, once you have complted the script, to change this and see the effect on +# the associated distance in TASK H and I +if True: + X = zscore(X, ddof=1) + +#%% +## TASK E: Show the attributes for insights +print("This is X:") +print(X) + +fig = plt.figure(figsize=(10, 8)) +plt.imshow(X, aspect='auto', cmap='jet') +plt.colorbar(label='Feature Values') +plt.title('Heatmap Data Matrix') +plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4) +plt.xticks(ticks=np.arange(len(attribute_names)), labels=attribute_names, rotation="vertical") +#plt.xticks(ticks=np.arange(len(attribute_names)), labels=wine_id_type, fontsize=4) +plt.xlabel('Attributes/features') +plt.ylabel('Observations') +plt.show() + +print("Data loaded (both standardized and not standardized versions)") + +#%% +## TASK F: Extract two wines and compute distances between a white and red whine (warm up exercise) +# +# Experiment with the various scaling factors and attrbutes being scale to see how the +# scaling affects the Lp distances (default L2) +# +x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red +x_white = np.copy(X[-1,:]) +print("x_red: %s" % x_red) +print("x_white: %s" % x_white) +dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2 +print("Distance: %s \n\n" % dist_firstandlast) + +# Try to change the scale of one of the wines and see the effect on teh distance +sf = 1000 +x_red = sf*np.copy(X[0,:]) +x_white = sf*np.copy(X[-1,:]) +print("x_red: %s" % x_red) +print("x_white: %s" % x_white) +dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2 +print(dist_firstandlast) +print("Distance after scaling all attributes: %s \n\n" % dist_firstandlast) + +# Try to change the scale of one of the attributes in both wines and see the effect on the distance +x_red = np.copy(X[0,:]) +x_white = np.copy(X[-1,:]) +sf = 1000 +x_white[1] = sf*x_white[1] +x_red[1] = sf*x_red[1] +print("x_red: %s" % x_red) +print("x_white: %s" % x_white) +dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2 +print("Distance after scaling one attribute: %s \n\n" % dist_firstandlast) + + +#%% +## TASK G: Compute and visualize distances between a wine and all others +# +x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red +x_white = np.copy(X[-1,:]) + +# we must use axis=1 to get the right result, otherwise the matrix norm will be used +# (the matrix norm is calculated across the whole matrix, rather than across each row vector!) +red_L1 = np.linalg.norm(X - x_red, 1, axis=1) # L_1 +red_L2 = np.linalg.norm(X - x_red, 2, axis=1) # L_2 +red_Linf = np.linalg.norm(X - x_red, np.inf, axis=1) # L_inf + +# This is not important +def list_in_order(alist, order): + """Given a list 'alist' and a list of indices 'order' + returns the list in the order given by the indices""" + return [alist[i] for i in order] + +def rank_plot(distances): # this is not important + order = np.argsort(distances) # find the ordering of the distances + ax.bar(np.arange(len(distances)), distances[order]) # bar plot them + ax.set_xlabel("Wines / type", fontsize=12) + ax.set_ylabel("Distance to the first red whine", fontsize=12) + ax.set_xticks(np.arange(N)) + #ax.set_frame_on(False) # remove frame + # make sure the correct order is used for the labels! + ax.set_xticklabels( + list_in_order(wine_id_type, order), rotation="vertical", fontsize=7 + ) + +# Make the plots (not important how this happens) +fig = plt.figure(figsize=(15, 22.5)) +ax = fig.add_subplot(3, 1, 1) +ax.set_title("$L_2$ norm", fontsize=16) +rank_plot(red_L1) +ax = fig.add_subplot(3, 1, 2) +ax.set_title("$L_1$ norm", fontsize=16) +rank_plot(red_L2) +ax = fig.add_subplot(3, 1, 3) +ax.set_title("$L_\infty$ norm", fontsize=16) +rank_plot(red_Linf) +plt.tight_layout() + + + +#%% +## TASK H: Plot distances among all wines +# Compute pairwise distances between rows and save in the following variables: +# +# ´pairwise_distances_L1´: An NxN matrix with distances between row i and row j using L1 +# ´pairwise_distances_L2´: An NxN matrix with distances between row i and row j using L2 +# ´pairwise_distances_Linf´: An NxN matrix with distances between row i and row j using Linf +# + +pairwise_distances_L1 = np.zeros((N, N)) +pairwise_distances_L2 = np.zeros((N, N)) +pairwise_distances_Linf = np.zeros((N, N)) + + +# TASK: INSERT YOUR CODE HERE +raise NotImplementedError() + + +# Plot the pairwise distances as an image (not critical to understand the specific plotting code) +fig = plt.figure(figsize=(15, 22.5)) +ax = fig.add_subplot(3, 1, 1) +cax=plt.imshow(pairwise_distances_L1, aspect='auto', cmap='jet') +plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical") +plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4) +plt.title("Heatmap of Pairwise L1 Distances Between Observations") +plt.colorbar(cax, label="Distance") +ax.set_aspect('equal', 'box') + +ax = fig.add_subplot(3, 1, 2) +cax=plt.imshow(pairwise_distances_L2, aspect='auto', cmap='jet') +plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical") +plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4) +plt.title("Heatmap of Pairwise L2 Distances Between Observations") +plt.colorbar(cax, label="Distance") +ax.set_aspect('equal', 'box') + +ax = fig.add_subplot(3, 1, 3) +cax=plt.imshow(pairwise_distances_Linf, aspect='auto', cmap='jet') +plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical") +plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4) +plt.title("Heatmap of Pairwise Linf Distances Between Observations") +plt.colorbar(cax, label="Distance") +ax.set_aspect('equal', 'box') +plt.tight_layout() + +plt.show() + +#%% +## TASK I (i.e. i): Compute the following distances and store them in the approiate variables: +# +# ´avg_interdist_white`: Average distance between all white wines based on the L1 norm (excluding distances to the same wine, i.e. 0) +# ´avg_interdist_red´: Average distance between all red wines based on the L1 norm (excluding distances to the same wine, i.e. 0) +# ´avg_intradist_red2white´: Average distance between white and red and white wines based on the L1 norm +# +# Hint: You can obtain the required information from the ´pairwise_distances´ variables +# above +# +# Question: Describe how the informaton about average inter and intra distances +# can be used in (automatically) disciminating between white and red wines? +# +# Question: Does it make a difference if you use the L1, L2 or Linf norm? Consider the +# relative difference between the inter and intra wine distances (p.s. it does...). +# + +avg_interdist_white = np.nan # replace np.nan with your +avg_interdist_red = np.nan # replace np.nan with your +avg_intradist_red2white = np.nan # replace np.nan with your + + +# TASK: INSERT YOUR CODE HERE +raise NotImplementedError() + + +#%% +print("You are now done with this exercise. ASk your TA to look over your solutions and discuss your findings with them.")#%% +# %% diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py index ec2f3488eeca721c9cc8a84dd0e2c22f61a151af..c8d1a84c068b076f9404930d77148657dc553a89 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py @@ -1,29 +1,3 @@ # exercise 4.1.1 -import numpy as np -import matplotlib.pyplot as plt - -# Number of samples -N = 200 - -# Mean -mu = 17 - -# Standard deviation -s = 2 - -# Number of bins in histogram -nbins = 20 - -# Generate samples from the Normal distribution -X = np.random.normal(mu, s, N).T -# or equally: -X = np.random.randn(N).T * s + mu - -# Plot the samples and histogram -plt.figure(figsize=(12, 4)) -plt.title("Normal distribution") -plt.subplot(1, 2, 1) -plt.plot(X, ".") -plt.subplot(1, 3, 3) -plt.hist(X, bins=nbins) -plt.show() +# Content to be added on discrete probability +# Will be distrbuted as hotfix (and via git) diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py new file mode 100644 index 0000000000000000000000000000000000000000..38a3f194614ce59fd7ff4a3b966ecb16de9d8193 --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py @@ -0,0 +1,29 @@ +# exercise 4.2.1 +import numpy as np +import matplotlib.pyplot as plt + +# Number of samples +N = 200 + +# Mean +mu = 17 + +# Standard deviation +s = 2 + +# Number of bins in histogram +nbins = 20 + +# Generate samples from the Normal distribution +X = np.random.normal(mu, s, N).T +# or equally: +X = np.random.randn(N).T * s + mu + +# Plot the samples and histogram +plt.figure(figsize=(12, 4)) +plt.title("Normal distribution") +plt.subplot(1, 2, 1) +plt.plot(X, ".") +plt.subplot(1, 3, 3) +plt.hist(X, bins=nbins) +plt.show() diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py similarity index 97% rename from exercises/02450Toolbox_Python/Scripts/ex4_1_2.py rename to exercises/02450Toolbox_Python/Scripts/ex4_2_2.py index 9f613d9590de7e7543594708599b41a9fba0e2ea..39cf9bde37d77bc14b52cbcc95a9fde25ee3ff4f 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py @@ -1,4 +1,4 @@ -# exercise 4.1.2 +# exercise 4.2.2 import numpy as np import matplotlib.pyplot as plt diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py similarity index 98% rename from exercises/02450Toolbox_Python/Scripts/ex4_1_3.py rename to exercises/02450Toolbox_Python/Scripts/ex4_2_3.py index 0a6c637b2aeda368d26b7470be3dbf9f44d47545..e61c3cf33efef1b4b3aaa2b457f2f8556b1afa9d 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py @@ -1,4 +1,4 @@ -# exercise 4.1.3 +# exercise 4.2.3 import numpy as np import matplotlib.pyplot as plt from scipy import stats diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py similarity index 83% rename from exercises/02450Toolbox_Python/Scripts/ex4_1_4.py rename to exercises/02450Toolbox_Python/Scripts/ex4_2_4.py index 42e7588f9653eb0be39a2a696d8f80a3afb6ccf4..2db6dac00758edb1c945a8fefcb1e98e00172c08 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py @@ -1,4 +1,4 @@ -# exercise 4.1.4 +# exercise 4.2.4 import numpy as np @@ -14,4 +14,4 @@ S = np.array([[4, 3], [3, 9]]) # Generate samples from the Normal distribution X = np.random.multivariate_normal(mu, S, N) -print("Ran Exercise 4.1.4") +print("Ran Exercise 4.2.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py similarity index 98% rename from exercises/02450Toolbox_Python/Scripts/ex4_1_5.py rename to exercises/02450Toolbox_Python/Scripts/ex4_2_5.py index d364cc8f672a4f9d674f67cf5d05453559ab953d..1d1bea4f9dfff7c59de9ab38446e67b305cb1c73 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py @@ -1,4 +1,4 @@ -# exercise 4.1.5 +# exercise 4.2.5 import numpy as np import matplotlib.pyplot as plt diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py similarity index 98% rename from exercises/02450Toolbox_Python/Scripts/ex4_1_6.py rename to exercises/02450Toolbox_Python/Scripts/ex4_2_6.py index 11ce3af61ed99a4c271869455f2a8ee5c368958f..05f9319dbd350d4faaec2fe0d83dcc4364d16511 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py @@ -1,4 +1,4 @@ -# exercise 4.1.6 +# exercise 4.2.6 import importlib_resources import numpy as np import scipy.linalg as linalg diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py similarity index 100% rename from exercises/02450Toolbox_Python/Scripts/ex4_1_7.py rename to exercises/02450Toolbox_Python/Scripts/ex4_2_7.py