Newer
Older
This is a small experiment where the exercise has a slightly different format than usual.
The purpose is to explore the best format of Python exercise in the course.
It is a long script. We suggest you run it usign the #%% feature
in VScode which allows you to easily run parts at the time in interactive mode
(similar to a Jupyter notebook yet still havign the full VScode/debugger available)
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
import importlib_resources
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from scipy.stats import zscore
#%%
## TASK A: Load the Wine dataset
filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat")
# Load data file and extract variables of interest
# Note the number of instances are: red wine (0) - 1599; white wine (1) - 4898.
mat_data = loadmat(filename)
X = mat_data["X"]
y = mat_data["y"].squeeze()
C = mat_data["C"][0, 0]
M = mat_data["M"][0, 0]
N = mat_data["N"][0, 0]
attribute_names = [name[0][0] for name in mat_data["attributeNames"]]
attribute_names = [f"{a1}" for a1 in attribute_names[:]]
class_names = [cls[0][0] for cls in mat_data["classNames"]]
wine_id = np.arange(0, N)
#%%
## TASK B: Remove the outlies (as detected in a previous exercise)
if True: # try setting once you and see the effect on the distances
outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200)
valid_mask = np.logical_not(outlier_mask)
# Finally we will remove these from the data set
X = X[valid_mask, :]
y = y[valid_mask]
wine_id = wine_id[valid_mask]
N = len(y)
#%%
## TASK C: Randomly select row indices to make the analysis simpler
# You can change this if you want (the default is 100)
N_wines_to_consider = 100
np.random.seed(123) # we seed the random number generator to get the same random sample every time
subsample_mask = np.random.choice(N, N_wines_to_consider, replace=False)
X = X[subsample_mask, :]
y = y[subsample_mask]
wine_id = wine_id[subsample_mask] # this is simply so we can id the orginal winev if need be
sorted_indices = np.argsort(y) # sort rows in X acording to whether they are red of white
X = X[sorted_indices]
y = y[sorted_indices]
wine_id = wine_id[sorted_indices]
N = len(y)
# create a list of string for the plots xticks/labels
idx = np.arange(0,N)
wine_id_type = [f"{a3} (id={a1} type={a2})" for a1,a2,a3 in zip(wine_id, y , idx)]
wine_id_type_vert = [f"(id={a1} type={a2}) {a3}" for a1,a2,a3 in zip(wine_id, y , idx)]
#%%
## TASK D: Optionally, standardize the attributes
# Try, once you have completed the script, to change this and see the effect on
# the associated distance in TASK H and I
if True:
X = zscore(X, ddof=1)
#%%
## TASK E: Show the attributes for insights
print("This is X:")
print(X)
fig = plt.figure(figsize=(10, 8))
plt.imshow(X, aspect='auto', cmap='jet')
plt.colorbar(label='Feature Values')
plt.title('Heatmap Data Matrix')
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.xticks(ticks=np.arange(len(attribute_names)), labels=attribute_names, rotation="vertical")
#plt.xticks(ticks=np.arange(len(attribute_names)), labels=wine_id_type, fontsize=4)
plt.xlabel('Attributes/features')
plt.ylabel('Observations')
plt.show()
## TASK F: Extract two wines and compute distances between a white and red wine (warm up exercise)
#
# Experiment with the various scaling factors and attributes being scale
# to see how the scaling affects the Lp distances (default L2)
# Note: you should think about ´x_red´ and ´x_white´ as vectors!
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#
x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red
x_white = np.copy(X[-1,:])
print("x_red: %s" % x_red)
print("x_white: %s" % x_white)
dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2
print("Distance: %s \n\n" % dist_firstandlast)
# Try to change the scale of one of the wines and see the effect on teh distance
sf = 1000
x_red = sf*np.copy(X[0,:])
x_white = sf*np.copy(X[-1,:])
print("x_red: %s" % x_red)
print("x_white: %s" % x_white)
dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2
print(dist_firstandlast)
print("Distance after scaling all attributes: %s \n\n" % dist_firstandlast)
# Try to change the scale of one of the attributes in both wines and see the effect on the distance
x_red = np.copy(X[0,:])
x_white = np.copy(X[-1,:])
sf = 1000
x_white[1] = sf*x_white[1]
x_red[1] = sf*x_red[1]
print("x_red: %s" % x_red)
print("x_white: %s" % x_white)
dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2
print("Distance after scaling one attribute: %s \n\n" % dist_firstandlast)
#%%
## TASK G: Compute and visualize distances between a wine and all others
#
x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red
x_white = np.copy(X[-1,:])
# we must use axis=1 to get the right result, otherwise the matrix norm will be used
# (the matrix norm is calculated across the whole matrix, rather than across each row vector!)
red_L1 = np.linalg.norm(X - x_red, 1, axis=1) # L_1
red_L2 = np.linalg.norm(X - x_red, 2, axis=1) # L_2
red_Linf = np.linalg.norm(X - x_red, np.inf, axis=1) # L_inf
# This is not important
def list_in_order(alist, order): # credit JHW
"""Given a list 'alist' and a list of indices 'order'
returns the list in the order given by the indices. Credit: JHW"""
def rank_plot(distances): # credit JHW
"""
A helper function. Credit: JHW
"""
order = np.argsort(distances) # find the ordering of the distances
ax.bar(np.arange(len(distances)), distances[order]) # bar plot them
ax.set_xlabel("Wines / type", fontsize=12)
ax.set_ylabel("Distance to the first red wine", fontsize=12)
ax.set_xticks(np.arange(N))
#ax.set_frame_on(False) # remove frame
# make sure the correct order is used for the labels!
ax.set_xticklabels(
list_in_order(wine_id_type, order), rotation="vertical", fontsize=7
)
# Make the plots (not important how this happens)
fig = plt.figure(figsize=(15, 22.5))
ax = fig.add_subplot(3, 1, 1)
ax.set_title("$L_2$ norm", fontsize=16)
rank_plot(red_L1)
ax = fig.add_subplot(3, 1, 2)
ax.set_title("$L_1$ norm", fontsize=16)
rank_plot(red_L2)
ax = fig.add_subplot(3, 1, 3)
ax.set_title("$L_\infty$ norm", fontsize=16)
rank_plot(red_Linf)
plt.tight_layout()
#%%
## TASK H: Plot distances between all wines.
# Compute all the possible pairwise distances between rows and save
# in the following variables:
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#
# ´pairwise_distances_L1´: An NxN matrix with distances between row i and row j using L1
# ´pairwise_distances_L2´: An NxN matrix with distances between row i and row j using L2
# ´pairwise_distances_Linf´: An NxN matrix with distances between row i and row j using Linf
#
pairwise_distances_L1 = np.zeros((N, N))
pairwise_distances_L2 = np.zeros((N, N))
pairwise_distances_Linf = np.zeros((N, N))
# TASK: INSERT YOUR CODE HERE
raise NotImplementedError()
# Plot the pairwise distances as an image (not critical to understand the specific plotting code)
fig = plt.figure(figsize=(15, 22.5))
ax = fig.add_subplot(3, 1, 1)
cax=plt.imshow(pairwise_distances_L1, aspect='auto', cmap='jet')
plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.title("Heatmap of Pairwise L1 Distances Between Observations")
plt.colorbar(cax, label="Distance")
ax.set_aspect('equal', 'box')
ax = fig.add_subplot(3, 1, 2)
cax=plt.imshow(pairwise_distances_L2, aspect='auto', cmap='jet')
plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.title("Heatmap of Pairwise L2 Distances Between Observations")
plt.colorbar(cax, label="Distance")
ax.set_aspect('equal', 'box')
ax = fig.add_subplot(3, 1, 3)
cax=plt.imshow(pairwise_distances_Linf, aspect='auto', cmap='jet')
plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.title("Heatmap of Pairwise Linf Distances Between Observations")
plt.colorbar(cax, label="Distance")
ax.set_aspect('equal', 'box')
plt.tight_layout()
plt.show()
#%%
## TASK I (i.e. i): Compute the following distances and store them in the approiate variables:
#
# ´avg_interdist_white`: Average distance between all white wines based on the L1 norm (excluding distances to the same wine, i.e. 0)
# ´avg_interdist_red´: Average distance between all red wines based on the L1 norm (excluding distances to the same wine, i.e. 0)
# ´avg_intradist_red2white´: Average distance between white and red and white wines based on the L1 norm
#
# Hint: You can obtain the required information from the ´pairwise_distances´ variables
# above
#
# Question: Describe how the informaton about average inter and intra distances
# can be used in (automatically) disciminating between white and red wines?
#
# Question: Does it make a difference if you use the L1, L2 or Linf norm? Consider the
# relative difference between the inter and intra wine distances (p.s. it does...).
#
avg_interdist_white = np.nan # replace np.nan with your estimate
avg_interdist_red = np.nan # replace np.nan with your estimate
avg_intradist_red2white = np.nan # replace np.nan with your estimate