Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# exercise 2.4.3
#%%
## Intro
"""
Note: This is a long script. We suggest you run it usign the #%% feature
in VScode which allows you to easily run parts at the time in interactive mode
(similar to a Jupyter notebook)
"""
import importlib_resources
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from scipy.stats import zscore
#%%
## TASK A: Load the Wine dataset
filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat")
# Load data file and extract variables of interest
# Note the number of instances are: red wine (0) - 1599; white wine (1) - 4898.
mat_data = loadmat(filename)
X = mat_data["X"]
y = mat_data["y"].squeeze()
C = mat_data["C"][0, 0]
M = mat_data["M"][0, 0]
N = mat_data["N"][0, 0]
attribute_names = [name[0][0] for name in mat_data["attributeNames"]]
attribute_names = [f"{a1}" for a1 in attribute_names[:]]
class_names = [cls[0][0] for cls in mat_data["classNames"]]
wine_id = np.arange(0, N)
#%%
## TASK B: Remove the outlies (as detected in a previous exercise)
if True: # try setting once you and see the effect on the distances
outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200)
valid_mask = np.logical_not(outlier_mask)
# Finally we will remove these from the data set
X = X[valid_mask, :]
y = y[valid_mask]
wine_id = wine_id[valid_mask]
N = len(y)
#%%
## TASK C: Randomly select row indices to make the analysis simpler
# You can change this if you want (the default is 100)
N_wines_to_consider = 100
np.random.seed(123) # we seed the random number generator to get the same random sample every time
subsample_mask = np.random.choice(N, N_wines_to_consider, replace=False)
X = X[subsample_mask, :]
y = y[subsample_mask]
wine_id = wine_id[subsample_mask] # this is simply so we can id the orginal winev if need be
N = len(y)
sorted_indices = np.argsort(y) # sort rows in X acording to whether they are red of white
X = X[sorted_indices]
y = y[sorted_indices]
wine_id = wine_id[sorted_indices]
N = len(y)
idx = np.arange(0,N)
wine_id_type = [f"{a3} (id={a1} type={a2})" for a1,a2,a3 in zip(wine_id, y , idx)]
wine_id_type_vert = [f"(id={a1} type={a2}) {a3}" for a1,a2,a3 in zip(wine_id, y , idx)]
#%%
## TASK D: Optionally, standardize the attributes
# Try, once you have complted the script, to change this and see the effect on
# the associated distance in TASK H and I
if True:
X = zscore(X, ddof=1)
#%%
## TASK E: Show the attributes for insights
print("This is X:")
print(X)
fig = plt.figure(figsize=(10, 8))
plt.imshow(X, aspect='auto', cmap='jet')
plt.colorbar(label='Feature Values')
plt.title('Heatmap Data Matrix')
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.xticks(ticks=np.arange(len(attribute_names)), labels=attribute_names, rotation="vertical")
#plt.xticks(ticks=np.arange(len(attribute_names)), labels=wine_id_type, fontsize=4)
plt.xlabel('Attributes/features')
plt.ylabel('Observations')
plt.show()
print("Data loaded (both standardized and not standardized versions)")
#%%
## TASK F: Extract two wines and compute distances between a white and red whine (warm up exercise)
#
# Experiment with the various scaling factors and attrbutes being scale to see how the
# scaling affects the Lp distances (default L2)
#
x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red
x_white = np.copy(X[-1,:])
print("x_red: %s" % x_red)
print("x_white: %s" % x_white)
dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2
print("Distance: %s \n\n" % dist_firstandlast)
# Try to change the scale of one of the wines and see the effect on teh distance
sf = 1000
x_red = sf*np.copy(X[0,:])
x_white = sf*np.copy(X[-1,:])
print("x_red: %s" % x_red)
print("x_white: %s" % x_white)
dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2
print(dist_firstandlast)
print("Distance after scaling all attributes: %s \n\n" % dist_firstandlast)
# Try to change the scale of one of the attributes in both wines and see the effect on the distance
x_red = np.copy(X[0,:])
x_white = np.copy(X[-1,:])
sf = 1000
x_white[1] = sf*x_white[1]
x_red[1] = sf*x_red[1]
print("x_red: %s" % x_red)
print("x_white: %s" % x_white)
dist_firstandlast = np.linalg.norm(x_red - x_white, 2) # L_2
print("Distance after scaling one attribute: %s \n\n" % dist_firstandlast)
#%%
## TASK G: Compute and visualize distances between a wine and all others
#
x_red = np.copy(X[0,:]) # note we make a copy to avoid messing with X in case we change x_white and x_red
x_white = np.copy(X[-1,:])
# we must use axis=1 to get the right result, otherwise the matrix norm will be used
# (the matrix norm is calculated across the whole matrix, rather than across each row vector!)
red_L1 = np.linalg.norm(X - x_red, 1, axis=1) # L_1
red_L2 = np.linalg.norm(X - x_red, 2, axis=1) # L_2
red_Linf = np.linalg.norm(X - x_red, np.inf, axis=1) # L_inf
# This is not important
def list_in_order(alist, order):
"""Given a list 'alist' and a list of indices 'order'
returns the list in the order given by the indices"""
return [alist[i] for i in order]
def rank_plot(distances): # this is not important
order = np.argsort(distances) # find the ordering of the distances
ax.bar(np.arange(len(distances)), distances[order]) # bar plot them
ax.set_xlabel("Wines / type", fontsize=12)
ax.set_ylabel("Distance to the first red whine", fontsize=12)
ax.set_xticks(np.arange(N))
#ax.set_frame_on(False) # remove frame
# make sure the correct order is used for the labels!
ax.set_xticklabels(
list_in_order(wine_id_type, order), rotation="vertical", fontsize=7
)
# Make the plots (not important how this happens)
fig = plt.figure(figsize=(15, 22.5))
ax = fig.add_subplot(3, 1, 1)
ax.set_title("$L_2$ norm", fontsize=16)
rank_plot(red_L1)
ax = fig.add_subplot(3, 1, 2)
ax.set_title("$L_1$ norm", fontsize=16)
rank_plot(red_L2)
ax = fig.add_subplot(3, 1, 3)
ax.set_title("$L_\infty$ norm", fontsize=16)
rank_plot(red_Linf)
plt.tight_layout()
#%%
## TASK H: Plot distances among all wines
# Compute pairwise distances between rows and save in the following variables:
#
# ´pairwise_distances_L1´: An NxN matrix with distances between row i and row j using L1
# ´pairwise_distances_L2´: An NxN matrix with distances between row i and row j using L2
# ´pairwise_distances_Linf´: An NxN matrix with distances between row i and row j using Linf
#
pairwise_distances_L1 = np.zeros((N, N))
pairwise_distances_L2 = np.zeros((N, N))
pairwise_distances_Linf = np.zeros((N, N))
# TASK: INSERT YOUR CODE HERE
raise NotImplementedError()
# Plot the pairwise distances as an image (not critical to understand the specific plotting code)
fig = plt.figure(figsize=(15, 22.5))
ax = fig.add_subplot(3, 1, 1)
cax=plt.imshow(pairwise_distances_L1, aspect='auto', cmap='jet')
plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.title("Heatmap of Pairwise L1 Distances Between Observations")
plt.colorbar(cax, label="Distance")
ax.set_aspect('equal', 'box')
ax = fig.add_subplot(3, 1, 2)
cax=plt.imshow(pairwise_distances_L2, aspect='auto', cmap='jet')
plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.title("Heatmap of Pairwise L2 Distances Between Observations")
plt.colorbar(cax, label="Distance")
ax.set_aspect('equal', 'box')
ax = fig.add_subplot(3, 1, 3)
cax=plt.imshow(pairwise_distances_Linf, aspect='auto', cmap='jet')
plt.xticks(ticks=np.arange(len(y)), labels=wine_id_type_vert, fontsize=4, rotation="vertical")
plt.yticks(ticks=np.arange(len(y)), labels=wine_id_type, fontsize=4)
plt.title("Heatmap of Pairwise Linf Distances Between Observations")
plt.colorbar(cax, label="Distance")
ax.set_aspect('equal', 'box')
plt.tight_layout()
plt.show()
#%%
## TASK I (i.e. i): Compute the following distances and store them in the approiate variables:
#
# ´avg_interdist_white`: Average distance between all white wines based on the L1 norm (excluding distances to the same wine, i.e. 0)
# ´avg_interdist_red´: Average distance between all red wines based on the L1 norm (excluding distances to the same wine, i.e. 0)
# ´avg_intradist_red2white´: Average distance between white and red and white wines based on the L1 norm
#
# Hint: You can obtain the required information from the ´pairwise_distances´ variables
# above
#
# Question: Describe how the informaton about average inter and intra distances
# can be used in (automatically) disciminating between white and red wines?
#
# Question: Does it make a difference if you use the L1, L2 or Linf norm? Consider the
# relative difference between the inter and intra wine distances (p.s. it does...).
#
avg_interdist_white = np.nan # replace np.nan with your
avg_interdist_red = np.nan # replace np.nan with your
avg_intradist_red2white = np.nan # replace np.nan with your
# TASK: INSERT YOUR CODE HERE
raise NotImplementedError()
#%%
print("You are now done with this exercise. ASk your TA to look over your solutions and discuss your findings with them.")#%%
# %%