From d004d8adc46cd2b25b13d721ee6f30cbb39ee6b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Sand=20Jensen?= <bjje@dtu.dk>
Date: Thu, 30 Jan 2025 17:24:33 +0100
Subject: [PATCH] Updated structure of lecture plan and file names (needs to be
 double checked)

---
 .../02450Toolbox_Matlab/Scripts/ex1_6_2.m     |   9 ++
 .../02450Toolbox_Matlab/Scripts/ex1_6_3.m     |   1 +
 .../02450Toolbox_Matlab/Scripts/ex1_6_4.m     |   1 +
 .../02450Toolbox_Matlab/Scripts/ex1_6_5.m     |  21 +++
 .../02450Toolbox_Matlab/Scripts/ex2_1_1.m     |  27 ++--
 .../02450Toolbox_Matlab/Scripts/ex2_1_2.m     |  31 ----
 .../02450Toolbox_Matlab/Scripts/ex2_1_3.m     |  28 ----
 .../02450Toolbox_Matlab/Scripts/ex2_1_4.m     |  24 ---
 .../02450Toolbox_Matlab/Scripts/ex2_1_5.m     |  40 -----
 .../02450Toolbox_Matlab/Scripts/ex2_2_1.m     |  87 ++++++++---
 .../02450Toolbox_Matlab/Scripts/ex2_2_2.m     | 105 +++----------
 .../02450Toolbox_Matlab/Scripts/ex2_3_1.m     |  69 ++++----
 .../Scripts/{ex4_2_2.m => ex2_3_2.m}          |   0
 .../Scripts/{ex4_2_3.m => ex2_3_3.m}          |   0
 .../Scripts/{ex4_2_4.m => ex2_3_4.m}          |   0
 .../Scripts/{ex4_2_5.m => ex2_3_5.m}          |   0
 .../Scripts/{ex4_2_6.m => ex2_3_6.m}          |   0
 .../Scripts/{ex4_2_7.m => ex2_3_7.m}          |   0
 .../Scripts/{ex4_3_1.m => ex2_4_1.m}          |   0
 .../Scripts/{ex4_3_2.m => ex2_4_2.m}          |   0
 .../02450Toolbox_Matlab/Scripts/ex3_1_1.m     |  19 +++
 .../02450Toolbox_Matlab/Scripts/ex3_1_2.m     |  38 ++++-
 .../02450Toolbox_Matlab/Scripts/ex3_1_3.m     |  29 +++-
 .../02450Toolbox_Matlab/Scripts/ex3_1_4.m     |  25 ++-
 .../02450Toolbox_Matlab/Scripts/ex3_1_5.m     |  51 ++++--
 .../Scripts/{ex2_1_6.m => ex3_1_6.m}          |   0
 .../02450Toolbox_Matlab/Scripts/ex3_2_1.m     |  36 +++--
 .../02450Toolbox_Matlab/Scripts/ex3_2_2.m     |  87 +++++++++++
 .../02450Toolbox_Matlab/Scripts/ex3_3_1.m     | 105 +++++--------
 .../02450Toolbox_Matlab/Scripts/ex3_3_2.m     |  18 ---
 .../02450Toolbox_Matlab/Scripts/ex4_2_1.m     |  31 ----
 .../02450Toolbox_Python/Scripts/ex1_6_2.py    |  54 +++++++
 .../02450Toolbox_Python/Scripts/ex1_6_3.py    |  40 +++++
 .../02450Toolbox_Python/Scripts/ex1_6_4.py    |  66 ++++++++
 .../02450Toolbox_Python/Scripts/ex1_6_5.py    |  38 +++++
 .../02450Toolbox_Python/Scripts/ex2_1_1.py    |  41 ++---
 .../02450Toolbox_Python/Scripts/ex2_1_2.py    |  37 -----
 .../02450Toolbox_Python/Scripts/ex2_1_3.py    |  30 ----
 .../02450Toolbox_Python/Scripts/ex2_1_4.py    |  38 -----
 .../02450Toolbox_Python/Scripts/ex2_1_5.py    |  53 -------
 .../02450Toolbox_Python/Scripts/ex2_2_1.py    |  89 ++++++++---
 .../02450Toolbox_Python/Scripts/ex2_2_2.py    | 147 +++++-------------
 .../02450Toolbox_Python/Scripts/ex2_3_1.py    |  85 ++++------
 .../Scripts/{ex4_2_2.py => ex2_3_2.py}        |   2 +-
 .../Scripts/{ex4_2_3.py => ex2_3_3.py}        |   2 +-
 .../Scripts/{ex4_2_4.py => ex2_3_4.py}        |   2 +-
 .../Scripts/{ex4_2_5.py => ex2_3_5.py}        |   2 +-
 .../Scripts/{ex4_2_6.py => ex2_3_6.py}        |   2 +-
 .../Scripts/{ex4_2_7.py => ex2_3_7.py}        |   2 +-
 .../Scripts/{ex4_3_1.py => ex2_4_1.py}        |   0
 .../Scripts/{ex4_3_2.py => ex2_4_2.py}        |   0
 .../02450Toolbox_Python/Scripts/ex3_1_1.py    |  32 ++++
 .../02450Toolbox_Python/Scripts/ex3_1_2.py    |  91 +++++------
 .../02450Toolbox_Python/Scripts/ex3_1_3.py    |  60 +++----
 .../02450Toolbox_Python/Scripts/ex3_1_4.py    | 104 +++++--------
 .../02450Toolbox_Python/Scripts/ex3_1_5.py    |  91 ++++++-----
 .../Scripts/{ex2_1_6.py => ex3_1_6.py}        |   2 +-
 .../02450Toolbox_Python/Scripts/ex3_2_1.py    |  45 ++++--
 .../02450Toolbox_Python/Scripts/ex3_2_2.py    | 117 ++++++++++++++
 .../02450Toolbox_Python/Scripts/ex3_3_1.py    | 125 +++++++--------
 .../02450Toolbox_Python/Scripts/ex3_3_2.py    |  42 -----
 .../02450Toolbox_Python/Scripts/ex4_2_1.py    |  33 ----
 exercises/02450Toolbox_R/Scripts/ex1_6_2.R    |  19 +++
 exercises/02450Toolbox_R/Scripts/ex1_6_3.R    |  23 +++
 exercises/02450Toolbox_R/Scripts/ex1_6_4.R    |  29 ++++
 exercises/02450Toolbox_R/Scripts/ex1_6_5.R    |  19 +++
 exercises/02450Toolbox_R/Scripts/ex2_1_1.R    |  39 ++---
 exercises/02450Toolbox_R/Scripts/ex2_1_2.R    |  20 ---
 exercises/02450Toolbox_R/Scripts/ex2_1_3.R    |  45 ------
 exercises/02450Toolbox_R/Scripts/ex2_1_4.R    |  20 ---
 exercises/02450Toolbox_R/Scripts/ex2_1_5.R    |  45 ------
 exercises/02450Toolbox_R/Scripts/ex2_2_1.R    |  79 ++++++----
 exercises/02450Toolbox_R/Scripts/ex2_2_2.R    | 115 ++------------
 exercises/02450Toolbox_R/Scripts/ex2_3_1.R    |  67 +++-----
 .../Scripts/{ex4_2_2.R => ex2_3_2.R}          |   2 +-
 .../Scripts/{ex4_2_3.R => ex2_3_3.R}          |   2 +-
 .../Scripts/{ex4_2_4.R => ex2_3_4.R}          |   2 +-
 .../Scripts/{ex4_2_5.R => ex2_3_5.R}          |   2 +-
 .../Scripts/{ex4_2_6.R => ex2_3_6.R}          |   2 +-
 .../Scripts/{ex4_2_7.R => ex2_3_7.R}          |   2 +-
 .../Scripts/{ex4_3_1.R => ex2_4_1.R}          |   0
 .../Scripts/{ex4_3_2.R => ex2_4_2.R}          |   2 +-
 exercises/02450Toolbox_R/Scripts/ex3_1_1.R    |  36 +++++
 exercises/02450Toolbox_R/Scripts/ex3_1_2.R    |  25 +--
 exercises/02450Toolbox_R/Scripts/ex3_1_3.R    |  50 ++++--
 exercises/02450Toolbox_R/Scripts/ex3_1_4.R    |  37 ++---
 exercises/02450Toolbox_R/Scripts/ex3_1_5.R    |  52 +++++--
 .../Scripts/{ex2_1_6.R => ex3_1_6.R}          |   2 +-
 exercises/02450Toolbox_R/Scripts/ex3_2_1.R    |  45 ++++--
 exercises/02450Toolbox_R/Scripts/ex3_2_2.R    | 105 +++++++++++++
 exercises/02450Toolbox_R/Scripts/ex3_3_1.R    | 112 +++++++------
 exercises/02450Toolbox_R/Scripts/ex3_3_2.R    |  22 ---
 exercises/02450Toolbox_R/Scripts/ex4_2_1.R    |  30 ----
 93 files changed, 1702 insertions(+), 1702 deletions(-)
 create mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m
 create mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m
 create mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m
 create mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m
 delete mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m
 delete mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m
 delete mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m
 delete mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_2_2.m => ex2_3_2.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_2_3.m => ex2_3_3.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_2_4.m => ex2_3_4.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_2_5.m => ex2_3_5.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_2_6.m => ex2_3_6.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_2_7.m => ex2_3_7.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_3_1.m => ex2_4_1.m} (100%)
 rename exercises/02450Toolbox_Matlab/Scripts/{ex4_3_2.m => ex2_4_2.m} (100%)
 create mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m
 rename exercises/02450Toolbox_Matlab/Scripts/{ex2_1_6.m => ex3_1_6.m} (100%)
 create mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m
 delete mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m
 delete mode 100644 exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m
 create mode 100644 exercises/02450Toolbox_Python/Scripts/ex1_6_2.py
 create mode 100644 exercises/02450Toolbox_Python/Scripts/ex1_6_3.py
 create mode 100644 exercises/02450Toolbox_Python/Scripts/ex1_6_4.py
 create mode 100644 exercises/02450Toolbox_Python/Scripts/ex1_6_5.py
 delete mode 100644 exercises/02450Toolbox_Python/Scripts/ex2_1_2.py
 delete mode 100644 exercises/02450Toolbox_Python/Scripts/ex2_1_3.py
 delete mode 100644 exercises/02450Toolbox_Python/Scripts/ex2_1_4.py
 delete mode 100644 exercises/02450Toolbox_Python/Scripts/ex2_1_5.py
 rename exercises/02450Toolbox_Python/Scripts/{ex4_2_2.py => ex2_3_2.py} (94%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_2_3.py => ex2_3_3.py} (92%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_2_4.py => ex2_3_4.py} (97%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_2_5.py => ex2_3_5.py} (97%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_2_6.py => ex2_3_6.py} (96%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_2_7.py => ex2_3_7.py} (95%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_3_1.py => ex2_4_1.py} (100%)
 rename exercises/02450Toolbox_Python/Scripts/{ex4_3_2.py => ex2_4_2.py} (100%)
 create mode 100644 exercises/02450Toolbox_Python/Scripts/ex3_1_1.py
 rename exercises/02450Toolbox_Python/Scripts/{ex2_1_6.py => ex3_1_6.py} (99%)
 create mode 100644 exercises/02450Toolbox_Python/Scripts/ex3_2_2.py
 delete mode 100644 exercises/02450Toolbox_Python/Scripts/ex3_3_2.py
 delete mode 100644 exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
 create mode 100644 exercises/02450Toolbox_R/Scripts/ex1_6_2.R
 create mode 100644 exercises/02450Toolbox_R/Scripts/ex1_6_3.R
 create mode 100644 exercises/02450Toolbox_R/Scripts/ex1_6_4.R
 create mode 100644 exercises/02450Toolbox_R/Scripts/ex1_6_5.R
 delete mode 100644 exercises/02450Toolbox_R/Scripts/ex2_1_2.R
 delete mode 100644 exercises/02450Toolbox_R/Scripts/ex2_1_3.R
 delete mode 100644 exercises/02450Toolbox_R/Scripts/ex2_1_4.R
 delete mode 100644 exercises/02450Toolbox_R/Scripts/ex2_1_5.R
 rename exercises/02450Toolbox_R/Scripts/{ex4_2_2.R => ex2_3_2.R} (96%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_2_3.R => ex2_3_3.R} (81%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_2_4.R => ex2_3_4.R} (96%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_2_5.R => ex2_3_5.R} (92%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_2_6.R => ex2_3_6.R} (97%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_2_7.R => ex2_3_7.R} (95%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_3_1.R => ex2_4_1.R} (100%)
 rename exercises/02450Toolbox_R/Scripts/{ex4_3_2.R => ex2_4_2.R} (94%)
 create mode 100644 exercises/02450Toolbox_R/Scripts/ex3_1_1.R
 rename exercises/02450Toolbox_R/Scripts/{ex2_1_6.R => ex3_1_6.R} (99%)
 create mode 100644 exercises/02450Toolbox_R/Scripts/ex3_2_2.R
 delete mode 100644 exercises/02450Toolbox_R/Scripts/ex3_3_2.R
 delete mode 100644 exercises/02450Toolbox_R/Scripts/ex4_2_1.R

diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m
new file mode 100644
index 0000000..b5391b7
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m
@@ -0,0 +1,9 @@
+%% exercise 3.1.2
+cdir = fileparts(mfilename('fullpath')); 
+[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'));
+X = full(A)';
+attributeNames = cellstr(D); 
+
+%% Display the result
+display(attributeNames);
+display(X);  
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m
new file mode 100644
index 0000000..fdd498e
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m
@@ -0,0 +1 @@
+%% exercise 3.1.3
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt');
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m
new file mode 100644
index 0000000..0518cd4
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m
@@ -0,0 +1 @@
+%% exercise 3.1.4
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = '../Data/stopWords.txt';
TMGOpts.stemming = 1;
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m
new file mode 100644
index 0000000..ffa4d21
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m
@@ -0,0 +1,21 @@
+%% exercise 3.1.5
+
+% Query vector
+q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]';
+
+%% Method #1 (a for loop)
+N = size(X,1); % Get the number of data objects
+sim = nan(N,1); % Allocate a vector for the similarity
+for i = 1:N
+    x = X(i,:); % Get the i'th data object
+    sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity
+end
+
+%% Method #2 (one compact line of code)
+sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2)));
+
+%% Method #3 (use the "similarity" function)
+sim = similarity(X, q, 'cos');
+
+%% Display the result
+display(sim);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m
index c5f1186..f411501 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m
@@ -1,19 +1,14 @@
-%% exercise 2.1.1
-% Load the data into Matlab
-cdir = fileparts(mfilename('fullpath')); 
-[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls'));
+% exercise 3.2.1
 
-% Extract the rows and columns corresponding to the sensor data, and
-% transpose the matrix to have rows correspond to data items
-X = NUMERIC(:,3:10);
+x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47];
 
-% Extract attribute names from the first column
-attributeNames = RAW(1,4:end);
+mean_x = mean(x);
+std_x = std(x);
+median_x = median(x);
+range_x = range(x);
 
-% Extract unique class names from the first row
-classLabels = RAW(3:end,1);
-classNames = unique(classLabels);
-
-% Extract class labels that match the class names
-[y_,y] = ismember(classLabels, classNames); y = y-1;
- 
\ No newline at end of file
+%% Display results
+display(mean_x);
+display(std_x);
+display(median_x);
+display(range_x);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m
deleted file mode 100644
index 5b6eee8..0000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m
+++ /dev/null
@@ -1,31 +0,0 @@
-%% exercise 2.1.2
-% Data attributes to be plotted
-i = 1;
-j = 2;
-
-% Make a simple plot of the i'th attribute against the j'th attribute
-mfig('NanoNose: Data'); clf;
-plot(X(:,i), X(:,j),'o');
-axis tight
-
-% Make another more fancy plot that includes legend, class labels, 
-% attribute names, and a title
-mfig('NanoNose: Classes'); clf; hold all; 
-C = length(classNames);
-% Use a specific color for each class (easy to reuse across plots!):
-colors = get(gca, 'colororder'); 
-% Here we the standard colours from MATLAB, but you could define you own.
-for c = 0:C-1
-    h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ...
-                'MarkerFaceColor', colors(c+1,:), ...
-                'MarkerEdgeAlpha', 0, ...
-                'MarkerFaceAlpha', .5);
-end
-% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries) 
-% gscatter(X(:,i), X(:,j), classLabels)
-legend(classNames);
-axis tight
-xlabel(attributeNames{i});
-ylabel(attributeNames{j});
-title('NanoNose data');
-a = 234
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m
deleted file mode 100644
index 145ff5c..0000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m
+++ /dev/null
@@ -1,28 +0,0 @@
-
-%% exercise 2.1.3
-
-% Subtract the mean from the data
-Y = bsxfun(@minus, X, mean(X));
-
-% Obtain the PCA solution by calculate the SVD of Y
-[U, S, V] = svd(Y);
-
-% Compute variance explained
-rho = diag(S).^2./sum(diag(S).^2);
-threshold = 0.90;
-
-% Plot variance explained
-mfig('NanoNose: Var. explained'); clf;
-hold on
-plot(rho, 'x-');
-plot(cumsum(rho), 'o-');
-plot([0,length(rho)], [threshold, threshold], 'k--');
-legend({'Individual','Cumulative','Threshold'}, ...
-        'Location','best');
-ylim([0, 1]);
-xlim([1, length(rho)]);
-grid minor
-xlabel('Principal component');
-ylabel('Variance explained value');
-title('Variance explained by principal components');
-
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m
deleted file mode 100644
index cf767df..0000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m
+++ /dev/null
@@ -1,24 +0,0 @@
-%% exercise 2.1.4
-
-% Index of the principal components
-i = 1;
-j = 2;
-
-% Compute the projection onto the principal components
-Z = U*S;
-
-% Plot PCA of data
-mfig('NanoNose: PCA Projection'); clf; hold all; 
-C = length(classNames);
-colors = get(gca,'colororder');
-for c = 0:C-1
-    scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ...
-                'MarkerFaceColor', colors(c+1,:), ...
-                'MarkerEdgeAlpha', 0, ...
-                'MarkerFaceAlpha', .5);
-end
-legend(classNames);
-axis tight
-xlabel(sprintf('PC %d', i));
-ylabel(sprintf('PC %d', j));
-title('PCA Projection of NanoNose data');
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m
deleted file mode 100644
index 2475302..0000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m
+++ /dev/null
@@ -1,40 +0,0 @@
-%% exercise 2.1.5
-% We saw in 2.1.3 that the first 3 components explaiend more than 90
-% percent of the variance. Let's look at their coefficients:
-pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5]
-mfig('NanoNose: PCA Component Coefficients');
-h = bar(V(:,pcs));
-legendCell = cellstr(num2str(pcs', 'PC%-d'));
-legend(legendCell, 'location','best');
-xticklabels(attributeNames);
-grid
-xlabel('Attributes');
-ylabel('Component coefficients');
-title('NanoNose: PCA Component Coefficients');
-
-% Inspecting the plot, we see that the 2nd principal component has large
-% (in magnitude) coefficients for attributes A, E and H. We can confirm
-% this by looking at it's numerical values directly, too:
-disp('PC2:')
-disp(V(:,2)') % notice the transpose for display in console 
-
-% How does this translate to the actual data and its projections?
-% Looking at the data for water:
-
-% Projection of water class onto the 2nd principal component.
-all_water_data = Y(y==4,:);
-
-disp('First water observation:')
-disp(all_water_data(1,:))
-
-% Based on the coefficients and the attribute values for the observation
-% displayed, would you expect the projection onto PC2 to be positive or
-% negative - why? Consider *both* the magnitude and sign of *both* the
-% coefficient and the attribute!
-
-% You can determine the projection by (remove comments):
-disp('...and its projection onto PC2:')
-disp(all_water_data(1,:) * V(:,2))
-% Try to explain why?
-
-
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m
index 2f064b5..0098173 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m
@@ -1,28 +1,65 @@
-%% exercise 2.2.1
+% exercise 3.3.1
 
-% Digit number to display
-i = 1; 
+% Image to use as query
+i = 1;
 
-% Load data
+% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' 
+SimilarityMeasure = 'cos';
+
+%% Load the CBCL face database
 cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/zipdata.mat'));
-
-% Extract digits data
-X = traindata(:,2:end);
-y = traindata(:,1);
-
-% Visualize the i'th digit as a vector
-mfig('Digits: Data');
-subplot(4,1,4);
-imagesc(X(i,:));
-xlabel('Pixel number');
-title('Digit in vector format');
-set(gca, 'YTick', []);
-
-% Visualize the i'th digit as an image
-subplot(4,1,1:3);
-I = reshape(X(i,:), [16,16])';
-imagesc(I);
-colormap(1-gray);
-axis image off
-title('Digit as a image');
+load(fullfile(cdir,'../Data/digits.mat'));
+
+% You can try out the faces CBCL face database, too:
+%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat'));
+
+transpose = true; % set to true if plotted images needs to be transposed
+
+[N,M] = size(X);
+imageDim = [sqrt(M),sqrt(M)];
+%% Search the face database for similar faces
+
+% Index of all other images than i
+noti = [1:i-1 i+1:N]; 
+% Compute similarity between image i and all others
+sim = similarity(X(i,:), X(noti,:), SimilarityMeasure);
+% Sort similarities
+[val, j] = sort(sim, 'descend');
+
+%% Plot query and result
+mfig('Faces: Query'); clf;
+subplot(3,5,1:5);
+
+img = reshape(X(i,:),imageDim);
+if transpose; img = img'; end;
+imagesc(img);
+
+axis image
+set(gca, 'XTick', [], 'YTick', []);
+ylabel(sprintf('Image #%d', i));
+title('Query image','FontWeight','bold');
+for k = 1:5
+    subplot(3,5,k+5)
+    ii = noti(j(k));
+    img = reshape(X(ii,:),imageDim);
+    if transpose; img = img'; end;
+    imagesc(img);
+    axis image
+    set(gca, 'XTick', [], 'YTick', []);
+    xlabel(sprintf('sim=%.2f', val(k)));
+    ylabel(sprintf('Image #%d', ii));
+    if k==3, title('Most similar images','FontWeight','bold'); end;
+end
+for k = 1:5
+    subplot(3,5,k+10)
+    ii = noti(j(end+1-k));
+    img = reshape(X(ii,:),imageDim);
+    if transpose; img = img'; end;
+    imagesc(img);
+    axis image
+    set(gca, 'XTick', [], 'YTick', []);
+    xlabel(sprintf('sim=%.3f', val(end+1-k)));
+    ylabel(sprintf('Image #%d', ii));
+    if k==3, title('Least similar images','FontWeight','bold'); end;
+end
+colormap(gray);
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m
index 0db965d..52eb65d 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m
@@ -1,87 +1,18 @@
-%% exercise 2.2.2
-
-% Digits to include in analysis (to include all, n = 1:10);
-n = [0,1];
-
-% Number of principal components for reconstruction
-K = 22;
-
-% Digits to visualize
-nD = 1:5;
-
-%% Load data
-cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/zipdata.mat'));
-
-% Extract digits
-X = traindata(:,2:end);
-y = traindata(:,1);
-classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'};
-classLabels = classNames(y+1);
-
-% Remove digits that are not to be inspected
-j = ismember(y, n);
-X = X(j,:);
-classLabels = classLabels(j);
-classNames = classNames(n+1);
-y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1;
-
-% Subtract the mean from the data
-Y = bsxfun(@minus, X, mean(X));
-
-% Obtain the PCA solution by calculate the SVD of Y
-[U, S, V] = svd(Y, 'econ');
-
-% Compute the projection onto the principal components
-Z = U*S;
-
-% Compute variance explained
-rho = diag(S).^2./sum(diag(S).^2);
-
-%% Plot variance explained
-mfig('Digits: Var. explained'); clf;
-plot(rho, 'o-');
-title('Variance explained by principal components');
-xlabel('Principal component');
-ylabel('Variance explained value');
-
-%% Plot PCA of data
-mfig('Digits: PCA'); clf; hold all; 
-C = length(classNames);
-for c = 0:C-1
-    plot(Z(y==c,1), Z(y==c,2), 'o');
-end
-legend(classNames);
-xlabel('PC 1');
-ylabel('PC 2');
-title('PCA of digits data');
-
-%% Visualize the reconstructed data from the firts K principal components
-mfig('Digits: Reconstruction'); clf;
-W = Z(:,1:K)*V(:,1:K)';
-D = length(nD);
-for d = 1:D
-    subplot(2,D,d);
-    I = reshape(X(nD(d),:), [16,16])';
-    imagesc(I);
-    axis image off
-    title('Original');
-    subplot(2,D,d+D);
-    I = reshape(W(nD(d),:)+mean(X), [16,16])';
-    imagesc(I);
-    axis image off
-    title('Reconstructed');
-end
-colormap(1-gray);
-
-%% Visualize the principal components
-mfig('Digits: Principal components'); clf;
-for k = 1:K
-    N1 = ceil(sqrt(K)); N2 = ceil(K/N1);
-    subplot(N2, N1, k);
-    I = reshape(V(:,k), [16,16])';
-    imagesc(I);
-    colormap(hot);
-    axis image off
-    title(sprintf('PC %d',k));
-end
+% exercise 3.3.2
+
+% Generate two data objects with M random attributes
+M = 5;
+x = rand(1,M);
+y = rand(1,M);
+
+% Two constants
+a = 1.5;
+b = 1.5;
+
+% Check the statements in the exercise
+similarity(x,y,'cos') - similarity(a*x,y,'cos')
+similarity(x,y,'ext') - similarity(a*x,y,'ext')
+similarity(x,y,'cor') - similarity(a*x,y,'cor')
+similarity(x,y,'cos') - similarity(b+x,y,'cos')
+similarity(x,y,'ext') - similarity(b+x,y,'ext')
+similarity(x,y,'cor') - similarity(b+x,y,'cor')
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m
index 2a8135e..9a68ff4 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m
@@ -1,44 +1,31 @@
-%% exercise 2.3.1
-%% Load data
+% exercise 4.2.1
+
+% Disable xlsread warning
+warning('off', 'MATLAB:xlsread:ActiveX'); 
+warning('off', 'MATLAB:xlsread:Mode'); 
+
+% Load the data into Matlab
 cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/zipdata.mat'));
-
-% Extract digits (training set)
-X = traindata(:,2:end);
-y = traindata(:,1);
-
-% Extract digits (test set)
-Xtest = testdata(:,2:end);
-ytest = testdata(:,1);
-
-% Subtract the mean from the data
-Y = bsxfun(@minus, X, mean(X));
-Ytest = bsxfun(@minus, Xtest, mean(X));
-
-% Obtain the PCA solution by calculate the SVD of Y
-[U, S, V] = svd(Y, 'econ');
-
-% Number of principal components to use, i.e. the reduced dimensionality
-Krange = [8,10,15,20,30,40,50,60,100,150];
-errorRate = zeros(length(Krange),1);
-for i = 1:length(Krange)
-    K=Krange(i);
-    % Compute the projection onto the principal components
-    Z = Y*V(:,1:K);
-    Ztest = Ytest*V(:,1:K);
-
-    % Classify digits using a K-nearest neighbour classifier
-    model=fitcknn(Z,y,'NumNeighbors',1);
-    yest = predict(model,Ztest);
-    
-    errorRate(i) = nnz(ytest~=yest)/length(ytest);
-
-    % Display results
-    fprintf('Error rate %.1f%%\n',errorRate(i)*100);
+[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic');
+
+% Extract the rows and columns corresponding to the data
+if isnan(NUMERIC(1,1))
+	X = NUMERIC(2:end,:);
+else
+	X = NUMERIC;
 end
 
-%% Visualize error rates vs. number of principal components considered
-mfig('Variance explained by principal components'); clf;
-plot(Krange,errorRate, 'o-');
-xlabel('Number of principal components K')
-ylabel('Error rate [%]')
+% Extract attribute names from the first row
+attributeNames = RAW(1,1:4)';
+
+% Extract unique class names from the last column
+classLabels = RAW(2:end,5)';
+classNames = unique(classLabels);
+
+% Extract class labels that match the class names
+[y_,y] = ismember(classLabels, classNames); y = y'-1;
+
+% Get the number of data objects, attributes, and classes
+[N, M] = size(X);
+C = length(classNames);
+
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_2.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_2.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_2.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_3.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_3.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_3.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_4.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_4.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_4.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_5.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_5.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_5.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_6.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_6.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_6.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_6.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_7.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_7.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_7.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_7.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_4_1.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_3_1.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_4_1.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_3_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_4_2.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_3_2.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_4_2.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m
new file mode 100644
index 0000000..c5f1186
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m
@@ -0,0 +1,19 @@
+%% exercise 2.1.1
+% Load the data into Matlab
+cdir = fileparts(mfilename('fullpath')); 
+[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls'));
+
+% Extract the rows and columns corresponding to the sensor data, and
+% transpose the matrix to have rows correspond to data items
+X = NUMERIC(:,3:10);
+
+% Extract attribute names from the first column
+attributeNames = RAW(1,4:end);
+
+% Extract unique class names from the first row
+classLabels = RAW(3:end,1);
+classNames = unique(classLabels);
+
+% Extract class labels that match the class names
+[y_,y] = ismember(classLabels, classNames); y = y-1;
+ 
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m
index b5391b7..5b6eee8 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m
@@ -1,9 +1,31 @@
-%% exercise 3.1.2
-cdir = fileparts(mfilename('fullpath')); 
-[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'));
-X = full(A)';
-attributeNames = cellstr(D); 
+%% exercise 2.1.2
+% Data attributes to be plotted
+i = 1;
+j = 2;
 
-%% Display the result
-display(attributeNames);
-display(X);  
\ No newline at end of file
+% Make a simple plot of the i'th attribute against the j'th attribute
+mfig('NanoNose: Data'); clf;
+plot(X(:,i), X(:,j),'o');
+axis tight
+
+% Make another more fancy plot that includes legend, class labels, 
+% attribute names, and a title
+mfig('NanoNose: Classes'); clf; hold all; 
+C = length(classNames);
+% Use a specific color for each class (easy to reuse across plots!):
+colors = get(gca, 'colororder'); 
+% Here we the standard colours from MATLAB, but you could define you own.
+for c = 0:C-1
+    h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ...
+                'MarkerFaceColor', colors(c+1,:), ...
+                'MarkerEdgeAlpha', 0, ...
+                'MarkerFaceAlpha', .5);
+end
+% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries) 
+% gscatter(X(:,i), X(:,j), classLabels)
+legend(classNames);
+axis tight
+xlabel(attributeNames{i});
+ylabel(attributeNames{j});
+title('NanoNose data');
+a = 234
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m
index fdd498e..145ff5c 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m
@@ -1 +1,28 @@
-%% exercise 3.1.3
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt');
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
+
+%% exercise 2.1.3
+
+% Subtract the mean from the data
+Y = bsxfun(@minus, X, mean(X));
+
+% Obtain the PCA solution by calculate the SVD of Y
+[U, S, V] = svd(Y);
+
+% Compute variance explained
+rho = diag(S).^2./sum(diag(S).^2);
+threshold = 0.90;
+
+% Plot variance explained
+mfig('NanoNose: Var. explained'); clf;
+hold on
+plot(rho, 'x-');
+plot(cumsum(rho), 'o-');
+plot([0,length(rho)], [threshold, threshold], 'k--');
+legend({'Individual','Cumulative','Threshold'}, ...
+        'Location','best');
+ylim([0, 1]);
+xlim([1, length(rho)]);
+grid minor
+xlabel('Principal component');
+ylabel('Variance explained value');
+title('Variance explained by principal components');
+
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m
index 0518cd4..cf767df 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m
@@ -1 +1,24 @@
-%% exercise 3.1.4
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = '../Data/stopWords.txt';
TMGOpts.stemming = 1;
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
+%% exercise 2.1.4
+
+% Index of the principal components
+i = 1;
+j = 2;
+
+% Compute the projection onto the principal components
+Z = U*S;
+
+% Plot PCA of data
+mfig('NanoNose: PCA Projection'); clf; hold all; 
+C = length(classNames);
+colors = get(gca,'colororder');
+for c = 0:C-1
+    scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ...
+                'MarkerFaceColor', colors(c+1,:), ...
+                'MarkerEdgeAlpha', 0, ...
+                'MarkerFaceAlpha', .5);
+end
+legend(classNames);
+axis tight
+xlabel(sprintf('PC %d', i));
+ylabel(sprintf('PC %d', j));
+title('PCA Projection of NanoNose data');
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m
index ffa4d21..2475302 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m
@@ -1,21 +1,40 @@
-%% exercise 3.1.5
+%% exercise 2.1.5
+% We saw in 2.1.3 that the first 3 components explaiend more than 90
+% percent of the variance. Let's look at their coefficients:
+pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5]
+mfig('NanoNose: PCA Component Coefficients');
+h = bar(V(:,pcs));
+legendCell = cellstr(num2str(pcs', 'PC%-d'));
+legend(legendCell, 'location','best');
+xticklabels(attributeNames);
+grid
+xlabel('Attributes');
+ylabel('Component coefficients');
+title('NanoNose: PCA Component Coefficients');
 
-% Query vector
-q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]';
+% Inspecting the plot, we see that the 2nd principal component has large
+% (in magnitude) coefficients for attributes A, E and H. We can confirm
+% this by looking at it's numerical values directly, too:
+disp('PC2:')
+disp(V(:,2)') % notice the transpose for display in console 
 
-%% Method #1 (a for loop)
-N = size(X,1); % Get the number of data objects
-sim = nan(N,1); % Allocate a vector for the similarity
-for i = 1:N
-    x = X(i,:); % Get the i'th data object
-    sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity
-end
+% How does this translate to the actual data and its projections?
+% Looking at the data for water:
 
-%% Method #2 (one compact line of code)
-sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2)));
+% Projection of water class onto the 2nd principal component.
+all_water_data = Y(y==4,:);
+
+disp('First water observation:')
+disp(all_water_data(1,:))
+
+% Based on the coefficients and the attribute values for the observation
+% displayed, would you expect the projection onto PC2 to be positive or
+% negative - why? Consider *both* the magnitude and sign of *both* the
+% coefficient and the attribute!
+
+% You can determine the projection by (remove comments):
+disp('...and its projection onto PC2:')
+disp(all_water_data(1,:) * V(:,2))
+% Try to explain why?
 
-%% Method #3 (use the "similarity" function)
-sim = similarity(X, q, 'cos');
 
-%% Display the result
-display(sim);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_6.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_6.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex2_1_6.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex3_1_6.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m
index f411501..2f064b5 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m
@@ -1,14 +1,28 @@
-% exercise 3.2.1
+%% exercise 2.2.1
 
-x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47];
+% Digit number to display
+i = 1; 
 
-mean_x = mean(x);
-std_x = std(x);
-median_x = median(x);
-range_x = range(x);
+% Load data
+cdir = fileparts(mfilename('fullpath')); 
+load(fullfile(cdir,'../Data/zipdata.mat'));
 
-%% Display results
-display(mean_x);
-display(std_x);
-display(median_x);
-display(range_x);
\ No newline at end of file
+% Extract digits data
+X = traindata(:,2:end);
+y = traindata(:,1);
+
+% Visualize the i'th digit as a vector
+mfig('Digits: Data');
+subplot(4,1,4);
+imagesc(X(i,:));
+xlabel('Pixel number');
+title('Digit in vector format');
+set(gca, 'YTick', []);
+
+% Visualize the i'th digit as an image
+subplot(4,1,1:3);
+I = reshape(X(i,:), [16,16])';
+imagesc(I);
+colormap(1-gray);
+axis image off
+title('Digit as a image');
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m
new file mode 100644
index 0000000..0db965d
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m
@@ -0,0 +1,87 @@
+%% exercise 2.2.2
+
+% Digits to include in analysis (to include all, n = 1:10);
+n = [0,1];
+
+% Number of principal components for reconstruction
+K = 22;
+
+% Digits to visualize
+nD = 1:5;
+
+%% Load data
+cdir = fileparts(mfilename('fullpath')); 
+load(fullfile(cdir,'../Data/zipdata.mat'));
+
+% Extract digits
+X = traindata(:,2:end);
+y = traindata(:,1);
+classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'};
+classLabels = classNames(y+1);
+
+% Remove digits that are not to be inspected
+j = ismember(y, n);
+X = X(j,:);
+classLabels = classLabels(j);
+classNames = classNames(n+1);
+y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1;
+
+% Subtract the mean from the data
+Y = bsxfun(@minus, X, mean(X));
+
+% Obtain the PCA solution by calculate the SVD of Y
+[U, S, V] = svd(Y, 'econ');
+
+% Compute the projection onto the principal components
+Z = U*S;
+
+% Compute variance explained
+rho = diag(S).^2./sum(diag(S).^2);
+
+%% Plot variance explained
+mfig('Digits: Var. explained'); clf;
+plot(rho, 'o-');
+title('Variance explained by principal components');
+xlabel('Principal component');
+ylabel('Variance explained value');
+
+%% Plot PCA of data
+mfig('Digits: PCA'); clf; hold all; 
+C = length(classNames);
+for c = 0:C-1
+    plot(Z(y==c,1), Z(y==c,2), 'o');
+end
+legend(classNames);
+xlabel('PC 1');
+ylabel('PC 2');
+title('PCA of digits data');
+
+%% Visualize the reconstructed data from the firts K principal components
+mfig('Digits: Reconstruction'); clf;
+W = Z(:,1:K)*V(:,1:K)';
+D = length(nD);
+for d = 1:D
+    subplot(2,D,d);
+    I = reshape(X(nD(d),:), [16,16])';
+    imagesc(I);
+    axis image off
+    title('Original');
+    subplot(2,D,d+D);
+    I = reshape(W(nD(d),:)+mean(X), [16,16])';
+    imagesc(I);
+    axis image off
+    title('Reconstructed');
+end
+colormap(1-gray);
+
+%% Visualize the principal components
+mfig('Digits: Principal components'); clf;
+for k = 1:K
+    N1 = ceil(sqrt(K)); N2 = ceil(K/N1);
+    subplot(N2, N1, k);
+    I = reshape(V(:,k), [16,16])';
+    imagesc(I);
+    colormap(hot);
+    axis image off
+    title(sprintf('PC %d',k));
+end
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m
index 0098173..2a8135e 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m
@@ -1,65 +1,44 @@
-% exercise 3.3.1
-
-% Image to use as query
-i = 1;
-
-% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' 
-SimilarityMeasure = 'cos';
-
-%% Load the CBCL face database
+%% exercise 2.3.1
+%% Load data
 cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/digits.mat'));
-
-% You can try out the faces CBCL face database, too:
-%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat'));
-
-transpose = true; % set to true if plotted images needs to be transposed
-
-[N,M] = size(X);
-imageDim = [sqrt(M),sqrt(M)];
-%% Search the face database for similar faces
-
-% Index of all other images than i
-noti = [1:i-1 i+1:N]; 
-% Compute similarity between image i and all others
-sim = similarity(X(i,:), X(noti,:), SimilarityMeasure);
-% Sort similarities
-[val, j] = sort(sim, 'descend');
-
-%% Plot query and result
-mfig('Faces: Query'); clf;
-subplot(3,5,1:5);
-
-img = reshape(X(i,:),imageDim);
-if transpose; img = img'; end;
-imagesc(img);
-
-axis image
-set(gca, 'XTick', [], 'YTick', []);
-ylabel(sprintf('Image #%d', i));
-title('Query image','FontWeight','bold');
-for k = 1:5
-    subplot(3,5,k+5)
-    ii = noti(j(k));
-    img = reshape(X(ii,:),imageDim);
-    if transpose; img = img'; end;
-    imagesc(img);
-    axis image
-    set(gca, 'XTick', [], 'YTick', []);
-    xlabel(sprintf('sim=%.2f', val(k)));
-    ylabel(sprintf('Image #%d', ii));
-    if k==3, title('Most similar images','FontWeight','bold'); end;
-end
-for k = 1:5
-    subplot(3,5,k+10)
-    ii = noti(j(end+1-k));
-    img = reshape(X(ii,:),imageDim);
-    if transpose; img = img'; end;
-    imagesc(img);
-    axis image
-    set(gca, 'XTick', [], 'YTick', []);
-    xlabel(sprintf('sim=%.3f', val(end+1-k)));
-    ylabel(sprintf('Image #%d', ii));
-    if k==3, title('Least similar images','FontWeight','bold'); end;
+load(fullfile(cdir,'../Data/zipdata.mat'));
+
+% Extract digits (training set)
+X = traindata(:,2:end);
+y = traindata(:,1);
+
+% Extract digits (test set)
+Xtest = testdata(:,2:end);
+ytest = testdata(:,1);
+
+% Subtract the mean from the data
+Y = bsxfun(@minus, X, mean(X));
+Ytest = bsxfun(@minus, Xtest, mean(X));
+
+% Obtain the PCA solution by calculate the SVD of Y
+[U, S, V] = svd(Y, 'econ');
+
+% Number of principal components to use, i.e. the reduced dimensionality
+Krange = [8,10,15,20,30,40,50,60,100,150];
+errorRate = zeros(length(Krange),1);
+for i = 1:length(Krange)
+    K=Krange(i);
+    % Compute the projection onto the principal components
+    Z = Y*V(:,1:K);
+    Ztest = Ytest*V(:,1:K);
+
+    % Classify digits using a K-nearest neighbour classifier
+    model=fitcknn(Z,y,'NumNeighbors',1);
+    yest = predict(model,Ztest);
+    
+    errorRate(i) = nnz(ytest~=yest)/length(ytest);
+
+    % Display results
+    fprintf('Error rate %.1f%%\n',errorRate(i)*100);
 end
-colormap(gray);
+
+%% Visualize error rates vs. number of principal components considered
+mfig('Variance explained by principal components'); clf;
+plot(Krange,errorRate, 'o-');
+xlabel('Number of principal components K')
+ylabel('Error rate [%]')
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m
deleted file mode 100644
index 52eb65d..0000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m
+++ /dev/null
@@ -1,18 +0,0 @@
-% exercise 3.3.2
-
-% Generate two data objects with M random attributes
-M = 5;
-x = rand(1,M);
-y = rand(1,M);
-
-% Two constants
-a = 1.5;
-b = 1.5;
-
-% Check the statements in the exercise
-similarity(x,y,'cos') - similarity(a*x,y,'cos')
-similarity(x,y,'ext') - similarity(a*x,y,'ext')
-similarity(x,y,'cor') - similarity(a*x,y,'cor')
-similarity(x,y,'cos') - similarity(b+x,y,'cos')
-similarity(x,y,'ext') - similarity(b+x,y,'ext')
-similarity(x,y,'cor') - similarity(b+x,y,'cor')
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m
deleted file mode 100644
index 9a68ff4..0000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m
+++ /dev/null
@@ -1,31 +0,0 @@
-% exercise 4.2.1
-
-% Disable xlsread warning
-warning('off', 'MATLAB:xlsread:ActiveX'); 
-warning('off', 'MATLAB:xlsread:Mode'); 
-
-% Load the data into Matlab
-cdir = fileparts(mfilename('fullpath')); 
-[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic');
-
-% Extract the rows and columns corresponding to the data
-if isnan(NUMERIC(1,1))
-	X = NUMERIC(2:end,:);
-else
-	X = NUMERIC;
-end
-
-% Extract attribute names from the first row
-attributeNames = RAW(1,1:4)';
-
-% Extract unique class names from the last column
-classLabels = RAW(2:end,5)';
-classNames = unique(classLabels);
-
-% Extract class labels that match the class names
-[y_,y] = ismember(classLabels, classNames); y = y'-1;
-
-% Get the number of data objects, attributes, and classes
-[N, M] = size(X);
-C = length(classNames);
-
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py
new file mode 100644
index 0000000..60a39cf
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py
@@ -0,0 +1,54 @@
+# exercise 3.1.4
+import importlib_resources
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
+# Load the textDocs.txt as a long string into raw_file:
+with open(filename, "r") as f:
+    raw_file = f.read()
+# raw_file contains sentences seperated by newline characters,
+# so we split by '\n':
+corpus = raw_file.split("\n")
+# corpus is now list of "documents" (sentences), but some of them are empty,
+# because textDocs.txt has a lot of empty lines, we filter/remove them:
+corpus = list(filter(None, corpus))
+
+# Display the result
+print("Document-term matrix analysis")
+print()
+print("Corpus (5 documents/sentences):")
+print(np.asmatrix(corpus))
+print()
+
+
+# To automatically obtain the bag of words representation, we use sklearn's
+# feature_extraction.text module, which has a function CountVectorizer.
+# We make a CounterVectorizer:
+vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b")
+# The token pattern is a regular expression (marked by the r), which ensures
+# that the vectorizer ignores digit/non-word tokens - in this case, it ensures
+# the 10 in the last document is not recognized as a token. It's not important
+# that you should understand it the regexp.
+
+# The object vectorizer can now be used to first 'fit' the vectorizer to the
+# corpus, and the subsequently transform the data. We start by fitting:
+vectorizer.fit(corpus)
+# The vectorizer has now determined the unique terms (or tokens) in the corpus
+# and we can extract them using:
+attributeNames = vectorizer.get_feature_names_out()
+print("Found terms:")
+print(attributeNames)
+print()
+
+# The next step is to count how many times each term is found in each document,
+# which we do using the transform function:
+X = vectorizer.transform(corpus)
+N, M = X.shape
+print("Number of documents (data objects, N):\t %i" % N)
+print("Number of terms (attributes, M):\t %i" % M)
+print()
+print("Document-term matrix:")
+print(X.toarray())
+print()
+print("Ran Exercise 3.1.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py
new file mode 100644
index 0000000..c3ac96c
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py
@@ -0,0 +1,40 @@
+# exercise 3.1.4
+import importlib_resources
+from sklearn.feature_extraction.text import CountVectorizer
+
+filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
+filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
+
+# As before, load the corpus and preprocess:
+with open(filename_docs, "r") as f:
+    raw_file = f.read()
+corpus = raw_file.split("\n")
+corpus = list(filter(None, corpus))
+
+# Load and process the stop words in a similar manner:
+with open(filename_stop, "r") as f:
+    raw_file = f.read()
+stopwords = raw_file.split("\n")
+
+# When making the CountVectorizer, we now input the stop words:
+vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords)
+# Determine the terms in the corpus
+vectorizer.fit(corpus)
+# ... and count the frequency of each term within a document:
+X = vectorizer.transform(corpus)
+attributeNames = vectorizer.get_feature_names_out()
+N, M = X.shape
+
+# Display the result
+print("Document-term matrix analysis (using stop words)")
+print()
+print("Number of documents (data objects, N):\t %i" % N)
+print("Number of terms (attributes, M):\t %i" % M)
+print()
+print("Found terms (no stop words):")
+print(attributeNames)
+print()
+print("Document-term matrix:")
+print(X.toarray())
+print()
+print("Ran Exercise 3.1.3")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py
new file mode 100644
index 0000000..e33ee98
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py
@@ -0,0 +1,66 @@
+# exercise 3.1.4
+import importlib_resources
+
+# We'll use a widely used stemmer based:
+# Porter, M. “An algorithm for suffix stripping.” Program 14.3 (1980): 130-137.
+# The stemmer is implemented in the most used natural language processing
+# package in Python, "Natural Langauge Toolkit" (NLTK):
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import CountVectorizer
+
+filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
+filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
+
+# As before, load the corpus and preprocess:
+with open(filename_docs, "r") as f:
+    raw_file = f.read()
+corpus = raw_file.split("\n")
+corpus = list(filter(None, corpus))
+
+# Load and process the stop words in a similar manner:
+with open(filename_stop, "r") as f:
+    raw_file = f.read()
+stopwords = raw_file.split("\n")
+
+
+# To enable stemming when using the sklearn-module, we need to parse an
+# "analyzer" to the vectorizer we've been using.
+# First, we make an object based on the PorterStemmer class, and we also make
+# an analyzer object:
+stemmer = PorterStemmer()
+analyzer = CountVectorizer(
+    token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords
+).build_analyzer()
+
+
+# Using these we'll make a function that can stem words:
+def stemmed_words(doc):
+    return (stemmer.stem(w) for w in analyzer(doc))
+
+
+# ... and finally, we make a vectorizer just like we've done before:
+vectorizer = CountVectorizer(analyzer=stemmed_words)
+
+# Determine the terms:
+vectorizer.fit(corpus)
+attributeNames = vectorizer.get_feature_names_out()
+
+# ... and count the occurences:
+X = vectorizer.transform(corpus)
+N, M = X.shape
+X = X.toarray()
+
+# Display the result
+print("Document-term matrix analysis (using stop words and stemming)")
+print()
+print("Number of documents (data objects, N):\t %i" % N)
+print("Number of terms (attributes, M):\t %i" % M)
+print()
+print("Found terms (no stop words, stemmed):")
+print(attributeNames)
+print()
+print("Document-term matrix:")
+print(X)
+print()
+print("Ran Exercise 3.1.4")
+print()
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py
new file mode 100644
index 0000000..eb3b3b3
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py
@@ -0,0 +1,38 @@
+# exercise 3.1.5
+import numpy as np
+import scipy.linalg as linalg
+from ex1_6_4 import *
+
+from dtuimldmtools import similarity
+
+# Query vector
+q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
+# notice, that you could get the query vector using the vectorizer, too:
+# q = vectorizer.transform(['matrix rank solv'])
+# q = np.asarray(q.toarray())
+# or use any other string:
+# q = vectorizer.transform(['Can I Google how to fix my problem?'])
+# q = np.asarray(q.toarray())
+
+# Method 1 ('for' loop - slow)
+N = np.shape(X)[0]
+# get the number of data objects
+sim = np.zeros((N, 1))  # allocate a vector for the similarity
+for i in range(N):
+    x = X[i, :]  # Get the i'th data object (here: document)
+    sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x)  # Compute cosine similarity
+
+# Method 2 (one line of code with no iterations - faster)
+sim = (q @ X.T).T / (
+    np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum())
+)
+
+# Method 3 (use the "similarity" function)
+sim = similarity(X, q, "cos")
+
+
+# Display the result
+print("Query vector:\n {0}\n".format(q))
+print("Similarity results:\n {0}".format(sim))
+
+print("Ran Exercise 3.1.5")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py
index b210d13..dbc1dc0 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py
@@ -1,32 +1,19 @@
-# exercise 2.1.1
-import importlib_resources
+# exercise 3.2.1
 import numpy as np
-import xlrd
 
-# Load xls sheet with data
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls")
-doc = xlrd.open_workbook(filename).sheet_by_index(0)
+x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47])
 
-# Extract attribute names (1st row, column 4 to 12)
-attributeNames = doc.row_values(0, 3, 11)
+# Compute values
+mean_x = x.mean()
+std_x = x.std(ddof=1)
+median_x = np.median(x)
+range_x = x.max() - x.min()
 
-# Extract class names to python list,
-# then encode with integers (dict)
-classLabels = doc.col_values(0, 2, 92)
-classNames = sorted(set(classLabels))
-classDict = dict(zip(classNames, range(5)))
+# Display results
+print("Vector:", x)
+print("Mean:", mean_x)
+print("Standard Deviation:", std_x)
+print("Median:", median_x)
+print("Range:", range_x)
 
-# Extract vector y, convert to NumPy array
-y = np.asarray([classDict[value] for value in classLabels])
-
-# Preallocate memory, then extract excel data to matrix X
-X = np.empty((90, 8))
-for i, col_id in enumerate(range(3, 11)):
-    X[:, i] = np.asarray(doc.col_values(col_id, 2, 92))
-
-# Compute values of N, M and C.
-N = len(y)
-M = len(attributeNames)
-C = len(classNames)
-
-print("Ran Exercise 2.1.1")
+print("Ran Exercise 3.2.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py
deleted file mode 100644
index 98847f6..0000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# exercise 2.1.2
-
-# Imports the numpy and xlrd package, then runs the ex2_1_1 code
-from ex2_1_1 import *
-from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
-
-# (requires data structures from ex. 2.1.1)
-
-
-# Data attributes to be plotted
-i = 0
-j = 1
-
-##
-# Make a simple plot of the i'th attribute against the j'th attribute
-# Notice that X is of matrix type (but it will also work with a numpy array)
-# X = np.array(X) #Try to uncomment this line
-plot(X[:, i], X[:, j], "o")
-
-# %%
-# Make another more fancy plot that includes legend, class labels,
-# attribute names, and a title.
-f = figure()
-title("NanoNose data")
-
-for c in range(C):
-    # select indices belonging to class c:
-    class_mask = y == c
-    plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3)
-
-legend(classNames)
-xlabel(attributeNames[i])
-ylabel(attributeNames[j])
-
-# Output result to screen
-show()
-print("Ran Exercise 2.1.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py
deleted file mode 100644
index c977a4c..0000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# exercise 2.1.3
-# (requires data structures from ex. 2.2.1)
-import matplotlib.pyplot as plt
-from ex2_1_1 import *
-from scipy.linalg import svd
-
-# Subtract mean value from data
-Y = X - np.ones((N, 1)) * X.mean(axis=0)
-
-# PCA by computing SVD of Y
-U, S, V = svd(Y, full_matrices=False)
-
-# Compute variance explained by principal components
-rho = (S * S) / (S * S).sum()
-
-threshold = 0.9
-
-# Plot variance explained
-plt.figure()
-plt.plot(range(1, len(rho) + 1), rho, "x-")
-plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-")
-plt.plot([1, len(rho)], [threshold, threshold], "k--")
-plt.title("Variance explained by principal components")
-plt.xlabel("Principal component")
-plt.ylabel("Variance explained")
-plt.legend(["Individual", "Cumulative", "Threshold"])
-plt.grid()
-plt.show()
-
-print("Ran Exercise 2.1.3")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py
deleted file mode 100644
index 4678868..0000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# exercise 2.1.4
-# (requires data structures from ex. 2.2.1 and 2.2.3)
-from ex2_1_1 import *
-from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
-from scipy.linalg import svd
-
-# Subtract mean value from data
-Y = X - np.ones((N, 1)) * X.mean(0)
-
-# PCA by computing SVD of Y
-U, S, Vh = svd(Y, full_matrices=False)
-# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose)
-# of the vector V. So, for us to obtain the correct V, we transpose:
-V = Vh.T
-
-# Project the centered data onto principal component space
-Z = Y @ V
-
-# Indices of the principal components to be plotted
-i = 0
-j = 1
-
-# Plot PCA of the data
-f = figure()
-title("NanoNose data: PCA")
-# Z = array(Z)
-for c in range(C):
-    # select indices belonging to class c:
-    class_mask = y == c
-    plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5)
-legend(classNames)
-xlabel("PC{0}".format(i + 1))
-ylabel("PC{0}".format(j + 1))
-
-# Output result to screen
-show()
-
-print("Ran Exercise 2.1.4")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py
deleted file mode 100644
index 364a201..0000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# exercise 2.2.4
-
-# (requires data structures from ex. 2.2.1)
-import matplotlib.pyplot as plt
-from ex2_1_1 import *
-from scipy.linalg import svd
-
-Y = X - np.ones((N, 1)) * X.mean(0)
-U, S, Vh = svd(Y, full_matrices=False)
-V = Vh.T
-N, M = X.shape
-
-# We saw in 2.1.3 that the first 3 components explaiend more than 90
-# percent of the variance. Let's look at their coefficients:
-pcs = [0, 1, 2]
-legendStrs = ["PC" + str(e + 1) for e in pcs]
-c = ["r", "g", "b"]
-bw = 0.2
-r = np.arange(1, M + 1)
-for i in pcs:
-    plt.bar(r + i * bw, V[:, i], width=bw)
-plt.xticks(r + bw, attributeNames)
-plt.xlabel("Attributes")
-plt.ylabel("Component coefficients")
-plt.legend(legendStrs)
-plt.grid()
-plt.title("NanoNose: PCA Component Coefficients")
-plt.show()
-
-# Inspecting the plot, we see that the 2nd principal component has large
-# (in magnitude) coefficients for attributes A, E and H. We can confirm
-# this by looking at it's numerical values directly, too:
-print("PC2:")
-print(V[:, 1].T)
-
-# How does this translate to the actual data and its projections?
-# Looking at the data for water:
-
-# Projection of water class onto the 2nd principal component.
-all_water_data = Y[y == 4, :]
-
-print("First water observation")
-print(all_water_data[0, :])
-
-# Based on the coefficients and the attribute values for the observation
-# displayed, would you expect the projection onto PC2 to be positive or
-# negative - why? Consider *both* the magnitude and sign of *both* the
-# coefficient and the attribute!
-
-# You can determine the projection by (remove comments):
-print("...and its projection onto PC2")
-print(all_water_data[0, :] @ V[:, 1])
-# Try to explain why?
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py
index 798bb5b..f5cfc46 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py
@@ -1,36 +1,77 @@
+# exercise 3.3.1
+
 import importlib_resources
+import matplotlib.pyplot as plt
 import numpy as np
-from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks
 from scipy.io import loadmat
 
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
-# Index of the digit to display
-i = 0
+from dtuimldmtools import similarity
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat")
+# Image to use as query
+i = 1
 
+# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
+similarity_measure = "SMC"
+
+# Load the digits
 # Load Matlab data file to python dict structure
-mat_data = loadmat(filename)
+X = loadmat(filename)["X"]
+# You can also try the CBCL faces dataset (remember to change 'transpose')
+#X = loadmat('../Data/wildfaces_grayscale.mat')['X']
+N, M = X.shape
+transpose = False # should the plotted images be transposed? 
+
+
+# Search the face database for similar faces 
+# Index of all other images than i
+noti = list(range(0,i)) + list(range(i+1,N)) 
+# Compute similarity between image i and all others
+sim = similarity(X[i,:], X[noti,:], similarity_measure)
+sim = sim.tolist()[0]
+# Tuples of sorted similarities and their indices
+sim_to_index = sorted(zip(sim,noti))
+
+
+# Visualize query image and 5 most/least similar images
+plt.figure(figsize=(12,8))
+plt.subplot(3,1,1)
 
-# Extract variables of interest
-testdata = mat_data["testdata"]
-traindata = mat_data["traindata"]
-X = traindata[:, 1:]
-y = traindata[:, 0]
+img_hw = int(np.sqrt(len(X[0])))
+img = np.reshape(X[i], (img_hw,img_hw))
+if transpose: img = img.T
+plt.imshow(img, cmap=plt.cm.gray)
+plt.xticks([]); plt.yticks([])
+plt.title('Query image')
+plt.ylabel('image #{0}'.format(i))
 
 
-# Visualize the i'th digit as a vector
-f = figure()
-subplot(4, 1, 4)
-imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r)
-xlabel("Pixel number")
-title("Digit in vector format")
-yticks([])
+for ms in range(5):
 
-# Visualize the i'th digit as an image
-subplot(2, 1, 1)
-I = np.reshape(X[i, :], (16, 16))
-imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r)
-title("Digit as an image")
+    # 5 most similar images found
+    plt.subplot(3,5,6+ms)
+    im_id = sim_to_index[-ms-1][1]
+    im_sim = sim_to_index[-ms-1][0]
+    img = np.reshape(X[im_id],(img_hw,img_hw))
+    if transpose: img = img.T
+    plt.imshow(img, cmap=plt.cm.gray)
+    plt.xlabel('sim={0:.3f}'.format(im_sim))
+    plt.ylabel('image #{0}'.format(im_id))
+    plt.xticks([]); plt.yticks([])
+    if ms==2: plt.title('Most similar images')
 
-show()
+    # 5 least similar images found
+    plt.subplot(3,5,11+ms)
+    im_id = sim_to_index[ms][1]
+    im_sim = sim_to_index[ms][0]
+    img = np.reshape(X[im_id],(img_hw,img_hw))
+    if transpose: img = img.T
+    plt.imshow(img, cmap=plt.cm.gray)
+    plt.xlabel('sim={0:.3f}'.format(im_sim))
+    plt.ylabel('image #{0}'.format(im_id))
+    plt.xticks([]); plt.yticks([])
+    if ms==2: plt.title('Least similar images')
+    
+plt.show()
 
-print("Ran Exercise 2.2.1")
+print('Ran Exercise 3.3.1')
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py
index 3d6104b..e7abec8 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py
@@ -1,117 +1,42 @@
-# exercise 2.2.2
-import importlib_resources
-import numpy as np
-import scipy.linalg as linalg
-from matplotlib.pyplot import (
-    cm,
-    figure,
-    imshow,
-    legend,
-    plot,
-    show,
-    subplot,
-    title,
-    xlabel,
-    ylabel,
-    yticks,
-)
-from scipy.io import loadmat
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
-
-# Digits to include in analysis (to include all, n = range(10) )
-n = [0, 1]
-# Number of principal components for reconstruction
-K = 16
-# Digits to visualize
-nD = range(6)
-
-
-# Load Matlab data file to python dict structure
-# and extract variables of interest
-traindata = loadmat(filename)["traindata"]
-X = traindata[:, 1:]
-y = traindata[:, 0]
-
-N, M = X.shape
-C = len(n)
-
-classValues = n
-classNames = [str(num) for num in n]
-classDict = dict(zip(classNames, classValues))
-
-
-# Select subset of digits classes to be inspected
-class_mask = np.zeros(N).astype(bool)
-for v in n:
-    cmsk = y == v
-    class_mask = class_mask | cmsk
-X = X[class_mask, :]
-y = y[class_mask]
-N = X.shape[0]
-
-# Center the data (subtract mean column values)
-Xc = X - np.ones((N, 1)) * X.mean(0)
-
-# PCA by computing SVD of Y
-U, S, V = linalg.svd(Xc, full_matrices=False)
-# U = mat(U)
-V = V.T
-
-# Compute variance explained by principal components
-rho = (S * S) / (S * S).sum()
-
-# Project data onto principal component space
-Z = Xc @ V
-
-# Plot variance explained
-figure()
-plot(rho, "o-")
-title("Variance explained by principal components")
-xlabel("Principal component")
-ylabel("Variance explained value")
-
-
-# Plot PCA of the data
-f = figure()
-title("pixel vectors of handwr. digits projected on PCs")
-for c in n:
-    # select indices belonging to class c:
-    class_mask = y == c
-    plot(Z[class_mask, 0], Z[class_mask, 1], "o")
-legend(classNames)
-xlabel("PC1")
-ylabel("PC2")
+# exercise 3.2.2
 
+import numpy as np
 
-# Visualize the reconstructed data from the first K principal components
-# Select randomly D digits.
-figure(figsize=(10, 3))
-W = Z[:, range(K)] @ V[:, range(K)].T
-D = len(nD)
-for d in range(D):
-    digit_ix = np.random.randint(0, N)
-    subplot(2, D, int(d + 1))
-    I = np.reshape(X[digit_ix, :], (16, 16))
-    imshow(I, cmap=cm.gray_r)
-    title("Original")
-    subplot(2, D, D + d + 1)
-    I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16))
-    imshow(I, cmap=cm.gray_r)
-    title("Reconstr.")
+from dtuimldmtools import similarity
 
+# Generate two data objects with M random attributes
+M = 5
+x = np.random.rand(1, M)
+y = np.random.rand(1, M)
 
-# Visualize the pricipal components
-figure(figsize=(8, 6))
-for k in range(K):
-    N1 = int(np.ceil(np.sqrt(K)))
-    N2 = int(np.ceil(K / N1))
-    subplot(N2, N1, int(k + 1))
-    I = np.reshape(V[:, k], (16, 16))
-    imshow(I, cmap=cm.hot)
-    title("PC{0}".format(k + 1))
+# Two constants
+a = 1.5
+b = 1.5
 
-# output to screen
-show()
+# Check the statements in the exercise
+print(
+    "Cosine scaling: %.4f "
+    % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0]
+)
+print(
+    "ExtendedJaccard scaling: %.4f "
+    % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0]
+)
+print(
+    "Correlation scaling: %.4f "
+    % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0]
+)
+print(
+    "Cosine translation: %.4f "
+    % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0]
+)
+print(
+    "ExtendedJaccard translation: %.4f "
+    % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0]
+)
+print(
+    "Correlation translation: %.4f "
+    % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0]
+)
 
-print("Ran Exercise 2.2.2")
+print("Ran Exercise 3.2.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py
index 9f665b5..e6b46e8 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py
@@ -1,58 +1,33 @@
-# exercise 2.3.1
+# exercise 4.2.1
 
 import importlib_resources
 import numpy as np
-import scipy.linalg as linalg
-from matplotlib.pyplot import figure, plot, show, xlabel, ylabel
-from scipy.io import loadmat
-from sklearn.neighbors import KNeighborsClassifier
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
-# Number of principal components to use for classification,
-# i.e. the reduced dimensionality
-K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150]
-
-# Load Matlab data file and extract training set and test set
-mat_data = loadmat(filename)
-X = mat_data["traindata"][:, 1:]
-y = mat_data["traindata"][:, 0]
-Xtest = mat_data["testdata"][:, 1:]
-ytest = mat_data["testdata"][:, 0]
-N, M = X.shape
-Ntest = Xtest.shape[0]  # or Xtest[:,0].shape
-
-# Subtract the mean from the data
-Y = X - np.ones((N, 1)) * X.mean(0)
-Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0)
-
-# Obtain the PCA solution  by calculate the SVD of Y
-U, S, V = linalg.svd(Y, full_matrices=False)
-V = V.T
-
-
-# Repeat classification for different values of K
-error_rates = []
-for k in K:
-    # Project data onto principal component space,
-    Z = Y @ V[:, :k]
-    Ztest = Ytest @ V[:, :k]
-
-    # Classify data with knn classifier
-    knn_classifier = KNeighborsClassifier(n_neighbors=1)
-    knn_classifier.fit(Z, y.ravel())
-    y_estimated = knn_classifier.predict(Ztest)
-
-    # Compute classification error rates
-    y_estimated = y_estimated.T
-    er = (sum(ytest != y_estimated) / float(len(ytest))) * 100
-    error_rates.append(er)
-    print("K={0}: Error rate: {1:.1f}%".format(k, er))
-
-# Visualize error rates vs. number of principal components considered
-figure()
-plot(K, error_rates, "o-")
-xlabel("Number of principal components K")
-ylabel("Error rate [%]")
-show()
-
-print("Ran Exercise 2.3.1")
+import xlrd
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls")
+# Load xls sheet with data
+doc = xlrd.open_workbook(filename).sheet_by_index(0)
+
+# Extract attribute names
+attributeNames = doc.row_values(0, 0, 4)
+
+# Extract class names to python list,
+# then encode with integers (dict)
+classLabels = doc.col_values(4, 1, 151)
+classNames = sorted(set(classLabels))
+classDict = dict(zip(classNames, range(len(classNames))))
+
+# Extract vector y, convert to NumPy matrix and transpose
+y = np.array([classDict[value] for value in classLabels])
+
+# Preallocate memory, then extract data to matrix X
+X = np.empty((150, 4))
+for i in range(4):
+    X[:, i] = np.array(doc.col_values(i, 1, 151)).T
+
+# Compute values of N, M and C.
+N = len(y)
+M = len(attributeNames)
+C = len(classNames)
+
+print("Ran Exercise 4.2.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_2.py
similarity index 94%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_2.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_2.py
index 4bef1c3..4066902 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_2.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import figure, hist, show, subplot, xlabel, ylim
 
 figure(figsize=(8, 7))
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_3.py
similarity index 92%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_3.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_3.py
index a642083..d19cace 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_3.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.3
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import boxplot, show, title, xticks, ylabel
 
 boxplot(X)
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_4.py
similarity index 97%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_4.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_4.py
index 4c394b1..d45db38 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_4.py
@@ -1,6 +1,6 @@
 # Exercise 4.2.4
 # requires data from exercise 4.1.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import boxplot, figure, show, subplot, title, xticks, ylim
 
 figure(figsize=(14, 7))
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_5.py
similarity index 97%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_5.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_5.py
index 67061cb..eaac941 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_5.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.5
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import (
     figure,
     legend,
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_6.py
similarity index 96%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_6.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_6.py
index 1f625d2..f881cc8 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_6.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.6
 
 # requires data from exercise 4.1.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import figure, show
 from mpl_toolkits.mplot3d import Axes3D
 
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_7.py
similarity index 95%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_7.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_7.py
index e34326e..08c9dbf 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_7.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.7
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import (
     cm,
     colorbar,
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py
similarity index 100%
rename from exercises/02450Toolbox_Python/Scripts/ex4_3_1.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_4_1.py
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_2.py
similarity index 100%
rename from exercises/02450Toolbox_Python/Scripts/ex4_3_2.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_4_2.py
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py
new file mode 100644
index 0000000..b210d13
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py
@@ -0,0 +1,32 @@
+# exercise 2.1.1
+import importlib_resources
+import numpy as np
+import xlrd
+
+# Load xls sheet with data
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls")
+doc = xlrd.open_workbook(filename).sheet_by_index(0)
+
+# Extract attribute names (1st row, column 4 to 12)
+attributeNames = doc.row_values(0, 3, 11)
+
+# Extract class names to python list,
+# then encode with integers (dict)
+classLabels = doc.col_values(0, 2, 92)
+classNames = sorted(set(classLabels))
+classDict = dict(zip(classNames, range(5)))
+
+# Extract vector y, convert to NumPy array
+y = np.asarray([classDict[value] for value in classLabels])
+
+# Preallocate memory, then extract excel data to matrix X
+X = np.empty((90, 8))
+for i, col_id in enumerate(range(3, 11)):
+    X[:, i] = np.asarray(doc.col_values(col_id, 2, 92))
+
+# Compute values of N, M and C.
+N = len(y)
+M = len(attributeNames)
+C = len(classNames)
+
+print("Ran Exercise 2.1.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py
index 60a39cf..6af386a 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py
@@ -1,54 +1,37 @@
-# exercise 3.1.4
-import importlib_resources
-import numpy as np
-from sklearn.feature_extraction.text import CountVectorizer
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
-# Load the textDocs.txt as a long string into raw_file:
-with open(filename, "r") as f:
-    raw_file = f.read()
-# raw_file contains sentences seperated by newline characters,
-# so we split by '\n':
-corpus = raw_file.split("\n")
-# corpus is now list of "documents" (sentences), but some of them are empty,
-# because textDocs.txt has a lot of empty lines, we filter/remove them:
-corpus = list(filter(None, corpus))
-
-# Display the result
-print("Document-term matrix analysis")
-print()
-print("Corpus (5 documents/sentences):")
-print(np.asmatrix(corpus))
-print()
-
-
-# To automatically obtain the bag of words representation, we use sklearn's
-# feature_extraction.text module, which has a function CountVectorizer.
-# We make a CounterVectorizer:
-vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b")
-# The token pattern is a regular expression (marked by the r), which ensures
-# that the vectorizer ignores digit/non-word tokens - in this case, it ensures
-# the 10 in the last document is not recognized as a token. It's not important
-# that you should understand it the regexp.
-
-# The object vectorizer can now be used to first 'fit' the vectorizer to the
-# corpus, and the subsequently transform the data. We start by fitting:
-vectorizer.fit(corpus)
-# The vectorizer has now determined the unique terms (or tokens) in the corpus
-# and we can extract them using:
-attributeNames = vectorizer.get_feature_names_out()
-print("Found terms:")
-print(attributeNames)
-print()
-
-# The next step is to count how many times each term is found in each document,
-# which we do using the transform function:
-X = vectorizer.transform(corpus)
-N, M = X.shape
-print("Number of documents (data objects, N):\t %i" % N)
-print("Number of terms (attributes, M):\t %i" % M)
-print()
-print("Document-term matrix:")
-print(X.toarray())
-print()
-print("Ran Exercise 3.1.2")
+# exercise 2.1.2
+
+# Imports the numpy and xlrd package, then runs the ex2_1_1 code
+from ex3_1_1 import *
+from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
+
+# (requires data structures from ex. 2.1.1)
+
+
+# Data attributes to be plotted
+i = 0
+j = 1
+
+##
+# Make a simple plot of the i'th attribute against the j'th attribute
+# Notice that X is of matrix type (but it will also work with a numpy array)
+# X = np.array(X) #Try to uncomment this line
+plot(X[:, i], X[:, j], "o")
+
+# %%
+# Make another more fancy plot that includes legend, class labels,
+# attribute names, and a title.
+f = figure()
+title("NanoNose data")
+
+for c in range(C):
+    # select indices belonging to class c:
+    class_mask = y == c
+    plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3)
+
+legend(classNames)
+xlabel(attributeNames[i])
+ylabel(attributeNames[j])
+
+# Output result to screen
+show()
+print("Ran Exercise 2.1.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py
index c3ac96c..75b9c6b 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py
@@ -1,40 +1,30 @@
-# exercise 3.1.4
-import importlib_resources
-from sklearn.feature_extraction.text import CountVectorizer
+# exercise 2.1.3
+# (requires data structures from ex. 2.2.1)
+import matplotlib.pyplot as plt
+from ex3_1_1 import *
+from scipy.linalg import svd
 
-filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
-filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
+# Subtract mean value from data
+Y = X - np.ones((N, 1)) * X.mean(axis=0)
 
-# As before, load the corpus and preprocess:
-with open(filename_docs, "r") as f:
-    raw_file = f.read()
-corpus = raw_file.split("\n")
-corpus = list(filter(None, corpus))
+# PCA by computing SVD of Y
+U, S, V = svd(Y, full_matrices=False)
 
-# Load and process the stop words in a similar manner:
-with open(filename_stop, "r") as f:
-    raw_file = f.read()
-stopwords = raw_file.split("\n")
+# Compute variance explained by principal components
+rho = (S * S) / (S * S).sum()
 
-# When making the CountVectorizer, we now input the stop words:
-vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords)
-# Determine the terms in the corpus
-vectorizer.fit(corpus)
-# ... and count the frequency of each term within a document:
-X = vectorizer.transform(corpus)
-attributeNames = vectorizer.get_feature_names_out()
-N, M = X.shape
+threshold = 0.9
 
-# Display the result
-print("Document-term matrix analysis (using stop words)")
-print()
-print("Number of documents (data objects, N):\t %i" % N)
-print("Number of terms (attributes, M):\t %i" % M)
-print()
-print("Found terms (no stop words):")
-print(attributeNames)
-print()
-print("Document-term matrix:")
-print(X.toarray())
-print()
-print("Ran Exercise 3.1.3")
+# Plot variance explained
+plt.figure()
+plt.plot(range(1, len(rho) + 1), rho, "x-")
+plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-")
+plt.plot([1, len(rho)], [threshold, threshold], "k--")
+plt.title("Variance explained by principal components")
+plt.xlabel("Principal component")
+plt.ylabel("Variance explained")
+plt.legend(["Individual", "Cumulative", "Threshold"])
+plt.grid()
+plt.show()
+
+print("Ran Exercise 2.1.3")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py
index e33ee98..87f40a7 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py
@@ -1,66 +1,38 @@
-# exercise 3.1.4
-import importlib_resources
-
-# We'll use a widely used stemmer based:
-# Porter, M. “An algorithm for suffix stripping.” Program 14.3 (1980): 130-137.
-# The stemmer is implemented in the most used natural language processing
-# package in Python, "Natural Langauge Toolkit" (NLTK):
-from nltk.stem import PorterStemmer
-from sklearn.feature_extraction.text import CountVectorizer
-
-filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
-filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
-
-# As before, load the corpus and preprocess:
-with open(filename_docs, "r") as f:
-    raw_file = f.read()
-corpus = raw_file.split("\n")
-corpus = list(filter(None, corpus))
-
-# Load and process the stop words in a similar manner:
-with open(filename_stop, "r") as f:
-    raw_file = f.read()
-stopwords = raw_file.split("\n")
-
-
-# To enable stemming when using the sklearn-module, we need to parse an
-# "analyzer" to the vectorizer we've been using.
-# First, we make an object based on the PorterStemmer class, and we also make
-# an analyzer object:
-stemmer = PorterStemmer()
-analyzer = CountVectorizer(
-    token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords
-).build_analyzer()
-
-
-# Using these we'll make a function that can stem words:
-def stemmed_words(doc):
-    return (stemmer.stem(w) for w in analyzer(doc))
-
-
-# ... and finally, we make a vectorizer just like we've done before:
-vectorizer = CountVectorizer(analyzer=stemmed_words)
-
-# Determine the terms:
-vectorizer.fit(corpus)
-attributeNames = vectorizer.get_feature_names_out()
-
-# ... and count the occurences:
-X = vectorizer.transform(corpus)
-N, M = X.shape
-X = X.toarray()
-
-# Display the result
-print("Document-term matrix analysis (using stop words and stemming)")
-print()
-print("Number of documents (data objects, N):\t %i" % N)
-print("Number of terms (attributes, M):\t %i" % M)
-print()
-print("Found terms (no stop words, stemmed):")
-print(attributeNames)
-print()
-print("Document-term matrix:")
-print(X)
-print()
-print("Ran Exercise 3.1.4")
-print()
+# exercise 2.1.4
+# (requires data structures from ex. 2.2.1 and 2.2.3)
+from ex3_1_1 import *
+from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
+from scipy.linalg import svd
+
+# Subtract mean value from data
+Y = X - np.ones((N, 1)) * X.mean(0)
+
+# PCA by computing SVD of Y
+U, S, Vh = svd(Y, full_matrices=False)
+# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose)
+# of the vector V. So, for us to obtain the correct V, we transpose:
+V = Vh.T
+
+# Project the centered data onto principal component space
+Z = Y @ V
+
+# Indices of the principal components to be plotted
+i = 0
+j = 1
+
+# Plot PCA of the data
+f = figure()
+title("NanoNose data: PCA")
+# Z = array(Z)
+for c in range(C):
+    # select indices belonging to class c:
+    class_mask = y == c
+    plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5)
+legend(classNames)
+xlabel("PC{0}".format(i + 1))
+ylabel("PC{0}".format(j + 1))
+
+# Output result to screen
+show()
+
+print("Ran Exercise 2.1.4")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py
index d28d50a..b17766d 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py
@@ -1,38 +1,53 @@
-# exercise 3.1.5
-import numpy as np
-import scipy.linalg as linalg
-from ex3_1_4 import *
-
-from dtuimldmtools import similarity
-
-# Query vector
-q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
-# notice, that you could get the query vector using the vectorizer, too:
-# q = vectorizer.transform(['matrix rank solv'])
-# q = np.asarray(q.toarray())
-# or use any other string:
-# q = vectorizer.transform(['Can I Google how to fix my problem?'])
-# q = np.asarray(q.toarray())
-
-# Method 1 ('for' loop - slow)
-N = np.shape(X)[0]
-# get the number of data objects
-sim = np.zeros((N, 1))  # allocate a vector for the similarity
-for i in range(N):
-    x = X[i, :]  # Get the i'th data object (here: document)
-    sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x)  # Compute cosine similarity
-
-# Method 2 (one line of code with no iterations - faster)
-sim = (q @ X.T).T / (
-    np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum())
-)
-
-# Method 3 (use the "similarity" function)
-sim = similarity(X, q, "cos")
-
-
-# Display the result
-print("Query vector:\n {0}\n".format(q))
-print("Similarity results:\n {0}".format(sim))
-
-print("Ran Exercise 3.1.5")
+# exercise 2.2.4
+
+# (requires data structures from ex. 2.2.1)
+import matplotlib.pyplot as plt
+from ex3_1_1 import *
+from scipy.linalg import svd
+
+Y = X - np.ones((N, 1)) * X.mean(0)
+U, S, Vh = svd(Y, full_matrices=False)
+V = Vh.T
+N, M = X.shape
+
+# We saw in 2.1.3 that the first 3 components explaiend more than 90
+# percent of the variance. Let's look at their coefficients:
+pcs = [0, 1, 2]
+legendStrs = ["PC" + str(e + 1) for e in pcs]
+c = ["r", "g", "b"]
+bw = 0.2
+r = np.arange(1, M + 1)
+for i in pcs:
+    plt.bar(r + i * bw, V[:, i], width=bw)
+plt.xticks(r + bw, attributeNames)
+plt.xlabel("Attributes")
+plt.ylabel("Component coefficients")
+plt.legend(legendStrs)
+plt.grid()
+plt.title("NanoNose: PCA Component Coefficients")
+plt.show()
+
+# Inspecting the plot, we see that the 2nd principal component has large
+# (in magnitude) coefficients for attributes A, E and H. We can confirm
+# this by looking at it's numerical values directly, too:
+print("PC2:")
+print(V[:, 1].T)
+
+# How does this translate to the actual data and its projections?
+# Looking at the data for water:
+
+# Projection of water class onto the 2nd principal component.
+all_water_data = Y[y == 4, :]
+
+print("First water observation")
+print(all_water_data[0, :])
+
+# Based on the coefficients and the attribute values for the observation
+# displayed, would you expect the projection onto PC2 to be positive or
+# negative - why? Consider *both* the magnitude and sign of *both* the
+# coefficient and the attribute!
+
+# You can determine the projection by (remove comments):
+print("...and its projection onto PC2")
+print(all_water_data[0, :] @ V[:, 1])
+# Try to explain why?
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_6.py
similarity index 99%
rename from exercises/02450Toolbox_Python/Scripts/ex2_1_6.py
rename to exercises/02450Toolbox_Python/Scripts/ex3_1_6.py
index 78250ef..25cf520 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_6.py
@@ -1,6 +1,6 @@
 ## exercise 2.1.6
 import matplotlib.pyplot as plt
-from ex2_1_1 import *
+from ex3_1_1 import *
 from scipy.linalg import svd
 
 r = np.arange(1, X.shape[1] + 1)
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py
index dbc1dc0..798bb5b 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py
@@ -1,19 +1,36 @@
-# exercise 3.2.1
+import importlib_resources
 import numpy as np
+from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks
+from scipy.io import loadmat
 
-x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47])
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
+# Index of the digit to display
+i = 0
 
-# Compute values
-mean_x = x.mean()
-std_x = x.std(ddof=1)
-median_x = np.median(x)
-range_x = x.max() - x.min()
+# Load Matlab data file to python dict structure
+mat_data = loadmat(filename)
 
-# Display results
-print("Vector:", x)
-print("Mean:", mean_x)
-print("Standard Deviation:", std_x)
-print("Median:", median_x)
-print("Range:", range_x)
+# Extract variables of interest
+testdata = mat_data["testdata"]
+traindata = mat_data["traindata"]
+X = traindata[:, 1:]
+y = traindata[:, 0]
 
-print("Ran Exercise 3.2.1")
+
+# Visualize the i'th digit as a vector
+f = figure()
+subplot(4, 1, 4)
+imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r)
+xlabel("Pixel number")
+title("Digit in vector format")
+yticks([])
+
+# Visualize the i'th digit as an image
+subplot(2, 1, 1)
+I = np.reshape(X[i, :], (16, 16))
+imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r)
+title("Digit as an image")
+
+show()
+
+print("Ran Exercise 2.2.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py
new file mode 100644
index 0000000..3d6104b
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py
@@ -0,0 +1,117 @@
+# exercise 2.2.2
+import importlib_resources
+import numpy as np
+import scipy.linalg as linalg
+from matplotlib.pyplot import (
+    cm,
+    figure,
+    imshow,
+    legend,
+    plot,
+    show,
+    subplot,
+    title,
+    xlabel,
+    ylabel,
+    yticks,
+)
+from scipy.io import loadmat
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
+
+# Digits to include in analysis (to include all, n = range(10) )
+n = [0, 1]
+# Number of principal components for reconstruction
+K = 16
+# Digits to visualize
+nD = range(6)
+
+
+# Load Matlab data file to python dict structure
+# and extract variables of interest
+traindata = loadmat(filename)["traindata"]
+X = traindata[:, 1:]
+y = traindata[:, 0]
+
+N, M = X.shape
+C = len(n)
+
+classValues = n
+classNames = [str(num) for num in n]
+classDict = dict(zip(classNames, classValues))
+
+
+# Select subset of digits classes to be inspected
+class_mask = np.zeros(N).astype(bool)
+for v in n:
+    cmsk = y == v
+    class_mask = class_mask | cmsk
+X = X[class_mask, :]
+y = y[class_mask]
+N = X.shape[0]
+
+# Center the data (subtract mean column values)
+Xc = X - np.ones((N, 1)) * X.mean(0)
+
+# PCA by computing SVD of Y
+U, S, V = linalg.svd(Xc, full_matrices=False)
+# U = mat(U)
+V = V.T
+
+# Compute variance explained by principal components
+rho = (S * S) / (S * S).sum()
+
+# Project data onto principal component space
+Z = Xc @ V
+
+# Plot variance explained
+figure()
+plot(rho, "o-")
+title("Variance explained by principal components")
+xlabel("Principal component")
+ylabel("Variance explained value")
+
+
+# Plot PCA of the data
+f = figure()
+title("pixel vectors of handwr. digits projected on PCs")
+for c in n:
+    # select indices belonging to class c:
+    class_mask = y == c
+    plot(Z[class_mask, 0], Z[class_mask, 1], "o")
+legend(classNames)
+xlabel("PC1")
+ylabel("PC2")
+
+
+# Visualize the reconstructed data from the first K principal components
+# Select randomly D digits.
+figure(figsize=(10, 3))
+W = Z[:, range(K)] @ V[:, range(K)].T
+D = len(nD)
+for d in range(D):
+    digit_ix = np.random.randint(0, N)
+    subplot(2, D, int(d + 1))
+    I = np.reshape(X[digit_ix, :], (16, 16))
+    imshow(I, cmap=cm.gray_r)
+    title("Original")
+    subplot(2, D, D + d + 1)
+    I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16))
+    imshow(I, cmap=cm.gray_r)
+    title("Reconstr.")
+
+
+# Visualize the pricipal components
+figure(figsize=(8, 6))
+for k in range(K):
+    N1 = int(np.ceil(np.sqrt(K)))
+    N2 = int(np.ceil(K / N1))
+    subplot(N2, N1, int(k + 1))
+    I = np.reshape(V[:, k], (16, 16))
+    imshow(I, cmap=cm.hot)
+    title("PC{0}".format(k + 1))
+
+# output to screen
+show()
+
+print("Ran Exercise 2.2.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py
index f5cfc46..9f665b5 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py
@@ -1,77 +1,58 @@
-# exercise 3.3.1
+# exercise 2.3.1
 
 import importlib_resources
-import matplotlib.pyplot as plt
 import numpy as np
+import scipy.linalg as linalg
+from matplotlib.pyplot import figure, plot, show, xlabel, ylabel
 from scipy.io import loadmat
-
-from dtuimldmtools import similarity
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat")
-# Image to use as query
-i = 1
-
-# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
-similarity_measure = "SMC"
-
-# Load the digits
-# Load Matlab data file to python dict structure
-X = loadmat(filename)["X"]
-# You can also try the CBCL faces dataset (remember to change 'transpose')
-#X = loadmat('../Data/wildfaces_grayscale.mat')['X']
+from sklearn.neighbors import KNeighborsClassifier
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
+# Number of principal components to use for classification,
+# i.e. the reduced dimensionality
+K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150]
+
+# Load Matlab data file and extract training set and test set
+mat_data = loadmat(filename)
+X = mat_data["traindata"][:, 1:]
+y = mat_data["traindata"][:, 0]
+Xtest = mat_data["testdata"][:, 1:]
+ytest = mat_data["testdata"][:, 0]
 N, M = X.shape
-transpose = False # should the plotted images be transposed? 
-
-
-# Search the face database for similar faces 
-# Index of all other images than i
-noti = list(range(0,i)) + list(range(i+1,N)) 
-# Compute similarity between image i and all others
-sim = similarity(X[i,:], X[noti,:], similarity_measure)
-sim = sim.tolist()[0]
-# Tuples of sorted similarities and their indices
-sim_to_index = sorted(zip(sim,noti))
-
-
-# Visualize query image and 5 most/least similar images
-plt.figure(figsize=(12,8))
-plt.subplot(3,1,1)
-
-img_hw = int(np.sqrt(len(X[0])))
-img = np.reshape(X[i], (img_hw,img_hw))
-if transpose: img = img.T
-plt.imshow(img, cmap=plt.cm.gray)
-plt.xticks([]); plt.yticks([])
-plt.title('Query image')
-plt.ylabel('image #{0}'.format(i))
-
-
-for ms in range(5):
-
-    # 5 most similar images found
-    plt.subplot(3,5,6+ms)
-    im_id = sim_to_index[-ms-1][1]
-    im_sim = sim_to_index[-ms-1][0]
-    img = np.reshape(X[im_id],(img_hw,img_hw))
-    if transpose: img = img.T
-    plt.imshow(img, cmap=plt.cm.gray)
-    plt.xlabel('sim={0:.3f}'.format(im_sim))
-    plt.ylabel('image #{0}'.format(im_id))
-    plt.xticks([]); plt.yticks([])
-    if ms==2: plt.title('Most similar images')
-
-    # 5 least similar images found
-    plt.subplot(3,5,11+ms)
-    im_id = sim_to_index[ms][1]
-    im_sim = sim_to_index[ms][0]
-    img = np.reshape(X[im_id],(img_hw,img_hw))
-    if transpose: img = img.T
-    plt.imshow(img, cmap=plt.cm.gray)
-    plt.xlabel('sim={0:.3f}'.format(im_sim))
-    plt.ylabel('image #{0}'.format(im_id))
-    plt.xticks([]); plt.yticks([])
-    if ms==2: plt.title('Least similar images')
-    
-plt.show()
-
-print('Ran Exercise 3.3.1')
+Ntest = Xtest.shape[0]  # or Xtest[:,0].shape
+
+# Subtract the mean from the data
+Y = X - np.ones((N, 1)) * X.mean(0)
+Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0)
+
+# Obtain the PCA solution  by calculate the SVD of Y
+U, S, V = linalg.svd(Y, full_matrices=False)
+V = V.T
+
+
+# Repeat classification for different values of K
+error_rates = []
+for k in K:
+    # Project data onto principal component space,
+    Z = Y @ V[:, :k]
+    Ztest = Ytest @ V[:, :k]
+
+    # Classify data with knn classifier
+    knn_classifier = KNeighborsClassifier(n_neighbors=1)
+    knn_classifier.fit(Z, y.ravel())
+    y_estimated = knn_classifier.predict(Ztest)
+
+    # Compute classification error rates
+    y_estimated = y_estimated.T
+    er = (sum(ytest != y_estimated) / float(len(ytest))) * 100
+    error_rates.append(er)
+    print("K={0}: Error rate: {1:.1f}%".format(k, er))
+
+# Visualize error rates vs. number of principal components considered
+figure()
+plot(K, error_rates, "o-")
+xlabel("Number of principal components K")
+ylabel("Error rate [%]")
+show()
+
+print("Ran Exercise 2.3.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py
deleted file mode 100644
index e7abec8..0000000
--- a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# exercise 3.2.2
-
-import numpy as np
-
-from dtuimldmtools import similarity
-
-# Generate two data objects with M random attributes
-M = 5
-x = np.random.rand(1, M)
-y = np.random.rand(1, M)
-
-# Two constants
-a = 1.5
-b = 1.5
-
-# Check the statements in the exercise
-print(
-    "Cosine scaling: %.4f "
-    % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0]
-)
-print(
-    "ExtendedJaccard scaling: %.4f "
-    % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0]
-)
-print(
-    "Correlation scaling: %.4f "
-    % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0]
-)
-print(
-    "Cosine translation: %.4f "
-    % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0]
-)
-print(
-    "ExtendedJaccard translation: %.4f "
-    % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0]
-)
-print(
-    "Correlation translation: %.4f "
-    % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0]
-)
-
-print("Ran Exercise 3.2.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
deleted file mode 100644
index e6b46e8..0000000
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# exercise 4.2.1
-
-import importlib_resources
-import numpy as np
-import xlrd
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls")
-# Load xls sheet with data
-doc = xlrd.open_workbook(filename).sheet_by_index(0)
-
-# Extract attribute names
-attributeNames = doc.row_values(0, 0, 4)
-
-# Extract class names to python list,
-# then encode with integers (dict)
-classLabels = doc.col_values(4, 1, 151)
-classNames = sorted(set(classLabels))
-classDict = dict(zip(classNames, range(len(classNames))))
-
-# Extract vector y, convert to NumPy matrix and transpose
-y = np.array([classDict[value] for value in classLabels])
-
-# Preallocate memory, then extract data to matrix X
-X = np.empty((150, 4))
-for i in range(4):
-    X[:, i] = np.array(doc.col_values(i, 1, 151)).T
-
-# Compute values of N, M and C.
-N = len(y)
-M = len(attributeNames)
-C = len(classNames)
-
-print("Ran Exercise 4.2.1")
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_2.R b/exercises/02450Toolbox_R/Scripts/ex1_6_2.R
new file mode 100644
index 0000000..d3a6872
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_2.R
@@ -0,0 +1,19 @@
+####################
+# Exercise 3.1.2
+####################
+
+rm(list = ls()) # Clear work space
+
+library(tm)
+textfolder = "./Data/"
+
+(docs <- Corpus(DirSource(textfolder, pattern="textDocs*"), readerControl = list(language="en")))
+
+inspect(docs)
+
+docs_nopunct <- tm_map(docs, removePunctuation)
+dtm <- DocumentTermMatrix(docs_nopunct)
+
+inspect(dtm)
+
+
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_3.R b/exercises/02450Toolbox_R/Scripts/ex1_6_3.R
new file mode 100644
index 0000000..4180e92
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_3.R
@@ -0,0 +1,23 @@
+####################
+# Exercise 3.1.3
+####################
+
+rm(list = ls()) # Clear work space
+
+library(tm)
+textfolder <- "./Data/"
+
+(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
+
+inspect(docs)
+docs_nopunct <- tm_map(docs, removePunctuation)
+dtm <- DocumentTermMatrix(docs_nopunct)
+
+mystopwords <- scan("./Data/stopWords.txt", character(0))
+
+docs_nostopwords <- tm_map(docs, removeWords, mystopwords)
+inspect(docs_nostopwords)
+dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
+inspect(dtm_nostopwords)
+
+control <- list(stopwords = TRUE)
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_4.R b/exercises/02450Toolbox_R/Scripts/ex1_6_4.R
new file mode 100644
index 0000000..5ad1d63
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_4.R
@@ -0,0 +1,29 @@
+####################
+# Exercise 3.1.4
+####################
+
+rm(list = ls()) # Clear work space
+
+library(tm) # install.packages("tm")
+library(SnowballC) # install.packages("SnowballC")
+
+textfolder <- "./Data/"
+
+(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
+
+inspect(docs)
+docs_nopunct <- tm_map(docs, removePunctuation)
+dtm <- DocumentTermMatrix(docs_nopunct)
+
+mystopwords <- scan("./Data/stopWords.txt", character(0))
+
+docs_nostopwords <- tm_map(docs_nopunct, removeWords, mystopwords)
+dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
+inspect(dtm_nostopwords)
+
+docs_stemmed <- tm_map(docs_nostopwords, stemDocument, language = "english")
+
+
+inspect(docs_stemmed)
+dtm_stemmed <- DocumentTermMatrix(docs_stemmed, control = list(stopwords = TRUE))
+inspect(dtm_stemmed)
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_5.R b/exercises/02450Toolbox_R/Scripts/ex1_6_5.R
new file mode 100644
index 0000000..a9c4b7d
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_5.R
@@ -0,0 +1,19 @@
+####################
+# Exercise 3.1.5
+####################
+
+source("Scripts/ex1_6_4.R")
+source("Tools/similarity.R")
+
+size <- dim(dtm_stemmed)
+cosmeas <- c()
+q <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0)
+dtmat <- matrix(dtm_stemmed, dtm_stemmed$nrow, dtm_stemmed$ncol)
+
+for (irow in 1:size[1]) {
+  doc <- dtm_stemmed[irow, ]
+  cosmeas[irow] <- similarity(t(dtmat[irow, ]), t(q), 'cos')
+}
+
+print("Cosine similarity from q to docs: ")
+print(cosmeas)
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_1.R b/exercises/02450Toolbox_R/Scripts/ex2_1_1.R
index 7861e15..725f756 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_1_1.R
@@ -1,36 +1,17 @@
 ####################
-# Exercise 2.1.1
+# Exercise 3.2.1
 ####################
 
 rm(list = ls()) # Clear work space
 
-# Read data into R
-data <- read.csv("./Data/nanonose.csv", check.names = FALSE)
+x <- c(-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47)
 
-# Extract class labels of observations
-classLabels <- colnames(data)
-classLabels <- classLabels[-(1:2)]
+mean(x)
+sd(x)
+median(x)
+diff(range(x))
 
-# Extract attributes, i.e. sensor names
-attributeNames <- data[3:10, 1]
-
-# Remove first two rows and columns and transpose data matrix
-X <- t(data[-(1:2), -(1:2)])
-
-# Check that dimensions are as they should be (90 rows and 8 columns)
-N <- dim(X)[1]
-M <- dim(X)[2]
-
-# Assign the class labels as row names and the attributes as column names
-rownames(X) <- classLabels
-colnames(X) <- attributeNames
-
-# Extract the class names present in data
-classNames <- sort(unique(classLabels))
-C <- length(classNames)
-
-# Extract numeric class assignments
-y <- as.numeric(as.factor(classLabels)) - 1
-
-# Clean up a bit in the variables, since we'll call this script from later scripts:
-rm(data)
+# Range returns the minimum and maximum of the vector x.
+# To get the range, we must take the maximum minus the minimum.
+# We do this using the function diff, which finds differences
+# between consecutive elements in a vector.
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_2.R b/exercises/02450Toolbox_R/Scripts/ex2_1_2.R
deleted file mode 100644
index d99b5eb..0000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_2.R
+++ /dev/null
@@ -1,20 +0,0 @@
-####################
-# Exercise 2.1.2
-####################
-
-# Run ex2.1.1:
-source('Scripts/ex2_1_1.R')
-
-# Choose which sensors to plot
-i = 1
-j = 2
-
-# Make simple plot
-plot(X[ , i], X[ , j])
-
-## Make more fancy plot
-library(ggplot2)
-ggplot() + geom_point(aes(x = X[ , i], y = X[ , j], color = classLabels), size=4, alpha=0.5) +
-  theme(legend.position = c(0.88, 0.77), legend.title= element_blank()) +
-  labs(x = attributeNames[i], y = attributeNames[j])
-
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_3.R b/exercises/02450Toolbox_R/Scripts/ex2_1_3.R
deleted file mode 100644
index 05b805d..0000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_3.R
+++ /dev/null
@@ -1,45 +0,0 @@
-####################
-# Exercise 2.1.3
-####################
-
-source("Scripts/ex2_1_1.R")
-
-# Subtract the column means form columns of X
-Y <- t(apply(X, 1, "-", colMeans(X)))
-
-# You can check the column means using: colMeans(Y)
-colMeans(Y)
-
-# PCA by computing SVD of Y:
-s <- svd(Y)
-diagS <- s$d
-
-rho <- diagS^2 / sum(diagS^2)
-
-# PCA can also be done with the built-in function prcomp.
-# Confirm that you get the same result.
-rho2 <- summary(prcomp(X, center = TRUE))$importance[2,]
-
-
-threshold <- 0.9
-
-xlimits <- c(1, M)
-plot(rho,
-  type = "o",
-  main = "Variance explained by principal componenets",
-  xlab = "Principal components",
-  ylab = "Variance explained",
-  xlim = xlimits,
-  ylim = c(0, 1),
-  col = "blue"
-)
-
-lines(cumsum(rho), type = "o", col = "orange")
-lines(xlimits, c(threshold, threshold), lty = "dashed")
-
-legend("right", # Define position
-  legend = c("Individual", "Cumulative", "Threshold"), # Set strings for legend
-  col = c("orange", "blue", "black"), lty = c(1, 1, 2), # Match appereance of lines
-  cex = 1, bg = "lightblue"
-) # Setup how the box looks (cex controls size)
-
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_4.R b/exercises/02450Toolbox_R/Scripts/ex2_1_4.R
deleted file mode 100644
index 3b43e47..0000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_4.R
+++ /dev/null
@@ -1,20 +0,0 @@
-####################
-# Exercise 2.1.4
-####################
-source("Scripts/ex2_1_3.R")
-
-# Manual projecting data onto principal component.
-Z <- s$u %*% diag(s$d)
-
-# Doing it with the built-in function.
-Z <- prcomp(X)$x
-
-i <- 1
-j <- 3
-
-## Make fancy plot
-library(ggplot2)
-ggplot() +
-  geom_point(aes(x = Z[, i], y = Z[, j], color = classLabels), size = 4, alpha = 0.5) +
-  theme(legend.position = c(0.88, 0.22), legend.title = element_blank()) +
-  labs(x = colnames(Z)[i], y = colnames(Z)[j])
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_5.R b/exercises/02450Toolbox_R/Scripts/ex2_1_5.R
deleted file mode 100644
index 4441f4c..0000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_5.R
+++ /dev/null
@@ -1,45 +0,0 @@
-####################
-# Exercise 2.1.5
-####################
-source("Scripts/ex2_1_1.R")
-
-Y <- t(apply(X, 1, "-", colMeans(X))) # subtract the column means form columns of X
-
-# Computing PCA:
-pca <- prcomp(X)
-V <- pca$rotation
-Z <- pca$x
-
-# We saw in 2.1.3 that the first 3 components explained more than 90
-# percent of the variance. Let's look at their coefficients:
-pcs <- 1:3
-test <- as.data.frame(melt(data.table(V[, pcs])))
-ggplot(test, aes(x = rep(1:8, length(pcs)), y = value, fill=variable)) +
-  geom_bar(position="dodge", stat = "identity") +
-  labs(fill="PC", x = "Attributes", y = "Component coefficients")
-
-# Inspecting the plot, we see that the 2nd principal component has large
-# (in magnitude) coefficients for attributes A, E and H. We can confirm
-# this by looking at it's numerical values directly, too:
-cat("PC2:", V[, 1])
-
-# How does this translate to the actual data and its projections?
-# Looking at the data for water:
-
-# Projection of water class onto the 2nd principal component.
-water <- Y[y == 4, ]
-cat("First water observation:", water[1, ])
-
-# Based on the coefficients and the attribute values for the observation
-# displayed, would you expect the projection onto PC2 to be positive or
-# negative - why? Consider *both* the magnitude and sign of *both* the
-# coefficient and the attribute!
-
-# You can determine the projection by (remove comments):
-print("...and its projection onto PC2")
-print(as.numeric(t(water[1, ]) %*% V[, 2]))
-
-# Try to explain why.
-# This is equivalent to:
-Z[1, 2]
-
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_2_1.R b/exercises/02450Toolbox_R/Scripts/ex2_2_1.R
index ff04967..91eccf7 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_2_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_2_1.R
@@ -1,42 +1,67 @@
 ####################
-# Exercise 2.2.1
+# Exercise 3.3.1
 ####################
+rm(list = ls()) # Clear work space)
 
-rm(list = ls()) # Clear work space
+source("Tools/binarize.R")
+source("Tools/similarity.R")
 
-# Load the library R.matlab to enable the function readMat,
-# which allows R to read the matlab .mat format.
-# Install with "install.packages("R.matlab")"
-library(R.matlab)
-
-# The row of training data that we will look at
+# Image to use as query
 i <- 1
 
-# Read in the data
-data <- readMat(file.path("Data", "zipdata.mat"))
+# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
+SimilarityMeasure <- "Jaccard"
+
+library(R.matlab)
+data <- readMat(file.path("./Data/digits.mat"))
+
+# You can also try it on the faces dataset:
+# data <- readMat(file.path("./Data/wildfaces_grayscale.mat"))
 
-# Check that the structure dat contains two matrices, testdata and traindata
 names(data)
+X <- data$X
+dimX <- dim(X)
+N <- dimX[1]
+M <- dimX[2]
+Q <- matrix(X[i, ], ncol = M) # The query image
+Y <- X[-i, ] # All images except the i'th
+
+# The next function is in the setup.R, load it if you have not already
+# source("setup.R")
+sim <- similarity(Q, Y, SimilarityMeasure)
+
+# Sort similarities
+sort_result <- sort(sim, decreasing = TRUE, index.return = TRUE)
+val <- sort_result$x
+j <- sort_result$ix
+nj <- length(j)
 
-# The features, i.e. images of digits, are stored in the rows of traindata,
-# except for the first column. The first column contains the class of the row,
-# i.e. the digit identity
-X <- data$traindata[, 2:dim(data$traindata)[2]]
-y <- data$traindata[, 1]
+# Plot five most similar and dissimilar images
+npics <- 5
+ndigits <- 4
+mostsim <- j[1:npics]
+leastsim <- j[(nj - npics + 1):nj]
 
-par(mfrow = c(1, 2))
+imageDim <- c(sqrt(M), sqrt(M))
+dim(Q) <- imageDim # reshape Q
 
-# View the i'th row of X
-image(as.matrix(X[i, ]), axes = FALSE, xlab = "Pixel number", main = "Digit in vector format")
+{
+  dev.new(width = 2.5, height = 3)
+  image(t(Q[nrow(Q):1, ]), main = "Query image", col = gray((0:32) / 32))
+}
 
-# Make ticks corresponding to pixels, i.e. values in row i
-axis(1, at = (1:length(X[i, ])) / length(X[i, ]), labels = 1:length(X[i, ]))
+{
+  dev.new(width = 2 * npics, height = 5)
+  layout(matrix(c(1:length(mostsim), (length(mostsim) + 1):(2 * length(mostsim))), 2, length(mostsim), byrow = FALSE))
+  for (d in 1:npics) {
+    similarImage <- Y[mostsim[d], ]
+    dim(similarImage) <- imageDim
+    image(t(similarImage[nrow(similarImage):1, ]), main = paste("Similarity:", format(val[d], digits = ndigits)), col = gray((0:32) / 32))
 
-# Extract the i'th row of X and store it as the matrix I
-I <- X[i, ]
 
-# Make the matrix I have dimensions 16 by 16
-dim(I) <- c(16, 16)
+    dissimilarImage <- Y[leastsim[d], ]
+    dim(dissimilarImage) <- imageDim
 
-# view the digit in the i'th row of X as an image. The function image rotates the matrix, so we need to rearrange the columns in the matrix I in order to get the correct view of the digit.
-image(I[, ncol(I):1], col = gray((32:0) / 32), main = "Digit as an image")
+    image(t(dissimilarImage[nrow(dissimilarImage):1, ]), main = paste("Similarity:", format(val[nj - d + 1], digits = ndigits)), col = gray((0:32) / 32))
+  }
+}
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_2_2.R b/exercises/02450Toolbox_R/Scripts/ex2_2_2.R
index 997d298..3cdc43e 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_2_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_2_2.R
@@ -1,105 +1,22 @@
 ####################
-# Exercise 2.2.2
+# Exercise 3.3.2
 ####################
-source("Scripts/ex2_2_1.R")
+source("Tools/similarity.R")
 
+# Generate two data objects with M random (standard uniformly distributed) attributes
+M = 5;
+x = as.matrix(runif(M));
+y = as.matrix(runif(M));
 
-# Choose which digits to consider in the analysis
 
-# This is like in the exercise
-digits_to_inspect <- 0:1
-
-# If you want to try more digits
-# digits_to_inspect = 0:9
-
-# Find the observations that relate to the digits chosen in digits_to_inspect
-inds <- !is.na(match(y, digits_to_inspect))
-
-# extract the rows of X found above
-X <- X[inds, ]
-y <- y[inds]
-
-# Get the column means of X, subtract them from each row,
-# and perform and SVD on the resulting matrix
-means <- colMeans(X)
-Xzeromean <- t(apply(X, 1, "-", means))
-svdres <- svd(Xzeromean)
-
-# Extract the matrices containing the left and right singular vectors, respectively
-U <- svdres$u
-V <- svdres$v
-
-# ----------------------------------------------------
-# Calculate and plot the variance explained by the PCs
-# ----------------------------------------------------
-
-pcvariance <- svdres$d^2 / sum(svdres$d^2)
-
-{
-  par(mfrow=c(1,1))
-  plot(cumsum(pcvariance), main = "Data variance explained by PCs",
-       xlab = "Number of PCs included in variance sum",
-       ylab = "Proportion of variance explained")
-  
-  # First 22 PCs should explain 90%
-  abline(v = 22)
-}
-
-# ---------------------------------------------------------------------
-# Plot principal components 1 and 2 against each other in a scatterplot,
-# i.e. plot the projections of observations onto PCs 1 and 2
-# ---------------------------------------------------------------------
-
-# These two ways of calculating pc_projections are equivalent
-pc_projections <- Xzeromean %*% V
-pc_projections <- U %*% diag(svdres$d)
-
-pcproj1 <- pc_projections[, 1]
-pcproj2 <- pc_projections[, 2]
-
-{
-  par(mfrow=c(1,1))
-  plot(c(min(pcproj1), max(pcproj1)), c(min(pcproj2), max(pcproj2)),
-        type = "n", xlab = "PC 1", ylab = "PC 2")
-  points(pcproj1[y == 0], pcproj2[y == 0], col = "red")
-  points(pcproj1[y == 1], pcproj2[y == 1], col = "green")
-  legend("topleft", legend = c("0", "1"), fill = c("red", "green"))
-}
-
-
-# ----------------------------------
-# Reconstruction of images of digits
-# ----------------------------------
-
-# Number of PCs to include in reconstruction of digits
-K <- 5
-
-# Digits to visualize
-nD <- 60:63
-
-reconstructions <- pc_projections[, 1:K] %*% t(V[, 1:K])
-
-par(mar = c(1, 1, 1, 1))
-layout(matrix(c(1:length(nD), (length(nD) + 1):(2 * length(nD))), 2, length(nD), byrow = FALSE))
-for (d in 1:length(nD)) {
-  origImage <- X[nD[d], ]
-  dim(origImage) <- c(16, 16)
-  image(origImage[, ncol(origImage):1], main = "Original", col = gray((32:0) / 32))
-  reconstructedImage <- reconstructions[nD[d], ] + means
-  dim(reconstructedImage) <- c(16, 16)
-  image(reconstructedImage[, ncol(reconstructedImage):1], main = "Reconstruction", col = gray((32:0) / 32))
-}
-
-# -------------
-# Visualize PCs
-# -------------
-
-# visualize the first 12 PCs
-par(mfrow = c(3, 4), mar = c(2, 2, 2, 2))
-for (nth_pc in 1:12) {
-  pc <- t(V[, nth_pc])
-  dim(pc) <- c(16, 16)
-  # view image of PC
-  image(pc[, ncol(pc):1], col = gray((32:0) / 32), main = paste("PC ", nth_pc))
-}
+# Two constants
+a = 1.5;
+b = 1.5;
 
+# Check the statements in the exercise
+similarity(x,y,'cos') - similarity(a*x,y,'cos')
+similarity(x,y,'ext') - similarity(a*x,y,'ext')
+similarity(x,y,'cor') - similarity(a*x,y,'cor')
+similarity(x,y,'cos') - similarity(b+x,y,'cos')
+similarity(x,y,'ext') - similarity(b+x,y,'ext')
+similarity(x,y,'cor') - similarity(b+x,y,'cor')
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_3_1.R b/exercises/02450Toolbox_R/Scripts/ex2_3_1.R
index 39726ea..98f0330 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_3_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_1.R
@@ -1,61 +1,30 @@
 ####################
-# Exercise 2.3.1
+# Exercise 4.2.1
 ####################
-
 rm(list = ls()) # Clear work space
 
-library(FNN)
-library(R.matlab)
-
-# Read in the data
-data <- readMat(file.path("Data", "zipdata.mat"))
+# Read data into R
+data <- read.csv("./Data/iris.csv")
 
-# Check that the structure dat contains two matrices, testdata and traindata
+# Inspect the contents of the variable "data".
 names(data)
+head(data)
 
-# The features, i.e. images of digits, are stored in the rows of traindata,
-# except for the first column. The first column contains the class of the row,
-# i.e. the digit identity
-X <- data$traindata[, 2:dim(data$traindata)[2]]
-y <- data$traindata[, 1]
-
-Xtest <- data$testdat[, 2:dim(data$testdat)[2]]
-ytest <- data$testdat[, 1]
-
-# This is like in the exercise
-digits_to_inspect <- 0:1
-
-# If you want to try more digits
-# digits_to_inspect = 0:9
-
-# Find the observations that relate to the digits chosen in digits_to_inspect
-inds <- !is.na(match(y, digits_to_inspect))
-indstest <- !is.na(match(ytest, digits_to_inspect))
-
-# Extract the rows of X found above
-X <- X[inds, ]
-y <- y[inds]
-Xtest <- Xtest[indstest, ]
-ytest <- ytest[indstest]
-
-
-# Get the column means of X, subtract them from each row,
-# and perform and SVD on the resulting matrix
-means <- colMeans(X)
-Xzeromean <- t(apply(X, 1, "-", means))
-Xtestzeromean <- t(apply(Xtest, 1, "-", means))
-svdres <- svd(Xzeromean)
+# Extract the rows and columns corresponding to the data, i.e. the attribute values
+X <- data[, 1:4]
 
-# Extract the matrices containing the left and right singular vectors, respectively
-U <- svdres$u
-V <- svdres$v
+# Extract attribute names from the first row
+attributeNames <- colnames(data)[1:(dim(data)[2] - 1)]
 
-K <- 5 # Number of principal components to use
-pc_projections <- Xzeromean %*% V[, 1:K]
-pc_projectionstest <- Xtestzeromean %*% V[, 1:K]
+# Extract unique class names from the last column
+classLabels <- data[, 5]
+classNames <- unique(classLabels)
 
-preds <- knn(pc_projections, pc_projectionstest, cl = y)
+# Extract class labels that match the class names
+y <- match(classLabels, classNames) - 1
 
-error_rate <- mean(preds != ytest)
-print(error_rate)
+# Get the number of data objects, attributes, and classes
+N <- dim(X)[1]
+M <- dim(X)[2]
+C <- length(classNames)
 
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_2.R b/exercises/02450Toolbox_R/Scripts/ex2_3_2.R
similarity index 96%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_2.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_2.R
index 0ae27d2..f9558da 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_2.R
@@ -2,7 +2,7 @@
 # Exercise 4.2.2
 ####################
 
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # ----------------
 # With base R plot
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_3.R b/exercises/02450Toolbox_R/Scripts/ex2_3_3.R
similarity index 81%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_3.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_3.R
index a05fc0c..cceef6d 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_3.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_3.R
@@ -2,7 +2,7 @@
 # Exercise 4.2.3
 ####################
 
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 par(mfrow=c(1,1))
 boxplot(X, main="Boxplots of attribute values")
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_4.R b/exercises/02450Toolbox_R/Scripts/ex2_3_4.R
similarity index 96%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_4.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_4.R
index 7cff927..dd45ae4 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_4.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_4.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.4
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # Done with base R plot
 yvals <- c()
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_5.R b/exercises/02450Toolbox_R/Scripts/ex2_3_5.R
similarity index 92%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_5.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_5.R
index bd6b45d..7007484 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_5.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_5.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.5
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 observationColors <- c("blue", "green3", "red")[unclass(y+1)]
 {
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_6.R b/exercises/02450Toolbox_R/Scripts/ex2_3_6.R
similarity index 97%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_6.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_6.R
index 29079a7..15a729e 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_6.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_6.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.6
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # Make a non-interactive 3d plot of the data. Install the package scatterplot3d
 library(scatterplot3d) # install.packages("scatterplot3d")
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_7.R b/exercises/02450Toolbox_R/Scripts/ex2_3_7.R
similarity index 95%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_7.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_7.R
index c803793..f228241 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_7.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_7.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.7
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # Standardize each column using the function scale (see ?scale)
 X_scaled <- as.data.frame(scale(X))  
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_3_1.R b/exercises/02450Toolbox_R/Scripts/ex2_4_1.R
similarity index 100%
rename from exercises/02450Toolbox_R/Scripts/ex4_3_1.R
rename to exercises/02450Toolbox_R/Scripts/ex2_4_1.R
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_3_2.R b/exercises/02450Toolbox_R/Scripts/ex2_4_2.R
similarity index 94%
rename from exercises/02450Toolbox_R/Scripts/ex4_3_2.R
rename to exercises/02450Toolbox_R/Scripts/ex2_4_2.R
index cc9848e..cbec5c6 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_3_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_4_2.R
@@ -2,7 +2,7 @@
 # Exercise 4.3.2
 ####################
 
-source("Scripts/ex4_3_1.R")
+source("Scripts/ex2_4_1.R")
 library(ggplot2) 
 library(GGally) # install.packages("GGally")
 
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_1.R b/exercises/02450Toolbox_R/Scripts/ex3_1_1.R
new file mode 100644
index 0000000..7861e15
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_1.R
@@ -0,0 +1,36 @@
+####################
+# Exercise 2.1.1
+####################
+
+rm(list = ls()) # Clear work space
+
+# Read data into R
+data <- read.csv("./Data/nanonose.csv", check.names = FALSE)
+
+# Extract class labels of observations
+classLabels <- colnames(data)
+classLabels <- classLabels[-(1:2)]
+
+# Extract attributes, i.e. sensor names
+attributeNames <- data[3:10, 1]
+
+# Remove first two rows and columns and transpose data matrix
+X <- t(data[-(1:2), -(1:2)])
+
+# Check that dimensions are as they should be (90 rows and 8 columns)
+N <- dim(X)[1]
+M <- dim(X)[2]
+
+# Assign the class labels as row names and the attributes as column names
+rownames(X) <- classLabels
+colnames(X) <- attributeNames
+
+# Extract the class names present in data
+classNames <- sort(unique(classLabels))
+C <- length(classNames)
+
+# Extract numeric class assignments
+y <- as.numeric(as.factor(classLabels)) - 1
+
+# Clean up a bit in the variables, since we'll call this script from later scripts:
+rm(data)
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_2.R b/exercises/02450Toolbox_R/Scripts/ex3_1_2.R
index d3a6872..d99b5eb 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_2.R
@@ -1,19 +1,20 @@
 ####################
-# Exercise 3.1.2
+# Exercise 2.1.2
 ####################
 
-rm(list = ls()) # Clear work space
+# Run ex2.1.1:
+source('Scripts/ex2_1_1.R')
 
-library(tm)
-textfolder = "./Data/"
+# Choose which sensors to plot
+i = 1
+j = 2
 
-(docs <- Corpus(DirSource(textfolder, pattern="textDocs*"), readerControl = list(language="en")))
-
-inspect(docs)
-
-docs_nopunct <- tm_map(docs, removePunctuation)
-dtm <- DocumentTermMatrix(docs_nopunct)
-
-inspect(dtm)
+# Make simple plot
+plot(X[ , i], X[ , j])
 
+## Make more fancy plot
+library(ggplot2)
+ggplot() + geom_point(aes(x = X[ , i], y = X[ , j], color = classLabels), size=4, alpha=0.5) +
+  theme(legend.position = c(0.88, 0.77), legend.title= element_blank()) +
+  labs(x = attributeNames[i], y = attributeNames[j])
 
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_3.R b/exercises/02450Toolbox_R/Scripts/ex3_1_3.R
index 4180e92..b38a522 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_3.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_3.R
@@ -1,23 +1,45 @@
 ####################
-# Exercise 3.1.3
+# Exercise 2.1.3
 ####################
 
-rm(list = ls()) # Clear work space
+source("Scripts/ex3_1_1.R")
 
-library(tm)
-textfolder <- "./Data/"
+# Subtract the column means form columns of X
+Y <- t(apply(X, 1, "-", colMeans(X)))
 
-(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
+# You can check the column means using: colMeans(Y)
+colMeans(Y)
 
-inspect(docs)
-docs_nopunct <- tm_map(docs, removePunctuation)
-dtm <- DocumentTermMatrix(docs_nopunct)
+# PCA by computing SVD of Y:
+s <- svd(Y)
+diagS <- s$d
 
-mystopwords <- scan("./Data/stopWords.txt", character(0))
+rho <- diagS^2 / sum(diagS^2)
 
-docs_nostopwords <- tm_map(docs, removeWords, mystopwords)
-inspect(docs_nostopwords)
-dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
-inspect(dtm_nostopwords)
+# PCA can also be done with the built-in function prcomp.
+# Confirm that you get the same result.
+rho2 <- summary(prcomp(X, center = TRUE))$importance[2,]
+
+
+threshold <- 0.9
+
+xlimits <- c(1, M)
+plot(rho,
+  type = "o",
+  main = "Variance explained by principal componenets",
+  xlab = "Principal components",
+  ylab = "Variance explained",
+  xlim = xlimits,
+  ylim = c(0, 1),
+  col = "blue"
+)
+
+lines(cumsum(rho), type = "o", col = "orange")
+lines(xlimits, c(threshold, threshold), lty = "dashed")
+
+legend("right", # Define position
+  legend = c("Individual", "Cumulative", "Threshold"), # Set strings for legend
+  col = c("orange", "blue", "black"), lty = c(1, 1, 2), # Match appereance of lines
+  cex = 1, bg = "lightblue"
+) # Setup how the box looks (cex controls size)
 
-control <- list(stopwords = TRUE)
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_4.R b/exercises/02450Toolbox_R/Scripts/ex3_1_4.R
index 5ad1d63..3b43e47 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_4.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_4.R
@@ -1,29 +1,20 @@
 ####################
-# Exercise 3.1.4
+# Exercise 2.1.4
 ####################
+source("Scripts/ex2_1_3.R")
 
-rm(list = ls()) # Clear work space
+# Manual projecting data onto principal component.
+Z <- s$u %*% diag(s$d)
 
-library(tm) # install.packages("tm")
-library(SnowballC) # install.packages("SnowballC")
+# Doing it with the built-in function.
+Z <- prcomp(X)$x
 
-textfolder <- "./Data/"
+i <- 1
+j <- 3
 
-(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
-
-inspect(docs)
-docs_nopunct <- tm_map(docs, removePunctuation)
-dtm <- DocumentTermMatrix(docs_nopunct)
-
-mystopwords <- scan("./Data/stopWords.txt", character(0))
-
-docs_nostopwords <- tm_map(docs_nopunct, removeWords, mystopwords)
-dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
-inspect(dtm_nostopwords)
-
-docs_stemmed <- tm_map(docs_nostopwords, stemDocument, language = "english")
-
-
-inspect(docs_stemmed)
-dtm_stemmed <- DocumentTermMatrix(docs_stemmed, control = list(stopwords = TRUE))
-inspect(dtm_stemmed)
+## Make fancy plot
+library(ggplot2)
+ggplot() +
+  geom_point(aes(x = Z[, i], y = Z[, j], color = classLabels), size = 4, alpha = 0.5) +
+  theme(legend.position = c(0.88, 0.22), legend.title = element_blank()) +
+  labs(x = colnames(Z)[i], y = colnames(Z)[j])
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_5.R b/exercises/02450Toolbox_R/Scripts/ex3_1_5.R
index efe3e7b..bb02536 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_5.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_5.R
@@ -1,19 +1,45 @@
 ####################
-# Exercise 3.1.5
+# Exercise 2.1.5
 ####################
+source("Scripts/ex3_1_1.R")
 
-source("Scripts/ex3_1_4.R")
-source("Tools/similarity.R")
+Y <- t(apply(X, 1, "-", colMeans(X))) # subtract the column means form columns of X
 
-size <- dim(dtm_stemmed)
-cosmeas <- c()
-q <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0)
-dtmat <- matrix(dtm_stemmed, dtm_stemmed$nrow, dtm_stemmed$ncol)
+# Computing PCA:
+pca <- prcomp(X)
+V <- pca$rotation
+Z <- pca$x
 
-for (irow in 1:size[1]) {
-  doc <- dtm_stemmed[irow, ]
-  cosmeas[irow] <- similarity(t(dtmat[irow, ]), t(q), 'cos')
-}
+# We saw in 2.1.3 that the first 3 components explained more than 90
+# percent of the variance. Let's look at their coefficients:
+pcs <- 1:3
+test <- as.data.frame(melt(data.table(V[, pcs])))
+ggplot(test, aes(x = rep(1:8, length(pcs)), y = value, fill=variable)) +
+  geom_bar(position="dodge", stat = "identity") +
+  labs(fill="PC", x = "Attributes", y = "Component coefficients")
+
+# Inspecting the plot, we see that the 2nd principal component has large
+# (in magnitude) coefficients for attributes A, E and H. We can confirm
+# this by looking at it's numerical values directly, too:
+cat("PC2:", V[, 1])
+
+# How does this translate to the actual data and its projections?
+# Looking at the data for water:
+
+# Projection of water class onto the 2nd principal component.
+water <- Y[y == 4, ]
+cat("First water observation:", water[1, ])
+
+# Based on the coefficients and the attribute values for the observation
+# displayed, would you expect the projection onto PC2 to be positive or
+# negative - why? Consider *both* the magnitude and sign of *both* the
+# coefficient and the attribute!
+
+# You can determine the projection by (remove comments):
+print("...and its projection onto PC2")
+print(as.numeric(t(water[1, ]) %*% V[, 2]))
+
+# Try to explain why.
+# This is equivalent to:
+Z[1, 2]
 
-print("Cosine similarity from q to docs: ")
-print(cosmeas)
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_6.R b/exercises/02450Toolbox_R/Scripts/ex3_1_6.R
similarity index 99%
rename from exercises/02450Toolbox_R/Scripts/ex2_1_6.R
rename to exercises/02450Toolbox_R/Scripts/ex3_1_6.R
index f0a329d..83faf47 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_6.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_6.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 2.1.6
 ####################
-source("Scripts/ex2_1_1.R")
+source("Scripts/ex3_1_1.R")
 
 # Try this *later* (for last), and explain the effect X_s = X
 # Make a to be "scaled" version of X
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_2_1.R b/exercises/02450Toolbox_R/Scripts/ex3_2_1.R
index 725f756..ff04967 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_2_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_2_1.R
@@ -1,17 +1,42 @@
 ####################
-# Exercise 3.2.1
+# Exercise 2.2.1
 ####################
 
 rm(list = ls()) # Clear work space
 
-x <- c(-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47)
+# Load the library R.matlab to enable the function readMat,
+# which allows R to read the matlab .mat format.
+# Install with "install.packages("R.matlab")"
+library(R.matlab)
 
-mean(x)
-sd(x)
-median(x)
-diff(range(x))
+# The row of training data that we will look at
+i <- 1
 
-# Range returns the minimum and maximum of the vector x.
-# To get the range, we must take the maximum minus the minimum.
-# We do this using the function diff, which finds differences
-# between consecutive elements in a vector.
+# Read in the data
+data <- readMat(file.path("Data", "zipdata.mat"))
+
+# Check that the structure dat contains two matrices, testdata and traindata
+names(data)
+
+# The features, i.e. images of digits, are stored in the rows of traindata,
+# except for the first column. The first column contains the class of the row,
+# i.e. the digit identity
+X <- data$traindata[, 2:dim(data$traindata)[2]]
+y <- data$traindata[, 1]
+
+par(mfrow = c(1, 2))
+
+# View the i'th row of X
+image(as.matrix(X[i, ]), axes = FALSE, xlab = "Pixel number", main = "Digit in vector format")
+
+# Make ticks corresponding to pixels, i.e. values in row i
+axis(1, at = (1:length(X[i, ])) / length(X[i, ]), labels = 1:length(X[i, ]))
+
+# Extract the i'th row of X and store it as the matrix I
+I <- X[i, ]
+
+# Make the matrix I have dimensions 16 by 16
+dim(I) <- c(16, 16)
+
+# view the digit in the i'th row of X as an image. The function image rotates the matrix, so we need to rearrange the columns in the matrix I in order to get the correct view of the digit.
+image(I[, ncol(I):1], col = gray((32:0) / 32), main = "Digit as an image")
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_2_2.R b/exercises/02450Toolbox_R/Scripts/ex3_2_2.R
new file mode 100644
index 0000000..d1d81de
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex3_2_2.R
@@ -0,0 +1,105 @@
+####################
+# Exercise 2.2.2
+####################
+source("Scripts/ex3_2_1.R")
+
+
+# Choose which digits to consider in the analysis
+
+# This is like in the exercise
+digits_to_inspect <- 0:1
+
+# If you want to try more digits
+# digits_to_inspect = 0:9
+
+# Find the observations that relate to the digits chosen in digits_to_inspect
+inds <- !is.na(match(y, digits_to_inspect))
+
+# extract the rows of X found above
+X <- X[inds, ]
+y <- y[inds]
+
+# Get the column means of X, subtract them from each row,
+# and perform and SVD on the resulting matrix
+means <- colMeans(X)
+Xzeromean <- t(apply(X, 1, "-", means))
+svdres <- svd(Xzeromean)
+
+# Extract the matrices containing the left and right singular vectors, respectively
+U <- svdres$u
+V <- svdres$v
+
+# ----------------------------------------------------
+# Calculate and plot the variance explained by the PCs
+# ----------------------------------------------------
+
+pcvariance <- svdres$d^2 / sum(svdres$d^2)
+
+{
+  par(mfrow=c(1,1))
+  plot(cumsum(pcvariance), main = "Data variance explained by PCs",
+       xlab = "Number of PCs included in variance sum",
+       ylab = "Proportion of variance explained")
+  
+  # First 22 PCs should explain 90%
+  abline(v = 22)
+}
+
+# ---------------------------------------------------------------------
+# Plot principal components 1 and 2 against each other in a scatterplot,
+# i.e. plot the projections of observations onto PCs 1 and 2
+# ---------------------------------------------------------------------
+
+# These two ways of calculating pc_projections are equivalent
+pc_projections <- Xzeromean %*% V
+pc_projections <- U %*% diag(svdres$d)
+
+pcproj1 <- pc_projections[, 1]
+pcproj2 <- pc_projections[, 2]
+
+{
+  par(mfrow=c(1,1))
+  plot(c(min(pcproj1), max(pcproj1)), c(min(pcproj2), max(pcproj2)),
+        type = "n", xlab = "PC 1", ylab = "PC 2")
+  points(pcproj1[y == 0], pcproj2[y == 0], col = "red")
+  points(pcproj1[y == 1], pcproj2[y == 1], col = "green")
+  legend("topleft", legend = c("0", "1"), fill = c("red", "green"))
+}
+
+
+# ----------------------------------
+# Reconstruction of images of digits
+# ----------------------------------
+
+# Number of PCs to include in reconstruction of digits
+K <- 5
+
+# Digits to visualize
+nD <- 60:63
+
+reconstructions <- pc_projections[, 1:K] %*% t(V[, 1:K])
+
+par(mar = c(1, 1, 1, 1))
+layout(matrix(c(1:length(nD), (length(nD) + 1):(2 * length(nD))), 2, length(nD), byrow = FALSE))
+for (d in 1:length(nD)) {
+  origImage <- X[nD[d], ]
+  dim(origImage) <- c(16, 16)
+  image(origImage[, ncol(origImage):1], main = "Original", col = gray((32:0) / 32))
+  reconstructedImage <- reconstructions[nD[d], ] + means
+  dim(reconstructedImage) <- c(16, 16)
+  image(reconstructedImage[, ncol(reconstructedImage):1], main = "Reconstruction", col = gray((32:0) / 32))
+}
+
+# -------------
+# Visualize PCs
+# -------------
+
+# visualize the first 12 PCs
+par(mfrow = c(3, 4), mar = c(2, 2, 2, 2))
+for (nth_pc in 1:12) {
+  pc <- t(V[, nth_pc])
+  dim(pc) <- c(16, 16)
+  # view image of PC
+  image(pc[, ncol(pc):1], col = gray((32:0) / 32), main = paste("PC ", nth_pc))
+}
+
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_3_1.R b/exercises/02450Toolbox_R/Scripts/ex3_3_1.R
index 91eccf7..39726ea 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_3_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_3_1.R
@@ -1,67 +1,61 @@
 ####################
-# Exercise 3.3.1
+# Exercise 2.3.1
 ####################
-rm(list = ls()) # Clear work space)
 
-source("Tools/binarize.R")
-source("Tools/similarity.R")
-
-# Image to use as query
-i <- 1
-
-# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
-SimilarityMeasure <- "Jaccard"
+rm(list = ls()) # Clear work space
 
+library(FNN)
 library(R.matlab)
-data <- readMat(file.path("./Data/digits.mat"))
 
-# You can also try it on the faces dataset:
-# data <- readMat(file.path("./Data/wildfaces_grayscale.mat"))
+# Read in the data
+data <- readMat(file.path("Data", "zipdata.mat"))
 
+# Check that the structure dat contains two matrices, testdata and traindata
 names(data)
-X <- data$X
-dimX <- dim(X)
-N <- dimX[1]
-M <- dimX[2]
-Q <- matrix(X[i, ], ncol = M) # The query image
-Y <- X[-i, ] # All images except the i'th
-
-# The next function is in the setup.R, load it if you have not already
-# source("setup.R")
-sim <- similarity(Q, Y, SimilarityMeasure)
-
-# Sort similarities
-sort_result <- sort(sim, decreasing = TRUE, index.return = TRUE)
-val <- sort_result$x
-j <- sort_result$ix
-nj <- length(j)
-
-# Plot five most similar and dissimilar images
-npics <- 5
-ndigits <- 4
-mostsim <- j[1:npics]
-leastsim <- j[(nj - npics + 1):nj]
-
-imageDim <- c(sqrt(M), sqrt(M))
-dim(Q) <- imageDim # reshape Q
-
-{
-  dev.new(width = 2.5, height = 3)
-  image(t(Q[nrow(Q):1, ]), main = "Query image", col = gray((0:32) / 32))
-}
-
-{
-  dev.new(width = 2 * npics, height = 5)
-  layout(matrix(c(1:length(mostsim), (length(mostsim) + 1):(2 * length(mostsim))), 2, length(mostsim), byrow = FALSE))
-  for (d in 1:npics) {
-    similarImage <- Y[mostsim[d], ]
-    dim(similarImage) <- imageDim
-    image(t(similarImage[nrow(similarImage):1, ]), main = paste("Similarity:", format(val[d], digits = ndigits)), col = gray((0:32) / 32))
-
-
-    dissimilarImage <- Y[leastsim[d], ]
-    dim(dissimilarImage) <- imageDim
-
-    image(t(dissimilarImage[nrow(dissimilarImage):1, ]), main = paste("Similarity:", format(val[nj - d + 1], digits = ndigits)), col = gray((0:32) / 32))
-  }
-}
+
+# The features, i.e. images of digits, are stored in the rows of traindata,
+# except for the first column. The first column contains the class of the row,
+# i.e. the digit identity
+X <- data$traindata[, 2:dim(data$traindata)[2]]
+y <- data$traindata[, 1]
+
+Xtest <- data$testdat[, 2:dim(data$testdat)[2]]
+ytest <- data$testdat[, 1]
+
+# This is like in the exercise
+digits_to_inspect <- 0:1
+
+# If you want to try more digits
+# digits_to_inspect = 0:9
+
+# Find the observations that relate to the digits chosen in digits_to_inspect
+inds <- !is.na(match(y, digits_to_inspect))
+indstest <- !is.na(match(ytest, digits_to_inspect))
+
+# Extract the rows of X found above
+X <- X[inds, ]
+y <- y[inds]
+Xtest <- Xtest[indstest, ]
+ytest <- ytest[indstest]
+
+
+# Get the column means of X, subtract them from each row,
+# and perform and SVD on the resulting matrix
+means <- colMeans(X)
+Xzeromean <- t(apply(X, 1, "-", means))
+Xtestzeromean <- t(apply(Xtest, 1, "-", means))
+svdres <- svd(Xzeromean)
+
+# Extract the matrices containing the left and right singular vectors, respectively
+U <- svdres$u
+V <- svdres$v
+
+K <- 5 # Number of principal components to use
+pc_projections <- Xzeromean %*% V[, 1:K]
+pc_projectionstest <- Xtestzeromean %*% V[, 1:K]
+
+preds <- knn(pc_projections, pc_projectionstest, cl = y)
+
+error_rate <- mean(preds != ytest)
+print(error_rate)
+
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_3_2.R b/exercises/02450Toolbox_R/Scripts/ex3_3_2.R
deleted file mode 100644
index 3cdc43e..0000000
--- a/exercises/02450Toolbox_R/Scripts/ex3_3_2.R
+++ /dev/null
@@ -1,22 +0,0 @@
-####################
-# Exercise 3.3.2
-####################
-source("Tools/similarity.R")
-
-# Generate two data objects with M random (standard uniformly distributed) attributes
-M = 5;
-x = as.matrix(runif(M));
-y = as.matrix(runif(M));
-
-
-# Two constants
-a = 1.5;
-b = 1.5;
-
-# Check the statements in the exercise
-similarity(x,y,'cos') - similarity(a*x,y,'cos')
-similarity(x,y,'ext') - similarity(a*x,y,'ext')
-similarity(x,y,'cor') - similarity(a*x,y,'cor')
-similarity(x,y,'cos') - similarity(b+x,y,'cos')
-similarity(x,y,'ext') - similarity(b+x,y,'ext')
-similarity(x,y,'cor') - similarity(b+x,y,'cor')
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_1.R b/exercises/02450Toolbox_R/Scripts/ex4_2_1.R
deleted file mode 100644
index 98f0330..0000000
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_1.R
+++ /dev/null
@@ -1,30 +0,0 @@
-####################
-# Exercise 4.2.1
-####################
-rm(list = ls()) # Clear work space
-
-# Read data into R
-data <- read.csv("./Data/iris.csv")
-
-# Inspect the contents of the variable "data".
-names(data)
-head(data)
-
-# Extract the rows and columns corresponding to the data, i.e. the attribute values
-X <- data[, 1:4]
-
-# Extract attribute names from the first row
-attributeNames <- colnames(data)[1:(dim(data)[2] - 1)]
-
-# Extract unique class names from the last column
-classLabels <- data[, 5]
-classNames <- unique(classLabels)
-
-# Extract class labels that match the class names
-y <- match(classLabels, classNames) - 1
-
-# Get the number of data objects, attributes, and classes
-N <- dim(X)[1]
-M <- dim(X)[2]
-C <- length(classNames)
-
-- 
GitLab