diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m
new file mode 100644
index 0000000000000000000000000000000000000000..b5391b758cafba6e3c28f0beb465769334439718
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m
@@ -0,0 +1,9 @@
+%% exercise 3.1.2
+cdir = fileparts(mfilename('fullpath')); 
+[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'));
+X = full(A)';
+attributeNames = cellstr(D); 
+
+%% Display the result
+display(attributeNames);
+display(X);  
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m
new file mode 100644
index 0000000000000000000000000000000000000000..fdd498e1c67d51f512c47132bf4298325f5abec0
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m
@@ -0,0 +1 @@
+%% exercise 3.1.3
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt');
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m
new file mode 100644
index 0000000000000000000000000000000000000000..0518cd4748fdea2dec5daf77635c2de330e3bd2d
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m
@@ -0,0 +1 @@
+%% exercise 3.1.4
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = '../Data/stopWords.txt';
TMGOpts.stemming = 1;
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m
new file mode 100644
index 0000000000000000000000000000000000000000..ffa4d21982d415f00f3b6d9424dd8cec08486e77
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m
@@ -0,0 +1,21 @@
+%% exercise 3.1.5
+
+% Query vector
+q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]';
+
+%% Method #1 (a for loop)
+N = size(X,1); % Get the number of data objects
+sim = nan(N,1); % Allocate a vector for the similarity
+for i = 1:N
+    x = X(i,:); % Get the i'th data object
+    sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity
+end
+
+%% Method #2 (one compact line of code)
+sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2)));
+
+%% Method #3 (use the "similarity" function)
+sim = similarity(X, q, 'cos');
+
+%% Display the result
+display(sim);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m
index c5f1186acedaa71f6829e0072e5bdd871453be96..f4115017e2054a948671c99dc72cc037b0309382 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m
@@ -1,19 +1,14 @@
-%% exercise 2.1.1
-% Load the data into Matlab
-cdir = fileparts(mfilename('fullpath')); 
-[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls'));
+% exercise 3.2.1
 
-% Extract the rows and columns corresponding to the sensor data, and
-% transpose the matrix to have rows correspond to data items
-X = NUMERIC(:,3:10);
+x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47];
 
-% Extract attribute names from the first column
-attributeNames = RAW(1,4:end);
+mean_x = mean(x);
+std_x = std(x);
+median_x = median(x);
+range_x = range(x);
 
-% Extract unique class names from the first row
-classLabels = RAW(3:end,1);
-classNames = unique(classLabels);
-
-% Extract class labels that match the class names
-[y_,y] = ismember(classLabels, classNames); y = y-1;
- 
\ No newline at end of file
+%% Display results
+display(mean_x);
+display(std_x);
+display(median_x);
+display(range_x);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m
deleted file mode 100644
index 5b6eee846e1397ed2ed1b2a9ea795cd92523e84c..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m
+++ /dev/null
@@ -1,31 +0,0 @@
-%% exercise 2.1.2
-% Data attributes to be plotted
-i = 1;
-j = 2;
-
-% Make a simple plot of the i'th attribute against the j'th attribute
-mfig('NanoNose: Data'); clf;
-plot(X(:,i), X(:,j),'o');
-axis tight
-
-% Make another more fancy plot that includes legend, class labels, 
-% attribute names, and a title
-mfig('NanoNose: Classes'); clf; hold all; 
-C = length(classNames);
-% Use a specific color for each class (easy to reuse across plots!):
-colors = get(gca, 'colororder'); 
-% Here we the standard colours from MATLAB, but you could define you own.
-for c = 0:C-1
-    h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ...
-                'MarkerFaceColor', colors(c+1,:), ...
-                'MarkerEdgeAlpha', 0, ...
-                'MarkerFaceAlpha', .5);
-end
-% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries) 
-% gscatter(X(:,i), X(:,j), classLabels)
-legend(classNames);
-axis tight
-xlabel(attributeNames{i});
-ylabel(attributeNames{j});
-title('NanoNose data');
-a = 234
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m
deleted file mode 100644
index 145ff5c7cdaf768b4b14906bc2c1bc014f08a37a..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m
+++ /dev/null
@@ -1,28 +0,0 @@
-
-%% exercise 2.1.3
-
-% Subtract the mean from the data
-Y = bsxfun(@minus, X, mean(X));
-
-% Obtain the PCA solution by calculate the SVD of Y
-[U, S, V] = svd(Y);
-
-% Compute variance explained
-rho = diag(S).^2./sum(diag(S).^2);
-threshold = 0.90;
-
-% Plot variance explained
-mfig('NanoNose: Var. explained'); clf;
-hold on
-plot(rho, 'x-');
-plot(cumsum(rho), 'o-');
-plot([0,length(rho)], [threshold, threshold], 'k--');
-legend({'Individual','Cumulative','Threshold'}, ...
-        'Location','best');
-ylim([0, 1]);
-xlim([1, length(rho)]);
-grid minor
-xlabel('Principal component');
-ylabel('Variance explained value');
-title('Variance explained by principal components');
-
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m
deleted file mode 100644
index cf767df44e4c6129faff311e92485815c1278a45..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m
+++ /dev/null
@@ -1,24 +0,0 @@
-%% exercise 2.1.4
-
-% Index of the principal components
-i = 1;
-j = 2;
-
-% Compute the projection onto the principal components
-Z = U*S;
-
-% Plot PCA of data
-mfig('NanoNose: PCA Projection'); clf; hold all; 
-C = length(classNames);
-colors = get(gca,'colororder');
-for c = 0:C-1
-    scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ...
-                'MarkerFaceColor', colors(c+1,:), ...
-                'MarkerEdgeAlpha', 0, ...
-                'MarkerFaceAlpha', .5);
-end
-legend(classNames);
-axis tight
-xlabel(sprintf('PC %d', i));
-ylabel(sprintf('PC %d', j));
-title('PCA Projection of NanoNose data');
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m
deleted file mode 100644
index 2475302c15fab8118605b132c2c4efd285b512eb..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m
+++ /dev/null
@@ -1,40 +0,0 @@
-%% exercise 2.1.5
-% We saw in 2.1.3 that the first 3 components explaiend more than 90
-% percent of the variance. Let's look at their coefficients:
-pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5]
-mfig('NanoNose: PCA Component Coefficients');
-h = bar(V(:,pcs));
-legendCell = cellstr(num2str(pcs', 'PC%-d'));
-legend(legendCell, 'location','best');
-xticklabels(attributeNames);
-grid
-xlabel('Attributes');
-ylabel('Component coefficients');
-title('NanoNose: PCA Component Coefficients');
-
-% Inspecting the plot, we see that the 2nd principal component has large
-% (in magnitude) coefficients for attributes A, E and H. We can confirm
-% this by looking at it's numerical values directly, too:
-disp('PC2:')
-disp(V(:,2)') % notice the transpose for display in console 
-
-% How does this translate to the actual data and its projections?
-% Looking at the data for water:
-
-% Projection of water class onto the 2nd principal component.
-all_water_data = Y(y==4,:);
-
-disp('First water observation:')
-disp(all_water_data(1,:))
-
-% Based on the coefficients and the attribute values for the observation
-% displayed, would you expect the projection onto PC2 to be positive or
-% negative - why? Consider *both* the magnitude and sign of *both* the
-% coefficient and the attribute!
-
-% You can determine the projection by (remove comments):
-disp('...and its projection onto PC2:')
-disp(all_water_data(1,:) * V(:,2))
-% Try to explain why?
-
-
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m
index 2f064b574ca36017f92dde65f45e22e78a017d1d..009817333dff41cc9d112663e65d03703f5c59ae 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m
@@ -1,28 +1,65 @@
-%% exercise 2.2.1
+% exercise 3.3.1
 
-% Digit number to display
-i = 1; 
+% Image to use as query
+i = 1;
 
-% Load data
+% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' 
+SimilarityMeasure = 'cos';
+
+%% Load the CBCL face database
 cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/zipdata.mat'));
-
-% Extract digits data
-X = traindata(:,2:end);
-y = traindata(:,1);
-
-% Visualize the i'th digit as a vector
-mfig('Digits: Data');
-subplot(4,1,4);
-imagesc(X(i,:));
-xlabel('Pixel number');
-title('Digit in vector format');
-set(gca, 'YTick', []);
-
-% Visualize the i'th digit as an image
-subplot(4,1,1:3);
-I = reshape(X(i,:), [16,16])';
-imagesc(I);
-colormap(1-gray);
-axis image off
-title('Digit as a image');
+load(fullfile(cdir,'../Data/digits.mat'));
+
+% You can try out the faces CBCL face database, too:
+%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat'));
+
+transpose = true; % set to true if plotted images needs to be transposed
+
+[N,M] = size(X);
+imageDim = [sqrt(M),sqrt(M)];
+%% Search the face database for similar faces
+
+% Index of all other images than i
+noti = [1:i-1 i+1:N]; 
+% Compute similarity between image i and all others
+sim = similarity(X(i,:), X(noti,:), SimilarityMeasure);
+% Sort similarities
+[val, j] = sort(sim, 'descend');
+
+%% Plot query and result
+mfig('Faces: Query'); clf;
+subplot(3,5,1:5);
+
+img = reshape(X(i,:),imageDim);
+if transpose; img = img'; end;
+imagesc(img);
+
+axis image
+set(gca, 'XTick', [], 'YTick', []);
+ylabel(sprintf('Image #%d', i));
+title('Query image','FontWeight','bold');
+for k = 1:5
+    subplot(3,5,k+5)
+    ii = noti(j(k));
+    img = reshape(X(ii,:),imageDim);
+    if transpose; img = img'; end;
+    imagesc(img);
+    axis image
+    set(gca, 'XTick', [], 'YTick', []);
+    xlabel(sprintf('sim=%.2f', val(k)));
+    ylabel(sprintf('Image #%d', ii));
+    if k==3, title('Most similar images','FontWeight','bold'); end;
+end
+for k = 1:5
+    subplot(3,5,k+10)
+    ii = noti(j(end+1-k));
+    img = reshape(X(ii,:),imageDim);
+    if transpose; img = img'; end;
+    imagesc(img);
+    axis image
+    set(gca, 'XTick', [], 'YTick', []);
+    xlabel(sprintf('sim=%.3f', val(end+1-k)));
+    ylabel(sprintf('Image #%d', ii));
+    if k==3, title('Least similar images','FontWeight','bold'); end;
+end
+colormap(gray);
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m
index 0db965d7614fface2e85ca12af9d2e08cdaf0d1b..52eb65d331a1a39824c0313a1f1db12600f9adcc 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m
@@ -1,87 +1,18 @@
-%% exercise 2.2.2
-
-% Digits to include in analysis (to include all, n = 1:10);
-n = [0,1];
-
-% Number of principal components for reconstruction
-K = 22;
-
-% Digits to visualize
-nD = 1:5;
-
-%% Load data
-cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/zipdata.mat'));
-
-% Extract digits
-X = traindata(:,2:end);
-y = traindata(:,1);
-classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'};
-classLabels = classNames(y+1);
-
-% Remove digits that are not to be inspected
-j = ismember(y, n);
-X = X(j,:);
-classLabels = classLabels(j);
-classNames = classNames(n+1);
-y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1;
-
-% Subtract the mean from the data
-Y = bsxfun(@minus, X, mean(X));
-
-% Obtain the PCA solution by calculate the SVD of Y
-[U, S, V] = svd(Y, 'econ');
-
-% Compute the projection onto the principal components
-Z = U*S;
-
-% Compute variance explained
-rho = diag(S).^2./sum(diag(S).^2);
-
-%% Plot variance explained
-mfig('Digits: Var. explained'); clf;
-plot(rho, 'o-');
-title('Variance explained by principal components');
-xlabel('Principal component');
-ylabel('Variance explained value');
-
-%% Plot PCA of data
-mfig('Digits: PCA'); clf; hold all; 
-C = length(classNames);
-for c = 0:C-1
-    plot(Z(y==c,1), Z(y==c,2), 'o');
-end
-legend(classNames);
-xlabel('PC 1');
-ylabel('PC 2');
-title('PCA of digits data');
-
-%% Visualize the reconstructed data from the firts K principal components
-mfig('Digits: Reconstruction'); clf;
-W = Z(:,1:K)*V(:,1:K)';
-D = length(nD);
-for d = 1:D
-    subplot(2,D,d);
-    I = reshape(X(nD(d),:), [16,16])';
-    imagesc(I);
-    axis image off
-    title('Original');
-    subplot(2,D,d+D);
-    I = reshape(W(nD(d),:)+mean(X), [16,16])';
-    imagesc(I);
-    axis image off
-    title('Reconstructed');
-end
-colormap(1-gray);
-
-%% Visualize the principal components
-mfig('Digits: Principal components'); clf;
-for k = 1:K
-    N1 = ceil(sqrt(K)); N2 = ceil(K/N1);
-    subplot(N2, N1, k);
-    I = reshape(V(:,k), [16,16])';
-    imagesc(I);
-    colormap(hot);
-    axis image off
-    title(sprintf('PC %d',k));
-end
+% exercise 3.3.2
+
+% Generate two data objects with M random attributes
+M = 5;
+x = rand(1,M);
+y = rand(1,M);
+
+% Two constants
+a = 1.5;
+b = 1.5;
+
+% Check the statements in the exercise
+similarity(x,y,'cos') - similarity(a*x,y,'cos')
+similarity(x,y,'ext') - similarity(a*x,y,'ext')
+similarity(x,y,'cor') - similarity(a*x,y,'cor')
+similarity(x,y,'cos') - similarity(b+x,y,'cos')
+similarity(x,y,'ext') - similarity(b+x,y,'ext')
+similarity(x,y,'cor') - similarity(b+x,y,'cor')
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m
index 2a8135ef9d9ff5647fd5be3bfcb9acfa7884be85..9a68ff4cf91cea0a1d5abf841a627e46c4751eae 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m
@@ -1,44 +1,31 @@
-%% exercise 2.3.1
-%% Load data
+% exercise 4.2.1
+
+% Disable xlsread warning
+warning('off', 'MATLAB:xlsread:ActiveX'); 
+warning('off', 'MATLAB:xlsread:Mode'); 
+
+% Load the data into Matlab
 cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/zipdata.mat'));
-
-% Extract digits (training set)
-X = traindata(:,2:end);
-y = traindata(:,1);
-
-% Extract digits (test set)
-Xtest = testdata(:,2:end);
-ytest = testdata(:,1);
-
-% Subtract the mean from the data
-Y = bsxfun(@minus, X, mean(X));
-Ytest = bsxfun(@minus, Xtest, mean(X));
-
-% Obtain the PCA solution by calculate the SVD of Y
-[U, S, V] = svd(Y, 'econ');
-
-% Number of principal components to use, i.e. the reduced dimensionality
-Krange = [8,10,15,20,30,40,50,60,100,150];
-errorRate = zeros(length(Krange),1);
-for i = 1:length(Krange)
-    K=Krange(i);
-    % Compute the projection onto the principal components
-    Z = Y*V(:,1:K);
-    Ztest = Ytest*V(:,1:K);
-
-    % Classify digits using a K-nearest neighbour classifier
-    model=fitcknn(Z,y,'NumNeighbors',1);
-    yest = predict(model,Ztest);
-    
-    errorRate(i) = nnz(ytest~=yest)/length(ytest);
-
-    % Display results
-    fprintf('Error rate %.1f%%\n',errorRate(i)*100);
+[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic');
+
+% Extract the rows and columns corresponding to the data
+if isnan(NUMERIC(1,1))
+	X = NUMERIC(2:end,:);
+else
+	X = NUMERIC;
 end
 
-%% Visualize error rates vs. number of principal components considered
-mfig('Variance explained by principal components'); clf;
-plot(Krange,errorRate, 'o-');
-xlabel('Number of principal components K')
-ylabel('Error rate [%]')
+% Extract attribute names from the first row
+attributeNames = RAW(1,1:4)';
+
+% Extract unique class names from the last column
+classLabels = RAW(2:end,5)';
+classNames = unique(classLabels);
+
+% Extract class labels that match the class names
+[y_,y] = ismember(classLabels, classNames); y = y'-1;
+
+% Get the number of data objects, attributes, and classes
+[N, M] = size(X);
+C = length(classNames);
+
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_2.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_2.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_2.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_3.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_3.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_3.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_4.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_4.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_4.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_5.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_5.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_5.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_6.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_6.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_6.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_6.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_7.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_7.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_7.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_7.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_4_1.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_3_1.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_4_1.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_3_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_4_2.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex4_3_2.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex2_4_2.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m
new file mode 100644
index 0000000000000000000000000000000000000000..c5f1186acedaa71f6829e0072e5bdd871453be96
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m
@@ -0,0 +1,19 @@
+%% exercise 2.1.1
+% Load the data into Matlab
+cdir = fileparts(mfilename('fullpath')); 
+[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls'));
+
+% Extract the rows and columns corresponding to the sensor data, and
+% transpose the matrix to have rows correspond to data items
+X = NUMERIC(:,3:10);
+
+% Extract attribute names from the first column
+attributeNames = RAW(1,4:end);
+
+% Extract unique class names from the first row
+classLabels = RAW(3:end,1);
+classNames = unique(classLabels);
+
+% Extract class labels that match the class names
+[y_,y] = ismember(classLabels, classNames); y = y-1;
+ 
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m
index b5391b758cafba6e3c28f0beb465769334439718..5b6eee846e1397ed2ed1b2a9ea795cd92523e84c 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m
@@ -1,9 +1,31 @@
-%% exercise 3.1.2
-cdir = fileparts(mfilename('fullpath')); 
-[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'));
-X = full(A)';
-attributeNames = cellstr(D); 
+%% exercise 2.1.2
+% Data attributes to be plotted
+i = 1;
+j = 2;
 
-%% Display the result
-display(attributeNames);
-display(X);  
\ No newline at end of file
+% Make a simple plot of the i'th attribute against the j'th attribute
+mfig('NanoNose: Data'); clf;
+plot(X(:,i), X(:,j),'o');
+axis tight
+
+% Make another more fancy plot that includes legend, class labels, 
+% attribute names, and a title
+mfig('NanoNose: Classes'); clf; hold all; 
+C = length(classNames);
+% Use a specific color for each class (easy to reuse across plots!):
+colors = get(gca, 'colororder'); 
+% Here we the standard colours from MATLAB, but you could define you own.
+for c = 0:C-1
+    h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ...
+                'MarkerFaceColor', colors(c+1,:), ...
+                'MarkerEdgeAlpha', 0, ...
+                'MarkerFaceAlpha', .5);
+end
+% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries) 
+% gscatter(X(:,i), X(:,j), classLabels)
+legend(classNames);
+axis tight
+xlabel(attributeNames{i});
+ylabel(attributeNames{j});
+title('NanoNose data');
+a = 234
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m
index fdd498e1c67d51f512c47132bf4298325f5abec0..145ff5c7cdaf768b4b14906bc2c1bc014f08a37a 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m
@@ -1 +1,28 @@
-%% exercise 3.1.3
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt');
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
+
+%% exercise 2.1.3
+
+% Subtract the mean from the data
+Y = bsxfun(@minus, X, mean(X));
+
+% Obtain the PCA solution by calculate the SVD of Y
+[U, S, V] = svd(Y);
+
+% Compute variance explained
+rho = diag(S).^2./sum(diag(S).^2);
+threshold = 0.90;
+
+% Plot variance explained
+mfig('NanoNose: Var. explained'); clf;
+hold on
+plot(rho, 'x-');
+plot(cumsum(rho), 'o-');
+plot([0,length(rho)], [threshold, threshold], 'k--');
+legend({'Individual','Cumulative','Threshold'}, ...
+        'Location','best');
+ylim([0, 1]);
+xlim([1, length(rho)]);
+grid minor
+xlabel('Principal component');
+ylabel('Variance explained value');
+title('Variance explained by principal components');
+
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m
index 0518cd4748fdea2dec5daf77635c2de330e3bd2d..cf767df44e4c6129faff311e92485815c1278a45 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m
@@ -1 +1,24 @@
-%% exercise 3.1.4
cdir = fileparts(mfilename('fullpath')); 
TMGOpts.stoplist = '../Data/stopWords.txt';
TMGOpts.stemming = 1;
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts);
X = full(A)';
attributeNames = cellstr(D);

%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
+%% exercise 2.1.4
+
+% Index of the principal components
+i = 1;
+j = 2;
+
+% Compute the projection onto the principal components
+Z = U*S;
+
+% Plot PCA of data
+mfig('NanoNose: PCA Projection'); clf; hold all; 
+C = length(classNames);
+colors = get(gca,'colororder');
+for c = 0:C-1
+    scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ...
+                'MarkerFaceColor', colors(c+1,:), ...
+                'MarkerEdgeAlpha', 0, ...
+                'MarkerFaceAlpha', .5);
+end
+legend(classNames);
+axis tight
+xlabel(sprintf('PC %d', i));
+ylabel(sprintf('PC %d', j));
+title('PCA Projection of NanoNose data');
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m
index ffa4d21982d415f00f3b6d9424dd8cec08486e77..2475302c15fab8118605b132c2c4efd285b512eb 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m
@@ -1,21 +1,40 @@
-%% exercise 3.1.5
+%% exercise 2.1.5
+% We saw in 2.1.3 that the first 3 components explaiend more than 90
+% percent of the variance. Let's look at their coefficients:
+pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5]
+mfig('NanoNose: PCA Component Coefficients');
+h = bar(V(:,pcs));
+legendCell = cellstr(num2str(pcs', 'PC%-d'));
+legend(legendCell, 'location','best');
+xticklabels(attributeNames);
+grid
+xlabel('Attributes');
+ylabel('Component coefficients');
+title('NanoNose: PCA Component Coefficients');
 
-% Query vector
-q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]';
+% Inspecting the plot, we see that the 2nd principal component has large
+% (in magnitude) coefficients for attributes A, E and H. We can confirm
+% this by looking at it's numerical values directly, too:
+disp('PC2:')
+disp(V(:,2)') % notice the transpose for display in console 
 
-%% Method #1 (a for loop)
-N = size(X,1); % Get the number of data objects
-sim = nan(N,1); % Allocate a vector for the similarity
-for i = 1:N
-    x = X(i,:); % Get the i'th data object
-    sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity
-end
+% How does this translate to the actual data and its projections?
+% Looking at the data for water:
 
-%% Method #2 (one compact line of code)
-sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2)));
+% Projection of water class onto the 2nd principal component.
+all_water_data = Y(y==4,:);
+
+disp('First water observation:')
+disp(all_water_data(1,:))
+
+% Based on the coefficients and the attribute values for the observation
+% displayed, would you expect the projection onto PC2 to be positive or
+% negative - why? Consider *both* the magnitude and sign of *both* the
+% coefficient and the attribute!
+
+% You can determine the projection by (remove comments):
+disp('...and its projection onto PC2:')
+disp(all_water_data(1,:) * V(:,2))
+% Try to explain why?
 
-%% Method #3 (use the "similarity" function)
-sim = similarity(X, q, 'cos');
 
-%% Display the result
-display(sim);
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_6.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_6.m
similarity index 100%
rename from exercises/02450Toolbox_Matlab/Scripts/ex2_1_6.m
rename to exercises/02450Toolbox_Matlab/Scripts/ex3_1_6.m
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m
index f4115017e2054a948671c99dc72cc037b0309382..2f064b574ca36017f92dde65f45e22e78a017d1d 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m
@@ -1,14 +1,28 @@
-% exercise 3.2.1
+%% exercise 2.2.1
 
-x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47];
+% Digit number to display
+i = 1; 
 
-mean_x = mean(x);
-std_x = std(x);
-median_x = median(x);
-range_x = range(x);
+% Load data
+cdir = fileparts(mfilename('fullpath')); 
+load(fullfile(cdir,'../Data/zipdata.mat'));
 
-%% Display results
-display(mean_x);
-display(std_x);
-display(median_x);
-display(range_x);
\ No newline at end of file
+% Extract digits data
+X = traindata(:,2:end);
+y = traindata(:,1);
+
+% Visualize the i'th digit as a vector
+mfig('Digits: Data');
+subplot(4,1,4);
+imagesc(X(i,:));
+xlabel('Pixel number');
+title('Digit in vector format');
+set(gca, 'YTick', []);
+
+% Visualize the i'th digit as an image
+subplot(4,1,1:3);
+I = reshape(X(i,:), [16,16])';
+imagesc(I);
+colormap(1-gray);
+axis image off
+title('Digit as a image');
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m
new file mode 100644
index 0000000000000000000000000000000000000000..0db965d7614fface2e85ca12af9d2e08cdaf0d1b
--- /dev/null
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m
@@ -0,0 +1,87 @@
+%% exercise 2.2.2
+
+% Digits to include in analysis (to include all, n = 1:10);
+n = [0,1];
+
+% Number of principal components for reconstruction
+K = 22;
+
+% Digits to visualize
+nD = 1:5;
+
+%% Load data
+cdir = fileparts(mfilename('fullpath')); 
+load(fullfile(cdir,'../Data/zipdata.mat'));
+
+% Extract digits
+X = traindata(:,2:end);
+y = traindata(:,1);
+classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'};
+classLabels = classNames(y+1);
+
+% Remove digits that are not to be inspected
+j = ismember(y, n);
+X = X(j,:);
+classLabels = classLabels(j);
+classNames = classNames(n+1);
+y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1;
+
+% Subtract the mean from the data
+Y = bsxfun(@minus, X, mean(X));
+
+% Obtain the PCA solution by calculate the SVD of Y
+[U, S, V] = svd(Y, 'econ');
+
+% Compute the projection onto the principal components
+Z = U*S;
+
+% Compute variance explained
+rho = diag(S).^2./sum(diag(S).^2);
+
+%% Plot variance explained
+mfig('Digits: Var. explained'); clf;
+plot(rho, 'o-');
+title('Variance explained by principal components');
+xlabel('Principal component');
+ylabel('Variance explained value');
+
+%% Plot PCA of data
+mfig('Digits: PCA'); clf; hold all; 
+C = length(classNames);
+for c = 0:C-1
+    plot(Z(y==c,1), Z(y==c,2), 'o');
+end
+legend(classNames);
+xlabel('PC 1');
+ylabel('PC 2');
+title('PCA of digits data');
+
+%% Visualize the reconstructed data from the firts K principal components
+mfig('Digits: Reconstruction'); clf;
+W = Z(:,1:K)*V(:,1:K)';
+D = length(nD);
+for d = 1:D
+    subplot(2,D,d);
+    I = reshape(X(nD(d),:), [16,16])';
+    imagesc(I);
+    axis image off
+    title('Original');
+    subplot(2,D,d+D);
+    I = reshape(W(nD(d),:)+mean(X), [16,16])';
+    imagesc(I);
+    axis image off
+    title('Reconstructed');
+end
+colormap(1-gray);
+
+%% Visualize the principal components
+mfig('Digits: Principal components'); clf;
+for k = 1:K
+    N1 = ceil(sqrt(K)); N2 = ceil(K/N1);
+    subplot(N2, N1, k);
+    I = reshape(V(:,k), [16,16])';
+    imagesc(I);
+    colormap(hot);
+    axis image off
+    title(sprintf('PC %d',k));
+end
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m
index 009817333dff41cc9d112663e65d03703f5c59ae..2a8135ef9d9ff5647fd5be3bfcb9acfa7884be85 100644
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m
+++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m
@@ -1,65 +1,44 @@
-% exercise 3.3.1
-
-% Image to use as query
-i = 1;
-
-% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' 
-SimilarityMeasure = 'cos';
-
-%% Load the CBCL face database
+%% exercise 2.3.1
+%% Load data
 cdir = fileparts(mfilename('fullpath')); 
-load(fullfile(cdir,'../Data/digits.mat'));
-
-% You can try out the faces CBCL face database, too:
-%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat'));
-
-transpose = true; % set to true if plotted images needs to be transposed
-
-[N,M] = size(X);
-imageDim = [sqrt(M),sqrt(M)];
-%% Search the face database for similar faces
-
-% Index of all other images than i
-noti = [1:i-1 i+1:N]; 
-% Compute similarity between image i and all others
-sim = similarity(X(i,:), X(noti,:), SimilarityMeasure);
-% Sort similarities
-[val, j] = sort(sim, 'descend');
-
-%% Plot query and result
-mfig('Faces: Query'); clf;
-subplot(3,5,1:5);
-
-img = reshape(X(i,:),imageDim);
-if transpose; img = img'; end;
-imagesc(img);
-
-axis image
-set(gca, 'XTick', [], 'YTick', []);
-ylabel(sprintf('Image #%d', i));
-title('Query image','FontWeight','bold');
-for k = 1:5
-    subplot(3,5,k+5)
-    ii = noti(j(k));
-    img = reshape(X(ii,:),imageDim);
-    if transpose; img = img'; end;
-    imagesc(img);
-    axis image
-    set(gca, 'XTick', [], 'YTick', []);
-    xlabel(sprintf('sim=%.2f', val(k)));
-    ylabel(sprintf('Image #%d', ii));
-    if k==3, title('Most similar images','FontWeight','bold'); end;
-end
-for k = 1:5
-    subplot(3,5,k+10)
-    ii = noti(j(end+1-k));
-    img = reshape(X(ii,:),imageDim);
-    if transpose; img = img'; end;
-    imagesc(img);
-    axis image
-    set(gca, 'XTick', [], 'YTick', []);
-    xlabel(sprintf('sim=%.3f', val(end+1-k)));
-    ylabel(sprintf('Image #%d', ii));
-    if k==3, title('Least similar images','FontWeight','bold'); end;
+load(fullfile(cdir,'../Data/zipdata.mat'));
+
+% Extract digits (training set)
+X = traindata(:,2:end);
+y = traindata(:,1);
+
+% Extract digits (test set)
+Xtest = testdata(:,2:end);
+ytest = testdata(:,1);
+
+% Subtract the mean from the data
+Y = bsxfun(@minus, X, mean(X));
+Ytest = bsxfun(@minus, Xtest, mean(X));
+
+% Obtain the PCA solution by calculate the SVD of Y
+[U, S, V] = svd(Y, 'econ');
+
+% Number of principal components to use, i.e. the reduced dimensionality
+Krange = [8,10,15,20,30,40,50,60,100,150];
+errorRate = zeros(length(Krange),1);
+for i = 1:length(Krange)
+    K=Krange(i);
+    % Compute the projection onto the principal components
+    Z = Y*V(:,1:K);
+    Ztest = Ytest*V(:,1:K);
+
+    % Classify digits using a K-nearest neighbour classifier
+    model=fitcknn(Z,y,'NumNeighbors',1);
+    yest = predict(model,Ztest);
+    
+    errorRate(i) = nnz(ytest~=yest)/length(ytest);
+
+    % Display results
+    fprintf('Error rate %.1f%%\n',errorRate(i)*100);
 end
-colormap(gray);
+
+%% Visualize error rates vs. number of principal components considered
+mfig('Variance explained by principal components'); clf;
+plot(Krange,errorRate, 'o-');
+xlabel('Number of principal components K')
+ylabel('Error rate [%]')
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m
deleted file mode 100644
index 52eb65d331a1a39824c0313a1f1db12600f9adcc..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m
+++ /dev/null
@@ -1,18 +0,0 @@
-% exercise 3.3.2
-
-% Generate two data objects with M random attributes
-M = 5;
-x = rand(1,M);
-y = rand(1,M);
-
-% Two constants
-a = 1.5;
-b = 1.5;
-
-% Check the statements in the exercise
-similarity(x,y,'cos') - similarity(a*x,y,'cos')
-similarity(x,y,'ext') - similarity(a*x,y,'ext')
-similarity(x,y,'cor') - similarity(a*x,y,'cor')
-similarity(x,y,'cos') - similarity(b+x,y,'cos')
-similarity(x,y,'ext') - similarity(b+x,y,'ext')
-similarity(x,y,'cor') - similarity(b+x,y,'cor')
\ No newline at end of file
diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m
deleted file mode 100644
index 9a68ff4cf91cea0a1d5abf841a627e46c4751eae..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m
+++ /dev/null
@@ -1,31 +0,0 @@
-% exercise 4.2.1
-
-% Disable xlsread warning
-warning('off', 'MATLAB:xlsread:ActiveX'); 
-warning('off', 'MATLAB:xlsread:Mode'); 
-
-% Load the data into Matlab
-cdir = fileparts(mfilename('fullpath')); 
-[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic');
-
-% Extract the rows and columns corresponding to the data
-if isnan(NUMERIC(1,1))
-	X = NUMERIC(2:end,:);
-else
-	X = NUMERIC;
-end
-
-% Extract attribute names from the first row
-attributeNames = RAW(1,1:4)';
-
-% Extract unique class names from the last column
-classLabels = RAW(2:end,5)';
-classNames = unique(classLabels);
-
-% Extract class labels that match the class names
-[y_,y] = ismember(classLabels, classNames); y = y'-1;
-
-% Get the number of data objects, attributes, and classes
-[N, M] = size(X);
-C = length(classNames);
-
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..60a39cf53d0415f6573b00f633de451559a70dff
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py
@@ -0,0 +1,54 @@
+# exercise 3.1.4
+import importlib_resources
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
+# Load the textDocs.txt as a long string into raw_file:
+with open(filename, "r") as f:
+    raw_file = f.read()
+# raw_file contains sentences seperated by newline characters,
+# so we split by '\n':
+corpus = raw_file.split("\n")
+# corpus is now list of "documents" (sentences), but some of them are empty,
+# because textDocs.txt has a lot of empty lines, we filter/remove them:
+corpus = list(filter(None, corpus))
+
+# Display the result
+print("Document-term matrix analysis")
+print()
+print("Corpus (5 documents/sentences):")
+print(np.asmatrix(corpus))
+print()
+
+
+# To automatically obtain the bag of words representation, we use sklearn's
+# feature_extraction.text module, which has a function CountVectorizer.
+# We make a CounterVectorizer:
+vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b")
+# The token pattern is a regular expression (marked by the r), which ensures
+# that the vectorizer ignores digit/non-word tokens - in this case, it ensures
+# the 10 in the last document is not recognized as a token. It's not important
+# that you should understand it the regexp.
+
+# The object vectorizer can now be used to first 'fit' the vectorizer to the
+# corpus, and the subsequently transform the data. We start by fitting:
+vectorizer.fit(corpus)
+# The vectorizer has now determined the unique terms (or tokens) in the corpus
+# and we can extract them using:
+attributeNames = vectorizer.get_feature_names_out()
+print("Found terms:")
+print(attributeNames)
+print()
+
+# The next step is to count how many times each term is found in each document,
+# which we do using the transform function:
+X = vectorizer.transform(corpus)
+N, M = X.shape
+print("Number of documents (data objects, N):\t %i" % N)
+print("Number of terms (attributes, M):\t %i" % M)
+print()
+print("Document-term matrix:")
+print(X.toarray())
+print()
+print("Ran Exercise 3.1.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3ac96c2ad678edc8786ea59973292ad3b80a9e1
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py
@@ -0,0 +1,40 @@
+# exercise 3.1.4
+import importlib_resources
+from sklearn.feature_extraction.text import CountVectorizer
+
+filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
+filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
+
+# As before, load the corpus and preprocess:
+with open(filename_docs, "r") as f:
+    raw_file = f.read()
+corpus = raw_file.split("\n")
+corpus = list(filter(None, corpus))
+
+# Load and process the stop words in a similar manner:
+with open(filename_stop, "r") as f:
+    raw_file = f.read()
+stopwords = raw_file.split("\n")
+
+# When making the CountVectorizer, we now input the stop words:
+vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords)
+# Determine the terms in the corpus
+vectorizer.fit(corpus)
+# ... and count the frequency of each term within a document:
+X = vectorizer.transform(corpus)
+attributeNames = vectorizer.get_feature_names_out()
+N, M = X.shape
+
+# Display the result
+print("Document-term matrix analysis (using stop words)")
+print()
+print("Number of documents (data objects, N):\t %i" % N)
+print("Number of terms (attributes, M):\t %i" % M)
+print()
+print("Found terms (no stop words):")
+print(attributeNames)
+print()
+print("Document-term matrix:")
+print(X.toarray())
+print()
+print("Ran Exercise 3.1.3")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..e33ee98041ad4eddb1533e2553763c881c2e5e73
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py
@@ -0,0 +1,66 @@
+# exercise 3.1.4
+import importlib_resources
+
+# We'll use a widely used stemmer based:
+# Porter, M. “An algorithm for suffix stripping.” Program 14.3 (1980): 130-137.
+# The stemmer is implemented in the most used natural language processing
+# package in Python, "Natural Langauge Toolkit" (NLTK):
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import CountVectorizer
+
+filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
+filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
+
+# As before, load the corpus and preprocess:
+with open(filename_docs, "r") as f:
+    raw_file = f.read()
+corpus = raw_file.split("\n")
+corpus = list(filter(None, corpus))
+
+# Load and process the stop words in a similar manner:
+with open(filename_stop, "r") as f:
+    raw_file = f.read()
+stopwords = raw_file.split("\n")
+
+
+# To enable stemming when using the sklearn-module, we need to parse an
+# "analyzer" to the vectorizer we've been using.
+# First, we make an object based on the PorterStemmer class, and we also make
+# an analyzer object:
+stemmer = PorterStemmer()
+analyzer = CountVectorizer(
+    token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords
+).build_analyzer()
+
+
+# Using these we'll make a function that can stem words:
+def stemmed_words(doc):
+    return (stemmer.stem(w) for w in analyzer(doc))
+
+
+# ... and finally, we make a vectorizer just like we've done before:
+vectorizer = CountVectorizer(analyzer=stemmed_words)
+
+# Determine the terms:
+vectorizer.fit(corpus)
+attributeNames = vectorizer.get_feature_names_out()
+
+# ... and count the occurences:
+X = vectorizer.transform(corpus)
+N, M = X.shape
+X = X.toarray()
+
+# Display the result
+print("Document-term matrix analysis (using stop words and stemming)")
+print()
+print("Number of documents (data objects, N):\t %i" % N)
+print("Number of terms (attributes, M):\t %i" % M)
+print()
+print("Found terms (no stop words, stemmed):")
+print(attributeNames)
+print()
+print("Document-term matrix:")
+print(X)
+print()
+print("Ran Exercise 3.1.4")
+print()
diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb3b3b3e15930768ffd46963e05ae16d66268455
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py
@@ -0,0 +1,38 @@
+# exercise 3.1.5
+import numpy as np
+import scipy.linalg as linalg
+from ex1_6_4 import *
+
+from dtuimldmtools import similarity
+
+# Query vector
+q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
+# notice, that you could get the query vector using the vectorizer, too:
+# q = vectorizer.transform(['matrix rank solv'])
+# q = np.asarray(q.toarray())
+# or use any other string:
+# q = vectorizer.transform(['Can I Google how to fix my problem?'])
+# q = np.asarray(q.toarray())
+
+# Method 1 ('for' loop - slow)
+N = np.shape(X)[0]
+# get the number of data objects
+sim = np.zeros((N, 1))  # allocate a vector for the similarity
+for i in range(N):
+    x = X[i, :]  # Get the i'th data object (here: document)
+    sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x)  # Compute cosine similarity
+
+# Method 2 (one line of code with no iterations - faster)
+sim = (q @ X.T).T / (
+    np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum())
+)
+
+# Method 3 (use the "similarity" function)
+sim = similarity(X, q, "cos")
+
+
+# Display the result
+print("Query vector:\n {0}\n".format(q))
+print("Similarity results:\n {0}".format(sim))
+
+print("Ran Exercise 3.1.5")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py
index b210d13bfaca05d29b799ca0f1cc103802dee09e..dbc1dc0af5914508cd15466c199cdb24c2fd3bc2 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py
@@ -1,32 +1,19 @@
-# exercise 2.1.1
-import importlib_resources
+# exercise 3.2.1
 import numpy as np
-import xlrd
 
-# Load xls sheet with data
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls")
-doc = xlrd.open_workbook(filename).sheet_by_index(0)
+x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47])
 
-# Extract attribute names (1st row, column 4 to 12)
-attributeNames = doc.row_values(0, 3, 11)
+# Compute values
+mean_x = x.mean()
+std_x = x.std(ddof=1)
+median_x = np.median(x)
+range_x = x.max() - x.min()
 
-# Extract class names to python list,
-# then encode with integers (dict)
-classLabels = doc.col_values(0, 2, 92)
-classNames = sorted(set(classLabels))
-classDict = dict(zip(classNames, range(5)))
+# Display results
+print("Vector:", x)
+print("Mean:", mean_x)
+print("Standard Deviation:", std_x)
+print("Median:", median_x)
+print("Range:", range_x)
 
-# Extract vector y, convert to NumPy array
-y = np.asarray([classDict[value] for value in classLabels])
-
-# Preallocate memory, then extract excel data to matrix X
-X = np.empty((90, 8))
-for i, col_id in enumerate(range(3, 11)):
-    X[:, i] = np.asarray(doc.col_values(col_id, 2, 92))
-
-# Compute values of N, M and C.
-N = len(y)
-M = len(attributeNames)
-C = len(classNames)
-
-print("Ran Exercise 2.1.1")
+print("Ran Exercise 3.2.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py
deleted file mode 100644
index 98847f60502ada91cb0b8289234d68daeb6959ba..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# exercise 2.1.2
-
-# Imports the numpy and xlrd package, then runs the ex2_1_1 code
-from ex2_1_1 import *
-from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
-
-# (requires data structures from ex. 2.1.1)
-
-
-# Data attributes to be plotted
-i = 0
-j = 1
-
-##
-# Make a simple plot of the i'th attribute against the j'th attribute
-# Notice that X is of matrix type (but it will also work with a numpy array)
-# X = np.array(X) #Try to uncomment this line
-plot(X[:, i], X[:, j], "o")
-
-# %%
-# Make another more fancy plot that includes legend, class labels,
-# attribute names, and a title.
-f = figure()
-title("NanoNose data")
-
-for c in range(C):
-    # select indices belonging to class c:
-    class_mask = y == c
-    plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3)
-
-legend(classNames)
-xlabel(attributeNames[i])
-ylabel(attributeNames[j])
-
-# Output result to screen
-show()
-print("Ran Exercise 2.1.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py
deleted file mode 100644
index c977a4c69b0e1a4cc311470122cbdffe59a72ae6..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# exercise 2.1.3
-# (requires data structures from ex. 2.2.1)
-import matplotlib.pyplot as plt
-from ex2_1_1 import *
-from scipy.linalg import svd
-
-# Subtract mean value from data
-Y = X - np.ones((N, 1)) * X.mean(axis=0)
-
-# PCA by computing SVD of Y
-U, S, V = svd(Y, full_matrices=False)
-
-# Compute variance explained by principal components
-rho = (S * S) / (S * S).sum()
-
-threshold = 0.9
-
-# Plot variance explained
-plt.figure()
-plt.plot(range(1, len(rho) + 1), rho, "x-")
-plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-")
-plt.plot([1, len(rho)], [threshold, threshold], "k--")
-plt.title("Variance explained by principal components")
-plt.xlabel("Principal component")
-plt.ylabel("Variance explained")
-plt.legend(["Individual", "Cumulative", "Threshold"])
-plt.grid()
-plt.show()
-
-print("Ran Exercise 2.1.3")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py
deleted file mode 100644
index 46788689ab957d3409370102720978e17d89a453..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# exercise 2.1.4
-# (requires data structures from ex. 2.2.1 and 2.2.3)
-from ex2_1_1 import *
-from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
-from scipy.linalg import svd
-
-# Subtract mean value from data
-Y = X - np.ones((N, 1)) * X.mean(0)
-
-# PCA by computing SVD of Y
-U, S, Vh = svd(Y, full_matrices=False)
-# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose)
-# of the vector V. So, for us to obtain the correct V, we transpose:
-V = Vh.T
-
-# Project the centered data onto principal component space
-Z = Y @ V
-
-# Indices of the principal components to be plotted
-i = 0
-j = 1
-
-# Plot PCA of the data
-f = figure()
-title("NanoNose data: PCA")
-# Z = array(Z)
-for c in range(C):
-    # select indices belonging to class c:
-    class_mask = y == c
-    plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5)
-legend(classNames)
-xlabel("PC{0}".format(i + 1))
-ylabel("PC{0}".format(j + 1))
-
-# Output result to screen
-show()
-
-print("Ran Exercise 2.1.4")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py
deleted file mode 100644
index 364a20146ce59e5f312d1d59025b73dbc1a91d6f..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# exercise 2.2.4
-
-# (requires data structures from ex. 2.2.1)
-import matplotlib.pyplot as plt
-from ex2_1_1 import *
-from scipy.linalg import svd
-
-Y = X - np.ones((N, 1)) * X.mean(0)
-U, S, Vh = svd(Y, full_matrices=False)
-V = Vh.T
-N, M = X.shape
-
-# We saw in 2.1.3 that the first 3 components explaiend more than 90
-# percent of the variance. Let's look at their coefficients:
-pcs = [0, 1, 2]
-legendStrs = ["PC" + str(e + 1) for e in pcs]
-c = ["r", "g", "b"]
-bw = 0.2
-r = np.arange(1, M + 1)
-for i in pcs:
-    plt.bar(r + i * bw, V[:, i], width=bw)
-plt.xticks(r + bw, attributeNames)
-plt.xlabel("Attributes")
-plt.ylabel("Component coefficients")
-plt.legend(legendStrs)
-plt.grid()
-plt.title("NanoNose: PCA Component Coefficients")
-plt.show()
-
-# Inspecting the plot, we see that the 2nd principal component has large
-# (in magnitude) coefficients for attributes A, E and H. We can confirm
-# this by looking at it's numerical values directly, too:
-print("PC2:")
-print(V[:, 1].T)
-
-# How does this translate to the actual data and its projections?
-# Looking at the data for water:
-
-# Projection of water class onto the 2nd principal component.
-all_water_data = Y[y == 4, :]
-
-print("First water observation")
-print(all_water_data[0, :])
-
-# Based on the coefficients and the attribute values for the observation
-# displayed, would you expect the projection onto PC2 to be positive or
-# negative - why? Consider *both* the magnitude and sign of *both* the
-# coefficient and the attribute!
-
-# You can determine the projection by (remove comments):
-print("...and its projection onto PC2")
-print(all_water_data[0, :] @ V[:, 1])
-# Try to explain why?
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py
index 798bb5b46dba276b98ff5c3f0da86e13f66bfe15..f5cfc462ae073133074a692c8080b0706d084ee5 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py
@@ -1,36 +1,77 @@
+# exercise 3.3.1
+
 import importlib_resources
+import matplotlib.pyplot as plt
 import numpy as np
-from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks
 from scipy.io import loadmat
 
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
-# Index of the digit to display
-i = 0
+from dtuimldmtools import similarity
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat")
+# Image to use as query
+i = 1
 
+# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
+similarity_measure = "SMC"
+
+# Load the digits
 # Load Matlab data file to python dict structure
-mat_data = loadmat(filename)
+X = loadmat(filename)["X"]
+# You can also try the CBCL faces dataset (remember to change 'transpose')
+#X = loadmat('../Data/wildfaces_grayscale.mat')['X']
+N, M = X.shape
+transpose = False # should the plotted images be transposed? 
+
+
+# Search the face database for similar faces 
+# Index of all other images than i
+noti = list(range(0,i)) + list(range(i+1,N)) 
+# Compute similarity between image i and all others
+sim = similarity(X[i,:], X[noti,:], similarity_measure)
+sim = sim.tolist()[0]
+# Tuples of sorted similarities and their indices
+sim_to_index = sorted(zip(sim,noti))
+
+
+# Visualize query image and 5 most/least similar images
+plt.figure(figsize=(12,8))
+plt.subplot(3,1,1)
 
-# Extract variables of interest
-testdata = mat_data["testdata"]
-traindata = mat_data["traindata"]
-X = traindata[:, 1:]
-y = traindata[:, 0]
+img_hw = int(np.sqrt(len(X[0])))
+img = np.reshape(X[i], (img_hw,img_hw))
+if transpose: img = img.T
+plt.imshow(img, cmap=plt.cm.gray)
+plt.xticks([]); plt.yticks([])
+plt.title('Query image')
+plt.ylabel('image #{0}'.format(i))
 
 
-# Visualize the i'th digit as a vector
-f = figure()
-subplot(4, 1, 4)
-imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r)
-xlabel("Pixel number")
-title("Digit in vector format")
-yticks([])
+for ms in range(5):
 
-# Visualize the i'th digit as an image
-subplot(2, 1, 1)
-I = np.reshape(X[i, :], (16, 16))
-imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r)
-title("Digit as an image")
+    # 5 most similar images found
+    plt.subplot(3,5,6+ms)
+    im_id = sim_to_index[-ms-1][1]
+    im_sim = sim_to_index[-ms-1][0]
+    img = np.reshape(X[im_id],(img_hw,img_hw))
+    if transpose: img = img.T
+    plt.imshow(img, cmap=plt.cm.gray)
+    plt.xlabel('sim={0:.3f}'.format(im_sim))
+    plt.ylabel('image #{0}'.format(im_id))
+    plt.xticks([]); plt.yticks([])
+    if ms==2: plt.title('Most similar images')
 
-show()
+    # 5 least similar images found
+    plt.subplot(3,5,11+ms)
+    im_id = sim_to_index[ms][1]
+    im_sim = sim_to_index[ms][0]
+    img = np.reshape(X[im_id],(img_hw,img_hw))
+    if transpose: img = img.T
+    plt.imshow(img, cmap=plt.cm.gray)
+    plt.xlabel('sim={0:.3f}'.format(im_sim))
+    plt.ylabel('image #{0}'.format(im_id))
+    plt.xticks([]); plt.yticks([])
+    if ms==2: plt.title('Least similar images')
+    
+plt.show()
 
-print("Ran Exercise 2.2.1")
+print('Ran Exercise 3.3.1')
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py
index 3d6104b2aae59b5bddd2d2afa9157d3d2e181485..e7abec8d3c302ff9414b0afb8925596c1dc69d3f 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py
@@ -1,117 +1,42 @@
-# exercise 2.2.2
-import importlib_resources
-import numpy as np
-import scipy.linalg as linalg
-from matplotlib.pyplot import (
-    cm,
-    figure,
-    imshow,
-    legend,
-    plot,
-    show,
-    subplot,
-    title,
-    xlabel,
-    ylabel,
-    yticks,
-)
-from scipy.io import loadmat
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
-
-# Digits to include in analysis (to include all, n = range(10) )
-n = [0, 1]
-# Number of principal components for reconstruction
-K = 16
-# Digits to visualize
-nD = range(6)
-
-
-# Load Matlab data file to python dict structure
-# and extract variables of interest
-traindata = loadmat(filename)["traindata"]
-X = traindata[:, 1:]
-y = traindata[:, 0]
-
-N, M = X.shape
-C = len(n)
-
-classValues = n
-classNames = [str(num) for num in n]
-classDict = dict(zip(classNames, classValues))
-
-
-# Select subset of digits classes to be inspected
-class_mask = np.zeros(N).astype(bool)
-for v in n:
-    cmsk = y == v
-    class_mask = class_mask | cmsk
-X = X[class_mask, :]
-y = y[class_mask]
-N = X.shape[0]
-
-# Center the data (subtract mean column values)
-Xc = X - np.ones((N, 1)) * X.mean(0)
-
-# PCA by computing SVD of Y
-U, S, V = linalg.svd(Xc, full_matrices=False)
-# U = mat(U)
-V = V.T
-
-# Compute variance explained by principal components
-rho = (S * S) / (S * S).sum()
-
-# Project data onto principal component space
-Z = Xc @ V
-
-# Plot variance explained
-figure()
-plot(rho, "o-")
-title("Variance explained by principal components")
-xlabel("Principal component")
-ylabel("Variance explained value")
-
-
-# Plot PCA of the data
-f = figure()
-title("pixel vectors of handwr. digits projected on PCs")
-for c in n:
-    # select indices belonging to class c:
-    class_mask = y == c
-    plot(Z[class_mask, 0], Z[class_mask, 1], "o")
-legend(classNames)
-xlabel("PC1")
-ylabel("PC2")
+# exercise 3.2.2
 
+import numpy as np
 
-# Visualize the reconstructed data from the first K principal components
-# Select randomly D digits.
-figure(figsize=(10, 3))
-W = Z[:, range(K)] @ V[:, range(K)].T
-D = len(nD)
-for d in range(D):
-    digit_ix = np.random.randint(0, N)
-    subplot(2, D, int(d + 1))
-    I = np.reshape(X[digit_ix, :], (16, 16))
-    imshow(I, cmap=cm.gray_r)
-    title("Original")
-    subplot(2, D, D + d + 1)
-    I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16))
-    imshow(I, cmap=cm.gray_r)
-    title("Reconstr.")
+from dtuimldmtools import similarity
 
+# Generate two data objects with M random attributes
+M = 5
+x = np.random.rand(1, M)
+y = np.random.rand(1, M)
 
-# Visualize the pricipal components
-figure(figsize=(8, 6))
-for k in range(K):
-    N1 = int(np.ceil(np.sqrt(K)))
-    N2 = int(np.ceil(K / N1))
-    subplot(N2, N1, int(k + 1))
-    I = np.reshape(V[:, k], (16, 16))
-    imshow(I, cmap=cm.hot)
-    title("PC{0}".format(k + 1))
+# Two constants
+a = 1.5
+b = 1.5
 
-# output to screen
-show()
+# Check the statements in the exercise
+print(
+    "Cosine scaling: %.4f "
+    % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0]
+)
+print(
+    "ExtendedJaccard scaling: %.4f "
+    % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0]
+)
+print(
+    "Correlation scaling: %.4f "
+    % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0]
+)
+print(
+    "Cosine translation: %.4f "
+    % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0]
+)
+print(
+    "ExtendedJaccard translation: %.4f "
+    % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0]
+)
+print(
+    "Correlation translation: %.4f "
+    % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0]
+)
 
-print("Ran Exercise 2.2.2")
+print("Ran Exercise 3.2.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py
index 9f665b581d0c86e7f5ba07bf5e943e04ff0a273e..e6b46e8ca7e49caa7258fa553737a598d2dda417 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py
@@ -1,58 +1,33 @@
-# exercise 2.3.1
+# exercise 4.2.1
 
 import importlib_resources
 import numpy as np
-import scipy.linalg as linalg
-from matplotlib.pyplot import figure, plot, show, xlabel, ylabel
-from scipy.io import loadmat
-from sklearn.neighbors import KNeighborsClassifier
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
-# Number of principal components to use for classification,
-# i.e. the reduced dimensionality
-K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150]
-
-# Load Matlab data file and extract training set and test set
-mat_data = loadmat(filename)
-X = mat_data["traindata"][:, 1:]
-y = mat_data["traindata"][:, 0]
-Xtest = mat_data["testdata"][:, 1:]
-ytest = mat_data["testdata"][:, 0]
-N, M = X.shape
-Ntest = Xtest.shape[0]  # or Xtest[:,0].shape
-
-# Subtract the mean from the data
-Y = X - np.ones((N, 1)) * X.mean(0)
-Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0)
-
-# Obtain the PCA solution  by calculate the SVD of Y
-U, S, V = linalg.svd(Y, full_matrices=False)
-V = V.T
-
-
-# Repeat classification for different values of K
-error_rates = []
-for k in K:
-    # Project data onto principal component space,
-    Z = Y @ V[:, :k]
-    Ztest = Ytest @ V[:, :k]
-
-    # Classify data with knn classifier
-    knn_classifier = KNeighborsClassifier(n_neighbors=1)
-    knn_classifier.fit(Z, y.ravel())
-    y_estimated = knn_classifier.predict(Ztest)
-
-    # Compute classification error rates
-    y_estimated = y_estimated.T
-    er = (sum(ytest != y_estimated) / float(len(ytest))) * 100
-    error_rates.append(er)
-    print("K={0}: Error rate: {1:.1f}%".format(k, er))
-
-# Visualize error rates vs. number of principal components considered
-figure()
-plot(K, error_rates, "o-")
-xlabel("Number of principal components K")
-ylabel("Error rate [%]")
-show()
-
-print("Ran Exercise 2.3.1")
+import xlrd
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls")
+# Load xls sheet with data
+doc = xlrd.open_workbook(filename).sheet_by_index(0)
+
+# Extract attribute names
+attributeNames = doc.row_values(0, 0, 4)
+
+# Extract class names to python list,
+# then encode with integers (dict)
+classLabels = doc.col_values(4, 1, 151)
+classNames = sorted(set(classLabels))
+classDict = dict(zip(classNames, range(len(classNames))))
+
+# Extract vector y, convert to NumPy matrix and transpose
+y = np.array([classDict[value] for value in classLabels])
+
+# Preallocate memory, then extract data to matrix X
+X = np.empty((150, 4))
+for i in range(4):
+    X[:, i] = np.array(doc.col_values(i, 1, 151)).T
+
+# Compute values of N, M and C.
+N = len(y)
+M = len(attributeNames)
+C = len(classNames)
+
+print("Ran Exercise 4.2.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_2.py
similarity index 94%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_2.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_2.py
index 4bef1c35cdf2df3450ee468be245671bfa4d2d72..40669020f57a582ca3cab76825569f7098c76b77 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_2.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import figure, hist, show, subplot, xlabel, ylim
 
 figure(figsize=(8, 7))
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_3.py
similarity index 92%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_3.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_3.py
index a6420832b94689016a5d776b2de688666e5e04e9..d19cace3629050ab15f7e48117bf0321c8b15b9a 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_3.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.3
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import boxplot, show, title, xticks, ylabel
 
 boxplot(X)
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_4.py
similarity index 97%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_4.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_4.py
index 4c394b19c505b2efd72030c49035f6dec7e4073a..d45db38b94e46011b94ad350dc76d17ce56c18c6 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_4.py
@@ -1,6 +1,6 @@
 # Exercise 4.2.4
 # requires data from exercise 4.1.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import boxplot, figure, show, subplot, title, xticks, ylim
 
 figure(figsize=(14, 7))
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_5.py
similarity index 97%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_5.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_5.py
index 67061cbd96c896a6df5018d3288a1e271a15b09a..eaac9413f25c6611a22d0ab64691595a423e036b 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_5.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.5
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import (
     figure,
     legend,
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_6.py
similarity index 96%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_6.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_6.py
index 1f625d2d23ad211e32fb255e4f81f3ff246db9d1..f881cc83908231ee83c703d19405afd7a087b7c2 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_6.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.6
 
 # requires data from exercise 4.1.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import figure, show
 from mpl_toolkits.mplot3d import Axes3D
 
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_7.py
similarity index 95%
rename from exercises/02450Toolbox_Python/Scripts/ex4_2_7.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_3_7.py
index e34326e5e6d35f172de5329f12881cc9322dd1b1..08c9dbfb9114f731624b39a2bb99534a99c9adf5 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_7.py
@@ -1,7 +1,7 @@
 # Exercise 4.2.7
 
 # requires data from exercise 4.2.1
-from ex4_2_1 import *
+from ex2_3_1 import *
 from matplotlib.pyplot import (
     cm,
     colorbar,
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py
similarity index 100%
rename from exercises/02450Toolbox_Python/Scripts/ex4_3_1.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_4_1.py
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_2.py
similarity index 100%
rename from exercises/02450Toolbox_Python/Scripts/ex4_3_2.py
rename to exercises/02450Toolbox_Python/Scripts/ex2_4_2.py
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..b210d13bfaca05d29b799ca0f1cc103802dee09e
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py
@@ -0,0 +1,32 @@
+# exercise 2.1.1
+import importlib_resources
+import numpy as np
+import xlrd
+
+# Load xls sheet with data
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls")
+doc = xlrd.open_workbook(filename).sheet_by_index(0)
+
+# Extract attribute names (1st row, column 4 to 12)
+attributeNames = doc.row_values(0, 3, 11)
+
+# Extract class names to python list,
+# then encode with integers (dict)
+classLabels = doc.col_values(0, 2, 92)
+classNames = sorted(set(classLabels))
+classDict = dict(zip(classNames, range(5)))
+
+# Extract vector y, convert to NumPy array
+y = np.asarray([classDict[value] for value in classLabels])
+
+# Preallocate memory, then extract excel data to matrix X
+X = np.empty((90, 8))
+for i, col_id in enumerate(range(3, 11)):
+    X[:, i] = np.asarray(doc.col_values(col_id, 2, 92))
+
+# Compute values of N, M and C.
+N = len(y)
+M = len(attributeNames)
+C = len(classNames)
+
+print("Ran Exercise 2.1.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py
index 60a39cf53d0415f6573b00f633de451559a70dff..6af386ac3591e7d180367857a81bc7e02770e29e 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py
@@ -1,54 +1,37 @@
-# exercise 3.1.4
-import importlib_resources
-import numpy as np
-from sklearn.feature_extraction.text import CountVectorizer
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
-# Load the textDocs.txt as a long string into raw_file:
-with open(filename, "r") as f:
-    raw_file = f.read()
-# raw_file contains sentences seperated by newline characters,
-# so we split by '\n':
-corpus = raw_file.split("\n")
-# corpus is now list of "documents" (sentences), but some of them are empty,
-# because textDocs.txt has a lot of empty lines, we filter/remove them:
-corpus = list(filter(None, corpus))
-
-# Display the result
-print("Document-term matrix analysis")
-print()
-print("Corpus (5 documents/sentences):")
-print(np.asmatrix(corpus))
-print()
-
-
-# To automatically obtain the bag of words representation, we use sklearn's
-# feature_extraction.text module, which has a function CountVectorizer.
-# We make a CounterVectorizer:
-vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b")
-# The token pattern is a regular expression (marked by the r), which ensures
-# that the vectorizer ignores digit/non-word tokens - in this case, it ensures
-# the 10 in the last document is not recognized as a token. It's not important
-# that you should understand it the regexp.
-
-# The object vectorizer can now be used to first 'fit' the vectorizer to the
-# corpus, and the subsequently transform the data. We start by fitting:
-vectorizer.fit(corpus)
-# The vectorizer has now determined the unique terms (or tokens) in the corpus
-# and we can extract them using:
-attributeNames = vectorizer.get_feature_names_out()
-print("Found terms:")
-print(attributeNames)
-print()
-
-# The next step is to count how many times each term is found in each document,
-# which we do using the transform function:
-X = vectorizer.transform(corpus)
-N, M = X.shape
-print("Number of documents (data objects, N):\t %i" % N)
-print("Number of terms (attributes, M):\t %i" % M)
-print()
-print("Document-term matrix:")
-print(X.toarray())
-print()
-print("Ran Exercise 3.1.2")
+# exercise 2.1.2
+
+# Imports the numpy and xlrd package, then runs the ex2_1_1 code
+from ex3_1_1 import *
+from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
+
+# (requires data structures from ex. 2.1.1)
+
+
+# Data attributes to be plotted
+i = 0
+j = 1
+
+##
+# Make a simple plot of the i'th attribute against the j'th attribute
+# Notice that X is of matrix type (but it will also work with a numpy array)
+# X = np.array(X) #Try to uncomment this line
+plot(X[:, i], X[:, j], "o")
+
+# %%
+# Make another more fancy plot that includes legend, class labels,
+# attribute names, and a title.
+f = figure()
+title("NanoNose data")
+
+for c in range(C):
+    # select indices belonging to class c:
+    class_mask = y == c
+    plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3)
+
+legend(classNames)
+xlabel(attributeNames[i])
+ylabel(attributeNames[j])
+
+# Output result to screen
+show()
+print("Ran Exercise 2.1.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py
index c3ac96c2ad678edc8786ea59973292ad3b80a9e1..75b9c6b3817910a5ed34195e17c77655fab3595c 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py
@@ -1,40 +1,30 @@
-# exercise 3.1.4
-import importlib_resources
-from sklearn.feature_extraction.text import CountVectorizer
+# exercise 2.1.3
+# (requires data structures from ex. 2.2.1)
+import matplotlib.pyplot as plt
+from ex3_1_1 import *
+from scipy.linalg import svd
 
-filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
-filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
+# Subtract mean value from data
+Y = X - np.ones((N, 1)) * X.mean(axis=0)
 
-# As before, load the corpus and preprocess:
-with open(filename_docs, "r") as f:
-    raw_file = f.read()
-corpus = raw_file.split("\n")
-corpus = list(filter(None, corpus))
+# PCA by computing SVD of Y
+U, S, V = svd(Y, full_matrices=False)
 
-# Load and process the stop words in a similar manner:
-with open(filename_stop, "r") as f:
-    raw_file = f.read()
-stopwords = raw_file.split("\n")
+# Compute variance explained by principal components
+rho = (S * S) / (S * S).sum()
 
-# When making the CountVectorizer, we now input the stop words:
-vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords)
-# Determine the terms in the corpus
-vectorizer.fit(corpus)
-# ... and count the frequency of each term within a document:
-X = vectorizer.transform(corpus)
-attributeNames = vectorizer.get_feature_names_out()
-N, M = X.shape
+threshold = 0.9
 
-# Display the result
-print("Document-term matrix analysis (using stop words)")
-print()
-print("Number of documents (data objects, N):\t %i" % N)
-print("Number of terms (attributes, M):\t %i" % M)
-print()
-print("Found terms (no stop words):")
-print(attributeNames)
-print()
-print("Document-term matrix:")
-print(X.toarray())
-print()
-print("Ran Exercise 3.1.3")
+# Plot variance explained
+plt.figure()
+plt.plot(range(1, len(rho) + 1), rho, "x-")
+plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-")
+plt.plot([1, len(rho)], [threshold, threshold], "k--")
+plt.title("Variance explained by principal components")
+plt.xlabel("Principal component")
+plt.ylabel("Variance explained")
+plt.legend(["Individual", "Cumulative", "Threshold"])
+plt.grid()
+plt.show()
+
+print("Ran Exercise 2.1.3")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py
index e33ee98041ad4eddb1533e2553763c881c2e5e73..87f40a76e9e346233d92b48cfc02604b34d4b8d6 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py
@@ -1,66 +1,38 @@
-# exercise 3.1.4
-import importlib_resources
-
-# We'll use a widely used stemmer based:
-# Porter, M. “An algorithm for suffix stripping.” Program 14.3 (1980): 130-137.
-# The stemmer is implemented in the most used natural language processing
-# package in Python, "Natural Langauge Toolkit" (NLTK):
-from nltk.stem import PorterStemmer
-from sklearn.feature_extraction.text import CountVectorizer
-
-filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt")
-filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt")
-
-# As before, load the corpus and preprocess:
-with open(filename_docs, "r") as f:
-    raw_file = f.read()
-corpus = raw_file.split("\n")
-corpus = list(filter(None, corpus))
-
-# Load and process the stop words in a similar manner:
-with open(filename_stop, "r") as f:
-    raw_file = f.read()
-stopwords = raw_file.split("\n")
-
-
-# To enable stemming when using the sklearn-module, we need to parse an
-# "analyzer" to the vectorizer we've been using.
-# First, we make an object based on the PorterStemmer class, and we also make
-# an analyzer object:
-stemmer = PorterStemmer()
-analyzer = CountVectorizer(
-    token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords
-).build_analyzer()
-
-
-# Using these we'll make a function that can stem words:
-def stemmed_words(doc):
-    return (stemmer.stem(w) for w in analyzer(doc))
-
-
-# ... and finally, we make a vectorizer just like we've done before:
-vectorizer = CountVectorizer(analyzer=stemmed_words)
-
-# Determine the terms:
-vectorizer.fit(corpus)
-attributeNames = vectorizer.get_feature_names_out()
-
-# ... and count the occurences:
-X = vectorizer.transform(corpus)
-N, M = X.shape
-X = X.toarray()
-
-# Display the result
-print("Document-term matrix analysis (using stop words and stemming)")
-print()
-print("Number of documents (data objects, N):\t %i" % N)
-print("Number of terms (attributes, M):\t %i" % M)
-print()
-print("Found terms (no stop words, stemmed):")
-print(attributeNames)
-print()
-print("Document-term matrix:")
-print(X)
-print()
-print("Ran Exercise 3.1.4")
-print()
+# exercise 2.1.4
+# (requires data structures from ex. 2.2.1 and 2.2.3)
+from ex3_1_1 import *
+from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel
+from scipy.linalg import svd
+
+# Subtract mean value from data
+Y = X - np.ones((N, 1)) * X.mean(0)
+
+# PCA by computing SVD of Y
+U, S, Vh = svd(Y, full_matrices=False)
+# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose)
+# of the vector V. So, for us to obtain the correct V, we transpose:
+V = Vh.T
+
+# Project the centered data onto principal component space
+Z = Y @ V
+
+# Indices of the principal components to be plotted
+i = 0
+j = 1
+
+# Plot PCA of the data
+f = figure()
+title("NanoNose data: PCA")
+# Z = array(Z)
+for c in range(C):
+    # select indices belonging to class c:
+    class_mask = y == c
+    plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5)
+legend(classNames)
+xlabel("PC{0}".format(i + 1))
+ylabel("PC{0}".format(j + 1))
+
+# Output result to screen
+show()
+
+print("Ran Exercise 2.1.4")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py
index d28d50af788bc9a8e83f8db8e2461ce49ba0e7a8..b17766da22a5f049eef3be9a59ac2f91f91c995b 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py
@@ -1,38 +1,53 @@
-# exercise 3.1.5
-import numpy as np
-import scipy.linalg as linalg
-from ex3_1_4 import *
-
-from dtuimldmtools import similarity
-
-# Query vector
-q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0])
-# notice, that you could get the query vector using the vectorizer, too:
-# q = vectorizer.transform(['matrix rank solv'])
-# q = np.asarray(q.toarray())
-# or use any other string:
-# q = vectorizer.transform(['Can I Google how to fix my problem?'])
-# q = np.asarray(q.toarray())
-
-# Method 1 ('for' loop - slow)
-N = np.shape(X)[0]
-# get the number of data objects
-sim = np.zeros((N, 1))  # allocate a vector for the similarity
-for i in range(N):
-    x = X[i, :]  # Get the i'th data object (here: document)
-    sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x)  # Compute cosine similarity
-
-# Method 2 (one line of code with no iterations - faster)
-sim = (q @ X.T).T / (
-    np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum())
-)
-
-# Method 3 (use the "similarity" function)
-sim = similarity(X, q, "cos")
-
-
-# Display the result
-print("Query vector:\n {0}\n".format(q))
-print("Similarity results:\n {0}".format(sim))
-
-print("Ran Exercise 3.1.5")
+# exercise 2.2.4
+
+# (requires data structures from ex. 2.2.1)
+import matplotlib.pyplot as plt
+from ex3_1_1 import *
+from scipy.linalg import svd
+
+Y = X - np.ones((N, 1)) * X.mean(0)
+U, S, Vh = svd(Y, full_matrices=False)
+V = Vh.T
+N, M = X.shape
+
+# We saw in 2.1.3 that the first 3 components explaiend more than 90
+# percent of the variance. Let's look at their coefficients:
+pcs = [0, 1, 2]
+legendStrs = ["PC" + str(e + 1) for e in pcs]
+c = ["r", "g", "b"]
+bw = 0.2
+r = np.arange(1, M + 1)
+for i in pcs:
+    plt.bar(r + i * bw, V[:, i], width=bw)
+plt.xticks(r + bw, attributeNames)
+plt.xlabel("Attributes")
+plt.ylabel("Component coefficients")
+plt.legend(legendStrs)
+plt.grid()
+plt.title("NanoNose: PCA Component Coefficients")
+plt.show()
+
+# Inspecting the plot, we see that the 2nd principal component has large
+# (in magnitude) coefficients for attributes A, E and H. We can confirm
+# this by looking at it's numerical values directly, too:
+print("PC2:")
+print(V[:, 1].T)
+
+# How does this translate to the actual data and its projections?
+# Looking at the data for water:
+
+# Projection of water class onto the 2nd principal component.
+all_water_data = Y[y == 4, :]
+
+print("First water observation")
+print(all_water_data[0, :])
+
+# Based on the coefficients and the attribute values for the observation
+# displayed, would you expect the projection onto PC2 to be positive or
+# negative - why? Consider *both* the magnitude and sign of *both* the
+# coefficient and the attribute!
+
+# You can determine the projection by (remove comments):
+print("...and its projection onto PC2")
+print(all_water_data[0, :] @ V[:, 1])
+# Try to explain why?
diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_6.py
similarity index 99%
rename from exercises/02450Toolbox_Python/Scripts/ex2_1_6.py
rename to exercises/02450Toolbox_Python/Scripts/ex3_1_6.py
index 78250ef25ad0a9015e9c4ce7e8ab0268f48d89f2..25cf5208a741bd821c06df394ea4184fb1a05371 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_6.py
@@ -1,6 +1,6 @@
 ## exercise 2.1.6
 import matplotlib.pyplot as plt
-from ex2_1_1 import *
+from ex3_1_1 import *
 from scipy.linalg import svd
 
 r = np.arange(1, X.shape[1] + 1)
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py
index dbc1dc0af5914508cd15466c199cdb24c2fd3bc2..798bb5b46dba276b98ff5c3f0da86e13f66bfe15 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py
@@ -1,19 +1,36 @@
-# exercise 3.2.1
+import importlib_resources
 import numpy as np
+from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks
+from scipy.io import loadmat
 
-x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47])
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
+# Index of the digit to display
+i = 0
 
-# Compute values
-mean_x = x.mean()
-std_x = x.std(ddof=1)
-median_x = np.median(x)
-range_x = x.max() - x.min()
+# Load Matlab data file to python dict structure
+mat_data = loadmat(filename)
 
-# Display results
-print("Vector:", x)
-print("Mean:", mean_x)
-print("Standard Deviation:", std_x)
-print("Median:", median_x)
-print("Range:", range_x)
+# Extract variables of interest
+testdata = mat_data["testdata"]
+traindata = mat_data["traindata"]
+X = traindata[:, 1:]
+y = traindata[:, 0]
 
-print("Ran Exercise 3.2.1")
+
+# Visualize the i'th digit as a vector
+f = figure()
+subplot(4, 1, 4)
+imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r)
+xlabel("Pixel number")
+title("Digit in vector format")
+yticks([])
+
+# Visualize the i'th digit as an image
+subplot(2, 1, 1)
+I = np.reshape(X[i, :], (16, 16))
+imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r)
+title("Digit as an image")
+
+show()
+
+print("Ran Exercise 2.2.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d6104b2aae59b5bddd2d2afa9157d3d2e181485
--- /dev/null
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py
@@ -0,0 +1,117 @@
+# exercise 2.2.2
+import importlib_resources
+import numpy as np
+import scipy.linalg as linalg
+from matplotlib.pyplot import (
+    cm,
+    figure,
+    imshow,
+    legend,
+    plot,
+    show,
+    subplot,
+    title,
+    xlabel,
+    ylabel,
+    yticks,
+)
+from scipy.io import loadmat
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
+
+# Digits to include in analysis (to include all, n = range(10) )
+n = [0, 1]
+# Number of principal components for reconstruction
+K = 16
+# Digits to visualize
+nD = range(6)
+
+
+# Load Matlab data file to python dict structure
+# and extract variables of interest
+traindata = loadmat(filename)["traindata"]
+X = traindata[:, 1:]
+y = traindata[:, 0]
+
+N, M = X.shape
+C = len(n)
+
+classValues = n
+classNames = [str(num) for num in n]
+classDict = dict(zip(classNames, classValues))
+
+
+# Select subset of digits classes to be inspected
+class_mask = np.zeros(N).astype(bool)
+for v in n:
+    cmsk = y == v
+    class_mask = class_mask | cmsk
+X = X[class_mask, :]
+y = y[class_mask]
+N = X.shape[0]
+
+# Center the data (subtract mean column values)
+Xc = X - np.ones((N, 1)) * X.mean(0)
+
+# PCA by computing SVD of Y
+U, S, V = linalg.svd(Xc, full_matrices=False)
+# U = mat(U)
+V = V.T
+
+# Compute variance explained by principal components
+rho = (S * S) / (S * S).sum()
+
+# Project data onto principal component space
+Z = Xc @ V
+
+# Plot variance explained
+figure()
+plot(rho, "o-")
+title("Variance explained by principal components")
+xlabel("Principal component")
+ylabel("Variance explained value")
+
+
+# Plot PCA of the data
+f = figure()
+title("pixel vectors of handwr. digits projected on PCs")
+for c in n:
+    # select indices belonging to class c:
+    class_mask = y == c
+    plot(Z[class_mask, 0], Z[class_mask, 1], "o")
+legend(classNames)
+xlabel("PC1")
+ylabel("PC2")
+
+
+# Visualize the reconstructed data from the first K principal components
+# Select randomly D digits.
+figure(figsize=(10, 3))
+W = Z[:, range(K)] @ V[:, range(K)].T
+D = len(nD)
+for d in range(D):
+    digit_ix = np.random.randint(0, N)
+    subplot(2, D, int(d + 1))
+    I = np.reshape(X[digit_ix, :], (16, 16))
+    imshow(I, cmap=cm.gray_r)
+    title("Original")
+    subplot(2, D, D + d + 1)
+    I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16))
+    imshow(I, cmap=cm.gray_r)
+    title("Reconstr.")
+
+
+# Visualize the pricipal components
+figure(figsize=(8, 6))
+for k in range(K):
+    N1 = int(np.ceil(np.sqrt(K)))
+    N2 = int(np.ceil(K / N1))
+    subplot(N2, N1, int(k + 1))
+    I = np.reshape(V[:, k], (16, 16))
+    imshow(I, cmap=cm.hot)
+    title("PC{0}".format(k + 1))
+
+# output to screen
+show()
+
+print("Ran Exercise 2.2.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py
index f5cfc462ae073133074a692c8080b0706d084ee5..9f665b581d0c86e7f5ba07bf5e943e04ff0a273e 100644
--- a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py
@@ -1,77 +1,58 @@
-# exercise 3.3.1
+# exercise 2.3.1
 
 import importlib_resources
-import matplotlib.pyplot as plt
 import numpy as np
+import scipy.linalg as linalg
+from matplotlib.pyplot import figure, plot, show, xlabel, ylabel
 from scipy.io import loadmat
-
-from dtuimldmtools import similarity
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat")
-# Image to use as query
-i = 1
-
-# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
-similarity_measure = "SMC"
-
-# Load the digits
-# Load Matlab data file to python dict structure
-X = loadmat(filename)["X"]
-# You can also try the CBCL faces dataset (remember to change 'transpose')
-#X = loadmat('../Data/wildfaces_grayscale.mat')['X']
+from sklearn.neighbors import KNeighborsClassifier
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat")
+# Number of principal components to use for classification,
+# i.e. the reduced dimensionality
+K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150]
+
+# Load Matlab data file and extract training set and test set
+mat_data = loadmat(filename)
+X = mat_data["traindata"][:, 1:]
+y = mat_data["traindata"][:, 0]
+Xtest = mat_data["testdata"][:, 1:]
+ytest = mat_data["testdata"][:, 0]
 N, M = X.shape
-transpose = False # should the plotted images be transposed? 
-
-
-# Search the face database for similar faces 
-# Index of all other images than i
-noti = list(range(0,i)) + list(range(i+1,N)) 
-# Compute similarity between image i and all others
-sim = similarity(X[i,:], X[noti,:], similarity_measure)
-sim = sim.tolist()[0]
-# Tuples of sorted similarities and their indices
-sim_to_index = sorted(zip(sim,noti))
-
-
-# Visualize query image and 5 most/least similar images
-plt.figure(figsize=(12,8))
-plt.subplot(3,1,1)
-
-img_hw = int(np.sqrt(len(X[0])))
-img = np.reshape(X[i], (img_hw,img_hw))
-if transpose: img = img.T
-plt.imshow(img, cmap=plt.cm.gray)
-plt.xticks([]); plt.yticks([])
-plt.title('Query image')
-plt.ylabel('image #{0}'.format(i))
-
-
-for ms in range(5):
-
-    # 5 most similar images found
-    plt.subplot(3,5,6+ms)
-    im_id = sim_to_index[-ms-1][1]
-    im_sim = sim_to_index[-ms-1][0]
-    img = np.reshape(X[im_id],(img_hw,img_hw))
-    if transpose: img = img.T
-    plt.imshow(img, cmap=plt.cm.gray)
-    plt.xlabel('sim={0:.3f}'.format(im_sim))
-    plt.ylabel('image #{0}'.format(im_id))
-    plt.xticks([]); plt.yticks([])
-    if ms==2: plt.title('Most similar images')
-
-    # 5 least similar images found
-    plt.subplot(3,5,11+ms)
-    im_id = sim_to_index[ms][1]
-    im_sim = sim_to_index[ms][0]
-    img = np.reshape(X[im_id],(img_hw,img_hw))
-    if transpose: img = img.T
-    plt.imshow(img, cmap=plt.cm.gray)
-    plt.xlabel('sim={0:.3f}'.format(im_sim))
-    plt.ylabel('image #{0}'.format(im_id))
-    plt.xticks([]); plt.yticks([])
-    if ms==2: plt.title('Least similar images')
-    
-plt.show()
-
-print('Ran Exercise 3.3.1')
+Ntest = Xtest.shape[0]  # or Xtest[:,0].shape
+
+# Subtract the mean from the data
+Y = X - np.ones((N, 1)) * X.mean(0)
+Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0)
+
+# Obtain the PCA solution  by calculate the SVD of Y
+U, S, V = linalg.svd(Y, full_matrices=False)
+V = V.T
+
+
+# Repeat classification for different values of K
+error_rates = []
+for k in K:
+    # Project data onto principal component space,
+    Z = Y @ V[:, :k]
+    Ztest = Ytest @ V[:, :k]
+
+    # Classify data with knn classifier
+    knn_classifier = KNeighborsClassifier(n_neighbors=1)
+    knn_classifier.fit(Z, y.ravel())
+    y_estimated = knn_classifier.predict(Ztest)
+
+    # Compute classification error rates
+    y_estimated = y_estimated.T
+    er = (sum(ytest != y_estimated) / float(len(ytest))) * 100
+    error_rates.append(er)
+    print("K={0}: Error rate: {1:.1f}%".format(k, er))
+
+# Visualize error rates vs. number of principal components considered
+figure()
+plot(K, error_rates, "o-")
+xlabel("Number of principal components K")
+ylabel("Error rate [%]")
+show()
+
+print("Ran Exercise 2.3.1")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py
deleted file mode 100644
index e7abec8d3c302ff9414b0afb8925596c1dc69d3f..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# exercise 3.2.2
-
-import numpy as np
-
-from dtuimldmtools import similarity
-
-# Generate two data objects with M random attributes
-M = 5
-x = np.random.rand(1, M)
-y = np.random.rand(1, M)
-
-# Two constants
-a = 1.5
-b = 1.5
-
-# Check the statements in the exercise
-print(
-    "Cosine scaling: %.4f "
-    % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0]
-)
-print(
-    "ExtendedJaccard scaling: %.4f "
-    % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0]
-)
-print(
-    "Correlation scaling: %.4f "
-    % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0]
-)
-print(
-    "Cosine translation: %.4f "
-    % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0]
-)
-print(
-    "ExtendedJaccard translation: %.4f "
-    % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0]
-)
-print(
-    "Correlation translation: %.4f "
-    % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0]
-)
-
-print("Ran Exercise 3.2.2")
diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
deleted file mode 100644
index e6b46e8ca7e49caa7258fa553737a598d2dda417..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# exercise 4.2.1
-
-import importlib_resources
-import numpy as np
-import xlrd
-
-filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls")
-# Load xls sheet with data
-doc = xlrd.open_workbook(filename).sheet_by_index(0)
-
-# Extract attribute names
-attributeNames = doc.row_values(0, 0, 4)
-
-# Extract class names to python list,
-# then encode with integers (dict)
-classLabels = doc.col_values(4, 1, 151)
-classNames = sorted(set(classLabels))
-classDict = dict(zip(classNames, range(len(classNames))))
-
-# Extract vector y, convert to NumPy matrix and transpose
-y = np.array([classDict[value] for value in classLabels])
-
-# Preallocate memory, then extract data to matrix X
-X = np.empty((150, 4))
-for i in range(4):
-    X[:, i] = np.array(doc.col_values(i, 1, 151)).T
-
-# Compute values of N, M and C.
-N = len(y)
-M = len(attributeNames)
-C = len(classNames)
-
-print("Ran Exercise 4.2.1")
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_2.R b/exercises/02450Toolbox_R/Scripts/ex1_6_2.R
new file mode 100644
index 0000000000000000000000000000000000000000..d3a68728715b5ebcd46797215edaea83f389e448
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_2.R
@@ -0,0 +1,19 @@
+####################
+# Exercise 3.1.2
+####################
+
+rm(list = ls()) # Clear work space
+
+library(tm)
+textfolder = "./Data/"
+
+(docs <- Corpus(DirSource(textfolder, pattern="textDocs*"), readerControl = list(language="en")))
+
+inspect(docs)
+
+docs_nopunct <- tm_map(docs, removePunctuation)
+dtm <- DocumentTermMatrix(docs_nopunct)
+
+inspect(dtm)
+
+
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_3.R b/exercises/02450Toolbox_R/Scripts/ex1_6_3.R
new file mode 100644
index 0000000000000000000000000000000000000000..4180e92904af380e5417c81b30f2044db6102ea0
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_3.R
@@ -0,0 +1,23 @@
+####################
+# Exercise 3.1.3
+####################
+
+rm(list = ls()) # Clear work space
+
+library(tm)
+textfolder <- "./Data/"
+
+(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
+
+inspect(docs)
+docs_nopunct <- tm_map(docs, removePunctuation)
+dtm <- DocumentTermMatrix(docs_nopunct)
+
+mystopwords <- scan("./Data/stopWords.txt", character(0))
+
+docs_nostopwords <- tm_map(docs, removeWords, mystopwords)
+inspect(docs_nostopwords)
+dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
+inspect(dtm_nostopwords)
+
+control <- list(stopwords = TRUE)
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_4.R b/exercises/02450Toolbox_R/Scripts/ex1_6_4.R
new file mode 100644
index 0000000000000000000000000000000000000000..5ad1d634acf03543105e49b1e1e8ee6a0ba61870
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_4.R
@@ -0,0 +1,29 @@
+####################
+# Exercise 3.1.4
+####################
+
+rm(list = ls()) # Clear work space
+
+library(tm) # install.packages("tm")
+library(SnowballC) # install.packages("SnowballC")
+
+textfolder <- "./Data/"
+
+(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
+
+inspect(docs)
+docs_nopunct <- tm_map(docs, removePunctuation)
+dtm <- DocumentTermMatrix(docs_nopunct)
+
+mystopwords <- scan("./Data/stopWords.txt", character(0))
+
+docs_nostopwords <- tm_map(docs_nopunct, removeWords, mystopwords)
+dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
+inspect(dtm_nostopwords)
+
+docs_stemmed <- tm_map(docs_nostopwords, stemDocument, language = "english")
+
+
+inspect(docs_stemmed)
+dtm_stemmed <- DocumentTermMatrix(docs_stemmed, control = list(stopwords = TRUE))
+inspect(dtm_stemmed)
diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_5.R b/exercises/02450Toolbox_R/Scripts/ex1_6_5.R
new file mode 100644
index 0000000000000000000000000000000000000000..a9c4b7d1d19f693fbde4fc96101ca4aab53ba0d8
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex1_6_5.R
@@ -0,0 +1,19 @@
+####################
+# Exercise 3.1.5
+####################
+
+source("Scripts/ex1_6_4.R")
+source("Tools/similarity.R")
+
+size <- dim(dtm_stemmed)
+cosmeas <- c()
+q <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0)
+dtmat <- matrix(dtm_stemmed, dtm_stemmed$nrow, dtm_stemmed$ncol)
+
+for (irow in 1:size[1]) {
+  doc <- dtm_stemmed[irow, ]
+  cosmeas[irow] <- similarity(t(dtmat[irow, ]), t(q), 'cos')
+}
+
+print("Cosine similarity from q to docs: ")
+print(cosmeas)
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_1.R b/exercises/02450Toolbox_R/Scripts/ex2_1_1.R
index 7861e15a48b09ebbb0a87f27a7366b651ff048d0..725f7564a291d7ba29252e89ef5d0f0c1adc7679 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_1_1.R
@@ -1,36 +1,17 @@
 ####################
-# Exercise 2.1.1
+# Exercise 3.2.1
 ####################
 
 rm(list = ls()) # Clear work space
 
-# Read data into R
-data <- read.csv("./Data/nanonose.csv", check.names = FALSE)
+x <- c(-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47)
 
-# Extract class labels of observations
-classLabels <- colnames(data)
-classLabels <- classLabels[-(1:2)]
+mean(x)
+sd(x)
+median(x)
+diff(range(x))
 
-# Extract attributes, i.e. sensor names
-attributeNames <- data[3:10, 1]
-
-# Remove first two rows and columns and transpose data matrix
-X <- t(data[-(1:2), -(1:2)])
-
-# Check that dimensions are as they should be (90 rows and 8 columns)
-N <- dim(X)[1]
-M <- dim(X)[2]
-
-# Assign the class labels as row names and the attributes as column names
-rownames(X) <- classLabels
-colnames(X) <- attributeNames
-
-# Extract the class names present in data
-classNames <- sort(unique(classLabels))
-C <- length(classNames)
-
-# Extract numeric class assignments
-y <- as.numeric(as.factor(classLabels)) - 1
-
-# Clean up a bit in the variables, since we'll call this script from later scripts:
-rm(data)
+# Range returns the minimum and maximum of the vector x.
+# To get the range, we must take the maximum minus the minimum.
+# We do this using the function diff, which finds differences
+# between consecutive elements in a vector.
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_2.R b/exercises/02450Toolbox_R/Scripts/ex2_1_2.R
deleted file mode 100644
index d99b5ebcc5c289c968bc0e73da451a2d85fbb7bb..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_2.R
+++ /dev/null
@@ -1,20 +0,0 @@
-####################
-# Exercise 2.1.2
-####################
-
-# Run ex2.1.1:
-source('Scripts/ex2_1_1.R')
-
-# Choose which sensors to plot
-i = 1
-j = 2
-
-# Make simple plot
-plot(X[ , i], X[ , j])
-
-## Make more fancy plot
-library(ggplot2)
-ggplot() + geom_point(aes(x = X[ , i], y = X[ , j], color = classLabels), size=4, alpha=0.5) +
-  theme(legend.position = c(0.88, 0.77), legend.title= element_blank()) +
-  labs(x = attributeNames[i], y = attributeNames[j])
-
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_3.R b/exercises/02450Toolbox_R/Scripts/ex2_1_3.R
deleted file mode 100644
index 05b805dcee60b003f1332fd99d127e29d0a09a99..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_3.R
+++ /dev/null
@@ -1,45 +0,0 @@
-####################
-# Exercise 2.1.3
-####################
-
-source("Scripts/ex2_1_1.R")
-
-# Subtract the column means form columns of X
-Y <- t(apply(X, 1, "-", colMeans(X)))
-
-# You can check the column means using: colMeans(Y)
-colMeans(Y)
-
-# PCA by computing SVD of Y:
-s <- svd(Y)
-diagS <- s$d
-
-rho <- diagS^2 / sum(diagS^2)
-
-# PCA can also be done with the built-in function prcomp.
-# Confirm that you get the same result.
-rho2 <- summary(prcomp(X, center = TRUE))$importance[2,]
-
-
-threshold <- 0.9
-
-xlimits <- c(1, M)
-plot(rho,
-  type = "o",
-  main = "Variance explained by principal componenets",
-  xlab = "Principal components",
-  ylab = "Variance explained",
-  xlim = xlimits,
-  ylim = c(0, 1),
-  col = "blue"
-)
-
-lines(cumsum(rho), type = "o", col = "orange")
-lines(xlimits, c(threshold, threshold), lty = "dashed")
-
-legend("right", # Define position
-  legend = c("Individual", "Cumulative", "Threshold"), # Set strings for legend
-  col = c("orange", "blue", "black"), lty = c(1, 1, 2), # Match appereance of lines
-  cex = 1, bg = "lightblue"
-) # Setup how the box looks (cex controls size)
-
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_4.R b/exercises/02450Toolbox_R/Scripts/ex2_1_4.R
deleted file mode 100644
index 3b43e4700e2a6c2d4f447166eee754a0f476a04a..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_4.R
+++ /dev/null
@@ -1,20 +0,0 @@
-####################
-# Exercise 2.1.4
-####################
-source("Scripts/ex2_1_3.R")
-
-# Manual projecting data onto principal component.
-Z <- s$u %*% diag(s$d)
-
-# Doing it with the built-in function.
-Z <- prcomp(X)$x
-
-i <- 1
-j <- 3
-
-## Make fancy plot
-library(ggplot2)
-ggplot() +
-  geom_point(aes(x = Z[, i], y = Z[, j], color = classLabels), size = 4, alpha = 0.5) +
-  theme(legend.position = c(0.88, 0.22), legend.title = element_blank()) +
-  labs(x = colnames(Z)[i], y = colnames(Z)[j])
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_5.R b/exercises/02450Toolbox_R/Scripts/ex2_1_5.R
deleted file mode 100644
index 4441f4c2b54c833f1008794ce7799a4aff91738e..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_5.R
+++ /dev/null
@@ -1,45 +0,0 @@
-####################
-# Exercise 2.1.5
-####################
-source("Scripts/ex2_1_1.R")
-
-Y <- t(apply(X, 1, "-", colMeans(X))) # subtract the column means form columns of X
-
-# Computing PCA:
-pca <- prcomp(X)
-V <- pca$rotation
-Z <- pca$x
-
-# We saw in 2.1.3 that the first 3 components explained more than 90
-# percent of the variance. Let's look at their coefficients:
-pcs <- 1:3
-test <- as.data.frame(melt(data.table(V[, pcs])))
-ggplot(test, aes(x = rep(1:8, length(pcs)), y = value, fill=variable)) +
-  geom_bar(position="dodge", stat = "identity") +
-  labs(fill="PC", x = "Attributes", y = "Component coefficients")
-
-# Inspecting the plot, we see that the 2nd principal component has large
-# (in magnitude) coefficients for attributes A, E and H. We can confirm
-# this by looking at it's numerical values directly, too:
-cat("PC2:", V[, 1])
-
-# How does this translate to the actual data and its projections?
-# Looking at the data for water:
-
-# Projection of water class onto the 2nd principal component.
-water <- Y[y == 4, ]
-cat("First water observation:", water[1, ])
-
-# Based on the coefficients and the attribute values for the observation
-# displayed, would you expect the projection onto PC2 to be positive or
-# negative - why? Consider *both* the magnitude and sign of *both* the
-# coefficient and the attribute!
-
-# You can determine the projection by (remove comments):
-print("...and its projection onto PC2")
-print(as.numeric(t(water[1, ]) %*% V[, 2]))
-
-# Try to explain why.
-# This is equivalent to:
-Z[1, 2]
-
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_2_1.R b/exercises/02450Toolbox_R/Scripts/ex2_2_1.R
index ff04967032ed9c6df0d9c33ec81aef0f5aeafdbc..91eccf7115949659685286b1285dced806f6039e 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_2_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_2_1.R
@@ -1,42 +1,67 @@
 ####################
-# Exercise 2.2.1
+# Exercise 3.3.1
 ####################
+rm(list = ls()) # Clear work space)
 
-rm(list = ls()) # Clear work space
+source("Tools/binarize.R")
+source("Tools/similarity.R")
 
-# Load the library R.matlab to enable the function readMat,
-# which allows R to read the matlab .mat format.
-# Install with "install.packages("R.matlab")"
-library(R.matlab)
-
-# The row of training data that we will look at
+# Image to use as query
 i <- 1
 
-# Read in the data
-data <- readMat(file.path("Data", "zipdata.mat"))
+# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
+SimilarityMeasure <- "Jaccard"
+
+library(R.matlab)
+data <- readMat(file.path("./Data/digits.mat"))
+
+# You can also try it on the faces dataset:
+# data <- readMat(file.path("./Data/wildfaces_grayscale.mat"))
 
-# Check that the structure dat contains two matrices, testdata and traindata
 names(data)
+X <- data$X
+dimX <- dim(X)
+N <- dimX[1]
+M <- dimX[2]
+Q <- matrix(X[i, ], ncol = M) # The query image
+Y <- X[-i, ] # All images except the i'th
+
+# The next function is in the setup.R, load it if you have not already
+# source("setup.R")
+sim <- similarity(Q, Y, SimilarityMeasure)
+
+# Sort similarities
+sort_result <- sort(sim, decreasing = TRUE, index.return = TRUE)
+val <- sort_result$x
+j <- sort_result$ix
+nj <- length(j)
 
-# The features, i.e. images of digits, are stored in the rows of traindata,
-# except for the first column. The first column contains the class of the row,
-# i.e. the digit identity
-X <- data$traindata[, 2:dim(data$traindata)[2]]
-y <- data$traindata[, 1]
+# Plot five most similar and dissimilar images
+npics <- 5
+ndigits <- 4
+mostsim <- j[1:npics]
+leastsim <- j[(nj - npics + 1):nj]
 
-par(mfrow = c(1, 2))
+imageDim <- c(sqrt(M), sqrt(M))
+dim(Q) <- imageDim # reshape Q
 
-# View the i'th row of X
-image(as.matrix(X[i, ]), axes = FALSE, xlab = "Pixel number", main = "Digit in vector format")
+{
+  dev.new(width = 2.5, height = 3)
+  image(t(Q[nrow(Q):1, ]), main = "Query image", col = gray((0:32) / 32))
+}
 
-# Make ticks corresponding to pixels, i.e. values in row i
-axis(1, at = (1:length(X[i, ])) / length(X[i, ]), labels = 1:length(X[i, ]))
+{
+  dev.new(width = 2 * npics, height = 5)
+  layout(matrix(c(1:length(mostsim), (length(mostsim) + 1):(2 * length(mostsim))), 2, length(mostsim), byrow = FALSE))
+  for (d in 1:npics) {
+    similarImage <- Y[mostsim[d], ]
+    dim(similarImage) <- imageDim
+    image(t(similarImage[nrow(similarImage):1, ]), main = paste("Similarity:", format(val[d], digits = ndigits)), col = gray((0:32) / 32))
 
-# Extract the i'th row of X and store it as the matrix I
-I <- X[i, ]
 
-# Make the matrix I have dimensions 16 by 16
-dim(I) <- c(16, 16)
+    dissimilarImage <- Y[leastsim[d], ]
+    dim(dissimilarImage) <- imageDim
 
-# view the digit in the i'th row of X as an image. The function image rotates the matrix, so we need to rearrange the columns in the matrix I in order to get the correct view of the digit.
-image(I[, ncol(I):1], col = gray((32:0) / 32), main = "Digit as an image")
+    image(t(dissimilarImage[nrow(dissimilarImage):1, ]), main = paste("Similarity:", format(val[nj - d + 1], digits = ndigits)), col = gray((0:32) / 32))
+  }
+}
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_2_2.R b/exercises/02450Toolbox_R/Scripts/ex2_2_2.R
index 997d298cafa7666e850bf4b6854cc7a548b20897..3cdc43ec18cec69145b8c9e47da735c2b53d1e34 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_2_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_2_2.R
@@ -1,105 +1,22 @@
 ####################
-# Exercise 2.2.2
+# Exercise 3.3.2
 ####################
-source("Scripts/ex2_2_1.R")
+source("Tools/similarity.R")
 
+# Generate two data objects with M random (standard uniformly distributed) attributes
+M = 5;
+x = as.matrix(runif(M));
+y = as.matrix(runif(M));
 
-# Choose which digits to consider in the analysis
 
-# This is like in the exercise
-digits_to_inspect <- 0:1
-
-# If you want to try more digits
-# digits_to_inspect = 0:9
-
-# Find the observations that relate to the digits chosen in digits_to_inspect
-inds <- !is.na(match(y, digits_to_inspect))
-
-# extract the rows of X found above
-X <- X[inds, ]
-y <- y[inds]
-
-# Get the column means of X, subtract them from each row,
-# and perform and SVD on the resulting matrix
-means <- colMeans(X)
-Xzeromean <- t(apply(X, 1, "-", means))
-svdres <- svd(Xzeromean)
-
-# Extract the matrices containing the left and right singular vectors, respectively
-U <- svdres$u
-V <- svdres$v
-
-# ----------------------------------------------------
-# Calculate and plot the variance explained by the PCs
-# ----------------------------------------------------
-
-pcvariance <- svdres$d^2 / sum(svdres$d^2)
-
-{
-  par(mfrow=c(1,1))
-  plot(cumsum(pcvariance), main = "Data variance explained by PCs",
-       xlab = "Number of PCs included in variance sum",
-       ylab = "Proportion of variance explained")
-  
-  # First 22 PCs should explain 90%
-  abline(v = 22)
-}
-
-# ---------------------------------------------------------------------
-# Plot principal components 1 and 2 against each other in a scatterplot,
-# i.e. plot the projections of observations onto PCs 1 and 2
-# ---------------------------------------------------------------------
-
-# These two ways of calculating pc_projections are equivalent
-pc_projections <- Xzeromean %*% V
-pc_projections <- U %*% diag(svdres$d)
-
-pcproj1 <- pc_projections[, 1]
-pcproj2 <- pc_projections[, 2]
-
-{
-  par(mfrow=c(1,1))
-  plot(c(min(pcproj1), max(pcproj1)), c(min(pcproj2), max(pcproj2)),
-        type = "n", xlab = "PC 1", ylab = "PC 2")
-  points(pcproj1[y == 0], pcproj2[y == 0], col = "red")
-  points(pcproj1[y == 1], pcproj2[y == 1], col = "green")
-  legend("topleft", legend = c("0", "1"), fill = c("red", "green"))
-}
-
-
-# ----------------------------------
-# Reconstruction of images of digits
-# ----------------------------------
-
-# Number of PCs to include in reconstruction of digits
-K <- 5
-
-# Digits to visualize
-nD <- 60:63
-
-reconstructions <- pc_projections[, 1:K] %*% t(V[, 1:K])
-
-par(mar = c(1, 1, 1, 1))
-layout(matrix(c(1:length(nD), (length(nD) + 1):(2 * length(nD))), 2, length(nD), byrow = FALSE))
-for (d in 1:length(nD)) {
-  origImage <- X[nD[d], ]
-  dim(origImage) <- c(16, 16)
-  image(origImage[, ncol(origImage):1], main = "Original", col = gray((32:0) / 32))
-  reconstructedImage <- reconstructions[nD[d], ] + means
-  dim(reconstructedImage) <- c(16, 16)
-  image(reconstructedImage[, ncol(reconstructedImage):1], main = "Reconstruction", col = gray((32:0) / 32))
-}
-
-# -------------
-# Visualize PCs
-# -------------
-
-# visualize the first 12 PCs
-par(mfrow = c(3, 4), mar = c(2, 2, 2, 2))
-for (nth_pc in 1:12) {
-  pc <- t(V[, nth_pc])
-  dim(pc) <- c(16, 16)
-  # view image of PC
-  image(pc[, ncol(pc):1], col = gray((32:0) / 32), main = paste("PC ", nth_pc))
-}
+# Two constants
+a = 1.5;
+b = 1.5;
 
+# Check the statements in the exercise
+similarity(x,y,'cos') - similarity(a*x,y,'cos')
+similarity(x,y,'ext') - similarity(a*x,y,'ext')
+similarity(x,y,'cor') - similarity(a*x,y,'cor')
+similarity(x,y,'cos') - similarity(b+x,y,'cos')
+similarity(x,y,'ext') - similarity(b+x,y,'ext')
+similarity(x,y,'cor') - similarity(b+x,y,'cor')
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_3_1.R b/exercises/02450Toolbox_R/Scripts/ex2_3_1.R
index 39726ea5f21ccd4bb41fc140f5df9f6542b34891..98f03304399f73d86970a8b9f0b324e750b99f96 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_3_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_1.R
@@ -1,61 +1,30 @@
 ####################
-# Exercise 2.3.1
+# Exercise 4.2.1
 ####################
-
 rm(list = ls()) # Clear work space
 
-library(FNN)
-library(R.matlab)
-
-# Read in the data
-data <- readMat(file.path("Data", "zipdata.mat"))
+# Read data into R
+data <- read.csv("./Data/iris.csv")
 
-# Check that the structure dat contains two matrices, testdata and traindata
+# Inspect the contents of the variable "data".
 names(data)
+head(data)
 
-# The features, i.e. images of digits, are stored in the rows of traindata,
-# except for the first column. The first column contains the class of the row,
-# i.e. the digit identity
-X <- data$traindata[, 2:dim(data$traindata)[2]]
-y <- data$traindata[, 1]
-
-Xtest <- data$testdat[, 2:dim(data$testdat)[2]]
-ytest <- data$testdat[, 1]
-
-# This is like in the exercise
-digits_to_inspect <- 0:1
-
-# If you want to try more digits
-# digits_to_inspect = 0:9
-
-# Find the observations that relate to the digits chosen in digits_to_inspect
-inds <- !is.na(match(y, digits_to_inspect))
-indstest <- !is.na(match(ytest, digits_to_inspect))
-
-# Extract the rows of X found above
-X <- X[inds, ]
-y <- y[inds]
-Xtest <- Xtest[indstest, ]
-ytest <- ytest[indstest]
-
-
-# Get the column means of X, subtract them from each row,
-# and perform and SVD on the resulting matrix
-means <- colMeans(X)
-Xzeromean <- t(apply(X, 1, "-", means))
-Xtestzeromean <- t(apply(Xtest, 1, "-", means))
-svdres <- svd(Xzeromean)
+# Extract the rows and columns corresponding to the data, i.e. the attribute values
+X <- data[, 1:4]
 
-# Extract the matrices containing the left and right singular vectors, respectively
-U <- svdres$u
-V <- svdres$v
+# Extract attribute names from the first row
+attributeNames <- colnames(data)[1:(dim(data)[2] - 1)]
 
-K <- 5 # Number of principal components to use
-pc_projections <- Xzeromean %*% V[, 1:K]
-pc_projectionstest <- Xtestzeromean %*% V[, 1:K]
+# Extract unique class names from the last column
+classLabels <- data[, 5]
+classNames <- unique(classLabels)
 
-preds <- knn(pc_projections, pc_projectionstest, cl = y)
+# Extract class labels that match the class names
+y <- match(classLabels, classNames) - 1
 
-error_rate <- mean(preds != ytest)
-print(error_rate)
+# Get the number of data objects, attributes, and classes
+N <- dim(X)[1]
+M <- dim(X)[2]
+C <- length(classNames)
 
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_2.R b/exercises/02450Toolbox_R/Scripts/ex2_3_2.R
similarity index 96%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_2.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_2.R
index 0ae27d25a2c7ae714db4d34e71a791827fd4c6e8..f9558da2f3640027f3862bd0192f660b1a74d5fa 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_2.R
@@ -2,7 +2,7 @@
 # Exercise 4.2.2
 ####################
 
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # ----------------
 # With base R plot
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_3.R b/exercises/02450Toolbox_R/Scripts/ex2_3_3.R
similarity index 81%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_3.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_3.R
index a05fc0c826c900001527be00c3633efaa865cd04..cceef6d4af761fb61283e31a713095ee8aa6ced1 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_3.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_3.R
@@ -2,7 +2,7 @@
 # Exercise 4.2.3
 ####################
 
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 par(mfrow=c(1,1))
 boxplot(X, main="Boxplots of attribute values")
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_4.R b/exercises/02450Toolbox_R/Scripts/ex2_3_4.R
similarity index 96%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_4.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_4.R
index 7cff927fc7f748463e5b4b00194da1d1687c7662..dd45ae4f9e2ee192f4b7baf2bf132e7f33f67821 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_4.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_4.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.4
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # Done with base R plot
 yvals <- c()
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_5.R b/exercises/02450Toolbox_R/Scripts/ex2_3_5.R
similarity index 92%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_5.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_5.R
index bd6b45df9d785a990b7bf03f83068a30f84403f2..700748406b8be01bc57fac91e77ad90ef89a4d42 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_5.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_5.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.5
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 observationColors <- c("blue", "green3", "red")[unclass(y+1)]
 {
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_6.R b/exercises/02450Toolbox_R/Scripts/ex2_3_6.R
similarity index 97%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_6.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_6.R
index 29079a731fe91884ce4451e5d626a8390b347272..15a729e371f07988a210c190dbf1b173f928a8db 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_6.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_6.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.6
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # Make a non-interactive 3d plot of the data. Install the package scatterplot3d
 library(scatterplot3d) # install.packages("scatterplot3d")
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_7.R b/exercises/02450Toolbox_R/Scripts/ex2_3_7.R
similarity index 95%
rename from exercises/02450Toolbox_R/Scripts/ex4_2_7.R
rename to exercises/02450Toolbox_R/Scripts/ex2_3_7.R
index c80379396992eb071da6571bf292b6fa549f43f4..f22824150a68ef1dba88ee0c42e8923a8d27ff37 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_7.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_3_7.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 4.2.7
 ####################
-source("Scripts/ex4_2_1.R")
+source("Scripts/ex2_3_1.R")
 
 # Standardize each column using the function scale (see ?scale)
 X_scaled <- as.data.frame(scale(X))  
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_3_1.R b/exercises/02450Toolbox_R/Scripts/ex2_4_1.R
similarity index 100%
rename from exercises/02450Toolbox_R/Scripts/ex4_3_1.R
rename to exercises/02450Toolbox_R/Scripts/ex2_4_1.R
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_3_2.R b/exercises/02450Toolbox_R/Scripts/ex2_4_2.R
similarity index 94%
rename from exercises/02450Toolbox_R/Scripts/ex4_3_2.R
rename to exercises/02450Toolbox_R/Scripts/ex2_4_2.R
index cc9848e5f00676464d9b198d9f8101a0bbf16be7..cbec5c62842dffb3006b3844277c8aa0457476f9 100644
--- a/exercises/02450Toolbox_R/Scripts/ex4_3_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex2_4_2.R
@@ -2,7 +2,7 @@
 # Exercise 4.3.2
 ####################
 
-source("Scripts/ex4_3_1.R")
+source("Scripts/ex2_4_1.R")
 library(ggplot2) 
 library(GGally) # install.packages("GGally")
 
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_1.R b/exercises/02450Toolbox_R/Scripts/ex3_1_1.R
new file mode 100644
index 0000000000000000000000000000000000000000..7861e15a48b09ebbb0a87f27a7366b651ff048d0
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_1.R
@@ -0,0 +1,36 @@
+####################
+# Exercise 2.1.1
+####################
+
+rm(list = ls()) # Clear work space
+
+# Read data into R
+data <- read.csv("./Data/nanonose.csv", check.names = FALSE)
+
+# Extract class labels of observations
+classLabels <- colnames(data)
+classLabels <- classLabels[-(1:2)]
+
+# Extract attributes, i.e. sensor names
+attributeNames <- data[3:10, 1]
+
+# Remove first two rows and columns and transpose data matrix
+X <- t(data[-(1:2), -(1:2)])
+
+# Check that dimensions are as they should be (90 rows and 8 columns)
+N <- dim(X)[1]
+M <- dim(X)[2]
+
+# Assign the class labels as row names and the attributes as column names
+rownames(X) <- classLabels
+colnames(X) <- attributeNames
+
+# Extract the class names present in data
+classNames <- sort(unique(classLabels))
+C <- length(classNames)
+
+# Extract numeric class assignments
+y <- as.numeric(as.factor(classLabels)) - 1
+
+# Clean up a bit in the variables, since we'll call this script from later scripts:
+rm(data)
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_2.R b/exercises/02450Toolbox_R/Scripts/ex3_1_2.R
index d3a68728715b5ebcd46797215edaea83f389e448..d99b5ebcc5c289c968bc0e73da451a2d85fbb7bb 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_2.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_2.R
@@ -1,19 +1,20 @@
 ####################
-# Exercise 3.1.2
+# Exercise 2.1.2
 ####################
 
-rm(list = ls()) # Clear work space
+# Run ex2.1.1:
+source('Scripts/ex2_1_1.R')
 
-library(tm)
-textfolder = "./Data/"
+# Choose which sensors to plot
+i = 1
+j = 2
 
-(docs <- Corpus(DirSource(textfolder, pattern="textDocs*"), readerControl = list(language="en")))
-
-inspect(docs)
-
-docs_nopunct <- tm_map(docs, removePunctuation)
-dtm <- DocumentTermMatrix(docs_nopunct)
-
-inspect(dtm)
+# Make simple plot
+plot(X[ , i], X[ , j])
 
+## Make more fancy plot
+library(ggplot2)
+ggplot() + geom_point(aes(x = X[ , i], y = X[ , j], color = classLabels), size=4, alpha=0.5) +
+  theme(legend.position = c(0.88, 0.77), legend.title= element_blank()) +
+  labs(x = attributeNames[i], y = attributeNames[j])
 
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_3.R b/exercises/02450Toolbox_R/Scripts/ex3_1_3.R
index 4180e92904af380e5417c81b30f2044db6102ea0..b38a52290d2988124ca034e4751e7e7375ccd926 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_3.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_3.R
@@ -1,23 +1,45 @@
 ####################
-# Exercise 3.1.3
+# Exercise 2.1.3
 ####################
 
-rm(list = ls()) # Clear work space
+source("Scripts/ex3_1_1.R")
 
-library(tm)
-textfolder <- "./Data/"
+# Subtract the column means form columns of X
+Y <- t(apply(X, 1, "-", colMeans(X)))
 
-(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
+# You can check the column means using: colMeans(Y)
+colMeans(Y)
 
-inspect(docs)
-docs_nopunct <- tm_map(docs, removePunctuation)
-dtm <- DocumentTermMatrix(docs_nopunct)
+# PCA by computing SVD of Y:
+s <- svd(Y)
+diagS <- s$d
 
-mystopwords <- scan("./Data/stopWords.txt", character(0))
+rho <- diagS^2 / sum(diagS^2)
 
-docs_nostopwords <- tm_map(docs, removeWords, mystopwords)
-inspect(docs_nostopwords)
-dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
-inspect(dtm_nostopwords)
+# PCA can also be done with the built-in function prcomp.
+# Confirm that you get the same result.
+rho2 <- summary(prcomp(X, center = TRUE))$importance[2,]
+
+
+threshold <- 0.9
+
+xlimits <- c(1, M)
+plot(rho,
+  type = "o",
+  main = "Variance explained by principal componenets",
+  xlab = "Principal components",
+  ylab = "Variance explained",
+  xlim = xlimits,
+  ylim = c(0, 1),
+  col = "blue"
+)
+
+lines(cumsum(rho), type = "o", col = "orange")
+lines(xlimits, c(threshold, threshold), lty = "dashed")
+
+legend("right", # Define position
+  legend = c("Individual", "Cumulative", "Threshold"), # Set strings for legend
+  col = c("orange", "blue", "black"), lty = c(1, 1, 2), # Match appereance of lines
+  cex = 1, bg = "lightblue"
+) # Setup how the box looks (cex controls size)
 
-control <- list(stopwords = TRUE)
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_4.R b/exercises/02450Toolbox_R/Scripts/ex3_1_4.R
index 5ad1d634acf03543105e49b1e1e8ee6a0ba61870..3b43e4700e2a6c2d4f447166eee754a0f476a04a 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_4.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_4.R
@@ -1,29 +1,20 @@
 ####################
-# Exercise 3.1.4
+# Exercise 2.1.4
 ####################
+source("Scripts/ex2_1_3.R")
 
-rm(list = ls()) # Clear work space
+# Manual projecting data onto principal component.
+Z <- s$u %*% diag(s$d)
 
-library(tm) # install.packages("tm")
-library(SnowballC) # install.packages("SnowballC")
+# Doing it with the built-in function.
+Z <- prcomp(X)$x
 
-textfolder <- "./Data/"
+i <- 1
+j <- 3
 
-(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en")))
-
-inspect(docs)
-docs_nopunct <- tm_map(docs, removePunctuation)
-dtm <- DocumentTermMatrix(docs_nopunct)
-
-mystopwords <- scan("./Data/stopWords.txt", character(0))
-
-docs_nostopwords <- tm_map(docs_nopunct, removeWords, mystopwords)
-dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE))
-inspect(dtm_nostopwords)
-
-docs_stemmed <- tm_map(docs_nostopwords, stemDocument, language = "english")
-
-
-inspect(docs_stemmed)
-dtm_stemmed <- DocumentTermMatrix(docs_stemmed, control = list(stopwords = TRUE))
-inspect(dtm_stemmed)
+## Make fancy plot
+library(ggplot2)
+ggplot() +
+  geom_point(aes(x = Z[, i], y = Z[, j], color = classLabels), size = 4, alpha = 0.5) +
+  theme(legend.position = c(0.88, 0.22), legend.title = element_blank()) +
+  labs(x = colnames(Z)[i], y = colnames(Z)[j])
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_5.R b/exercises/02450Toolbox_R/Scripts/ex3_1_5.R
index efe3e7b136de8bd693240e8f8168f4e460d629a9..bb02536666ef680aa67cfdf71b6e91504e2cf931 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_1_5.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_5.R
@@ -1,19 +1,45 @@
 ####################
-# Exercise 3.1.5
+# Exercise 2.1.5
 ####################
+source("Scripts/ex3_1_1.R")
 
-source("Scripts/ex3_1_4.R")
-source("Tools/similarity.R")
+Y <- t(apply(X, 1, "-", colMeans(X))) # subtract the column means form columns of X
 
-size <- dim(dtm_stemmed)
-cosmeas <- c()
-q <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0)
-dtmat <- matrix(dtm_stemmed, dtm_stemmed$nrow, dtm_stemmed$ncol)
+# Computing PCA:
+pca <- prcomp(X)
+V <- pca$rotation
+Z <- pca$x
 
-for (irow in 1:size[1]) {
-  doc <- dtm_stemmed[irow, ]
-  cosmeas[irow] <- similarity(t(dtmat[irow, ]), t(q), 'cos')
-}
+# We saw in 2.1.3 that the first 3 components explained more than 90
+# percent of the variance. Let's look at their coefficients:
+pcs <- 1:3
+test <- as.data.frame(melt(data.table(V[, pcs])))
+ggplot(test, aes(x = rep(1:8, length(pcs)), y = value, fill=variable)) +
+  geom_bar(position="dodge", stat = "identity") +
+  labs(fill="PC", x = "Attributes", y = "Component coefficients")
+
+# Inspecting the plot, we see that the 2nd principal component has large
+# (in magnitude) coefficients for attributes A, E and H. We can confirm
+# this by looking at it's numerical values directly, too:
+cat("PC2:", V[, 1])
+
+# How does this translate to the actual data and its projections?
+# Looking at the data for water:
+
+# Projection of water class onto the 2nd principal component.
+water <- Y[y == 4, ]
+cat("First water observation:", water[1, ])
+
+# Based on the coefficients and the attribute values for the observation
+# displayed, would you expect the projection onto PC2 to be positive or
+# negative - why? Consider *both* the magnitude and sign of *both* the
+# coefficient and the attribute!
+
+# You can determine the projection by (remove comments):
+print("...and its projection onto PC2")
+print(as.numeric(t(water[1, ]) %*% V[, 2]))
+
+# Try to explain why.
+# This is equivalent to:
+Z[1, 2]
 
-print("Cosine similarity from q to docs: ")
-print(cosmeas)
diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_6.R b/exercises/02450Toolbox_R/Scripts/ex3_1_6.R
similarity index 99%
rename from exercises/02450Toolbox_R/Scripts/ex2_1_6.R
rename to exercises/02450Toolbox_R/Scripts/ex3_1_6.R
index f0a329dd022b47e14de6af383acfedf4a86092b9..83faf47dbd9116ccf326b94e3344fe76f8a036fe 100644
--- a/exercises/02450Toolbox_R/Scripts/ex2_1_6.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_1_6.R
@@ -1,7 +1,7 @@
 ####################
 # Exercise 2.1.6
 ####################
-source("Scripts/ex2_1_1.R")
+source("Scripts/ex3_1_1.R")
 
 # Try this *later* (for last), and explain the effect X_s = X
 # Make a to be "scaled" version of X
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_2_1.R b/exercises/02450Toolbox_R/Scripts/ex3_2_1.R
index 725f7564a291d7ba29252e89ef5d0f0c1adc7679..ff04967032ed9c6df0d9c33ec81aef0f5aeafdbc 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_2_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_2_1.R
@@ -1,17 +1,42 @@
 ####################
-# Exercise 3.2.1
+# Exercise 2.2.1
 ####################
 
 rm(list = ls()) # Clear work space
 
-x <- c(-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47)
+# Load the library R.matlab to enable the function readMat,
+# which allows R to read the matlab .mat format.
+# Install with "install.packages("R.matlab")"
+library(R.matlab)
 
-mean(x)
-sd(x)
-median(x)
-diff(range(x))
+# The row of training data that we will look at
+i <- 1
 
-# Range returns the minimum and maximum of the vector x.
-# To get the range, we must take the maximum minus the minimum.
-# We do this using the function diff, which finds differences
-# between consecutive elements in a vector.
+# Read in the data
+data <- readMat(file.path("Data", "zipdata.mat"))
+
+# Check that the structure dat contains two matrices, testdata and traindata
+names(data)
+
+# The features, i.e. images of digits, are stored in the rows of traindata,
+# except for the first column. The first column contains the class of the row,
+# i.e. the digit identity
+X <- data$traindata[, 2:dim(data$traindata)[2]]
+y <- data$traindata[, 1]
+
+par(mfrow = c(1, 2))
+
+# View the i'th row of X
+image(as.matrix(X[i, ]), axes = FALSE, xlab = "Pixel number", main = "Digit in vector format")
+
+# Make ticks corresponding to pixels, i.e. values in row i
+axis(1, at = (1:length(X[i, ])) / length(X[i, ]), labels = 1:length(X[i, ]))
+
+# Extract the i'th row of X and store it as the matrix I
+I <- X[i, ]
+
+# Make the matrix I have dimensions 16 by 16
+dim(I) <- c(16, 16)
+
+# view the digit in the i'th row of X as an image. The function image rotates the matrix, so we need to rearrange the columns in the matrix I in order to get the correct view of the digit.
+image(I[, ncol(I):1], col = gray((32:0) / 32), main = "Digit as an image")
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_2_2.R b/exercises/02450Toolbox_R/Scripts/ex3_2_2.R
new file mode 100644
index 0000000000000000000000000000000000000000..d1d81de672b8e982cbdb3c01f531afaf6cdf7d56
--- /dev/null
+++ b/exercises/02450Toolbox_R/Scripts/ex3_2_2.R
@@ -0,0 +1,105 @@
+####################
+# Exercise 2.2.2
+####################
+source("Scripts/ex3_2_1.R")
+
+
+# Choose which digits to consider in the analysis
+
+# This is like in the exercise
+digits_to_inspect <- 0:1
+
+# If you want to try more digits
+# digits_to_inspect = 0:9
+
+# Find the observations that relate to the digits chosen in digits_to_inspect
+inds <- !is.na(match(y, digits_to_inspect))
+
+# extract the rows of X found above
+X <- X[inds, ]
+y <- y[inds]
+
+# Get the column means of X, subtract them from each row,
+# and perform and SVD on the resulting matrix
+means <- colMeans(X)
+Xzeromean <- t(apply(X, 1, "-", means))
+svdres <- svd(Xzeromean)
+
+# Extract the matrices containing the left and right singular vectors, respectively
+U <- svdres$u
+V <- svdres$v
+
+# ----------------------------------------------------
+# Calculate and plot the variance explained by the PCs
+# ----------------------------------------------------
+
+pcvariance <- svdres$d^2 / sum(svdres$d^2)
+
+{
+  par(mfrow=c(1,1))
+  plot(cumsum(pcvariance), main = "Data variance explained by PCs",
+       xlab = "Number of PCs included in variance sum",
+       ylab = "Proportion of variance explained")
+  
+  # First 22 PCs should explain 90%
+  abline(v = 22)
+}
+
+# ---------------------------------------------------------------------
+# Plot principal components 1 and 2 against each other in a scatterplot,
+# i.e. plot the projections of observations onto PCs 1 and 2
+# ---------------------------------------------------------------------
+
+# These two ways of calculating pc_projections are equivalent
+pc_projections <- Xzeromean %*% V
+pc_projections <- U %*% diag(svdres$d)
+
+pcproj1 <- pc_projections[, 1]
+pcproj2 <- pc_projections[, 2]
+
+{
+  par(mfrow=c(1,1))
+  plot(c(min(pcproj1), max(pcproj1)), c(min(pcproj2), max(pcproj2)),
+        type = "n", xlab = "PC 1", ylab = "PC 2")
+  points(pcproj1[y == 0], pcproj2[y == 0], col = "red")
+  points(pcproj1[y == 1], pcproj2[y == 1], col = "green")
+  legend("topleft", legend = c("0", "1"), fill = c("red", "green"))
+}
+
+
+# ----------------------------------
+# Reconstruction of images of digits
+# ----------------------------------
+
+# Number of PCs to include in reconstruction of digits
+K <- 5
+
+# Digits to visualize
+nD <- 60:63
+
+reconstructions <- pc_projections[, 1:K] %*% t(V[, 1:K])
+
+par(mar = c(1, 1, 1, 1))
+layout(matrix(c(1:length(nD), (length(nD) + 1):(2 * length(nD))), 2, length(nD), byrow = FALSE))
+for (d in 1:length(nD)) {
+  origImage <- X[nD[d], ]
+  dim(origImage) <- c(16, 16)
+  image(origImage[, ncol(origImage):1], main = "Original", col = gray((32:0) / 32))
+  reconstructedImage <- reconstructions[nD[d], ] + means
+  dim(reconstructedImage) <- c(16, 16)
+  image(reconstructedImage[, ncol(reconstructedImage):1], main = "Reconstruction", col = gray((32:0) / 32))
+}
+
+# -------------
+# Visualize PCs
+# -------------
+
+# visualize the first 12 PCs
+par(mfrow = c(3, 4), mar = c(2, 2, 2, 2))
+for (nth_pc in 1:12) {
+  pc <- t(V[, nth_pc])
+  dim(pc) <- c(16, 16)
+  # view image of PC
+  image(pc[, ncol(pc):1], col = gray((32:0) / 32), main = paste("PC ", nth_pc))
+}
+
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_3_1.R b/exercises/02450Toolbox_R/Scripts/ex3_3_1.R
index 91eccf7115949659685286b1285dced806f6039e..39726ea5f21ccd4bb41fc140f5df9f6542b34891 100644
--- a/exercises/02450Toolbox_R/Scripts/ex3_3_1.R
+++ b/exercises/02450Toolbox_R/Scripts/ex3_3_1.R
@@ -1,67 +1,61 @@
 ####################
-# Exercise 3.3.1
+# Exercise 2.3.1
 ####################
-rm(list = ls()) # Clear work space)
 
-source("Tools/binarize.R")
-source("Tools/similarity.R")
-
-# Image to use as query
-i <- 1
-
-# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
-SimilarityMeasure <- "Jaccard"
+rm(list = ls()) # Clear work space
 
+library(FNN)
 library(R.matlab)
-data <- readMat(file.path("./Data/digits.mat"))
 
-# You can also try it on the faces dataset:
-# data <- readMat(file.path("./Data/wildfaces_grayscale.mat"))
+# Read in the data
+data <- readMat(file.path("Data", "zipdata.mat"))
 
+# Check that the structure dat contains two matrices, testdata and traindata
 names(data)
-X <- data$X
-dimX <- dim(X)
-N <- dimX[1]
-M <- dimX[2]
-Q <- matrix(X[i, ], ncol = M) # The query image
-Y <- X[-i, ] # All images except the i'th
-
-# The next function is in the setup.R, load it if you have not already
-# source("setup.R")
-sim <- similarity(Q, Y, SimilarityMeasure)
-
-# Sort similarities
-sort_result <- sort(sim, decreasing = TRUE, index.return = TRUE)
-val <- sort_result$x
-j <- sort_result$ix
-nj <- length(j)
-
-# Plot five most similar and dissimilar images
-npics <- 5
-ndigits <- 4
-mostsim <- j[1:npics]
-leastsim <- j[(nj - npics + 1):nj]
-
-imageDim <- c(sqrt(M), sqrt(M))
-dim(Q) <- imageDim # reshape Q
-
-{
-  dev.new(width = 2.5, height = 3)
-  image(t(Q[nrow(Q):1, ]), main = "Query image", col = gray((0:32) / 32))
-}
-
-{
-  dev.new(width = 2 * npics, height = 5)
-  layout(matrix(c(1:length(mostsim), (length(mostsim) + 1):(2 * length(mostsim))), 2, length(mostsim), byrow = FALSE))
-  for (d in 1:npics) {
-    similarImage <- Y[mostsim[d], ]
-    dim(similarImage) <- imageDim
-    image(t(similarImage[nrow(similarImage):1, ]), main = paste("Similarity:", format(val[d], digits = ndigits)), col = gray((0:32) / 32))
-
-
-    dissimilarImage <- Y[leastsim[d], ]
-    dim(dissimilarImage) <- imageDim
-
-    image(t(dissimilarImage[nrow(dissimilarImage):1, ]), main = paste("Similarity:", format(val[nj - d + 1], digits = ndigits)), col = gray((0:32) / 32))
-  }
-}
+
+# The features, i.e. images of digits, are stored in the rows of traindata,
+# except for the first column. The first column contains the class of the row,
+# i.e. the digit identity
+X <- data$traindata[, 2:dim(data$traindata)[2]]
+y <- data$traindata[, 1]
+
+Xtest <- data$testdat[, 2:dim(data$testdat)[2]]
+ytest <- data$testdat[, 1]
+
+# This is like in the exercise
+digits_to_inspect <- 0:1
+
+# If you want to try more digits
+# digits_to_inspect = 0:9
+
+# Find the observations that relate to the digits chosen in digits_to_inspect
+inds <- !is.na(match(y, digits_to_inspect))
+indstest <- !is.na(match(ytest, digits_to_inspect))
+
+# Extract the rows of X found above
+X <- X[inds, ]
+y <- y[inds]
+Xtest <- Xtest[indstest, ]
+ytest <- ytest[indstest]
+
+
+# Get the column means of X, subtract them from each row,
+# and perform and SVD on the resulting matrix
+means <- colMeans(X)
+Xzeromean <- t(apply(X, 1, "-", means))
+Xtestzeromean <- t(apply(Xtest, 1, "-", means))
+svdres <- svd(Xzeromean)
+
+# Extract the matrices containing the left and right singular vectors, respectively
+U <- svdres$u
+V <- svdres$v
+
+K <- 5 # Number of principal components to use
+pc_projections <- Xzeromean %*% V[, 1:K]
+pc_projectionstest <- Xtestzeromean %*% V[, 1:K]
+
+preds <- knn(pc_projections, pc_projectionstest, cl = y)
+
+error_rate <- mean(preds != ytest)
+print(error_rate)
+
diff --git a/exercises/02450Toolbox_R/Scripts/ex3_3_2.R b/exercises/02450Toolbox_R/Scripts/ex3_3_2.R
deleted file mode 100644
index 3cdc43ec18cec69145b8c9e47da735c2b53d1e34..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_R/Scripts/ex3_3_2.R
+++ /dev/null
@@ -1,22 +0,0 @@
-####################
-# Exercise 3.3.2
-####################
-source("Tools/similarity.R")
-
-# Generate two data objects with M random (standard uniformly distributed) attributes
-M = 5;
-x = as.matrix(runif(M));
-y = as.matrix(runif(M));
-
-
-# Two constants
-a = 1.5;
-b = 1.5;
-
-# Check the statements in the exercise
-similarity(x,y,'cos') - similarity(a*x,y,'cos')
-similarity(x,y,'ext') - similarity(a*x,y,'ext')
-similarity(x,y,'cor') - similarity(a*x,y,'cor')
-similarity(x,y,'cos') - similarity(b+x,y,'cos')
-similarity(x,y,'ext') - similarity(b+x,y,'ext')
-similarity(x,y,'cor') - similarity(b+x,y,'cor')
diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_1.R b/exercises/02450Toolbox_R/Scripts/ex4_2_1.R
deleted file mode 100644
index 98f03304399f73d86970a8b9f0b324e750b99f96..0000000000000000000000000000000000000000
--- a/exercises/02450Toolbox_R/Scripts/ex4_2_1.R
+++ /dev/null
@@ -1,30 +0,0 @@
-####################
-# Exercise 4.2.1
-####################
-rm(list = ls()) # Clear work space
-
-# Read data into R
-data <- read.csv("./Data/iris.csv")
-
-# Inspect the contents of the variable "data".
-names(data)
-head(data)
-
-# Extract the rows and columns corresponding to the data, i.e. the attribute values
-X <- data[, 1:4]
-
-# Extract attribute names from the first row
-attributeNames <- colnames(data)[1:(dim(data)[2] - 1)]
-
-# Extract unique class names from the last column
-classLabels <- data[, 5]
-classNames <- unique(classLabels)
-
-# Extract class labels that match the class names
-y <- match(classLabels, classNames) - 1
-
-# Get the number of data objects, attributes, and classes
-N <- dim(X)[1]
-M <- dim(X)[2]
-C <- length(classNames)
-