diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m new file mode 100644 index 0000000000000000000000000000000000000000..b5391b758cafba6e3c28f0beb465769334439718 --- /dev/null +++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_2.m @@ -0,0 +1,9 @@ +%% exercise 3.1.2 +cdir = fileparts(mfilename('fullpath')); +[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt')); +X = full(A)'; +attributeNames = cellstr(D); + +%% Display the result +display(attributeNames); +display(X); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m new file mode 100644 index 0000000000000000000000000000000000000000..fdd498e1c67d51f512c47132bf4298325f5abec0 --- /dev/null +++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_3.m @@ -0,0 +1 @@ +%% exercise 3.1.3 cdir = fileparts(mfilename('fullpath')); TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt'); [A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts); X = full(A)'; attributeNames = cellstr(D); %% Display the result display(attributeNames); display(X); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m new file mode 100644 index 0000000000000000000000000000000000000000..0518cd4748fdea2dec5daf77635c2de330e3bd2d --- /dev/null +++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_4.m @@ -0,0 +1 @@ +%% exercise 3.1.4 cdir = fileparts(mfilename('fullpath')); TMGOpts.stoplist = '../Data/stopWords.txt'; TMGOpts.stemming = 1; [A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts); X = full(A)'; attributeNames = cellstr(D); %% Display the result display(attributeNames); display(X); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m new file mode 100644 index 0000000000000000000000000000000000000000..ffa4d21982d415f00f3b6d9424dd8cec08486e77 --- /dev/null +++ b/exercises/02450Toolbox_Matlab/Scripts/ex1_6_5.m @@ -0,0 +1,21 @@ +%% exercise 3.1.5 + +% Query vector +q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]'; + +%% Method #1 (a for loop) +N = size(X,1); % Get the number of data objects +sim = nan(N,1); % Allocate a vector for the similarity +for i = 1:N + x = X(i,:); % Get the i'th data object + sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity +end + +%% Method #2 (one compact line of code) +sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2))); + +%% Method #3 (use the "similarity" function) +sim = similarity(X, q, 'cos'); + +%% Display the result +display(sim); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m index c5f1186acedaa71f6829e0072e5bdd871453be96..f4115017e2054a948671c99dc72cc037b0309382 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_1.m @@ -1,19 +1,14 @@ -%% exercise 2.1.1 -% Load the data into Matlab -cdir = fileparts(mfilename('fullpath')); -[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls')); +% exercise 3.2.1 -% Extract the rows and columns corresponding to the sensor data, and -% transpose the matrix to have rows correspond to data items -X = NUMERIC(:,3:10); +x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47]; -% Extract attribute names from the first column -attributeNames = RAW(1,4:end); +mean_x = mean(x); +std_x = std(x); +median_x = median(x); +range_x = range(x); -% Extract unique class names from the first row -classLabels = RAW(3:end,1); -classNames = unique(classLabels); - -% Extract class labels that match the class names -[y_,y] = ismember(classLabels, classNames); y = y-1; - \ No newline at end of file +%% Display results +display(mean_x); +display(std_x); +display(median_x); +display(range_x); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m deleted file mode 100644 index 5b6eee846e1397ed2ed1b2a9ea795cd92523e84c..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_2.m +++ /dev/null @@ -1,31 +0,0 @@ -%% exercise 2.1.2 -% Data attributes to be plotted -i = 1; -j = 2; - -% Make a simple plot of the i'th attribute against the j'th attribute -mfig('NanoNose: Data'); clf; -plot(X(:,i), X(:,j),'o'); -axis tight - -% Make another more fancy plot that includes legend, class labels, -% attribute names, and a title -mfig('NanoNose: Classes'); clf; hold all; -C = length(classNames); -% Use a specific color for each class (easy to reuse across plots!): -colors = get(gca, 'colororder'); -% Here we the standard colours from MATLAB, but you could define you own. -for c = 0:C-1 - h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ... - 'MarkerFaceColor', colors(c+1,:), ... - 'MarkerEdgeAlpha', 0, ... - 'MarkerFaceAlpha', .5); -end -% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries) -% gscatter(X(:,i), X(:,j), classLabels) -legend(classNames); -axis tight -xlabel(attributeNames{i}); -ylabel(attributeNames{j}); -title('NanoNose data'); -a = 234 diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m deleted file mode 100644 index 145ff5c7cdaf768b4b14906bc2c1bc014f08a37a..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_3.m +++ /dev/null @@ -1,28 +0,0 @@ - -%% exercise 2.1.3 - -% Subtract the mean from the data -Y = bsxfun(@minus, X, mean(X)); - -% Obtain the PCA solution by calculate the SVD of Y -[U, S, V] = svd(Y); - -% Compute variance explained -rho = diag(S).^2./sum(diag(S).^2); -threshold = 0.90; - -% Plot variance explained -mfig('NanoNose: Var. explained'); clf; -hold on -plot(rho, 'x-'); -plot(cumsum(rho), 'o-'); -plot([0,length(rho)], [threshold, threshold], 'k--'); -legend({'Individual','Cumulative','Threshold'}, ... - 'Location','best'); -ylim([0, 1]); -xlim([1, length(rho)]); -grid minor -xlabel('Principal component'); -ylabel('Variance explained value'); -title('Variance explained by principal components'); - diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m deleted file mode 100644 index cf767df44e4c6129faff311e92485815c1278a45..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_4.m +++ /dev/null @@ -1,24 +0,0 @@ -%% exercise 2.1.4 - -% Index of the principal components -i = 1; -j = 2; - -% Compute the projection onto the principal components -Z = U*S; - -% Plot PCA of data -mfig('NanoNose: PCA Projection'); clf; hold all; -C = length(classNames); -colors = get(gca,'colororder'); -for c = 0:C-1 - scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ... - 'MarkerFaceColor', colors(c+1,:), ... - 'MarkerEdgeAlpha', 0, ... - 'MarkerFaceAlpha', .5); -end -legend(classNames); -axis tight -xlabel(sprintf('PC %d', i)); -ylabel(sprintf('PC %d', j)); -title('PCA Projection of NanoNose data'); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m deleted file mode 100644 index 2475302c15fab8118605b132c2c4efd285b512eb..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_5.m +++ /dev/null @@ -1,40 +0,0 @@ -%% exercise 2.1.5 -% We saw in 2.1.3 that the first 3 components explaiend more than 90 -% percent of the variance. Let's look at their coefficients: -pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5] -mfig('NanoNose: PCA Component Coefficients'); -h = bar(V(:,pcs)); -legendCell = cellstr(num2str(pcs', 'PC%-d')); -legend(legendCell, 'location','best'); -xticklabels(attributeNames); -grid -xlabel('Attributes'); -ylabel('Component coefficients'); -title('NanoNose: PCA Component Coefficients'); - -% Inspecting the plot, we see that the 2nd principal component has large -% (in magnitude) coefficients for attributes A, E and H. We can confirm -% this by looking at it's numerical values directly, too: -disp('PC2:') -disp(V(:,2)') % notice the transpose for display in console - -% How does this translate to the actual data and its projections? -% Looking at the data for water: - -% Projection of water class onto the 2nd principal component. -all_water_data = Y(y==4,:); - -disp('First water observation:') -disp(all_water_data(1,:)) - -% Based on the coefficients and the attribute values for the observation -% displayed, would you expect the projection onto PC2 to be positive or -% negative - why? Consider *both* the magnitude and sign of *both* the -% coefficient and the attribute! - -% You can determine the projection by (remove comments): -disp('...and its projection onto PC2:') -disp(all_water_data(1,:) * V(:,2)) -% Try to explain why? - - diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m index 2f064b574ca36017f92dde65f45e22e78a017d1d..009817333dff41cc9d112663e65d03703f5c59ae 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_1.m @@ -1,28 +1,65 @@ -%% exercise 2.2.1 +% exercise 3.3.1 -% Digit number to display -i = 1; +% Image to use as query +i = 1; -% Load data +% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' +SimilarityMeasure = 'cos'; + +%% Load the CBCL face database cdir = fileparts(mfilename('fullpath')); -load(fullfile(cdir,'../Data/zipdata.mat')); - -% Extract digits data -X = traindata(:,2:end); -y = traindata(:,1); - -% Visualize the i'th digit as a vector -mfig('Digits: Data'); -subplot(4,1,4); -imagesc(X(i,:)); -xlabel('Pixel number'); -title('Digit in vector format'); -set(gca, 'YTick', []); - -% Visualize the i'th digit as an image -subplot(4,1,1:3); -I = reshape(X(i,:), [16,16])'; -imagesc(I); -colormap(1-gray); -axis image off -title('Digit as a image'); +load(fullfile(cdir,'../Data/digits.mat')); + +% You can try out the faces CBCL face database, too: +%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat')); + +transpose = true; % set to true if plotted images needs to be transposed + +[N,M] = size(X); +imageDim = [sqrt(M),sqrt(M)]; +%% Search the face database for similar faces + +% Index of all other images than i +noti = [1:i-1 i+1:N]; +% Compute similarity between image i and all others +sim = similarity(X(i,:), X(noti,:), SimilarityMeasure); +% Sort similarities +[val, j] = sort(sim, 'descend'); + +%% Plot query and result +mfig('Faces: Query'); clf; +subplot(3,5,1:5); + +img = reshape(X(i,:),imageDim); +if transpose; img = img'; end; +imagesc(img); + +axis image +set(gca, 'XTick', [], 'YTick', []); +ylabel(sprintf('Image #%d', i)); +title('Query image','FontWeight','bold'); +for k = 1:5 + subplot(3,5,k+5) + ii = noti(j(k)); + img = reshape(X(ii,:),imageDim); + if transpose; img = img'; end; + imagesc(img); + axis image + set(gca, 'XTick', [], 'YTick', []); + xlabel(sprintf('sim=%.2f', val(k))); + ylabel(sprintf('Image #%d', ii)); + if k==3, title('Most similar images','FontWeight','bold'); end; +end +for k = 1:5 + subplot(3,5,k+10) + ii = noti(j(end+1-k)); + img = reshape(X(ii,:),imageDim); + if transpose; img = img'; end; + imagesc(img); + axis image + set(gca, 'XTick', [], 'YTick', []); + xlabel(sprintf('sim=%.3f', val(end+1-k))); + ylabel(sprintf('Image #%d', ii)); + if k==3, title('Least similar images','FontWeight','bold'); end; +end +colormap(gray); diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m index 0db965d7614fface2e85ca12af9d2e08cdaf0d1b..52eb65d331a1a39824c0313a1f1db12600f9adcc 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_2_2.m @@ -1,87 +1,18 @@ -%% exercise 2.2.2 - -% Digits to include in analysis (to include all, n = 1:10); -n = [0,1]; - -% Number of principal components for reconstruction -K = 22; - -% Digits to visualize -nD = 1:5; - -%% Load data -cdir = fileparts(mfilename('fullpath')); -load(fullfile(cdir,'../Data/zipdata.mat')); - -% Extract digits -X = traindata(:,2:end); -y = traindata(:,1); -classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'}; -classLabels = classNames(y+1); - -% Remove digits that are not to be inspected -j = ismember(y, n); -X = X(j,:); -classLabels = classLabels(j); -classNames = classNames(n+1); -y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1; - -% Subtract the mean from the data -Y = bsxfun(@minus, X, mean(X)); - -% Obtain the PCA solution by calculate the SVD of Y -[U, S, V] = svd(Y, 'econ'); - -% Compute the projection onto the principal components -Z = U*S; - -% Compute variance explained -rho = diag(S).^2./sum(diag(S).^2); - -%% Plot variance explained -mfig('Digits: Var. explained'); clf; -plot(rho, 'o-'); -title('Variance explained by principal components'); -xlabel('Principal component'); -ylabel('Variance explained value'); - -%% Plot PCA of data -mfig('Digits: PCA'); clf; hold all; -C = length(classNames); -for c = 0:C-1 - plot(Z(y==c,1), Z(y==c,2), 'o'); -end -legend(classNames); -xlabel('PC 1'); -ylabel('PC 2'); -title('PCA of digits data'); - -%% Visualize the reconstructed data from the firts K principal components -mfig('Digits: Reconstruction'); clf; -W = Z(:,1:K)*V(:,1:K)'; -D = length(nD); -for d = 1:D - subplot(2,D,d); - I = reshape(X(nD(d),:), [16,16])'; - imagesc(I); - axis image off - title('Original'); - subplot(2,D,d+D); - I = reshape(W(nD(d),:)+mean(X), [16,16])'; - imagesc(I); - axis image off - title('Reconstructed'); -end -colormap(1-gray); - -%% Visualize the principal components -mfig('Digits: Principal components'); clf; -for k = 1:K - N1 = ceil(sqrt(K)); N2 = ceil(K/N1); - subplot(N2, N1, k); - I = reshape(V(:,k), [16,16])'; - imagesc(I); - colormap(hot); - axis image off - title(sprintf('PC %d',k)); -end +% exercise 3.3.2 + +% Generate two data objects with M random attributes +M = 5; +x = rand(1,M); +y = rand(1,M); + +% Two constants +a = 1.5; +b = 1.5; + +% Check the statements in the exercise +similarity(x,y,'cos') - similarity(a*x,y,'cos') +similarity(x,y,'ext') - similarity(a*x,y,'ext') +similarity(x,y,'cor') - similarity(a*x,y,'cor') +similarity(x,y,'cos') - similarity(b+x,y,'cos') +similarity(x,y,'ext') - similarity(b+x,y,'ext') +similarity(x,y,'cor') - similarity(b+x,y,'cor') \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m index 2a8135ef9d9ff5647fd5be3bfcb9acfa7884be85..9a68ff4cf91cea0a1d5abf841a627e46c4751eae 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_1.m @@ -1,44 +1,31 @@ -%% exercise 2.3.1 -%% Load data +% exercise 4.2.1 + +% Disable xlsread warning +warning('off', 'MATLAB:xlsread:ActiveX'); +warning('off', 'MATLAB:xlsread:Mode'); + +% Load the data into Matlab cdir = fileparts(mfilename('fullpath')); -load(fullfile(cdir,'../Data/zipdata.mat')); - -% Extract digits (training set) -X = traindata(:,2:end); -y = traindata(:,1); - -% Extract digits (test set) -Xtest = testdata(:,2:end); -ytest = testdata(:,1); - -% Subtract the mean from the data -Y = bsxfun(@minus, X, mean(X)); -Ytest = bsxfun(@minus, Xtest, mean(X)); - -% Obtain the PCA solution by calculate the SVD of Y -[U, S, V] = svd(Y, 'econ'); - -% Number of principal components to use, i.e. the reduced dimensionality -Krange = [8,10,15,20,30,40,50,60,100,150]; -errorRate = zeros(length(Krange),1); -for i = 1:length(Krange) - K=Krange(i); - % Compute the projection onto the principal components - Z = Y*V(:,1:K); - Ztest = Ytest*V(:,1:K); - - % Classify digits using a K-nearest neighbour classifier - model=fitcknn(Z,y,'NumNeighbors',1); - yest = predict(model,Ztest); - - errorRate(i) = nnz(ytest~=yest)/length(ytest); - - % Display results - fprintf('Error rate %.1f%%\n',errorRate(i)*100); +[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic'); + +% Extract the rows and columns corresponding to the data +if isnan(NUMERIC(1,1)) + X = NUMERIC(2:end,:); +else + X = NUMERIC; end -%% Visualize error rates vs. number of principal components considered -mfig('Variance explained by principal components'); clf; -plot(Krange,errorRate, 'o-'); -xlabel('Number of principal components K') -ylabel('Error rate [%]') +% Extract attribute names from the first row +attributeNames = RAW(1,1:4)'; + +% Extract unique class names from the last column +classLabels = RAW(2:end,5)'; +classNames = unique(classLabels); + +% Extract class labels that match the class names +[y_,y] = ismember(classLabels, classNames); y = y'-1; + +% Get the number of data objects, attributes, and classes +[N, M] = size(X); +C = length(classNames); + diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_2.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_2.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_2.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_3.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_3.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_3.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_4.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_4.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_4.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_5.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_5.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_5.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_6.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_6.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_6.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_6.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_7.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_3_7.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_2_7.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_3_7.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_4_1.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_3_1.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_4_1.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_3_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex2_4_2.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex4_3_2.m rename to exercises/02450Toolbox_Matlab/Scripts/ex2_4_2.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m new file mode 100644 index 0000000000000000000000000000000000000000..c5f1186acedaa71f6829e0072e5bdd871453be96 --- /dev/null +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_1.m @@ -0,0 +1,19 @@ +%% exercise 2.1.1 +% Load the data into Matlab +cdir = fileparts(mfilename('fullpath')); +[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls')); + +% Extract the rows and columns corresponding to the sensor data, and +% transpose the matrix to have rows correspond to data items +X = NUMERIC(:,3:10); + +% Extract attribute names from the first column +attributeNames = RAW(1,4:end); + +% Extract unique class names from the first row +classLabels = RAW(3:end,1); +classNames = unique(classLabels); + +% Extract class labels that match the class names +[y_,y] = ismember(classLabels, classNames); y = y-1; + \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m index b5391b758cafba6e3c28f0beb465769334439718..5b6eee846e1397ed2ed1b2a9ea795cd92523e84c 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_2.m @@ -1,9 +1,31 @@ -%% exercise 3.1.2 -cdir = fileparts(mfilename('fullpath')); -[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt')); -X = full(A)'; -attributeNames = cellstr(D); +%% exercise 2.1.2 +% Data attributes to be plotted +i = 1; +j = 2; -%% Display the result -display(attributeNames); -display(X); \ No newline at end of file +% Make a simple plot of the i'th attribute against the j'th attribute +mfig('NanoNose: Data'); clf; +plot(X(:,i), X(:,j),'o'); +axis tight + +% Make another more fancy plot that includes legend, class labels, +% attribute names, and a title +mfig('NanoNose: Classes'); clf; hold all; +C = length(classNames); +% Use a specific color for each class (easy to reuse across plots!): +colors = get(gca, 'colororder'); +% Here we the standard colours from MATLAB, but you could define you own. +for c = 0:C-1 + h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ... + 'MarkerFaceColor', colors(c+1,:), ... + 'MarkerEdgeAlpha', 0, ... + 'MarkerFaceAlpha', .5); +end +% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries) +% gscatter(X(:,i), X(:,j), classLabels) +legend(classNames); +axis tight +xlabel(attributeNames{i}); +ylabel(attributeNames{j}); +title('NanoNose data'); +a = 234 diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m index fdd498e1c67d51f512c47132bf4298325f5abec0..145ff5c7cdaf768b4b14906bc2c1bc014f08a37a 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_3.m @@ -1 +1,28 @@ -%% exercise 3.1.3 cdir = fileparts(mfilename('fullpath')); TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt'); [A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts); X = full(A)'; attributeNames = cellstr(D); %% Display the result display(attributeNames); display(X); \ No newline at end of file + +%% exercise 2.1.3 + +% Subtract the mean from the data +Y = bsxfun(@minus, X, mean(X)); + +% Obtain the PCA solution by calculate the SVD of Y +[U, S, V] = svd(Y); + +% Compute variance explained +rho = diag(S).^2./sum(diag(S).^2); +threshold = 0.90; + +% Plot variance explained +mfig('NanoNose: Var. explained'); clf; +hold on +plot(rho, 'x-'); +plot(cumsum(rho), 'o-'); +plot([0,length(rho)], [threshold, threshold], 'k--'); +legend({'Individual','Cumulative','Threshold'}, ... + 'Location','best'); +ylim([0, 1]); +xlim([1, length(rho)]); +grid minor +xlabel('Principal component'); +ylabel('Variance explained value'); +title('Variance explained by principal components'); + diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m index 0518cd4748fdea2dec5daf77635c2de330e3bd2d..cf767df44e4c6129faff311e92485815c1278a45 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_4.m @@ -1 +1,24 @@ -%% exercise 3.1.4 cdir = fileparts(mfilename('fullpath')); TMGOpts.stoplist = '../Data/stopWords.txt'; TMGOpts.stemming = 1; [A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts); X = full(A)'; attributeNames = cellstr(D); %% Display the result display(attributeNames); display(X); \ No newline at end of file +%% exercise 2.1.4 + +% Index of the principal components +i = 1; +j = 2; + +% Compute the projection onto the principal components +Z = U*S; + +% Plot PCA of data +mfig('NanoNose: PCA Projection'); clf; hold all; +C = length(classNames); +colors = get(gca,'colororder'); +for c = 0:C-1 + scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ... + 'MarkerFaceColor', colors(c+1,:), ... + 'MarkerEdgeAlpha', 0, ... + 'MarkerFaceAlpha', .5); +end +legend(classNames); +axis tight +xlabel(sprintf('PC %d', i)); +ylabel(sprintf('PC %d', j)); +title('PCA Projection of NanoNose data'); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m index ffa4d21982d415f00f3b6d9424dd8cec08486e77..2475302c15fab8118605b132c2c4efd285b512eb 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_5.m @@ -1,21 +1,40 @@ -%% exercise 3.1.5 +%% exercise 2.1.5 +% We saw in 2.1.3 that the first 3 components explaiend more than 90 +% percent of the variance. Let's look at their coefficients: +pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5] +mfig('NanoNose: PCA Component Coefficients'); +h = bar(V(:,pcs)); +legendCell = cellstr(num2str(pcs', 'PC%-d')); +legend(legendCell, 'location','best'); +xticklabels(attributeNames); +grid +xlabel('Attributes'); +ylabel('Component coefficients'); +title('NanoNose: PCA Component Coefficients'); -% Query vector -q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]'; +% Inspecting the plot, we see that the 2nd principal component has large +% (in magnitude) coefficients for attributes A, E and H. We can confirm +% this by looking at it's numerical values directly, too: +disp('PC2:') +disp(V(:,2)') % notice the transpose for display in console -%% Method #1 (a for loop) -N = size(X,1); % Get the number of data objects -sim = nan(N,1); % Allocate a vector for the similarity -for i = 1:N - x = X(i,:); % Get the i'th data object - sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity -end +% How does this translate to the actual data and its projections? +% Looking at the data for water: -%% Method #2 (one compact line of code) -sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2))); +% Projection of water class onto the 2nd principal component. +all_water_data = Y(y==4,:); + +disp('First water observation:') +disp(all_water_data(1,:)) + +% Based on the coefficients and the attribute values for the observation +% displayed, would you expect the projection onto PC2 to be positive or +% negative - why? Consider *both* the magnitude and sign of *both* the +% coefficient and the attribute! + +% You can determine the projection by (remove comments): +disp('...and its projection onto PC2:') +disp(all_water_data(1,:) * V(:,2)) +% Try to explain why? -%% Method #3 (use the "similarity" function) -sim = similarity(X, q, 'cos'); -%% Display the result -display(sim); \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex2_1_6.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_1_6.m similarity index 100% rename from exercises/02450Toolbox_Matlab/Scripts/ex2_1_6.m rename to exercises/02450Toolbox_Matlab/Scripts/ex3_1_6.m diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m index f4115017e2054a948671c99dc72cc037b0309382..2f064b574ca36017f92dde65f45e22e78a017d1d 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_1.m @@ -1,14 +1,28 @@ -% exercise 3.2.1 +%% exercise 2.2.1 -x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47]; +% Digit number to display +i = 1; -mean_x = mean(x); -std_x = std(x); -median_x = median(x); -range_x = range(x); +% Load data +cdir = fileparts(mfilename('fullpath')); +load(fullfile(cdir,'../Data/zipdata.mat')); -%% Display results -display(mean_x); -display(std_x); -display(median_x); -display(range_x); \ No newline at end of file +% Extract digits data +X = traindata(:,2:end); +y = traindata(:,1); + +% Visualize the i'th digit as a vector +mfig('Digits: Data'); +subplot(4,1,4); +imagesc(X(i,:)); +xlabel('Pixel number'); +title('Digit in vector format'); +set(gca, 'YTick', []); + +% Visualize the i'th digit as an image +subplot(4,1,1:3); +I = reshape(X(i,:), [16,16])'; +imagesc(I); +colormap(1-gray); +axis image off +title('Digit as a image'); diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m new file mode 100644 index 0000000000000000000000000000000000000000..0db965d7614fface2e85ca12af9d2e08cdaf0d1b --- /dev/null +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_2_2.m @@ -0,0 +1,87 @@ +%% exercise 2.2.2 + +% Digits to include in analysis (to include all, n = 1:10); +n = [0,1]; + +% Number of principal components for reconstruction +K = 22; + +% Digits to visualize +nD = 1:5; + +%% Load data +cdir = fileparts(mfilename('fullpath')); +load(fullfile(cdir,'../Data/zipdata.mat')); + +% Extract digits +X = traindata(:,2:end); +y = traindata(:,1); +classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'}; +classLabels = classNames(y+1); + +% Remove digits that are not to be inspected +j = ismember(y, n); +X = X(j,:); +classLabels = classLabels(j); +classNames = classNames(n+1); +y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1; + +% Subtract the mean from the data +Y = bsxfun(@minus, X, mean(X)); + +% Obtain the PCA solution by calculate the SVD of Y +[U, S, V] = svd(Y, 'econ'); + +% Compute the projection onto the principal components +Z = U*S; + +% Compute variance explained +rho = diag(S).^2./sum(diag(S).^2); + +%% Plot variance explained +mfig('Digits: Var. explained'); clf; +plot(rho, 'o-'); +title('Variance explained by principal components'); +xlabel('Principal component'); +ylabel('Variance explained value'); + +%% Plot PCA of data +mfig('Digits: PCA'); clf; hold all; +C = length(classNames); +for c = 0:C-1 + plot(Z(y==c,1), Z(y==c,2), 'o'); +end +legend(classNames); +xlabel('PC 1'); +ylabel('PC 2'); +title('PCA of digits data'); + +%% Visualize the reconstructed data from the firts K principal components +mfig('Digits: Reconstruction'); clf; +W = Z(:,1:K)*V(:,1:K)'; +D = length(nD); +for d = 1:D + subplot(2,D,d); + I = reshape(X(nD(d),:), [16,16])'; + imagesc(I); + axis image off + title('Original'); + subplot(2,D,d+D); + I = reshape(W(nD(d),:)+mean(X), [16,16])'; + imagesc(I); + axis image off + title('Reconstructed'); +end +colormap(1-gray); + +%% Visualize the principal components +mfig('Digits: Principal components'); clf; +for k = 1:K + N1 = ceil(sqrt(K)); N2 = ceil(K/N1); + subplot(N2, N1, k); + I = reshape(V(:,k), [16,16])'; + imagesc(I); + colormap(hot); + axis image off + title(sprintf('PC %d',k)); +end diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m index 009817333dff41cc9d112663e65d03703f5c59ae..2a8135ef9d9ff5647fd5be3bfcb9acfa7884be85 100644 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m +++ b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_1.m @@ -1,65 +1,44 @@ -% exercise 3.3.1 - -% Image to use as query -i = 1; - -% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' -SimilarityMeasure = 'cos'; - -%% Load the CBCL face database +%% exercise 2.3.1 +%% Load data cdir = fileparts(mfilename('fullpath')); -load(fullfile(cdir,'../Data/digits.mat')); - -% You can try out the faces CBCL face database, too: -%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat')); - -transpose = true; % set to true if plotted images needs to be transposed - -[N,M] = size(X); -imageDim = [sqrt(M),sqrt(M)]; -%% Search the face database for similar faces - -% Index of all other images than i -noti = [1:i-1 i+1:N]; -% Compute similarity between image i and all others -sim = similarity(X(i,:), X(noti,:), SimilarityMeasure); -% Sort similarities -[val, j] = sort(sim, 'descend'); - -%% Plot query and result -mfig('Faces: Query'); clf; -subplot(3,5,1:5); - -img = reshape(X(i,:),imageDim); -if transpose; img = img'; end; -imagesc(img); - -axis image -set(gca, 'XTick', [], 'YTick', []); -ylabel(sprintf('Image #%d', i)); -title('Query image','FontWeight','bold'); -for k = 1:5 - subplot(3,5,k+5) - ii = noti(j(k)); - img = reshape(X(ii,:),imageDim); - if transpose; img = img'; end; - imagesc(img); - axis image - set(gca, 'XTick', [], 'YTick', []); - xlabel(sprintf('sim=%.2f', val(k))); - ylabel(sprintf('Image #%d', ii)); - if k==3, title('Most similar images','FontWeight','bold'); end; -end -for k = 1:5 - subplot(3,5,k+10) - ii = noti(j(end+1-k)); - img = reshape(X(ii,:),imageDim); - if transpose; img = img'; end; - imagesc(img); - axis image - set(gca, 'XTick', [], 'YTick', []); - xlabel(sprintf('sim=%.3f', val(end+1-k))); - ylabel(sprintf('Image #%d', ii)); - if k==3, title('Least similar images','FontWeight','bold'); end; +load(fullfile(cdir,'../Data/zipdata.mat')); + +% Extract digits (training set) +X = traindata(:,2:end); +y = traindata(:,1); + +% Extract digits (test set) +Xtest = testdata(:,2:end); +ytest = testdata(:,1); + +% Subtract the mean from the data +Y = bsxfun(@minus, X, mean(X)); +Ytest = bsxfun(@minus, Xtest, mean(X)); + +% Obtain the PCA solution by calculate the SVD of Y +[U, S, V] = svd(Y, 'econ'); + +% Number of principal components to use, i.e. the reduced dimensionality +Krange = [8,10,15,20,30,40,50,60,100,150]; +errorRate = zeros(length(Krange),1); +for i = 1:length(Krange) + K=Krange(i); + % Compute the projection onto the principal components + Z = Y*V(:,1:K); + Ztest = Ytest*V(:,1:K); + + % Classify digits using a K-nearest neighbour classifier + model=fitcknn(Z,y,'NumNeighbors',1); + yest = predict(model,Ztest); + + errorRate(i) = nnz(ytest~=yest)/length(ytest); + + % Display results + fprintf('Error rate %.1f%%\n',errorRate(i)*100); end -colormap(gray); + +%% Visualize error rates vs. number of principal components considered +mfig('Variance explained by principal components'); clf; +plot(Krange,errorRate, 'o-'); +xlabel('Number of principal components K') +ylabel('Error rate [%]') diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m b/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m deleted file mode 100644 index 52eb65d331a1a39824c0313a1f1db12600f9adcc..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/ex3_3_2.m +++ /dev/null @@ -1,18 +0,0 @@ -% exercise 3.3.2 - -% Generate two data objects with M random attributes -M = 5; -x = rand(1,M); -y = rand(1,M); - -% Two constants -a = 1.5; -b = 1.5; - -% Check the statements in the exercise -similarity(x,y,'cos') - similarity(a*x,y,'cos') -similarity(x,y,'ext') - similarity(a*x,y,'ext') -similarity(x,y,'cor') - similarity(a*x,y,'cor') -similarity(x,y,'cos') - similarity(b+x,y,'cos') -similarity(x,y,'ext') - similarity(b+x,y,'ext') -similarity(x,y,'cor') - similarity(b+x,y,'cor') \ No newline at end of file diff --git a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m b/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m deleted file mode 100644 index 9a68ff4cf91cea0a1d5abf841a627e46c4751eae..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/ex4_2_1.m +++ /dev/null @@ -1,31 +0,0 @@ -% exercise 4.2.1 - -% Disable xlsread warning -warning('off', 'MATLAB:xlsread:ActiveX'); -warning('off', 'MATLAB:xlsread:Mode'); - -% Load the data into Matlab -cdir = fileparts(mfilename('fullpath')); -[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic'); - -% Extract the rows and columns corresponding to the data -if isnan(NUMERIC(1,1)) - X = NUMERIC(2:end,:); -else - X = NUMERIC; -end - -% Extract attribute names from the first row -attributeNames = RAW(1,1:4)'; - -% Extract unique class names from the last column -classLabels = RAW(2:end,5)'; -classNames = unique(classLabels); - -% Extract class labels that match the class names -[y_,y] = ismember(classLabels, classNames); y = y'-1; - -% Get the number of data objects, attributes, and classes -[N, M] = size(X); -C = length(classNames); - diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py new file mode 100644 index 0000000000000000000000000000000000000000..60a39cf53d0415f6573b00f633de451559a70dff --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_2.py @@ -0,0 +1,54 @@ +# exercise 3.1.4 +import importlib_resources +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") +# Load the textDocs.txt as a long string into raw_file: +with open(filename, "r") as f: + raw_file = f.read() +# raw_file contains sentences seperated by newline characters, +# so we split by '\n': +corpus = raw_file.split("\n") +# corpus is now list of "documents" (sentences), but some of them are empty, +# because textDocs.txt has a lot of empty lines, we filter/remove them: +corpus = list(filter(None, corpus)) + +# Display the result +print("Document-term matrix analysis") +print() +print("Corpus (5 documents/sentences):") +print(np.asmatrix(corpus)) +print() + + +# To automatically obtain the bag of words representation, we use sklearn's +# feature_extraction.text module, which has a function CountVectorizer. +# We make a CounterVectorizer: +vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b") +# The token pattern is a regular expression (marked by the r), which ensures +# that the vectorizer ignores digit/non-word tokens - in this case, it ensures +# the 10 in the last document is not recognized as a token. It's not important +# that you should understand it the regexp. + +# The object vectorizer can now be used to first 'fit' the vectorizer to the +# corpus, and the subsequently transform the data. We start by fitting: +vectorizer.fit(corpus) +# The vectorizer has now determined the unique terms (or tokens) in the corpus +# and we can extract them using: +attributeNames = vectorizer.get_feature_names_out() +print("Found terms:") +print(attributeNames) +print() + +# The next step is to count how many times each term is found in each document, +# which we do using the transform function: +X = vectorizer.transform(corpus) +N, M = X.shape +print("Number of documents (data objects, N):\t %i" % N) +print("Number of terms (attributes, M):\t %i" % M) +print() +print("Document-term matrix:") +print(X.toarray()) +print() +print("Ran Exercise 3.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py new file mode 100644 index 0000000000000000000000000000000000000000..c3ac96c2ad678edc8786ea59973292ad3b80a9e1 --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_3.py @@ -0,0 +1,40 @@ +# exercise 3.1.4 +import importlib_resources +from sklearn.feature_extraction.text import CountVectorizer + +filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") +filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt") + +# As before, load the corpus and preprocess: +with open(filename_docs, "r") as f: + raw_file = f.read() +corpus = raw_file.split("\n") +corpus = list(filter(None, corpus)) + +# Load and process the stop words in a similar manner: +with open(filename_stop, "r") as f: + raw_file = f.read() +stopwords = raw_file.split("\n") + +# When making the CountVectorizer, we now input the stop words: +vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords) +# Determine the terms in the corpus +vectorizer.fit(corpus) +# ... and count the frequency of each term within a document: +X = vectorizer.transform(corpus) +attributeNames = vectorizer.get_feature_names_out() +N, M = X.shape + +# Display the result +print("Document-term matrix analysis (using stop words)") +print() +print("Number of documents (data objects, N):\t %i" % N) +print("Number of terms (attributes, M):\t %i" % M) +print() +print("Found terms (no stop words):") +print(attributeNames) +print() +print("Document-term matrix:") +print(X.toarray()) +print() +print("Ran Exercise 3.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py new file mode 100644 index 0000000000000000000000000000000000000000..e33ee98041ad4eddb1533e2553763c881c2e5e73 --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_4.py @@ -0,0 +1,66 @@ +# exercise 3.1.4 +import importlib_resources + +# We'll use a widely used stemmer based: +# Porter, M. “An algorithm for suffix stripping.†Program 14.3 (1980): 130-137. +# The stemmer is implemented in the most used natural language processing +# package in Python, "Natural Langauge Toolkit" (NLTK): +from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import CountVectorizer + +filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") +filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt") + +# As before, load the corpus and preprocess: +with open(filename_docs, "r") as f: + raw_file = f.read() +corpus = raw_file.split("\n") +corpus = list(filter(None, corpus)) + +# Load and process the stop words in a similar manner: +with open(filename_stop, "r") as f: + raw_file = f.read() +stopwords = raw_file.split("\n") + + +# To enable stemming when using the sklearn-module, we need to parse an +# "analyzer" to the vectorizer we've been using. +# First, we make an object based on the PorterStemmer class, and we also make +# an analyzer object: +stemmer = PorterStemmer() +analyzer = CountVectorizer( + token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords +).build_analyzer() + + +# Using these we'll make a function that can stem words: +def stemmed_words(doc): + return (stemmer.stem(w) for w in analyzer(doc)) + + +# ... and finally, we make a vectorizer just like we've done before: +vectorizer = CountVectorizer(analyzer=stemmed_words) + +# Determine the terms: +vectorizer.fit(corpus) +attributeNames = vectorizer.get_feature_names_out() + +# ... and count the occurences: +X = vectorizer.transform(corpus) +N, M = X.shape +X = X.toarray() + +# Display the result +print("Document-term matrix analysis (using stop words and stemming)") +print() +print("Number of documents (data objects, N):\t %i" % N) +print("Number of terms (attributes, M):\t %i" % M) +print() +print("Found terms (no stop words, stemmed):") +print(attributeNames) +print() +print("Document-term matrix:") +print(X) +print() +print("Ran Exercise 3.1.4") +print() diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py b/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py new file mode 100644 index 0000000000000000000000000000000000000000..eb3b3b3e15930768ffd46963e05ae16d66268455 --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex1_6_5.py @@ -0,0 +1,38 @@ +# exercise 3.1.5 +import numpy as np +import scipy.linalg as linalg +from ex1_6_4 import * + +from dtuimldmtools import similarity + +# Query vector +q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) +# notice, that you could get the query vector using the vectorizer, too: +# q = vectorizer.transform(['matrix rank solv']) +# q = np.asarray(q.toarray()) +# or use any other string: +# q = vectorizer.transform(['Can I Google how to fix my problem?']) +# q = np.asarray(q.toarray()) + +# Method 1 ('for' loop - slow) +N = np.shape(X)[0] +# get the number of data objects +sim = np.zeros((N, 1)) # allocate a vector for the similarity +for i in range(N): + x = X[i, :] # Get the i'th data object (here: document) + sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x) # Compute cosine similarity + +# Method 2 (one line of code with no iterations - faster) +sim = (q @ X.T).T / ( + np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum()) +) + +# Method 3 (use the "similarity" function) +sim = similarity(X, q, "cos") + + +# Display the result +print("Query vector:\n {0}\n".format(q)) +print("Similarity results:\n {0}".format(sim)) + +print("Ran Exercise 3.1.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py index b210d13bfaca05d29b799ca0f1cc103802dee09e..dbc1dc0af5914508cd15466c199cdb24c2fd3bc2 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py @@ -1,32 +1,19 @@ -# exercise 2.1.1 -import importlib_resources +# exercise 3.2.1 import numpy as np -import xlrd -# Load xls sheet with data -filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls") -doc = xlrd.open_workbook(filename).sheet_by_index(0) +x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47]) -# Extract attribute names (1st row, column 4 to 12) -attributeNames = doc.row_values(0, 3, 11) +# Compute values +mean_x = x.mean() +std_x = x.std(ddof=1) +median_x = np.median(x) +range_x = x.max() - x.min() -# Extract class names to python list, -# then encode with integers (dict) -classLabels = doc.col_values(0, 2, 92) -classNames = sorted(set(classLabels)) -classDict = dict(zip(classNames, range(5))) +# Display results +print("Vector:", x) +print("Mean:", mean_x) +print("Standard Deviation:", std_x) +print("Median:", median_x) +print("Range:", range_x) -# Extract vector y, convert to NumPy array -y = np.asarray([classDict[value] for value in classLabels]) - -# Preallocate memory, then extract excel data to matrix X -X = np.empty((90, 8)) -for i, col_id in enumerate(range(3, 11)): - X[:, i] = np.asarray(doc.col_values(col_id, 2, 92)) - -# Compute values of N, M and C. -N = len(y) -M = len(attributeNames) -C = len(classNames) - -print("Ran Exercise 2.1.1") +print("Ran Exercise 3.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py deleted file mode 100644 index 98847f60502ada91cb0b8289234d68daeb6959ba..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py +++ /dev/null @@ -1,37 +0,0 @@ -# exercise 2.1.2 - -# Imports the numpy and xlrd package, then runs the ex2_1_1 code -from ex2_1_1 import * -from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel - -# (requires data structures from ex. 2.1.1) - - -# Data attributes to be plotted -i = 0 -j = 1 - -## -# Make a simple plot of the i'th attribute against the j'th attribute -# Notice that X is of matrix type (but it will also work with a numpy array) -# X = np.array(X) #Try to uncomment this line -plot(X[:, i], X[:, j], "o") - -# %% -# Make another more fancy plot that includes legend, class labels, -# attribute names, and a title. -f = figure() -title("NanoNose data") - -for c in range(C): - # select indices belonging to class c: - class_mask = y == c - plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3) - -legend(classNames) -xlabel(attributeNames[i]) -ylabel(attributeNames[j]) - -# Output result to screen -show() -print("Ran Exercise 2.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py deleted file mode 100644 index c977a4c69b0e1a4cc311470122cbdffe59a72ae6..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py +++ /dev/null @@ -1,30 +0,0 @@ -# exercise 2.1.3 -# (requires data structures from ex. 2.2.1) -import matplotlib.pyplot as plt -from ex2_1_1 import * -from scipy.linalg import svd - -# Subtract mean value from data -Y = X - np.ones((N, 1)) * X.mean(axis=0) - -# PCA by computing SVD of Y -U, S, V = svd(Y, full_matrices=False) - -# Compute variance explained by principal components -rho = (S * S) / (S * S).sum() - -threshold = 0.9 - -# Plot variance explained -plt.figure() -plt.plot(range(1, len(rho) + 1), rho, "x-") -plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-") -plt.plot([1, len(rho)], [threshold, threshold], "k--") -plt.title("Variance explained by principal components") -plt.xlabel("Principal component") -plt.ylabel("Variance explained") -plt.legend(["Individual", "Cumulative", "Threshold"]) -plt.grid() -plt.show() - -print("Ran Exercise 2.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py deleted file mode 100644 index 46788689ab957d3409370102720978e17d89a453..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py +++ /dev/null @@ -1,38 +0,0 @@ -# exercise 2.1.4 -# (requires data structures from ex. 2.2.1 and 2.2.3) -from ex2_1_1 import * -from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel -from scipy.linalg import svd - -# Subtract mean value from data -Y = X - np.ones((N, 1)) * X.mean(0) - -# PCA by computing SVD of Y -U, S, Vh = svd(Y, full_matrices=False) -# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose) -# of the vector V. So, for us to obtain the correct V, we transpose: -V = Vh.T - -# Project the centered data onto principal component space -Z = Y @ V - -# Indices of the principal components to be plotted -i = 0 -j = 1 - -# Plot PCA of the data -f = figure() -title("NanoNose data: PCA") -# Z = array(Z) -for c in range(C): - # select indices belonging to class c: - class_mask = y == c - plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5) -legend(classNames) -xlabel("PC{0}".format(i + 1)) -ylabel("PC{0}".format(j + 1)) - -# Output result to screen -show() - -print("Ran Exercise 2.1.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py deleted file mode 100644 index 364a20146ce59e5f312d1d59025b73dbc1a91d6f..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py +++ /dev/null @@ -1,53 +0,0 @@ -# exercise 2.2.4 - -# (requires data structures from ex. 2.2.1) -import matplotlib.pyplot as plt -from ex2_1_1 import * -from scipy.linalg import svd - -Y = X - np.ones((N, 1)) * X.mean(0) -U, S, Vh = svd(Y, full_matrices=False) -V = Vh.T -N, M = X.shape - -# We saw in 2.1.3 that the first 3 components explaiend more than 90 -# percent of the variance. Let's look at their coefficients: -pcs = [0, 1, 2] -legendStrs = ["PC" + str(e + 1) for e in pcs] -c = ["r", "g", "b"] -bw = 0.2 -r = np.arange(1, M + 1) -for i in pcs: - plt.bar(r + i * bw, V[:, i], width=bw) -plt.xticks(r + bw, attributeNames) -plt.xlabel("Attributes") -plt.ylabel("Component coefficients") -plt.legend(legendStrs) -plt.grid() -plt.title("NanoNose: PCA Component Coefficients") -plt.show() - -# Inspecting the plot, we see that the 2nd principal component has large -# (in magnitude) coefficients for attributes A, E and H. We can confirm -# this by looking at it's numerical values directly, too: -print("PC2:") -print(V[:, 1].T) - -# How does this translate to the actual data and its projections? -# Looking at the data for water: - -# Projection of water class onto the 2nd principal component. -all_water_data = Y[y == 4, :] - -print("First water observation") -print(all_water_data[0, :]) - -# Based on the coefficients and the attribute values for the observation -# displayed, would you expect the projection onto PC2 to be positive or -# negative - why? Consider *both* the magnitude and sign of *both* the -# coefficient and the attribute! - -# You can determine the projection by (remove comments): -print("...and its projection onto PC2") -print(all_water_data[0, :] @ V[:, 1]) -# Try to explain why? diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py index 798bb5b46dba276b98ff5c3f0da86e13f66bfe15..f5cfc462ae073133074a692c8080b0706d084ee5 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py @@ -1,36 +1,77 @@ +# exercise 3.3.1 + import importlib_resources +import matplotlib.pyplot as plt import numpy as np -from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks from scipy.io import loadmat -filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") -# Index of the digit to display -i = 0 +from dtuimldmtools import similarity + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat") +# Image to use as query +i = 1 +# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' +similarity_measure = "SMC" + +# Load the digits # Load Matlab data file to python dict structure -mat_data = loadmat(filename) +X = loadmat(filename)["X"] +# You can also try the CBCL faces dataset (remember to change 'transpose') +#X = loadmat('../Data/wildfaces_grayscale.mat')['X'] +N, M = X.shape +transpose = False # should the plotted images be transposed? + + +# Search the face database for similar faces +# Index of all other images than i +noti = list(range(0,i)) + list(range(i+1,N)) +# Compute similarity between image i and all others +sim = similarity(X[i,:], X[noti,:], similarity_measure) +sim = sim.tolist()[0] +# Tuples of sorted similarities and their indices +sim_to_index = sorted(zip(sim,noti)) + + +# Visualize query image and 5 most/least similar images +plt.figure(figsize=(12,8)) +plt.subplot(3,1,1) -# Extract variables of interest -testdata = mat_data["testdata"] -traindata = mat_data["traindata"] -X = traindata[:, 1:] -y = traindata[:, 0] +img_hw = int(np.sqrt(len(X[0]))) +img = np.reshape(X[i], (img_hw,img_hw)) +if transpose: img = img.T +plt.imshow(img, cmap=plt.cm.gray) +plt.xticks([]); plt.yticks([]) +plt.title('Query image') +plt.ylabel('image #{0}'.format(i)) -# Visualize the i'th digit as a vector -f = figure() -subplot(4, 1, 4) -imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r) -xlabel("Pixel number") -title("Digit in vector format") -yticks([]) +for ms in range(5): -# Visualize the i'th digit as an image -subplot(2, 1, 1) -I = np.reshape(X[i, :], (16, 16)) -imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r) -title("Digit as an image") + # 5 most similar images found + plt.subplot(3,5,6+ms) + im_id = sim_to_index[-ms-1][1] + im_sim = sim_to_index[-ms-1][0] + img = np.reshape(X[im_id],(img_hw,img_hw)) + if transpose: img = img.T + plt.imshow(img, cmap=plt.cm.gray) + plt.xlabel('sim={0:.3f}'.format(im_sim)) + plt.ylabel('image #{0}'.format(im_id)) + plt.xticks([]); plt.yticks([]) + if ms==2: plt.title('Most similar images') -show() + # 5 least similar images found + plt.subplot(3,5,11+ms) + im_id = sim_to_index[ms][1] + im_sim = sim_to_index[ms][0] + img = np.reshape(X[im_id],(img_hw,img_hw)) + if transpose: img = img.T + plt.imshow(img, cmap=plt.cm.gray) + plt.xlabel('sim={0:.3f}'.format(im_sim)) + plt.ylabel('image #{0}'.format(im_id)) + plt.xticks([]); plt.yticks([]) + if ms==2: plt.title('Least similar images') + +plt.show() -print("Ran Exercise 2.2.1") +print('Ran Exercise 3.3.1') diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py index 3d6104b2aae59b5bddd2d2afa9157d3d2e181485..e7abec8d3c302ff9414b0afb8925596c1dc69d3f 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py @@ -1,117 +1,42 @@ -# exercise 2.2.2 -import importlib_resources -import numpy as np -import scipy.linalg as linalg -from matplotlib.pyplot import ( - cm, - figure, - imshow, - legend, - plot, - show, - subplot, - title, - xlabel, - ylabel, - yticks, -) -from scipy.io import loadmat - -filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") - -# Digits to include in analysis (to include all, n = range(10) ) -n = [0, 1] -# Number of principal components for reconstruction -K = 16 -# Digits to visualize -nD = range(6) - - -# Load Matlab data file to python dict structure -# and extract variables of interest -traindata = loadmat(filename)["traindata"] -X = traindata[:, 1:] -y = traindata[:, 0] - -N, M = X.shape -C = len(n) - -classValues = n -classNames = [str(num) for num in n] -classDict = dict(zip(classNames, classValues)) - - -# Select subset of digits classes to be inspected -class_mask = np.zeros(N).astype(bool) -for v in n: - cmsk = y == v - class_mask = class_mask | cmsk -X = X[class_mask, :] -y = y[class_mask] -N = X.shape[0] - -# Center the data (subtract mean column values) -Xc = X - np.ones((N, 1)) * X.mean(0) - -# PCA by computing SVD of Y -U, S, V = linalg.svd(Xc, full_matrices=False) -# U = mat(U) -V = V.T - -# Compute variance explained by principal components -rho = (S * S) / (S * S).sum() - -# Project data onto principal component space -Z = Xc @ V - -# Plot variance explained -figure() -plot(rho, "o-") -title("Variance explained by principal components") -xlabel("Principal component") -ylabel("Variance explained value") - - -# Plot PCA of the data -f = figure() -title("pixel vectors of handwr. digits projected on PCs") -for c in n: - # select indices belonging to class c: - class_mask = y == c - plot(Z[class_mask, 0], Z[class_mask, 1], "o") -legend(classNames) -xlabel("PC1") -ylabel("PC2") +# exercise 3.2.2 +import numpy as np -# Visualize the reconstructed data from the first K principal components -# Select randomly D digits. -figure(figsize=(10, 3)) -W = Z[:, range(K)] @ V[:, range(K)].T -D = len(nD) -for d in range(D): - digit_ix = np.random.randint(0, N) - subplot(2, D, int(d + 1)) - I = np.reshape(X[digit_ix, :], (16, 16)) - imshow(I, cmap=cm.gray_r) - title("Original") - subplot(2, D, D + d + 1) - I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16)) - imshow(I, cmap=cm.gray_r) - title("Reconstr.") +from dtuimldmtools import similarity +# Generate two data objects with M random attributes +M = 5 +x = np.random.rand(1, M) +y = np.random.rand(1, M) -# Visualize the pricipal components -figure(figsize=(8, 6)) -for k in range(K): - N1 = int(np.ceil(np.sqrt(K))) - N2 = int(np.ceil(K / N1)) - subplot(N2, N1, int(k + 1)) - I = np.reshape(V[:, k], (16, 16)) - imshow(I, cmap=cm.hot) - title("PC{0}".format(k + 1)) +# Two constants +a = 1.5 +b = 1.5 -# output to screen -show() +# Check the statements in the exercise +print( + "Cosine scaling: %.4f " + % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0] +) +print( + "ExtendedJaccard scaling: %.4f " + % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0] +) +print( + "Correlation scaling: %.4f " + % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0] +) +print( + "Cosine translation: %.4f " + % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0] +) +print( + "ExtendedJaccard translation: %.4f " + % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0] +) +print( + "Correlation translation: %.4f " + % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0] +) -print("Ran Exercise 2.2.2") +print("Ran Exercise 3.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py index 9f665b581d0c86e7f5ba07bf5e943e04ff0a273e..e6b46e8ca7e49caa7258fa553737a598d2dda417 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py @@ -1,58 +1,33 @@ -# exercise 2.3.1 +# exercise 4.2.1 import importlib_resources import numpy as np -import scipy.linalg as linalg -from matplotlib.pyplot import figure, plot, show, xlabel, ylabel -from scipy.io import loadmat -from sklearn.neighbors import KNeighborsClassifier - -filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") -# Number of principal components to use for classification, -# i.e. the reduced dimensionality -K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150] - -# Load Matlab data file and extract training set and test set -mat_data = loadmat(filename) -X = mat_data["traindata"][:, 1:] -y = mat_data["traindata"][:, 0] -Xtest = mat_data["testdata"][:, 1:] -ytest = mat_data["testdata"][:, 0] -N, M = X.shape -Ntest = Xtest.shape[0] # or Xtest[:,0].shape - -# Subtract the mean from the data -Y = X - np.ones((N, 1)) * X.mean(0) -Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0) - -# Obtain the PCA solution by calculate the SVD of Y -U, S, V = linalg.svd(Y, full_matrices=False) -V = V.T - - -# Repeat classification for different values of K -error_rates = [] -for k in K: - # Project data onto principal component space, - Z = Y @ V[:, :k] - Ztest = Ytest @ V[:, :k] - - # Classify data with knn classifier - knn_classifier = KNeighborsClassifier(n_neighbors=1) - knn_classifier.fit(Z, y.ravel()) - y_estimated = knn_classifier.predict(Ztest) - - # Compute classification error rates - y_estimated = y_estimated.T - er = (sum(ytest != y_estimated) / float(len(ytest))) * 100 - error_rates.append(er) - print("K={0}: Error rate: {1:.1f}%".format(k, er)) - -# Visualize error rates vs. number of principal components considered -figure() -plot(K, error_rates, "o-") -xlabel("Number of principal components K") -ylabel("Error rate [%]") -show() - -print("Ran Exercise 2.3.1") +import xlrd + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls") +# Load xls sheet with data +doc = xlrd.open_workbook(filename).sheet_by_index(0) + +# Extract attribute names +attributeNames = doc.row_values(0, 0, 4) + +# Extract class names to python list, +# then encode with integers (dict) +classLabels = doc.col_values(4, 1, 151) +classNames = sorted(set(classLabels)) +classDict = dict(zip(classNames, range(len(classNames)))) + +# Extract vector y, convert to NumPy matrix and transpose +y = np.array([classDict[value] for value in classLabels]) + +# Preallocate memory, then extract data to matrix X +X = np.empty((150, 4)) +for i in range(4): + X[:, i] = np.array(doc.col_values(i, 1, 151)).T + +# Compute values of N, M and C. +N = len(y) +M = len(attributeNames) +C = len(classNames) + +print("Ran Exercise 4.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_2.py similarity index 94% rename from exercises/02450Toolbox_Python/Scripts/ex4_2_2.py rename to exercises/02450Toolbox_Python/Scripts/ex2_3_2.py index 4bef1c35cdf2df3450ee468be245671bfa4d2d72..40669020f57a582ca3cab76825569f7098c76b77 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_2.py @@ -3,7 +3,7 @@ import numpy as np # requires data from exercise 4.2.1 -from ex4_2_1 import * +from ex2_3_1 import * from matplotlib.pyplot import figure, hist, show, subplot, xlabel, ylim figure(figsize=(8, 7)) diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_3.py similarity index 92% rename from exercises/02450Toolbox_Python/Scripts/ex4_2_3.py rename to exercises/02450Toolbox_Python/Scripts/ex2_3_3.py index a6420832b94689016a5d776b2de688666e5e04e9..d19cace3629050ab15f7e48117bf0321c8b15b9a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_3.py @@ -1,7 +1,7 @@ # Exercise 4.2.3 # requires data from exercise 4.2.1 -from ex4_2_1 import * +from ex2_3_1 import * from matplotlib.pyplot import boxplot, show, title, xticks, ylabel boxplot(X) diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_4.py similarity index 97% rename from exercises/02450Toolbox_Python/Scripts/ex4_2_4.py rename to exercises/02450Toolbox_Python/Scripts/ex2_3_4.py index 4c394b19c505b2efd72030c49035f6dec7e4073a..d45db38b94e46011b94ad350dc76d17ce56c18c6 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_4.py @@ -1,6 +1,6 @@ # Exercise 4.2.4 # requires data from exercise 4.1.1 -from ex4_2_1 import * +from ex2_3_1 import * from matplotlib.pyplot import boxplot, figure, show, subplot, title, xticks, ylim figure(figsize=(14, 7)) diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_5.py similarity index 97% rename from exercises/02450Toolbox_Python/Scripts/ex4_2_5.py rename to exercises/02450Toolbox_Python/Scripts/ex2_3_5.py index 67061cbd96c896a6df5018d3288a1e271a15b09a..eaac9413f25c6611a22d0ab64691595a423e036b 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_5.py @@ -1,7 +1,7 @@ # Exercise 4.2.5 # requires data from exercise 4.2.1 -from ex4_2_1 import * +from ex2_3_1 import * from matplotlib.pyplot import ( figure, legend, diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_6.py similarity index 96% rename from exercises/02450Toolbox_Python/Scripts/ex4_2_6.py rename to exercises/02450Toolbox_Python/Scripts/ex2_3_6.py index 1f625d2d23ad211e32fb255e4f81f3ff246db9d1..f881cc83908231ee83c703d19405afd7a087b7c2 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_6.py @@ -1,7 +1,7 @@ # Exercise 4.2.6 # requires data from exercise 4.1.1 -from ex4_2_1 import * +from ex2_3_1 import * from matplotlib.pyplot import figure, show from mpl_toolkits.mplot3d import Axes3D diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_7.py similarity index 95% rename from exercises/02450Toolbox_Python/Scripts/ex4_2_7.py rename to exercises/02450Toolbox_Python/Scripts/ex2_3_7.py index e34326e5e6d35f172de5329f12881cc9322dd1b1..08c9dbfb9114f731624b39a2bb99534a99c9adf5 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_7.py @@ -1,7 +1,7 @@ # Exercise 4.2.7 # requires data from exercise 4.2.1 -from ex4_2_1 import * +from ex2_3_1 import * from matplotlib.pyplot import ( cm, colorbar, diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_1.py similarity index 100% rename from exercises/02450Toolbox_Python/Scripts/ex4_3_1.py rename to exercises/02450Toolbox_Python/Scripts/ex2_4_1.py diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_4_2.py similarity index 100% rename from exercises/02450Toolbox_Python/Scripts/ex4_3_2.py rename to exercises/02450Toolbox_Python/Scripts/ex2_4_2.py diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py new file mode 100644 index 0000000000000000000000000000000000000000..b210d13bfaca05d29b799ca0f1cc103802dee09e --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_1.py @@ -0,0 +1,32 @@ +# exercise 2.1.1 +import importlib_resources +import numpy as np +import xlrd + +# Load xls sheet with data +filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls") +doc = xlrd.open_workbook(filename).sheet_by_index(0) + +# Extract attribute names (1st row, column 4 to 12) +attributeNames = doc.row_values(0, 3, 11) + +# Extract class names to python list, +# then encode with integers (dict) +classLabels = doc.col_values(0, 2, 92) +classNames = sorted(set(classLabels)) +classDict = dict(zip(classNames, range(5))) + +# Extract vector y, convert to NumPy array +y = np.asarray([classDict[value] for value in classLabels]) + +# Preallocate memory, then extract excel data to matrix X +X = np.empty((90, 8)) +for i, col_id in enumerate(range(3, 11)): + X[:, i] = np.asarray(doc.col_values(col_id, 2, 92)) + +# Compute values of N, M and C. +N = len(y) +M = len(attributeNames) +C = len(classNames) + +print("Ran Exercise 2.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py index 60a39cf53d0415f6573b00f633de451559a70dff..6af386ac3591e7d180367857a81bc7e02770e29e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py @@ -1,54 +1,37 @@ -# exercise 3.1.4 -import importlib_resources -import numpy as np -from sklearn.feature_extraction.text import CountVectorizer - -filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") -# Load the textDocs.txt as a long string into raw_file: -with open(filename, "r") as f: - raw_file = f.read() -# raw_file contains sentences seperated by newline characters, -# so we split by '\n': -corpus = raw_file.split("\n") -# corpus is now list of "documents" (sentences), but some of them are empty, -# because textDocs.txt has a lot of empty lines, we filter/remove them: -corpus = list(filter(None, corpus)) - -# Display the result -print("Document-term matrix analysis") -print() -print("Corpus (5 documents/sentences):") -print(np.asmatrix(corpus)) -print() - - -# To automatically obtain the bag of words representation, we use sklearn's -# feature_extraction.text module, which has a function CountVectorizer. -# We make a CounterVectorizer: -vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b") -# The token pattern is a regular expression (marked by the r), which ensures -# that the vectorizer ignores digit/non-word tokens - in this case, it ensures -# the 10 in the last document is not recognized as a token. It's not important -# that you should understand it the regexp. - -# The object vectorizer can now be used to first 'fit' the vectorizer to the -# corpus, and the subsequently transform the data. We start by fitting: -vectorizer.fit(corpus) -# The vectorizer has now determined the unique terms (or tokens) in the corpus -# and we can extract them using: -attributeNames = vectorizer.get_feature_names_out() -print("Found terms:") -print(attributeNames) -print() - -# The next step is to count how many times each term is found in each document, -# which we do using the transform function: -X = vectorizer.transform(corpus) -N, M = X.shape -print("Number of documents (data objects, N):\t %i" % N) -print("Number of terms (attributes, M):\t %i" % M) -print() -print("Document-term matrix:") -print(X.toarray()) -print() -print("Ran Exercise 3.1.2") +# exercise 2.1.2 + +# Imports the numpy and xlrd package, then runs the ex2_1_1 code +from ex3_1_1 import * +from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel + +# (requires data structures from ex. 2.1.1) + + +# Data attributes to be plotted +i = 0 +j = 1 + +## +# Make a simple plot of the i'th attribute against the j'th attribute +# Notice that X is of matrix type (but it will also work with a numpy array) +# X = np.array(X) #Try to uncomment this line +plot(X[:, i], X[:, j], "o") + +# %% +# Make another more fancy plot that includes legend, class labels, +# attribute names, and a title. +f = figure() +title("NanoNose data") + +for c in range(C): + # select indices belonging to class c: + class_mask = y == c + plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3) + +legend(classNames) +xlabel(attributeNames[i]) +ylabel(attributeNames[j]) + +# Output result to screen +show() +print("Ran Exercise 2.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py index c3ac96c2ad678edc8786ea59973292ad3b80a9e1..75b9c6b3817910a5ed34195e17c77655fab3595c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py @@ -1,40 +1,30 @@ -# exercise 3.1.4 -import importlib_resources -from sklearn.feature_extraction.text import CountVectorizer +# exercise 2.1.3 +# (requires data structures from ex. 2.2.1) +import matplotlib.pyplot as plt +from ex3_1_1 import * +from scipy.linalg import svd -filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") -filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt") +# Subtract mean value from data +Y = X - np.ones((N, 1)) * X.mean(axis=0) -# As before, load the corpus and preprocess: -with open(filename_docs, "r") as f: - raw_file = f.read() -corpus = raw_file.split("\n") -corpus = list(filter(None, corpus)) +# PCA by computing SVD of Y +U, S, V = svd(Y, full_matrices=False) -# Load and process the stop words in a similar manner: -with open(filename_stop, "r") as f: - raw_file = f.read() -stopwords = raw_file.split("\n") +# Compute variance explained by principal components +rho = (S * S) / (S * S).sum() -# When making the CountVectorizer, we now input the stop words: -vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords) -# Determine the terms in the corpus -vectorizer.fit(corpus) -# ... and count the frequency of each term within a document: -X = vectorizer.transform(corpus) -attributeNames = vectorizer.get_feature_names_out() -N, M = X.shape +threshold = 0.9 -# Display the result -print("Document-term matrix analysis (using stop words)") -print() -print("Number of documents (data objects, N):\t %i" % N) -print("Number of terms (attributes, M):\t %i" % M) -print() -print("Found terms (no stop words):") -print(attributeNames) -print() -print("Document-term matrix:") -print(X.toarray()) -print() -print("Ran Exercise 3.1.3") +# Plot variance explained +plt.figure() +plt.plot(range(1, len(rho) + 1), rho, "x-") +plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-") +plt.plot([1, len(rho)], [threshold, threshold], "k--") +plt.title("Variance explained by principal components") +plt.xlabel("Principal component") +plt.ylabel("Variance explained") +plt.legend(["Individual", "Cumulative", "Threshold"]) +plt.grid() +plt.show() + +print("Ran Exercise 2.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py index e33ee98041ad4eddb1533e2553763c881c2e5e73..87f40a76e9e346233d92b48cfc02604b34d4b8d6 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py @@ -1,66 +1,38 @@ -# exercise 3.1.4 -import importlib_resources - -# We'll use a widely used stemmer based: -# Porter, M. “An algorithm for suffix stripping.†Program 14.3 (1980): 130-137. -# The stemmer is implemented in the most used natural language processing -# package in Python, "Natural Langauge Toolkit" (NLTK): -from nltk.stem import PorterStemmer -from sklearn.feature_extraction.text import CountVectorizer - -filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") -filename_stop = importlib_resources.files("dtuimldmtools").joinpath("data/stopWords.txt") - -# As before, load the corpus and preprocess: -with open(filename_docs, "r") as f: - raw_file = f.read() -corpus = raw_file.split("\n") -corpus = list(filter(None, corpus)) - -# Load and process the stop words in a similar manner: -with open(filename_stop, "r") as f: - raw_file = f.read() -stopwords = raw_file.split("\n") - - -# To enable stemming when using the sklearn-module, we need to parse an -# "analyzer" to the vectorizer we've been using. -# First, we make an object based on the PorterStemmer class, and we also make -# an analyzer object: -stemmer = PorterStemmer() -analyzer = CountVectorizer( - token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords -).build_analyzer() - - -# Using these we'll make a function that can stem words: -def stemmed_words(doc): - return (stemmer.stem(w) for w in analyzer(doc)) - - -# ... and finally, we make a vectorizer just like we've done before: -vectorizer = CountVectorizer(analyzer=stemmed_words) - -# Determine the terms: -vectorizer.fit(corpus) -attributeNames = vectorizer.get_feature_names_out() - -# ... and count the occurences: -X = vectorizer.transform(corpus) -N, M = X.shape -X = X.toarray() - -# Display the result -print("Document-term matrix analysis (using stop words and stemming)") -print() -print("Number of documents (data objects, N):\t %i" % N) -print("Number of terms (attributes, M):\t %i" % M) -print() -print("Found terms (no stop words, stemmed):") -print(attributeNames) -print() -print("Document-term matrix:") -print(X) -print() -print("Ran Exercise 3.1.4") -print() +# exercise 2.1.4 +# (requires data structures from ex. 2.2.1 and 2.2.3) +from ex3_1_1 import * +from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel +from scipy.linalg import svd + +# Subtract mean value from data +Y = X - np.ones((N, 1)) * X.mean(0) + +# PCA by computing SVD of Y +U, S, Vh = svd(Y, full_matrices=False) +# scipy.linalg.svd returns "Vh", which is the Hermitian (transpose) +# of the vector V. So, for us to obtain the correct V, we transpose: +V = Vh.T + +# Project the centered data onto principal component space +Z = Y @ V + +# Indices of the principal components to be plotted +i = 0 +j = 1 + +# Plot PCA of the data +f = figure() +title("NanoNose data: PCA") +# Z = array(Z) +for c in range(C): + # select indices belonging to class c: + class_mask = y == c + plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5) +legend(classNames) +xlabel("PC{0}".format(i + 1)) +ylabel("PC{0}".format(j + 1)) + +# Output result to screen +show() + +print("Ran Exercise 2.1.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py index d28d50af788bc9a8e83f8db8e2461ce49ba0e7a8..b17766da22a5f049eef3be9a59ac2f91f91c995b 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py @@ -1,38 +1,53 @@ -# exercise 3.1.5 -import numpy as np -import scipy.linalg as linalg -from ex3_1_4 import * - -from dtuimldmtools import similarity - -# Query vector -q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) -# notice, that you could get the query vector using the vectorizer, too: -# q = vectorizer.transform(['matrix rank solv']) -# q = np.asarray(q.toarray()) -# or use any other string: -# q = vectorizer.transform(['Can I Google how to fix my problem?']) -# q = np.asarray(q.toarray()) - -# Method 1 ('for' loop - slow) -N = np.shape(X)[0] -# get the number of data objects -sim = np.zeros((N, 1)) # allocate a vector for the similarity -for i in range(N): - x = X[i, :] # Get the i'th data object (here: document) - sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x) # Compute cosine similarity - -# Method 2 (one line of code with no iterations - faster) -sim = (q @ X.T).T / ( - np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum()) -) - -# Method 3 (use the "similarity" function) -sim = similarity(X, q, "cos") - - -# Display the result -print("Query vector:\n {0}\n".format(q)) -print("Similarity results:\n {0}".format(sim)) - -print("Ran Exercise 3.1.5") +# exercise 2.2.4 + +# (requires data structures from ex. 2.2.1) +import matplotlib.pyplot as plt +from ex3_1_1 import * +from scipy.linalg import svd + +Y = X - np.ones((N, 1)) * X.mean(0) +U, S, Vh = svd(Y, full_matrices=False) +V = Vh.T +N, M = X.shape + +# We saw in 2.1.3 that the first 3 components explaiend more than 90 +# percent of the variance. Let's look at their coefficients: +pcs = [0, 1, 2] +legendStrs = ["PC" + str(e + 1) for e in pcs] +c = ["r", "g", "b"] +bw = 0.2 +r = np.arange(1, M + 1) +for i in pcs: + plt.bar(r + i * bw, V[:, i], width=bw) +plt.xticks(r + bw, attributeNames) +plt.xlabel("Attributes") +plt.ylabel("Component coefficients") +plt.legend(legendStrs) +plt.grid() +plt.title("NanoNose: PCA Component Coefficients") +plt.show() + +# Inspecting the plot, we see that the 2nd principal component has large +# (in magnitude) coefficients for attributes A, E and H. We can confirm +# this by looking at it's numerical values directly, too: +print("PC2:") +print(V[:, 1].T) + +# How does this translate to the actual data and its projections? +# Looking at the data for water: + +# Projection of water class onto the 2nd principal component. +all_water_data = Y[y == 4, :] + +print("First water observation") +print(all_water_data[0, :]) + +# Based on the coefficients and the attribute values for the observation +# displayed, would you expect the projection onto PC2 to be positive or +# negative - why? Consider *both* the magnitude and sign of *both* the +# coefficient and the attribute! + +# You can determine the projection by (remove comments): +print("...and its projection onto PC2") +print(all_water_data[0, :] @ V[:, 1]) +# Try to explain why? diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_6.py similarity index 99% rename from exercises/02450Toolbox_Python/Scripts/ex2_1_6.py rename to exercises/02450Toolbox_Python/Scripts/ex3_1_6.py index 78250ef25ad0a9015e9c4ce7e8ab0268f48d89f2..25cf5208a741bd821c06df394ea4184fb1a05371 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_6.py @@ -1,6 +1,6 @@ ## exercise 2.1.6 import matplotlib.pyplot as plt -from ex2_1_1 import * +from ex3_1_1 import * from scipy.linalg import svd r = np.arange(1, X.shape[1] + 1) diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py index dbc1dc0af5914508cd15466c199cdb24c2fd3bc2..798bb5b46dba276b98ff5c3f0da86e13f66bfe15 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py @@ -1,19 +1,36 @@ -# exercise 3.2.1 +import importlib_resources import numpy as np +from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks +from scipy.io import loadmat -x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47]) +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") +# Index of the digit to display +i = 0 -# Compute values -mean_x = x.mean() -std_x = x.std(ddof=1) -median_x = np.median(x) -range_x = x.max() - x.min() +# Load Matlab data file to python dict structure +mat_data = loadmat(filename) -# Display results -print("Vector:", x) -print("Mean:", mean_x) -print("Standard Deviation:", std_x) -print("Median:", median_x) -print("Range:", range_x) +# Extract variables of interest +testdata = mat_data["testdata"] +traindata = mat_data["traindata"] +X = traindata[:, 1:] +y = traindata[:, 0] -print("Ran Exercise 3.2.1") + +# Visualize the i'th digit as a vector +f = figure() +subplot(4, 1, 4) +imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r) +xlabel("Pixel number") +title("Digit in vector format") +yticks([]) + +# Visualize the i'th digit as an image +subplot(2, 1, 1) +I = np.reshape(X[i, :], (16, 16)) +imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r) +title("Digit as an image") + +show() + +print("Ran Exercise 2.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py new file mode 100644 index 0000000000000000000000000000000000000000..3d6104b2aae59b5bddd2d2afa9157d3d2e181485 --- /dev/null +++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_2.py @@ -0,0 +1,117 @@ +# exercise 2.2.2 +import importlib_resources +import numpy as np +import scipy.linalg as linalg +from matplotlib.pyplot import ( + cm, + figure, + imshow, + legend, + plot, + show, + subplot, + title, + xlabel, + ylabel, + yticks, +) +from scipy.io import loadmat + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") + +# Digits to include in analysis (to include all, n = range(10) ) +n = [0, 1] +# Number of principal components for reconstruction +K = 16 +# Digits to visualize +nD = range(6) + + +# Load Matlab data file to python dict structure +# and extract variables of interest +traindata = loadmat(filename)["traindata"] +X = traindata[:, 1:] +y = traindata[:, 0] + +N, M = X.shape +C = len(n) + +classValues = n +classNames = [str(num) for num in n] +classDict = dict(zip(classNames, classValues)) + + +# Select subset of digits classes to be inspected +class_mask = np.zeros(N).astype(bool) +for v in n: + cmsk = y == v + class_mask = class_mask | cmsk +X = X[class_mask, :] +y = y[class_mask] +N = X.shape[0] + +# Center the data (subtract mean column values) +Xc = X - np.ones((N, 1)) * X.mean(0) + +# PCA by computing SVD of Y +U, S, V = linalg.svd(Xc, full_matrices=False) +# U = mat(U) +V = V.T + +# Compute variance explained by principal components +rho = (S * S) / (S * S).sum() + +# Project data onto principal component space +Z = Xc @ V + +# Plot variance explained +figure() +plot(rho, "o-") +title("Variance explained by principal components") +xlabel("Principal component") +ylabel("Variance explained value") + + +# Plot PCA of the data +f = figure() +title("pixel vectors of handwr. digits projected on PCs") +for c in n: + # select indices belonging to class c: + class_mask = y == c + plot(Z[class_mask, 0], Z[class_mask, 1], "o") +legend(classNames) +xlabel("PC1") +ylabel("PC2") + + +# Visualize the reconstructed data from the first K principal components +# Select randomly D digits. +figure(figsize=(10, 3)) +W = Z[:, range(K)] @ V[:, range(K)].T +D = len(nD) +for d in range(D): + digit_ix = np.random.randint(0, N) + subplot(2, D, int(d + 1)) + I = np.reshape(X[digit_ix, :], (16, 16)) + imshow(I, cmap=cm.gray_r) + title("Original") + subplot(2, D, D + d + 1) + I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16)) + imshow(I, cmap=cm.gray_r) + title("Reconstr.") + + +# Visualize the pricipal components +figure(figsize=(8, 6)) +for k in range(K): + N1 = int(np.ceil(np.sqrt(K))) + N2 = int(np.ceil(K / N1)) + subplot(N2, N1, int(k + 1)) + I = np.reshape(V[:, k], (16, 16)) + imshow(I, cmap=cm.hot) + title("PC{0}".format(k + 1)) + +# output to screen +show() + +print("Ran Exercise 2.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py index f5cfc462ae073133074a692c8080b0706d084ee5..9f665b581d0c86e7f5ba07bf5e943e04ff0a273e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py @@ -1,77 +1,58 @@ -# exercise 3.3.1 +# exercise 2.3.1 import importlib_resources -import matplotlib.pyplot as plt import numpy as np +import scipy.linalg as linalg +from matplotlib.pyplot import figure, plot, show, xlabel, ylabel from scipy.io import loadmat - -from dtuimldmtools import similarity - -filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat") -# Image to use as query -i = 1 - -# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' -similarity_measure = "SMC" - -# Load the digits -# Load Matlab data file to python dict structure -X = loadmat(filename)["X"] -# You can also try the CBCL faces dataset (remember to change 'transpose') -#X = loadmat('../Data/wildfaces_grayscale.mat')['X'] +from sklearn.neighbors import KNeighborsClassifier + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") +# Number of principal components to use for classification, +# i.e. the reduced dimensionality +K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150] + +# Load Matlab data file and extract training set and test set +mat_data = loadmat(filename) +X = mat_data["traindata"][:, 1:] +y = mat_data["traindata"][:, 0] +Xtest = mat_data["testdata"][:, 1:] +ytest = mat_data["testdata"][:, 0] N, M = X.shape -transpose = False # should the plotted images be transposed? - - -# Search the face database for similar faces -# Index of all other images than i -noti = list(range(0,i)) + list(range(i+1,N)) -# Compute similarity between image i and all others -sim = similarity(X[i,:], X[noti,:], similarity_measure) -sim = sim.tolist()[0] -# Tuples of sorted similarities and their indices -sim_to_index = sorted(zip(sim,noti)) - - -# Visualize query image and 5 most/least similar images -plt.figure(figsize=(12,8)) -plt.subplot(3,1,1) - -img_hw = int(np.sqrt(len(X[0]))) -img = np.reshape(X[i], (img_hw,img_hw)) -if transpose: img = img.T -plt.imshow(img, cmap=plt.cm.gray) -plt.xticks([]); plt.yticks([]) -plt.title('Query image') -plt.ylabel('image #{0}'.format(i)) - - -for ms in range(5): - - # 5 most similar images found - plt.subplot(3,5,6+ms) - im_id = sim_to_index[-ms-1][1] - im_sim = sim_to_index[-ms-1][0] - img = np.reshape(X[im_id],(img_hw,img_hw)) - if transpose: img = img.T - plt.imshow(img, cmap=plt.cm.gray) - plt.xlabel('sim={0:.3f}'.format(im_sim)) - plt.ylabel('image #{0}'.format(im_id)) - plt.xticks([]); plt.yticks([]) - if ms==2: plt.title('Most similar images') - - # 5 least similar images found - plt.subplot(3,5,11+ms) - im_id = sim_to_index[ms][1] - im_sim = sim_to_index[ms][0] - img = np.reshape(X[im_id],(img_hw,img_hw)) - if transpose: img = img.T - plt.imshow(img, cmap=plt.cm.gray) - plt.xlabel('sim={0:.3f}'.format(im_sim)) - plt.ylabel('image #{0}'.format(im_id)) - plt.xticks([]); plt.yticks([]) - if ms==2: plt.title('Least similar images') - -plt.show() - -print('Ran Exercise 3.3.1') +Ntest = Xtest.shape[0] # or Xtest[:,0].shape + +# Subtract the mean from the data +Y = X - np.ones((N, 1)) * X.mean(0) +Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0) + +# Obtain the PCA solution by calculate the SVD of Y +U, S, V = linalg.svd(Y, full_matrices=False) +V = V.T + + +# Repeat classification for different values of K +error_rates = [] +for k in K: + # Project data onto principal component space, + Z = Y @ V[:, :k] + Ztest = Ytest @ V[:, :k] + + # Classify data with knn classifier + knn_classifier = KNeighborsClassifier(n_neighbors=1) + knn_classifier.fit(Z, y.ravel()) + y_estimated = knn_classifier.predict(Ztest) + + # Compute classification error rates + y_estimated = y_estimated.T + er = (sum(ytest != y_estimated) / float(len(ytest))) * 100 + error_rates.append(er) + print("K={0}: Error rate: {1:.1f}%".format(k, er)) + +# Visualize error rates vs. number of principal components considered +figure() +plot(K, error_rates, "o-") +xlabel("Number of principal components K") +ylabel("Error rate [%]") +show() + +print("Ran Exercise 2.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py deleted file mode 100644 index e7abec8d3c302ff9414b0afb8925596c1dc69d3f..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py +++ /dev/null @@ -1,42 +0,0 @@ -# exercise 3.2.2 - -import numpy as np - -from dtuimldmtools import similarity - -# Generate two data objects with M random attributes -M = 5 -x = np.random.rand(1, M) -y = np.random.rand(1, M) - -# Two constants -a = 1.5 -b = 1.5 - -# Check the statements in the exercise -print( - "Cosine scaling: %.4f " - % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0] -) -print( - "ExtendedJaccard scaling: %.4f " - % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0] -) -print( - "Correlation scaling: %.4f " - % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0] -) -print( - "Cosine translation: %.4f " - % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0] -) -print( - "ExtendedJaccard translation: %.4f " - % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0] -) -print( - "Correlation translation: %.4f " - % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0] -) - -print("Ran Exercise 3.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py deleted file mode 100644 index e6b46e8ca7e49caa7258fa553737a598d2dda417..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py +++ /dev/null @@ -1,33 +0,0 @@ -# exercise 4.2.1 - -import importlib_resources -import numpy as np -import xlrd - -filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls") -# Load xls sheet with data -doc = xlrd.open_workbook(filename).sheet_by_index(0) - -# Extract attribute names -attributeNames = doc.row_values(0, 0, 4) - -# Extract class names to python list, -# then encode with integers (dict) -classLabels = doc.col_values(4, 1, 151) -classNames = sorted(set(classLabels)) -classDict = dict(zip(classNames, range(len(classNames)))) - -# Extract vector y, convert to NumPy matrix and transpose -y = np.array([classDict[value] for value in classLabels]) - -# Preallocate memory, then extract data to matrix X -X = np.empty((150, 4)) -for i in range(4): - X[:, i] = np.array(doc.col_values(i, 1, 151)).T - -# Compute values of N, M and C. -N = len(y) -M = len(attributeNames) -C = len(classNames) - -print("Ran Exercise 4.2.1") diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_2.R b/exercises/02450Toolbox_R/Scripts/ex1_6_2.R new file mode 100644 index 0000000000000000000000000000000000000000..d3a68728715b5ebcd46797215edaea83f389e448 --- /dev/null +++ b/exercises/02450Toolbox_R/Scripts/ex1_6_2.R @@ -0,0 +1,19 @@ +#################### +# Exercise 3.1.2 +#################### + +rm(list = ls()) # Clear work space + +library(tm) +textfolder = "./Data/" + +(docs <- Corpus(DirSource(textfolder, pattern="textDocs*"), readerControl = list(language="en"))) + +inspect(docs) + +docs_nopunct <- tm_map(docs, removePunctuation) +dtm <- DocumentTermMatrix(docs_nopunct) + +inspect(dtm) + + diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_3.R b/exercises/02450Toolbox_R/Scripts/ex1_6_3.R new file mode 100644 index 0000000000000000000000000000000000000000..4180e92904af380e5417c81b30f2044db6102ea0 --- /dev/null +++ b/exercises/02450Toolbox_R/Scripts/ex1_6_3.R @@ -0,0 +1,23 @@ +#################### +# Exercise 3.1.3 +#################### + +rm(list = ls()) # Clear work space + +library(tm) +textfolder <- "./Data/" + +(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en"))) + +inspect(docs) +docs_nopunct <- tm_map(docs, removePunctuation) +dtm <- DocumentTermMatrix(docs_nopunct) + +mystopwords <- scan("./Data/stopWords.txt", character(0)) + +docs_nostopwords <- tm_map(docs, removeWords, mystopwords) +inspect(docs_nostopwords) +dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE)) +inspect(dtm_nostopwords) + +control <- list(stopwords = TRUE) diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_4.R b/exercises/02450Toolbox_R/Scripts/ex1_6_4.R new file mode 100644 index 0000000000000000000000000000000000000000..5ad1d634acf03543105e49b1e1e8ee6a0ba61870 --- /dev/null +++ b/exercises/02450Toolbox_R/Scripts/ex1_6_4.R @@ -0,0 +1,29 @@ +#################### +# Exercise 3.1.4 +#################### + +rm(list = ls()) # Clear work space + +library(tm) # install.packages("tm") +library(SnowballC) # install.packages("SnowballC") + +textfolder <- "./Data/" + +(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en"))) + +inspect(docs) +docs_nopunct <- tm_map(docs, removePunctuation) +dtm <- DocumentTermMatrix(docs_nopunct) + +mystopwords <- scan("./Data/stopWords.txt", character(0)) + +docs_nostopwords <- tm_map(docs_nopunct, removeWords, mystopwords) +dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE)) +inspect(dtm_nostopwords) + +docs_stemmed <- tm_map(docs_nostopwords, stemDocument, language = "english") + + +inspect(docs_stemmed) +dtm_stemmed <- DocumentTermMatrix(docs_stemmed, control = list(stopwords = TRUE)) +inspect(dtm_stemmed) diff --git a/exercises/02450Toolbox_R/Scripts/ex1_6_5.R b/exercises/02450Toolbox_R/Scripts/ex1_6_5.R new file mode 100644 index 0000000000000000000000000000000000000000..a9c4b7d1d19f693fbde4fc96101ca4aab53ba0d8 --- /dev/null +++ b/exercises/02450Toolbox_R/Scripts/ex1_6_5.R @@ -0,0 +1,19 @@ +#################### +# Exercise 3.1.5 +#################### + +source("Scripts/ex1_6_4.R") +source("Tools/similarity.R") + +size <- dim(dtm_stemmed) +cosmeas <- c() +q <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0) +dtmat <- matrix(dtm_stemmed, dtm_stemmed$nrow, dtm_stemmed$ncol) + +for (irow in 1:size[1]) { + doc <- dtm_stemmed[irow, ] + cosmeas[irow] <- similarity(t(dtmat[irow, ]), t(q), 'cos') +} + +print("Cosine similarity from q to docs: ") +print(cosmeas) diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_1.R b/exercises/02450Toolbox_R/Scripts/ex2_1_1.R index 7861e15a48b09ebbb0a87f27a7366b651ff048d0..725f7564a291d7ba29252e89ef5d0f0c1adc7679 100644 --- a/exercises/02450Toolbox_R/Scripts/ex2_1_1.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_1_1.R @@ -1,36 +1,17 @@ #################### -# Exercise 2.1.1 +# Exercise 3.2.1 #################### rm(list = ls()) # Clear work space -# Read data into R -data <- read.csv("./Data/nanonose.csv", check.names = FALSE) +x <- c(-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47) -# Extract class labels of observations -classLabels <- colnames(data) -classLabels <- classLabels[-(1:2)] +mean(x) +sd(x) +median(x) +diff(range(x)) -# Extract attributes, i.e. sensor names -attributeNames <- data[3:10, 1] - -# Remove first two rows and columns and transpose data matrix -X <- t(data[-(1:2), -(1:2)]) - -# Check that dimensions are as they should be (90 rows and 8 columns) -N <- dim(X)[1] -M <- dim(X)[2] - -# Assign the class labels as row names and the attributes as column names -rownames(X) <- classLabels -colnames(X) <- attributeNames - -# Extract the class names present in data -classNames <- sort(unique(classLabels)) -C <- length(classNames) - -# Extract numeric class assignments -y <- as.numeric(as.factor(classLabels)) - 1 - -# Clean up a bit in the variables, since we'll call this script from later scripts: -rm(data) +# Range returns the minimum and maximum of the vector x. +# To get the range, we must take the maximum minus the minimum. +# We do this using the function diff, which finds differences +# between consecutive elements in a vector. diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_2.R b/exercises/02450Toolbox_R/Scripts/ex2_1_2.R deleted file mode 100644 index d99b5ebcc5c289c968bc0e73da451a2d85fbb7bb..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_R/Scripts/ex2_1_2.R +++ /dev/null @@ -1,20 +0,0 @@ -#################### -# Exercise 2.1.2 -#################### - -# Run ex2.1.1: -source('Scripts/ex2_1_1.R') - -# Choose which sensors to plot -i = 1 -j = 2 - -# Make simple plot -plot(X[ , i], X[ , j]) - -## Make more fancy plot -library(ggplot2) -ggplot() + geom_point(aes(x = X[ , i], y = X[ , j], color = classLabels), size=4, alpha=0.5) + - theme(legend.position = c(0.88, 0.77), legend.title= element_blank()) + - labs(x = attributeNames[i], y = attributeNames[j]) - diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_3.R b/exercises/02450Toolbox_R/Scripts/ex2_1_3.R deleted file mode 100644 index 05b805dcee60b003f1332fd99d127e29d0a09a99..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_R/Scripts/ex2_1_3.R +++ /dev/null @@ -1,45 +0,0 @@ -#################### -# Exercise 2.1.3 -#################### - -source("Scripts/ex2_1_1.R") - -# Subtract the column means form columns of X -Y <- t(apply(X, 1, "-", colMeans(X))) - -# You can check the column means using: colMeans(Y) -colMeans(Y) - -# PCA by computing SVD of Y: -s <- svd(Y) -diagS <- s$d - -rho <- diagS^2 / sum(diagS^2) - -# PCA can also be done with the built-in function prcomp. -# Confirm that you get the same result. -rho2 <- summary(prcomp(X, center = TRUE))$importance[2,] - - -threshold <- 0.9 - -xlimits <- c(1, M) -plot(rho, - type = "o", - main = "Variance explained by principal componenets", - xlab = "Principal components", - ylab = "Variance explained", - xlim = xlimits, - ylim = c(0, 1), - col = "blue" -) - -lines(cumsum(rho), type = "o", col = "orange") -lines(xlimits, c(threshold, threshold), lty = "dashed") - -legend("right", # Define position - legend = c("Individual", "Cumulative", "Threshold"), # Set strings for legend - col = c("orange", "blue", "black"), lty = c(1, 1, 2), # Match appereance of lines - cex = 1, bg = "lightblue" -) # Setup how the box looks (cex controls size) - diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_4.R b/exercises/02450Toolbox_R/Scripts/ex2_1_4.R deleted file mode 100644 index 3b43e4700e2a6c2d4f447166eee754a0f476a04a..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_R/Scripts/ex2_1_4.R +++ /dev/null @@ -1,20 +0,0 @@ -#################### -# Exercise 2.1.4 -#################### -source("Scripts/ex2_1_3.R") - -# Manual projecting data onto principal component. -Z <- s$u %*% diag(s$d) - -# Doing it with the built-in function. -Z <- prcomp(X)$x - -i <- 1 -j <- 3 - -## Make fancy plot -library(ggplot2) -ggplot() + - geom_point(aes(x = Z[, i], y = Z[, j], color = classLabels), size = 4, alpha = 0.5) + - theme(legend.position = c(0.88, 0.22), legend.title = element_blank()) + - labs(x = colnames(Z)[i], y = colnames(Z)[j]) diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_5.R b/exercises/02450Toolbox_R/Scripts/ex2_1_5.R deleted file mode 100644 index 4441f4c2b54c833f1008794ce7799a4aff91738e..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_R/Scripts/ex2_1_5.R +++ /dev/null @@ -1,45 +0,0 @@ -#################### -# Exercise 2.1.5 -#################### -source("Scripts/ex2_1_1.R") - -Y <- t(apply(X, 1, "-", colMeans(X))) # subtract the column means form columns of X - -# Computing PCA: -pca <- prcomp(X) -V <- pca$rotation -Z <- pca$x - -# We saw in 2.1.3 that the first 3 components explained more than 90 -# percent of the variance. Let's look at their coefficients: -pcs <- 1:3 -test <- as.data.frame(melt(data.table(V[, pcs]))) -ggplot(test, aes(x = rep(1:8, length(pcs)), y = value, fill=variable)) + - geom_bar(position="dodge", stat = "identity") + - labs(fill="PC", x = "Attributes", y = "Component coefficients") - -# Inspecting the plot, we see that the 2nd principal component has large -# (in magnitude) coefficients for attributes A, E and H. We can confirm -# this by looking at it's numerical values directly, too: -cat("PC2:", V[, 1]) - -# How does this translate to the actual data and its projections? -# Looking at the data for water: - -# Projection of water class onto the 2nd principal component. -water <- Y[y == 4, ] -cat("First water observation:", water[1, ]) - -# Based on the coefficients and the attribute values for the observation -# displayed, would you expect the projection onto PC2 to be positive or -# negative - why? Consider *both* the magnitude and sign of *both* the -# coefficient and the attribute! - -# You can determine the projection by (remove comments): -print("...and its projection onto PC2") -print(as.numeric(t(water[1, ]) %*% V[, 2])) - -# Try to explain why. -# This is equivalent to: -Z[1, 2] - diff --git a/exercises/02450Toolbox_R/Scripts/ex2_2_1.R b/exercises/02450Toolbox_R/Scripts/ex2_2_1.R index ff04967032ed9c6df0d9c33ec81aef0f5aeafdbc..91eccf7115949659685286b1285dced806f6039e 100644 --- a/exercises/02450Toolbox_R/Scripts/ex2_2_1.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_2_1.R @@ -1,42 +1,67 @@ #################### -# Exercise 2.2.1 +# Exercise 3.3.1 #################### +rm(list = ls()) # Clear work space) -rm(list = ls()) # Clear work space +source("Tools/binarize.R") +source("Tools/similarity.R") -# Load the library R.matlab to enable the function readMat, -# which allows R to read the matlab .mat format. -# Install with "install.packages("R.matlab")" -library(R.matlab) - -# The row of training data that we will look at +# Image to use as query i <- 1 -# Read in the data -data <- readMat(file.path("Data", "zipdata.mat")) +# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' +SimilarityMeasure <- "Jaccard" + +library(R.matlab) +data <- readMat(file.path("./Data/digits.mat")) + +# You can also try it on the faces dataset: +# data <- readMat(file.path("./Data/wildfaces_grayscale.mat")) -# Check that the structure dat contains two matrices, testdata and traindata names(data) +X <- data$X +dimX <- dim(X) +N <- dimX[1] +M <- dimX[2] +Q <- matrix(X[i, ], ncol = M) # The query image +Y <- X[-i, ] # All images except the i'th + +# The next function is in the setup.R, load it if you have not already +# source("setup.R") +sim <- similarity(Q, Y, SimilarityMeasure) + +# Sort similarities +sort_result <- sort(sim, decreasing = TRUE, index.return = TRUE) +val <- sort_result$x +j <- sort_result$ix +nj <- length(j) -# The features, i.e. images of digits, are stored in the rows of traindata, -# except for the first column. The first column contains the class of the row, -# i.e. the digit identity -X <- data$traindata[, 2:dim(data$traindata)[2]] -y <- data$traindata[, 1] +# Plot five most similar and dissimilar images +npics <- 5 +ndigits <- 4 +mostsim <- j[1:npics] +leastsim <- j[(nj - npics + 1):nj] -par(mfrow = c(1, 2)) +imageDim <- c(sqrt(M), sqrt(M)) +dim(Q) <- imageDim # reshape Q -# View the i'th row of X -image(as.matrix(X[i, ]), axes = FALSE, xlab = "Pixel number", main = "Digit in vector format") +{ + dev.new(width = 2.5, height = 3) + image(t(Q[nrow(Q):1, ]), main = "Query image", col = gray((0:32) / 32)) +} -# Make ticks corresponding to pixels, i.e. values in row i -axis(1, at = (1:length(X[i, ])) / length(X[i, ]), labels = 1:length(X[i, ])) +{ + dev.new(width = 2 * npics, height = 5) + layout(matrix(c(1:length(mostsim), (length(mostsim) + 1):(2 * length(mostsim))), 2, length(mostsim), byrow = FALSE)) + for (d in 1:npics) { + similarImage <- Y[mostsim[d], ] + dim(similarImage) <- imageDim + image(t(similarImage[nrow(similarImage):1, ]), main = paste("Similarity:", format(val[d], digits = ndigits)), col = gray((0:32) / 32)) -# Extract the i'th row of X and store it as the matrix I -I <- X[i, ] -# Make the matrix I have dimensions 16 by 16 -dim(I) <- c(16, 16) + dissimilarImage <- Y[leastsim[d], ] + dim(dissimilarImage) <- imageDim -# view the digit in the i'th row of X as an image. The function image rotates the matrix, so we need to rearrange the columns in the matrix I in order to get the correct view of the digit. -image(I[, ncol(I):1], col = gray((32:0) / 32), main = "Digit as an image") + image(t(dissimilarImage[nrow(dissimilarImage):1, ]), main = paste("Similarity:", format(val[nj - d + 1], digits = ndigits)), col = gray((0:32) / 32)) + } +} diff --git a/exercises/02450Toolbox_R/Scripts/ex2_2_2.R b/exercises/02450Toolbox_R/Scripts/ex2_2_2.R index 997d298cafa7666e850bf4b6854cc7a548b20897..3cdc43ec18cec69145b8c9e47da735c2b53d1e34 100644 --- a/exercises/02450Toolbox_R/Scripts/ex2_2_2.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_2_2.R @@ -1,105 +1,22 @@ #################### -# Exercise 2.2.2 +# Exercise 3.3.2 #################### -source("Scripts/ex2_2_1.R") +source("Tools/similarity.R") +# Generate two data objects with M random (standard uniformly distributed) attributes +M = 5; +x = as.matrix(runif(M)); +y = as.matrix(runif(M)); -# Choose which digits to consider in the analysis -# This is like in the exercise -digits_to_inspect <- 0:1 - -# If you want to try more digits -# digits_to_inspect = 0:9 - -# Find the observations that relate to the digits chosen in digits_to_inspect -inds <- !is.na(match(y, digits_to_inspect)) - -# extract the rows of X found above -X <- X[inds, ] -y <- y[inds] - -# Get the column means of X, subtract them from each row, -# and perform and SVD on the resulting matrix -means <- colMeans(X) -Xzeromean <- t(apply(X, 1, "-", means)) -svdres <- svd(Xzeromean) - -# Extract the matrices containing the left and right singular vectors, respectively -U <- svdres$u -V <- svdres$v - -# ---------------------------------------------------- -# Calculate and plot the variance explained by the PCs -# ---------------------------------------------------- - -pcvariance <- svdres$d^2 / sum(svdres$d^2) - -{ - par(mfrow=c(1,1)) - plot(cumsum(pcvariance), main = "Data variance explained by PCs", - xlab = "Number of PCs included in variance sum", - ylab = "Proportion of variance explained") - - # First 22 PCs should explain 90% - abline(v = 22) -} - -# --------------------------------------------------------------------- -# Plot principal components 1 and 2 against each other in a scatterplot, -# i.e. plot the projections of observations onto PCs 1 and 2 -# --------------------------------------------------------------------- - -# These two ways of calculating pc_projections are equivalent -pc_projections <- Xzeromean %*% V -pc_projections <- U %*% diag(svdres$d) - -pcproj1 <- pc_projections[, 1] -pcproj2 <- pc_projections[, 2] - -{ - par(mfrow=c(1,1)) - plot(c(min(pcproj1), max(pcproj1)), c(min(pcproj2), max(pcproj2)), - type = "n", xlab = "PC 1", ylab = "PC 2") - points(pcproj1[y == 0], pcproj2[y == 0], col = "red") - points(pcproj1[y == 1], pcproj2[y == 1], col = "green") - legend("topleft", legend = c("0", "1"), fill = c("red", "green")) -} - - -# ---------------------------------- -# Reconstruction of images of digits -# ---------------------------------- - -# Number of PCs to include in reconstruction of digits -K <- 5 - -# Digits to visualize -nD <- 60:63 - -reconstructions <- pc_projections[, 1:K] %*% t(V[, 1:K]) - -par(mar = c(1, 1, 1, 1)) -layout(matrix(c(1:length(nD), (length(nD) + 1):(2 * length(nD))), 2, length(nD), byrow = FALSE)) -for (d in 1:length(nD)) { - origImage <- X[nD[d], ] - dim(origImage) <- c(16, 16) - image(origImage[, ncol(origImage):1], main = "Original", col = gray((32:0) / 32)) - reconstructedImage <- reconstructions[nD[d], ] + means - dim(reconstructedImage) <- c(16, 16) - image(reconstructedImage[, ncol(reconstructedImage):1], main = "Reconstruction", col = gray((32:0) / 32)) -} - -# ------------- -# Visualize PCs -# ------------- - -# visualize the first 12 PCs -par(mfrow = c(3, 4), mar = c(2, 2, 2, 2)) -for (nth_pc in 1:12) { - pc <- t(V[, nth_pc]) - dim(pc) <- c(16, 16) - # view image of PC - image(pc[, ncol(pc):1], col = gray((32:0) / 32), main = paste("PC ", nth_pc)) -} +# Two constants +a = 1.5; +b = 1.5; +# Check the statements in the exercise +similarity(x,y,'cos') - similarity(a*x,y,'cos') +similarity(x,y,'ext') - similarity(a*x,y,'ext') +similarity(x,y,'cor') - similarity(a*x,y,'cor') +similarity(x,y,'cos') - similarity(b+x,y,'cos') +similarity(x,y,'ext') - similarity(b+x,y,'ext') +similarity(x,y,'cor') - similarity(b+x,y,'cor') diff --git a/exercises/02450Toolbox_R/Scripts/ex2_3_1.R b/exercises/02450Toolbox_R/Scripts/ex2_3_1.R index 39726ea5f21ccd4bb41fc140f5df9f6542b34891..98f03304399f73d86970a8b9f0b324e750b99f96 100644 --- a/exercises/02450Toolbox_R/Scripts/ex2_3_1.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_1.R @@ -1,61 +1,30 @@ #################### -# Exercise 2.3.1 +# Exercise 4.2.1 #################### - rm(list = ls()) # Clear work space -library(FNN) -library(R.matlab) - -# Read in the data -data <- readMat(file.path("Data", "zipdata.mat")) +# Read data into R +data <- read.csv("./Data/iris.csv") -# Check that the structure dat contains two matrices, testdata and traindata +# Inspect the contents of the variable "data". names(data) +head(data) -# The features, i.e. images of digits, are stored in the rows of traindata, -# except for the first column. The first column contains the class of the row, -# i.e. the digit identity -X <- data$traindata[, 2:dim(data$traindata)[2]] -y <- data$traindata[, 1] - -Xtest <- data$testdat[, 2:dim(data$testdat)[2]] -ytest <- data$testdat[, 1] - -# This is like in the exercise -digits_to_inspect <- 0:1 - -# If you want to try more digits -# digits_to_inspect = 0:9 - -# Find the observations that relate to the digits chosen in digits_to_inspect -inds <- !is.na(match(y, digits_to_inspect)) -indstest <- !is.na(match(ytest, digits_to_inspect)) - -# Extract the rows of X found above -X <- X[inds, ] -y <- y[inds] -Xtest <- Xtest[indstest, ] -ytest <- ytest[indstest] - - -# Get the column means of X, subtract them from each row, -# and perform and SVD on the resulting matrix -means <- colMeans(X) -Xzeromean <- t(apply(X, 1, "-", means)) -Xtestzeromean <- t(apply(Xtest, 1, "-", means)) -svdres <- svd(Xzeromean) +# Extract the rows and columns corresponding to the data, i.e. the attribute values +X <- data[, 1:4] -# Extract the matrices containing the left and right singular vectors, respectively -U <- svdres$u -V <- svdres$v +# Extract attribute names from the first row +attributeNames <- colnames(data)[1:(dim(data)[2] - 1)] -K <- 5 # Number of principal components to use -pc_projections <- Xzeromean %*% V[, 1:K] -pc_projectionstest <- Xtestzeromean %*% V[, 1:K] +# Extract unique class names from the last column +classLabels <- data[, 5] +classNames <- unique(classLabels) -preds <- knn(pc_projections, pc_projectionstest, cl = y) +# Extract class labels that match the class names +y <- match(classLabels, classNames) - 1 -error_rate <- mean(preds != ytest) -print(error_rate) +# Get the number of data objects, attributes, and classes +N <- dim(X)[1] +M <- dim(X)[2] +C <- length(classNames) diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_2.R b/exercises/02450Toolbox_R/Scripts/ex2_3_2.R similarity index 96% rename from exercises/02450Toolbox_R/Scripts/ex4_2_2.R rename to exercises/02450Toolbox_R/Scripts/ex2_3_2.R index 0ae27d25a2c7ae714db4d34e71a791827fd4c6e8..f9558da2f3640027f3862bd0192f660b1a74d5fa 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_2.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_2.R @@ -2,7 +2,7 @@ # Exercise 4.2.2 #################### -source("Scripts/ex4_2_1.R") +source("Scripts/ex2_3_1.R") # ---------------- # With base R plot diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_3.R b/exercises/02450Toolbox_R/Scripts/ex2_3_3.R similarity index 81% rename from exercises/02450Toolbox_R/Scripts/ex4_2_3.R rename to exercises/02450Toolbox_R/Scripts/ex2_3_3.R index a05fc0c826c900001527be00c3633efaa865cd04..cceef6d4af761fb61283e31a713095ee8aa6ced1 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_3.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_3.R @@ -2,7 +2,7 @@ # Exercise 4.2.3 #################### -source("Scripts/ex4_2_1.R") +source("Scripts/ex2_3_1.R") par(mfrow=c(1,1)) boxplot(X, main="Boxplots of attribute values") diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_4.R b/exercises/02450Toolbox_R/Scripts/ex2_3_4.R similarity index 96% rename from exercises/02450Toolbox_R/Scripts/ex4_2_4.R rename to exercises/02450Toolbox_R/Scripts/ex2_3_4.R index 7cff927fc7f748463e5b4b00194da1d1687c7662..dd45ae4f9e2ee192f4b7baf2bf132e7f33f67821 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_4.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_4.R @@ -1,7 +1,7 @@ #################### # Exercise 4.2.4 #################### -source("Scripts/ex4_2_1.R") +source("Scripts/ex2_3_1.R") # Done with base R plot yvals <- c() diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_5.R b/exercises/02450Toolbox_R/Scripts/ex2_3_5.R similarity index 92% rename from exercises/02450Toolbox_R/Scripts/ex4_2_5.R rename to exercises/02450Toolbox_R/Scripts/ex2_3_5.R index bd6b45df9d785a990b7bf03f83068a30f84403f2..700748406b8be01bc57fac91e77ad90ef89a4d42 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_5.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_5.R @@ -1,7 +1,7 @@ #################### # Exercise 4.2.5 #################### -source("Scripts/ex4_2_1.R") +source("Scripts/ex2_3_1.R") observationColors <- c("blue", "green3", "red")[unclass(y+1)] { diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_6.R b/exercises/02450Toolbox_R/Scripts/ex2_3_6.R similarity index 97% rename from exercises/02450Toolbox_R/Scripts/ex4_2_6.R rename to exercises/02450Toolbox_R/Scripts/ex2_3_6.R index 29079a731fe91884ce4451e5d626a8390b347272..15a729e371f07988a210c190dbf1b173f928a8db 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_6.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_6.R @@ -1,7 +1,7 @@ #################### # Exercise 4.2.6 #################### -source("Scripts/ex4_2_1.R") +source("Scripts/ex2_3_1.R") # Make a non-interactive 3d plot of the data. Install the package scatterplot3d library(scatterplot3d) # install.packages("scatterplot3d") diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_7.R b/exercises/02450Toolbox_R/Scripts/ex2_3_7.R similarity index 95% rename from exercises/02450Toolbox_R/Scripts/ex4_2_7.R rename to exercises/02450Toolbox_R/Scripts/ex2_3_7.R index c80379396992eb071da6571bf292b6fa549f43f4..f22824150a68ef1dba88ee0c42e8923a8d27ff37 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_7.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_3_7.R @@ -1,7 +1,7 @@ #################### # Exercise 4.2.7 #################### -source("Scripts/ex4_2_1.R") +source("Scripts/ex2_3_1.R") # Standardize each column using the function scale (see ?scale) X_scaled <- as.data.frame(scale(X)) diff --git a/exercises/02450Toolbox_R/Scripts/ex4_3_1.R b/exercises/02450Toolbox_R/Scripts/ex2_4_1.R similarity index 100% rename from exercises/02450Toolbox_R/Scripts/ex4_3_1.R rename to exercises/02450Toolbox_R/Scripts/ex2_4_1.R diff --git a/exercises/02450Toolbox_R/Scripts/ex4_3_2.R b/exercises/02450Toolbox_R/Scripts/ex2_4_2.R similarity index 94% rename from exercises/02450Toolbox_R/Scripts/ex4_3_2.R rename to exercises/02450Toolbox_R/Scripts/ex2_4_2.R index cc9848e5f00676464d9b198d9f8101a0bbf16be7..cbec5c62842dffb3006b3844277c8aa0457476f9 100644 --- a/exercises/02450Toolbox_R/Scripts/ex4_3_2.R +++ b/exercises/02450Toolbox_R/Scripts/ex2_4_2.R @@ -2,7 +2,7 @@ # Exercise 4.3.2 #################### -source("Scripts/ex4_3_1.R") +source("Scripts/ex2_4_1.R") library(ggplot2) library(GGally) # install.packages("GGally") diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_1.R b/exercises/02450Toolbox_R/Scripts/ex3_1_1.R new file mode 100644 index 0000000000000000000000000000000000000000..7861e15a48b09ebbb0a87f27a7366b651ff048d0 --- /dev/null +++ b/exercises/02450Toolbox_R/Scripts/ex3_1_1.R @@ -0,0 +1,36 @@ +#################### +# Exercise 2.1.1 +#################### + +rm(list = ls()) # Clear work space + +# Read data into R +data <- read.csv("./Data/nanonose.csv", check.names = FALSE) + +# Extract class labels of observations +classLabels <- colnames(data) +classLabels <- classLabels[-(1:2)] + +# Extract attributes, i.e. sensor names +attributeNames <- data[3:10, 1] + +# Remove first two rows and columns and transpose data matrix +X <- t(data[-(1:2), -(1:2)]) + +# Check that dimensions are as they should be (90 rows and 8 columns) +N <- dim(X)[1] +M <- dim(X)[2] + +# Assign the class labels as row names and the attributes as column names +rownames(X) <- classLabels +colnames(X) <- attributeNames + +# Extract the class names present in data +classNames <- sort(unique(classLabels)) +C <- length(classNames) + +# Extract numeric class assignments +y <- as.numeric(as.factor(classLabels)) - 1 + +# Clean up a bit in the variables, since we'll call this script from later scripts: +rm(data) diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_2.R b/exercises/02450Toolbox_R/Scripts/ex3_1_2.R index d3a68728715b5ebcd46797215edaea83f389e448..d99b5ebcc5c289c968bc0e73da451a2d85fbb7bb 100644 --- a/exercises/02450Toolbox_R/Scripts/ex3_1_2.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_1_2.R @@ -1,19 +1,20 @@ #################### -# Exercise 3.1.2 +# Exercise 2.1.2 #################### -rm(list = ls()) # Clear work space +# Run ex2.1.1: +source('Scripts/ex2_1_1.R') -library(tm) -textfolder = "./Data/" +# Choose which sensors to plot +i = 1 +j = 2 -(docs <- Corpus(DirSource(textfolder, pattern="textDocs*"), readerControl = list(language="en"))) - -inspect(docs) - -docs_nopunct <- tm_map(docs, removePunctuation) -dtm <- DocumentTermMatrix(docs_nopunct) - -inspect(dtm) +# Make simple plot +plot(X[ , i], X[ , j]) +## Make more fancy plot +library(ggplot2) +ggplot() + geom_point(aes(x = X[ , i], y = X[ , j], color = classLabels), size=4, alpha=0.5) + + theme(legend.position = c(0.88, 0.77), legend.title= element_blank()) + + labs(x = attributeNames[i], y = attributeNames[j]) diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_3.R b/exercises/02450Toolbox_R/Scripts/ex3_1_3.R index 4180e92904af380e5417c81b30f2044db6102ea0..b38a52290d2988124ca034e4751e7e7375ccd926 100644 --- a/exercises/02450Toolbox_R/Scripts/ex3_1_3.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_1_3.R @@ -1,23 +1,45 @@ #################### -# Exercise 3.1.3 +# Exercise 2.1.3 #################### -rm(list = ls()) # Clear work space +source("Scripts/ex3_1_1.R") -library(tm) -textfolder <- "./Data/" +# Subtract the column means form columns of X +Y <- t(apply(X, 1, "-", colMeans(X))) -(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en"))) +# You can check the column means using: colMeans(Y) +colMeans(Y) -inspect(docs) -docs_nopunct <- tm_map(docs, removePunctuation) -dtm <- DocumentTermMatrix(docs_nopunct) +# PCA by computing SVD of Y: +s <- svd(Y) +diagS <- s$d -mystopwords <- scan("./Data/stopWords.txt", character(0)) +rho <- diagS^2 / sum(diagS^2) -docs_nostopwords <- tm_map(docs, removeWords, mystopwords) -inspect(docs_nostopwords) -dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE)) -inspect(dtm_nostopwords) +# PCA can also be done with the built-in function prcomp. +# Confirm that you get the same result. +rho2 <- summary(prcomp(X, center = TRUE))$importance[2,] + + +threshold <- 0.9 + +xlimits <- c(1, M) +plot(rho, + type = "o", + main = "Variance explained by principal componenets", + xlab = "Principal components", + ylab = "Variance explained", + xlim = xlimits, + ylim = c(0, 1), + col = "blue" +) + +lines(cumsum(rho), type = "o", col = "orange") +lines(xlimits, c(threshold, threshold), lty = "dashed") + +legend("right", # Define position + legend = c("Individual", "Cumulative", "Threshold"), # Set strings for legend + col = c("orange", "blue", "black"), lty = c(1, 1, 2), # Match appereance of lines + cex = 1, bg = "lightblue" +) # Setup how the box looks (cex controls size) -control <- list(stopwords = TRUE) diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_4.R b/exercises/02450Toolbox_R/Scripts/ex3_1_4.R index 5ad1d634acf03543105e49b1e1e8ee6a0ba61870..3b43e4700e2a6c2d4f447166eee754a0f476a04a 100644 --- a/exercises/02450Toolbox_R/Scripts/ex3_1_4.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_1_4.R @@ -1,29 +1,20 @@ #################### -# Exercise 3.1.4 +# Exercise 2.1.4 #################### +source("Scripts/ex2_1_3.R") -rm(list = ls()) # Clear work space +# Manual projecting data onto principal component. +Z <- s$u %*% diag(s$d) -library(tm) # install.packages("tm") -library(SnowballC) # install.packages("SnowballC") +# Doing it with the built-in function. +Z <- prcomp(X)$x -textfolder <- "./Data/" +i <- 1 +j <- 3 -(docs <- Corpus(DirSource(textfolder, pattern = "textDocs*"), readerControl = list(language = "en"))) - -inspect(docs) -docs_nopunct <- tm_map(docs, removePunctuation) -dtm <- DocumentTermMatrix(docs_nopunct) - -mystopwords <- scan("./Data/stopWords.txt", character(0)) - -docs_nostopwords <- tm_map(docs_nopunct, removeWords, mystopwords) -dtm_nostopwords <- DocumentTermMatrix(docs_nostopwords, control = list(removePunctuation = TRUE, stopwords = TRUE)) -inspect(dtm_nostopwords) - -docs_stemmed <- tm_map(docs_nostopwords, stemDocument, language = "english") - - -inspect(docs_stemmed) -dtm_stemmed <- DocumentTermMatrix(docs_stemmed, control = list(stopwords = TRUE)) -inspect(dtm_stemmed) +## Make fancy plot +library(ggplot2) +ggplot() + + geom_point(aes(x = Z[, i], y = Z[, j], color = classLabels), size = 4, alpha = 0.5) + + theme(legend.position = c(0.88, 0.22), legend.title = element_blank()) + + labs(x = colnames(Z)[i], y = colnames(Z)[j]) diff --git a/exercises/02450Toolbox_R/Scripts/ex3_1_5.R b/exercises/02450Toolbox_R/Scripts/ex3_1_5.R index efe3e7b136de8bd693240e8f8168f4e460d629a9..bb02536666ef680aa67cfdf71b6e91504e2cf931 100644 --- a/exercises/02450Toolbox_R/Scripts/ex3_1_5.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_1_5.R @@ -1,19 +1,45 @@ #################### -# Exercise 3.1.5 +# Exercise 2.1.5 #################### +source("Scripts/ex3_1_1.R") -source("Scripts/ex3_1_4.R") -source("Tools/similarity.R") +Y <- t(apply(X, 1, "-", colMeans(X))) # subtract the column means form columns of X -size <- dim(dtm_stemmed) -cosmeas <- c() -q <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0) -dtmat <- matrix(dtm_stemmed, dtm_stemmed$nrow, dtm_stemmed$ncol) +# Computing PCA: +pca <- prcomp(X) +V <- pca$rotation +Z <- pca$x -for (irow in 1:size[1]) { - doc <- dtm_stemmed[irow, ] - cosmeas[irow] <- similarity(t(dtmat[irow, ]), t(q), 'cos') -} +# We saw in 2.1.3 that the first 3 components explained more than 90 +# percent of the variance. Let's look at their coefficients: +pcs <- 1:3 +test <- as.data.frame(melt(data.table(V[, pcs]))) +ggplot(test, aes(x = rep(1:8, length(pcs)), y = value, fill=variable)) + + geom_bar(position="dodge", stat = "identity") + + labs(fill="PC", x = "Attributes", y = "Component coefficients") + +# Inspecting the plot, we see that the 2nd principal component has large +# (in magnitude) coefficients for attributes A, E and H. We can confirm +# this by looking at it's numerical values directly, too: +cat("PC2:", V[, 1]) + +# How does this translate to the actual data and its projections? +# Looking at the data for water: + +# Projection of water class onto the 2nd principal component. +water <- Y[y == 4, ] +cat("First water observation:", water[1, ]) + +# Based on the coefficients and the attribute values for the observation +# displayed, would you expect the projection onto PC2 to be positive or +# negative - why? Consider *both* the magnitude and sign of *both* the +# coefficient and the attribute! + +# You can determine the projection by (remove comments): +print("...and its projection onto PC2") +print(as.numeric(t(water[1, ]) %*% V[, 2])) + +# Try to explain why. +# This is equivalent to: +Z[1, 2] -print("Cosine similarity from q to docs: ") -print(cosmeas) diff --git a/exercises/02450Toolbox_R/Scripts/ex2_1_6.R b/exercises/02450Toolbox_R/Scripts/ex3_1_6.R similarity index 99% rename from exercises/02450Toolbox_R/Scripts/ex2_1_6.R rename to exercises/02450Toolbox_R/Scripts/ex3_1_6.R index f0a329dd022b47e14de6af383acfedf4a86092b9..83faf47dbd9116ccf326b94e3344fe76f8a036fe 100644 --- a/exercises/02450Toolbox_R/Scripts/ex2_1_6.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_1_6.R @@ -1,7 +1,7 @@ #################### # Exercise 2.1.6 #################### -source("Scripts/ex2_1_1.R") +source("Scripts/ex3_1_1.R") # Try this *later* (for last), and explain the effect X_s = X # Make a to be "scaled" version of X diff --git a/exercises/02450Toolbox_R/Scripts/ex3_2_1.R b/exercises/02450Toolbox_R/Scripts/ex3_2_1.R index 725f7564a291d7ba29252e89ef5d0f0c1adc7679..ff04967032ed9c6df0d9c33ec81aef0f5aeafdbc 100644 --- a/exercises/02450Toolbox_R/Scripts/ex3_2_1.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_2_1.R @@ -1,17 +1,42 @@ #################### -# Exercise 3.2.1 +# Exercise 2.2.1 #################### rm(list = ls()) # Clear work space -x <- c(-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47) +# Load the library R.matlab to enable the function readMat, +# which allows R to read the matlab .mat format. +# Install with "install.packages("R.matlab")" +library(R.matlab) -mean(x) -sd(x) -median(x) -diff(range(x)) +# The row of training data that we will look at +i <- 1 -# Range returns the minimum and maximum of the vector x. -# To get the range, we must take the maximum minus the minimum. -# We do this using the function diff, which finds differences -# between consecutive elements in a vector. +# Read in the data +data <- readMat(file.path("Data", "zipdata.mat")) + +# Check that the structure dat contains two matrices, testdata and traindata +names(data) + +# The features, i.e. images of digits, are stored in the rows of traindata, +# except for the first column. The first column contains the class of the row, +# i.e. the digit identity +X <- data$traindata[, 2:dim(data$traindata)[2]] +y <- data$traindata[, 1] + +par(mfrow = c(1, 2)) + +# View the i'th row of X +image(as.matrix(X[i, ]), axes = FALSE, xlab = "Pixel number", main = "Digit in vector format") + +# Make ticks corresponding to pixels, i.e. values in row i +axis(1, at = (1:length(X[i, ])) / length(X[i, ]), labels = 1:length(X[i, ])) + +# Extract the i'th row of X and store it as the matrix I +I <- X[i, ] + +# Make the matrix I have dimensions 16 by 16 +dim(I) <- c(16, 16) + +# view the digit in the i'th row of X as an image. The function image rotates the matrix, so we need to rearrange the columns in the matrix I in order to get the correct view of the digit. +image(I[, ncol(I):1], col = gray((32:0) / 32), main = "Digit as an image") diff --git a/exercises/02450Toolbox_R/Scripts/ex3_2_2.R b/exercises/02450Toolbox_R/Scripts/ex3_2_2.R new file mode 100644 index 0000000000000000000000000000000000000000..d1d81de672b8e982cbdb3c01f531afaf6cdf7d56 --- /dev/null +++ b/exercises/02450Toolbox_R/Scripts/ex3_2_2.R @@ -0,0 +1,105 @@ +#################### +# Exercise 2.2.2 +#################### +source("Scripts/ex3_2_1.R") + + +# Choose which digits to consider in the analysis + +# This is like in the exercise +digits_to_inspect <- 0:1 + +# If you want to try more digits +# digits_to_inspect = 0:9 + +# Find the observations that relate to the digits chosen in digits_to_inspect +inds <- !is.na(match(y, digits_to_inspect)) + +# extract the rows of X found above +X <- X[inds, ] +y <- y[inds] + +# Get the column means of X, subtract them from each row, +# and perform and SVD on the resulting matrix +means <- colMeans(X) +Xzeromean <- t(apply(X, 1, "-", means)) +svdres <- svd(Xzeromean) + +# Extract the matrices containing the left and right singular vectors, respectively +U <- svdres$u +V <- svdres$v + +# ---------------------------------------------------- +# Calculate and plot the variance explained by the PCs +# ---------------------------------------------------- + +pcvariance <- svdres$d^2 / sum(svdres$d^2) + +{ + par(mfrow=c(1,1)) + plot(cumsum(pcvariance), main = "Data variance explained by PCs", + xlab = "Number of PCs included in variance sum", + ylab = "Proportion of variance explained") + + # First 22 PCs should explain 90% + abline(v = 22) +} + +# --------------------------------------------------------------------- +# Plot principal components 1 and 2 against each other in a scatterplot, +# i.e. plot the projections of observations onto PCs 1 and 2 +# --------------------------------------------------------------------- + +# These two ways of calculating pc_projections are equivalent +pc_projections <- Xzeromean %*% V +pc_projections <- U %*% diag(svdres$d) + +pcproj1 <- pc_projections[, 1] +pcproj2 <- pc_projections[, 2] + +{ + par(mfrow=c(1,1)) + plot(c(min(pcproj1), max(pcproj1)), c(min(pcproj2), max(pcproj2)), + type = "n", xlab = "PC 1", ylab = "PC 2") + points(pcproj1[y == 0], pcproj2[y == 0], col = "red") + points(pcproj1[y == 1], pcproj2[y == 1], col = "green") + legend("topleft", legend = c("0", "1"), fill = c("red", "green")) +} + + +# ---------------------------------- +# Reconstruction of images of digits +# ---------------------------------- + +# Number of PCs to include in reconstruction of digits +K <- 5 + +# Digits to visualize +nD <- 60:63 + +reconstructions <- pc_projections[, 1:K] %*% t(V[, 1:K]) + +par(mar = c(1, 1, 1, 1)) +layout(matrix(c(1:length(nD), (length(nD) + 1):(2 * length(nD))), 2, length(nD), byrow = FALSE)) +for (d in 1:length(nD)) { + origImage <- X[nD[d], ] + dim(origImage) <- c(16, 16) + image(origImage[, ncol(origImage):1], main = "Original", col = gray((32:0) / 32)) + reconstructedImage <- reconstructions[nD[d], ] + means + dim(reconstructedImage) <- c(16, 16) + image(reconstructedImage[, ncol(reconstructedImage):1], main = "Reconstruction", col = gray((32:0) / 32)) +} + +# ------------- +# Visualize PCs +# ------------- + +# visualize the first 12 PCs +par(mfrow = c(3, 4), mar = c(2, 2, 2, 2)) +for (nth_pc in 1:12) { + pc <- t(V[, nth_pc]) + dim(pc) <- c(16, 16) + # view image of PC + image(pc[, ncol(pc):1], col = gray((32:0) / 32), main = paste("PC ", nth_pc)) +} + diff --git a/exercises/02450Toolbox_R/Scripts/ex3_3_1.R b/exercises/02450Toolbox_R/Scripts/ex3_3_1.R index 91eccf7115949659685286b1285dced806f6039e..39726ea5f21ccd4bb41fc140f5df9f6542b34891 100644 --- a/exercises/02450Toolbox_R/Scripts/ex3_3_1.R +++ b/exercises/02450Toolbox_R/Scripts/ex3_3_1.R @@ -1,67 +1,61 @@ #################### -# Exercise 3.3.1 +# Exercise 2.3.1 #################### -rm(list = ls()) # Clear work space) -source("Tools/binarize.R") -source("Tools/similarity.R") - -# Image to use as query -i <- 1 - -# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' -SimilarityMeasure <- "Jaccard" +rm(list = ls()) # Clear work space +library(FNN) library(R.matlab) -data <- readMat(file.path("./Data/digits.mat")) -# You can also try it on the faces dataset: -# data <- readMat(file.path("./Data/wildfaces_grayscale.mat")) +# Read in the data +data <- readMat(file.path("Data", "zipdata.mat")) +# Check that the structure dat contains two matrices, testdata and traindata names(data) -X <- data$X -dimX <- dim(X) -N <- dimX[1] -M <- dimX[2] -Q <- matrix(X[i, ], ncol = M) # The query image -Y <- X[-i, ] # All images except the i'th - -# The next function is in the setup.R, load it if you have not already -# source("setup.R") -sim <- similarity(Q, Y, SimilarityMeasure) - -# Sort similarities -sort_result <- sort(sim, decreasing = TRUE, index.return = TRUE) -val <- sort_result$x -j <- sort_result$ix -nj <- length(j) - -# Plot five most similar and dissimilar images -npics <- 5 -ndigits <- 4 -mostsim <- j[1:npics] -leastsim <- j[(nj - npics + 1):nj] - -imageDim <- c(sqrt(M), sqrt(M)) -dim(Q) <- imageDim # reshape Q - -{ - dev.new(width = 2.5, height = 3) - image(t(Q[nrow(Q):1, ]), main = "Query image", col = gray((0:32) / 32)) -} - -{ - dev.new(width = 2 * npics, height = 5) - layout(matrix(c(1:length(mostsim), (length(mostsim) + 1):(2 * length(mostsim))), 2, length(mostsim), byrow = FALSE)) - for (d in 1:npics) { - similarImage <- Y[mostsim[d], ] - dim(similarImage) <- imageDim - image(t(similarImage[nrow(similarImage):1, ]), main = paste("Similarity:", format(val[d], digits = ndigits)), col = gray((0:32) / 32)) - - - dissimilarImage <- Y[leastsim[d], ] - dim(dissimilarImage) <- imageDim - - image(t(dissimilarImage[nrow(dissimilarImage):1, ]), main = paste("Similarity:", format(val[nj - d + 1], digits = ndigits)), col = gray((0:32) / 32)) - } -} + +# The features, i.e. images of digits, are stored in the rows of traindata, +# except for the first column. The first column contains the class of the row, +# i.e. the digit identity +X <- data$traindata[, 2:dim(data$traindata)[2]] +y <- data$traindata[, 1] + +Xtest <- data$testdat[, 2:dim(data$testdat)[2]] +ytest <- data$testdat[, 1] + +# This is like in the exercise +digits_to_inspect <- 0:1 + +# If you want to try more digits +# digits_to_inspect = 0:9 + +# Find the observations that relate to the digits chosen in digits_to_inspect +inds <- !is.na(match(y, digits_to_inspect)) +indstest <- !is.na(match(ytest, digits_to_inspect)) + +# Extract the rows of X found above +X <- X[inds, ] +y <- y[inds] +Xtest <- Xtest[indstest, ] +ytest <- ytest[indstest] + + +# Get the column means of X, subtract them from each row, +# and perform and SVD on the resulting matrix +means <- colMeans(X) +Xzeromean <- t(apply(X, 1, "-", means)) +Xtestzeromean <- t(apply(Xtest, 1, "-", means)) +svdres <- svd(Xzeromean) + +# Extract the matrices containing the left and right singular vectors, respectively +U <- svdres$u +V <- svdres$v + +K <- 5 # Number of principal components to use +pc_projections <- Xzeromean %*% V[, 1:K] +pc_projectionstest <- Xtestzeromean %*% V[, 1:K] + +preds <- knn(pc_projections, pc_projectionstest, cl = y) + +error_rate <- mean(preds != ytest) +print(error_rate) + diff --git a/exercises/02450Toolbox_R/Scripts/ex3_3_2.R b/exercises/02450Toolbox_R/Scripts/ex3_3_2.R deleted file mode 100644 index 3cdc43ec18cec69145b8c9e47da735c2b53d1e34..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_R/Scripts/ex3_3_2.R +++ /dev/null @@ -1,22 +0,0 @@ -#################### -# Exercise 3.3.2 -#################### -source("Tools/similarity.R") - -# Generate two data objects with M random (standard uniformly distributed) attributes -M = 5; -x = as.matrix(runif(M)); -y = as.matrix(runif(M)); - - -# Two constants -a = 1.5; -b = 1.5; - -# Check the statements in the exercise -similarity(x,y,'cos') - similarity(a*x,y,'cos') -similarity(x,y,'ext') - similarity(a*x,y,'ext') -similarity(x,y,'cor') - similarity(a*x,y,'cor') -similarity(x,y,'cos') - similarity(b+x,y,'cos') -similarity(x,y,'ext') - similarity(b+x,y,'ext') -similarity(x,y,'cor') - similarity(b+x,y,'cor') diff --git a/exercises/02450Toolbox_R/Scripts/ex4_2_1.R b/exercises/02450Toolbox_R/Scripts/ex4_2_1.R deleted file mode 100644 index 98f03304399f73d86970a8b9f0b324e750b99f96..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_R/Scripts/ex4_2_1.R +++ /dev/null @@ -1,30 +0,0 @@ -#################### -# Exercise 4.2.1 -#################### -rm(list = ls()) # Clear work space - -# Read data into R -data <- read.csv("./Data/iris.csv") - -# Inspect the contents of the variable "data". -names(data) -head(data) - -# Extract the rows and columns corresponding to the data, i.e. the attribute values -X <- data[, 1:4] - -# Extract attribute names from the first row -attributeNames <- colnames(data)[1:(dim(data)[2] - 1)] - -# Extract unique class names from the last column -classLabels <- data[, 5] -classNames <- unique(classLabels) - -# Extract class labels that match the class names -y <- match(classLabels, classNames) - 1 - -# Get the number of data objects, attributes, and classes -N <- dim(X)[1] -M <- dim(X)[2] -C <- length(classNames) -