diff --git a/exercises/02450Toolbox_Matlab/Scripts/default.txt b/exercises/02450Toolbox_Matlab/Scripts/default.txt deleted file mode 100644 index 065eaa168a1aeeee91a3c6d1427c257895e4cf79..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Matlab/Scripts/default.txt +++ /dev/null @@ -1,101 +0,0 @@ -Rule (Support, Confidence) -11 -> 13 (39.3242%, 74.9395%) -13 -> 11 (39.3242%, 78.1033%) -15 -> 22 (38.1187%, 75.2348%) -22 -> 15 (38.1187%, 76.4556%) -16 -> 21 (37.5952%, 76.2058%) -21 -> 16 (37.5952%, 74.9763%) -9 -> 15 (37.2621%, 72.366%) -15 -> 9 (37.2621%, 73.5441%) -12 -> 14 (36.5006%, 76.8024%) -14 -> 12 (36.5006%, 73.5144%) -7 -> 13 (35.2951%, 70.0346%) -13 -> 7 (35.2951%, 70.1008%) -10 -> 16 (35.1047%, 72.3676%) -16 -> 10 (35.1047%, 71.1576%) -7 -> 11 (34.7874%, 69.0274%) -11 -> 7 (34.7874%, 66.2938%) -8 -> 14 (34.5495%, 69.6514%) -14 -> 8 (34.5495%, 69.5847%) -1 -> 15 (34.1529%, 63.0454%) -15 -> 1 (34.1529%, 67.4076%) -1 -> 9 (33.915%, 62.6061%) -9 -> 1 (33.915%, 65.8657%) -9 -> 22 (33.677%, 65.4036%) -22 -> 9 (33.677%, 67.5469%) -3 -> 9 (33.566%, 66.0012%) -9 -> 3 (33.566%, 65.1879%) -4 -> 11 (32.9156%, 66.9787%) -11 -> 4 (32.9156%, 62.7267%) -5 -> 1 (32.5032%, 61.8286%) -10 -> 21 (32.3287%, 66.6449%) -21 -> 10 (32.3287%, 64.4733%) -8 -> 12 (31.9162%, 64.3428%) -12 -> 8 (31.9162%, 67.1562%) -9 -> 19 (31.71%, 61.5835%) -19 -> 9 (31.71%, 63.1197%) -7 -> 15 (31.5038%, 62.5118%) -15 -> 7 (31.5038%, 62.1791%) -3 -> 12 (31.2976%, 61.5409%) -12 -> 3 (31.2976%, 65.8545%) -17 -> 19 (31.2341%, 60.7716%) -19 -> 17 (31.2341%, 62.1724%) -4 -> 10 (31.2183%, 63.5249%) -10 -> 4 (31.2183%, 64.3558%) -3 -> 19 (31.0596%, 61.073%) -19 -> 3 (31.0596%, 61.8251%) -8 -> 17 (31.0596%, 62.6159%) -17 -> 8 (31.0596%, 60.4321%) -7 -> 20 (30.8852%, 61.2842%) -20 -> 7 (30.8852%, 62.0657%) -3 -> 1 (30.8693%, 60.6987%) -3 -> 15 (30.8376%, 60.6363%) -15 -> 3 (30.8376%, 60.8641%) -20 -> 11 (30.7582%, 61.8106%) -8 -> 19 (30.7265%, 61.9444%) -19 -> 8 (30.7265%, 61.162%) -8 -> 1 (30.5838%, 61.6565%) -19 -> 1 (30.5679%, 60.8462%) -13 -> 22 (30.4727%, 60.523%) -22 -> 13 (30.4727%, 61.1199%) -14 -> 3 (30.4569%, 61.3419%) -8 -> 16 (30.441%, 61.3687%) -16 -> 8 (30.441%, 61.7042%) -14 -> 1 (30.3775%, 61.1821%) -14 -> 21 (30.2665%, 60.9585%) -21 -> 14 (30.2665%, 60.3606%) -10 -> 11 (30.2189%, 62.2956%) -19 -> 15 (30.1713%, 60.0568%) -14 -> 17 (30.1396%, 60.7029%) -18 -> 7 (30.0603%, 61.8473%) -4 -> 5 (30.0127%, 61.0717%) -20 -> 13 (30.0127%, 60.3124%) -10 -> 20 (29.981%, 61.8051%) -20 -> 10 (29.981%, 60.2486%) -18 -> 1 (29.9651%, 61.6514%) -4 -> 20 (29.9651%, 60.9748%) -20 -> 4 (29.9651%, 60.2168%) -4 -> 13 (29.9492%, 60.9425%) -14 -> 19 (29.9016%, 60.2236%) -12 -> 1 (29.8541%, 62.8171%) -18 -> 20 (29.6003%, 60.9008%) -2 -> 16 (29.3147%, 63.9668%) -18 -> 5 (29.2513%, 60.1828%) -12 -> 9 (29.2354%, 61.5154%) -12 -> 19 (28.5216%, 60.0134%) -2 -> 10 (28.2519%, 61.6476%) -2 -> 11 (28.1567%, 61.4399%) -1,3 -> 9 (21.9543%, 71.1202%) -1,9 -> 3 (21.9543%, 64.7334%) -3,9 -> 1 (21.9543%, 65.4064%) -1,3 -> 12 (20.3363%, 65.8787%) -1,12 -> 3 (20.3363%, 68.119%) -3,12 -> 1 (20.3363%, 64.9772%) -1,5 -> 15 (20.2887%, 62.4207%) -5,15 -> 1 (20.2887%, 76.2217%) -1,7 -> 15 (20.2411%, 85.8104%) -7,15 -> 1 (20.2411%, 64.2497%) -1,3 -> 15 (20.0666%, 65.0051%) -3,15 -> 1 (20.0666%, 65.072%) -1,8 -> 12 (20.0666%, 65.612%) -1,12 -> 8 (20.0666%, 67.2157%) diff --git a/exercises/02450Toolbox_Python/Scripts/check_installation.py b/exercises/02450Toolbox_Python/Scripts/check_installation.py index bb1b41223e9a32b42202a07c307ee31d8e075e6e..cc381d5bdc256865e9e50b13350359d25d5188c6 100644 --- a/exercises/02450Toolbox_Python/Scripts/check_installation.py +++ b/exercises/02450Toolbox_Python/Scripts/check_installation.py @@ -1,5 +1,5 @@ """ -This is a helper function which can help you and the TAs debug your Python setup. +This is a helper function which can help you debug the Python installation """ import os import sklearn diff --git a/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py index c66a9b557b92caede6af2b051105199be4fb3aa4..a427cac0a34cafd553024aad1d0263d143330238 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py @@ -9,40 +9,38 @@ from dtuimldmtools import clusterval filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat") - # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] -classNames = [name[0][0] for name in mat_data["classNames"]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] +classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) - # Maximum number of clusters: K = 10 # Allocate variables: -Rand = np.zeros((K - 1,)) -Jaccard = np.zeros((K - 1,)) -NMI = np.zeros((K - 1,)) +Rand = np.zeros((K-1,)) +Jaccard = np.zeros((K-1,)) +NMI = np.zeros((K-1,)) -for k in range(K - 1): +for k in range(K-1): # run K-means clustering: - # cls = Pycluster.kcluster(X,k+1)[0] - centroids, cls, inertia = k_means(X, k + 2) + #cls = Pycluster.kcluster(X,k+1)[0] + centroids, cls, inertia = k_means(X,k+2) # compute cluster validities: - Rand[k], Jaccard[k], NMI[k] = clusterval(y, cls) - + Rand[k], Jaccard[k], NMI[k] = clusterval(y,cls) + # Plot results: figure(1) -title("Cluster validity") -plot(np.arange(K - 1) + 2, Rand) -plot(np.arange(K - 1) + 2, Jaccard) -plot(np.arange(K - 1) + 2, NMI) -legend(["Rand", "Jaccard", "NMI"], loc=4) +title('Cluster validity') +plot(np.arange(K-1)+2, Rand) +plot(np.arange(K-1)+2, Jaccard) +plot(np.arange(K-1)+2, NMI) +legend(['Rand', 'Jaccard', 'NMI'], loc=4) show() -print("Ran Exercise 10.1.3") +print('Ran Exercise 10.1.3') diff --git a/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py index 3b60ca2c570a0e45a8002ec69aed1f3f571026a8..08e6653fe56d38f593c00541d1fecd9f30aa5187 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py @@ -7,17 +7,18 @@ from sklearn.cluster import k_means filename = importlib_resources.files("dtuimldmtools").joinpath("data/wildfaces.mat") - # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -# mat_data = loadmat('../Data/digits.mat') #<-- uncomment this for using the digits dataset -X = mat_data["X"] +#filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat") #<-- uncomment this for using the digits dataset +#mat_data = loadmat('../Data/digits.mat') #<-- uncomment this for using the digits dataset + +X = mat_data['X'] N, M = X.shape # Image resolution and number of colors -x = 40 # <-- change this for using the digits dataset -y = 40 # <-- change this for using the digits dataset -c = 3 # <-- change this for using the digits dataset +x = 40 #<-- change this for using the digits dataset +y = 40 #<-- change this for using the digits dataset +c = 3 #<-- change this for using the digits dataset # Number of clusters: @@ -34,48 +35,43 @@ centroids, cls, inertia = k_means(X, K, verbose=True, max_iter=100, n_init=S) # Plot centroids plt.figure(1) -n1 = int(np.ceil(np.sqrt(K / 2))) -n2 = int(np.ceil(float(K) / n1)) +n1 = int(np.ceil(np.sqrt(K/2))) +n2 = int(np.ceil(float(K)/n1)) -# For black and white, cmap=plt.cm.binary, else default -cmap = plt.cm.binary if c == 1 else None +#For black and white, cmap=plt.cm.binary, else default +cmap = plt.cm.binary if c==1 else None for k in range(K): - plt.subplot(n1, n2, k + 1) + plt.subplot(n1,n2,k+1) # Reshape centroids to fit resolution and colors - img = np.reshape(centroids[k, :], (c, x, y)).T - if c == 1: # if color is single-color/gray scale + img = np.reshape(centroids[k,:],(c,x,y)).T + if c == 1: # if color is single-color/gray scale # Squeeze out singleton dimension # and flip the image (cancel out previos transpose) img = np.squeeze(img).T - plt.imshow(img, interpolation="None", cmap=cmap) - plt.xticks([]) - plt.yticks([]) - if k == np.floor((n2 - 1) / 2): - plt.title("Centroids") - -# Plot few randomly selected faces and their nearest centroids -L = 5 # number of images to plot + plt.imshow(img,interpolation='None', cmap=cmap) + plt.xticks([]); plt.yticks([]) + if k==np.floor((n2-1)/2): plt.title('Centroids') + +# Plot few randomly selected faces and their nearest centroids +L = 5 # number of images to plot j = np.random.randint(0, N, L) plt.figure(2) for l in range(L): - plt.subplot(2, L, l + 1) - img = np.resize(X[j[l], :], (c, x, y)).T + plt.subplot(2,L,l+1) + img = np.resize(X[j[l],:],(c,x,y)).T if c == 1: img = np.squeeze(img).T - plt.imshow(img, interpolation="None", cmap=cmap) - plt.xticks([]) - plt.yticks([]) - if l == np.floor((L - 1) / 2): - plt.title("Randomly selected faces and their centroids") - plt.subplot(2, L, L + l + 1) - img = np.resize(centroids[cls[j[l]], :], (c, x, y)).T + plt.imshow(img,interpolation='None', cmap=cmap) + plt.xticks([]); plt.yticks([]) + if l==np.floor((L-1)/2): plt.title('Randomly selected faces and their centroids') + plt.subplot(2,L,L+l+1) + img = np.resize(centroids[cls[j[l]],:],(c,x,y)).T if c == 1: img = np.squeeze(img).T - plt.imshow(img, interpolation="None", cmap=cmap) - plt.xticks([]) - plt.yticks([]) + plt.imshow(img,interpolation='None', cmap=cmap) + plt.xticks([]); plt.yticks([]) plt.show() -print("Ran Exercise 10.1.5") +print('Ran Exercise 10.1.5') diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py index f0fca9634def84749f1819bfc78cf58cac693c1f..f5cfc462ae073133074a692c8080b0706d084ee5 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py @@ -18,67 +18,60 @@ similarity_measure = "SMC" # Load Matlab data file to python dict structure X = loadmat(filename)["X"] # You can also try the CBCL faces dataset (remember to change 'transpose') -# X = loadmat('../Data/wildfaces_grayscale.mat')['X'] +#X = loadmat('../Data/wildfaces_grayscale.mat')['X'] N, M = X.shape -transpose = False # should the plotted images be transposed? +transpose = False # should the plotted images be transposed? -# Search the face database for similar faces +# Search the face database for similar faces # Index of all other images than i -noti = list(range(0, i)) + list(range(i + 1, N)) +noti = list(range(0,i)) + list(range(i+1,N)) # Compute similarity between image i and all others -sim = similarity(X[i, :], X[noti, :], similarity_measure) +sim = similarity(X[i,:], X[noti,:], similarity_measure) sim = sim.tolist()[0] # Tuples of sorted similarities and their indices -sim_to_index = sorted(zip(sim, noti)) +sim_to_index = sorted(zip(sim,noti)) # Visualize query image and 5 most/least similar images -plt.figure(figsize=(12, 8)) -plt.subplot(3, 1, 1) +plt.figure(figsize=(12,8)) +plt.subplot(3,1,1) img_hw = int(np.sqrt(len(X[0]))) -img = np.reshape(X[i], (img_hw, img_hw)) -if transpose: - img = img.T +img = np.reshape(X[i], (img_hw,img_hw)) +if transpose: img = img.T plt.imshow(img, cmap=plt.cm.gray) -plt.xticks([]) -plt.yticks([]) -plt.title("Query image") -plt.ylabel("image #{0}".format(i)) +plt.xticks([]); plt.yticks([]) +plt.title('Query image') +plt.ylabel('image #{0}'.format(i)) for ms in range(5): + # 5 most similar images found - plt.subplot(3, 5, 6 + ms) - im_id = sim_to_index[-ms - 1][1] - im_sim = sim_to_index[-ms - 1][0] - img = np.reshape(X[im_id], (img_hw, img_hw)) - if transpose: - img = img.T + plt.subplot(3,5,6+ms) + im_id = sim_to_index[-ms-1][1] + im_sim = sim_to_index[-ms-1][0] + img = np.reshape(X[im_id],(img_hw,img_hw)) + if transpose: img = img.T plt.imshow(img, cmap=plt.cm.gray) - plt.xlabel("sim={0:.3f}".format(im_sim)) - plt.ylabel("image #{0}".format(im_id)) - plt.xticks([]) - plt.yticks([]) - if ms == 2: - plt.title("Most similar images") + plt.xlabel('sim={0:.3f}'.format(im_sim)) + plt.ylabel('image #{0}'.format(im_id)) + plt.xticks([]); plt.yticks([]) + if ms==2: plt.title('Most similar images') # 5 least similar images found - plt.subplot(3, 5, 11 + ms) + plt.subplot(3,5,11+ms) im_id = sim_to_index[ms][1] im_sim = sim_to_index[ms][0] - img = np.reshape(X[im_id], (img_hw, img_hw)) - if transpose: - img = img.T + img = np.reshape(X[im_id],(img_hw,img_hw)) + if transpose: img = img.T plt.imshow(img, cmap=plt.cm.gray) - plt.xlabel("sim={0:.3f}".format(im_sim)) - plt.ylabel("image #{0}".format(im_id)) - plt.xticks([]) - plt.yticks([]) - if ms == 2: - plt.title("Least similar images") - + plt.xlabel('sim={0:.3f}'.format(im_sim)) + plt.ylabel('image #{0}'.format(im_id)) + plt.xticks([]); plt.yticks([]) + if ms==2: plt.title('Least similar images') + plt.show() -print("Ran Exercise 3.3.1") +print('Ran Exercise 3.3.1') diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py index 2be950077126609cf895cae0dbacbc49a8be79a0..64f7350e3bec2f9fe166dc3c74b2df93e6e7e244 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py @@ -11,161 +11,121 @@ from dtuimldmtools import bmplot, feature_selector_lr filename = importlib_resources.files("dtuimldmtools").joinpath("data/body.mat") # Load data from matlab file mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'][0]] N, M = X.shape ## Crossvalidation # Create crossvalidation partition for evaluation K = 5 -CV = model_selection.KFold(n_splits=K, shuffle=True) +CV = model_selection.KFold(n_splits=K,shuffle=True) # Initialize variables -Features = np.zeros((M, K)) -Error_train = np.empty((K, 1)) -Error_test = np.empty((K, 1)) -Error_train_fs = np.empty((K, 1)) -Error_test_fs = np.empty((K, 1)) -Error_train_nofeatures = np.empty((K, 1)) -Error_test_nofeatures = np.empty((K, 1)) - -k = 0 +Features = np.zeros((M,K)) +Error_train = np.empty((K,1)) +Error_test = np.empty((K,1)) +Error_train_fs = np.empty((K,1)) +Error_test_fs = np.empty((K,1)) +Error_train_nofeatures = np.empty((K,1)) +Error_test_nofeatures = np.empty((K,1)) + +k=0 for train_index, test_index in CV.split(X): + # extract training and test set for current CV fold - X_train = X[train_index, :] + X_train = X[train_index,:] y_train = y[train_index] - X_test = X[test_index, :] + X_test = X[test_index,:] y_test = y[test_index] internal_cross_validation = 10 - + # Compute squared error without using the input data at all - Error_train_nofeatures[k] = ( - np.square(y_train - y_train.mean()).sum() / y_train.shape[0] - ) - Error_test_nofeatures[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0] + Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0] + Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) - Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] - Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] + Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] + Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] # Compute squared error with feature subset selection - textout = "" - selected_features, features_record, loss_record = feature_selector_lr( - X_train, y_train, internal_cross_validation, display=textout - ) - - Features[selected_features, k] = 1 + textout = '' + selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation,display=textout) + + Features[selected_features,k] = 1 # .. alternatively you could use module sklearn.feature_selection if len(selected_features) == 0: - print( - "No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y)." - ) + print('No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: - m = lm.LinearRegression(fit_intercept=True).fit( - X_train[:, selected_features], y_train - ) - Error_train_fs[k] = ( - np.square(y_train - m.predict(X_train[:, selected_features])).sum() - / y_train.shape[0] - ) - Error_test_fs[k] = ( - np.square(y_test - m.predict(X_test[:, selected_features])).sum() - / y_test.shape[0] - ) - + m = lm.LinearRegression(fit_intercept=True).fit(X_train[:,selected_features], y_train) + Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0] + Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0] + figure(k) - subplot(1, 2, 1) - plot(range(1, len(loss_record)), loss_record[1:]) - xlabel("Iteration") - ylabel("Squared error (crossvalidation)") - - subplot(1, 3, 3) - bmplot( - attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:] - ) - clim(-1.5, 0) - xlabel("Iteration") + subplot(1,2,1) + plot(range(1,len(loss_record)), loss_record[1:]) + xlabel('Iteration') + ylabel('Squared error (crossvalidation)') + + subplot(1,3,3) + bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:]) + clim(-1.5,0) + xlabel('Iteration') - print("Cross validation fold {0}/{1}".format(k + 1, K)) - print("Train indices: {0}".format(train_index)) - print("Test indices: {0}".format(test_index)) - print("Features no: {0}\n".format(selected_features.size)) + print('Cross validation fold {0}/{1}'.format(k+1,K)) + print('Train indices: {0}'.format(train_index)) + print('Test indices: {0}'.format(test_index)) + print('Features no: {0}\n'.format(selected_features.size)) - k += 1 + k+=1 # Display results -print("\n") -print("Linear regression without feature selection:\n") -print("- Training error: {0}".format(Error_train.mean())) -print("- Test error: {0}".format(Error_test.mean())) -print( - "- R^2 train: {0}".format( - (Error_train_nofeatures.sum() - Error_train.sum()) - / Error_train_nofeatures.sum() - ) -) -print( - "- R^2 test: {0}".format( - (Error_test_nofeatures.sum() - Error_test.sum()) / Error_test_nofeatures.sum() - ) -) -print("Linear regression with feature selection:\n") -print("- Training error: {0}".format(Error_train_fs.mean())) -print("- Test error: {0}".format(Error_test_fs.mean())) -print( - "- R^2 train: {0}".format( - (Error_train_nofeatures.sum() - Error_train_fs.sum()) - / Error_train_nofeatures.sum() - ) -) -print( - "- R^2 test: {0}".format( - (Error_test_nofeatures.sum() - Error_test_fs.sum()) - / Error_test_nofeatures.sum() - ) -) +print('\n') +print('Linear regression without feature selection:\n') +print('- Training error: {0}'.format(Error_train.mean())) +print('- Test error: {0}'.format(Error_test.mean())) +print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum())) +print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum())) +print('Linear regression with feature selection:\n') +print('- Training error: {0}'.format(Error_train_fs.mean())) +print('- Test error: {0}'.format(Error_test_fs.mean())) +print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum())) +print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum())) figure(k) -subplot(1, 3, 2) -bmplot(attributeNames, range(1, Features.shape[1] + 1), -Features) -clim(-1.5, 0) -xlabel("Crossvalidation fold") -ylabel("Attribute") +subplot(1,3,2) +bmplot(attributeNames, range(1,Features.shape[1]+1), -Features) +clim(-1.5,0) +xlabel('Crossvalidation fold') +ylabel('Attribute') # Inspect selected feature coefficients effect on the entire dataset and # plot the fitted model residual error as function of each attribute to # inspect for systematic structure in the residual -f = 2 # cross-validation fold to inspect -ff = Features[:, f - 1].nonzero()[0] +f=2 # cross-validation fold to inspect +ff=Features[:,f-1].nonzero()[0] if len(ff) == 0: - print( - "\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y)." - ) + print('\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: - m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y) - - y_est = m.predict(X[:, ff]) - residual = y - y_est - - figure(k + 1, figsize=(12, 6)) - title( - "Residual error vs. Attributes for features selected in cross-validation fold {0}".format( - f - ) - ) - for i in range(0, len(ff)): - subplot(2, int(np.ceil(len(ff) / 2)), i + 1) - plot(X[:, ff[i]], residual, ".") - xlabel(attributeNames[ff[i]]) - ylabel("residual error") - - + m = lm.LinearRegression(fit_intercept=True).fit(X[:,ff], y) + + y_est= m.predict(X[:,ff]) + residual=y-y_est + + figure(k+1, figsize=(12,6)) + title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f)) + for i in range(0,len(ff)): + subplot(2, int( np.ceil(len(ff)/2)), i+1) + plot(X[:,ff[i]],residual,'.') + xlabel(attributeNames[ff[i]]) + ylabel('residual error') + + show() -print("Ran Exercise 6.2.1") +print('Ran Exercise 6.2.1') diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py index ced5d6063730861eacc33c25a87429565dec69a2..f04ee01ab424588250a67e58079b0b199a2b57bf 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py @@ -17,9 +17,7 @@ from scipy.io import loadmat from sklearn.metrics import confusion_matrix from sklearn.neighbors import KNeighborsClassifier -filename = importlib_resources.files("dtuimldmtools").joinpath( - "synth1.mat" -) # <-- change the number to change dataset +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth3.mat") # <-- change the number to change dataset # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py index 60a9781933948bef1770546e976a4307a1aaf39e..4f54813207980fe035e700fcafdd8517f7decd87 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py @@ -10,12 +10,11 @@ from dtuimldmtools import * from dtuimldmtools.statistics.statistics import correlated_ttest loss = 2 -X, y = X[:, :10], X[:, 10:] +X,y = X[:,:10], X[:,10:] # This script crates predictions from three KNN classifiers using cross-validation -K = 10 +K = 10 # We presently set J=K m = 1 -J = 0 r = [] kf = model_selection.KFold(n_splits=K) @@ -24,7 +23,7 @@ for dm in range(m): yhat = [] for train_index, test_index in kf.split(X): - X_train, y_train = X[train_index, :], y[train_index] + X_train, y_train = X[train_index,:], y[train_index] X_test, y_test = X[test_index, :], y[test_index] mA = sklearn.linear_model.LinearRegression().fit(X_train, y_train) @@ -33,30 +32,26 @@ for dm in range(m): yhatA = mA.predict(X_test) yhatB = mB.predict(X_test)[:, np.newaxis] # justsklearnthings y_true.append(y_test) - yhat.append(np.concatenate([yhatA, yhatB], axis=1)) + yhat.append( np.concatenate([yhatA, yhatB], axis=1) ) - r.append( - np.mean(np.abs(yhatA - y_test) ** loss - np.abs(yhatB - y_test) ** loss) - ) + r.append( np.mean( np.abs( yhatA-y_test ) ** loss - np.abs( yhatB-y_test) ** loss ) ) # Initialize parameters and run test appropriate for setup II alpha = 0.05 -rho = 1 / K +rho = 1/K p_setupII, CI_setupII = correlated_ttest(r, rho, alpha=alpha) if m == 1: - y_true = np.concatenate(y_true)[:, 0] + y_true = np.concatenate(y_true)[:,0] yhat = np.concatenate(yhat) # note our usual setup I ttest only makes sense if m=1. - zA = np.abs(y_true - yhat[:, 0]) ** loss - zB = np.abs(y_true - yhat[:, 1]) ** loss + zA = np.abs(y_true - yhat[:,0] ) ** loss + zB = np.abs(y_true - yhat[:,1] ) ** loss z = zA - zB - CI_setupI = st.t.interval( - 1 - alpha, len(z) - 1, loc=np.mean(z), scale=st.sem(z) - ) # Confidence interval + CI_setupI = st.t.interval(1 - alpha, len(z) - 1, loc=np.mean(z), scale=st.sem(z)) # Confidence interval p_setupI = st.t.cdf(-np.abs(np.mean(z)) / st.sem(z), df=len(z) - 1) # p-value - print([p_setupII, p_setupI]) - print(CI_setupII, CI_setupI) + print( [p_setupII, p_setupI] ) + print(CI_setupII, CI_setupI ) \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py index cd0cdc5c0721ec6581c6ba07be43e8ac4b3a299c..fcad65118964c00311958383913ba49cf5d06750 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py @@ -1,21 +1,19 @@ # exercise 9.1.1 - - import importlib_resources -import numpy as np from matplotlib.pyplot import figure, show +import numpy as np from scipy.io import loadmat -from sklearn.linear_model import LogisticRegression - from dtuimldmtools import BinClassifierEnsemble, bootstrap, dbplot, dbprobplot +from sklearn.linear_model import LogisticRegression filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth5.mat") + # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] -classNames = [name[0][0] for name in mat_data["classNames"]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] +classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) @@ -25,17 +23,18 @@ C = len(classNames) L = 100 # Weights for selecting samples in each bootstrap -weights = np.ones((N, 1), dtype=float) / N +weights = np.ones((N,1),dtype=float)/N # Storage of trained log.reg. classifiers fitted in each bootstrap -logits = [0] * L +logits = [0]*L votes = np.zeros((N,)) # For each round of bagging for l in range(L): + # Extract training set by random sampling with replacement from X and y X_train, y_train = bootstrap(X, y, N, weights) - + # Fit logistic regression model to training data and save result logit_classifier = LogisticRegression() logit_classifier.fit(X_train, y_train) @@ -43,22 +42,20 @@ for l in range(L): y_est = logit_classifier.predict(X).T votes = votes + y_est - ErrorRate = (y != y_est).sum(dtype=float) / N - print("Error rate: {:2.2f}%".format(ErrorRate * 100)) - + ErrorRate = (y!=y_est).sum(dtype=float)/N + print('Error rate: {:2.2f}%'.format(ErrorRate*100)) + # Estimated value of class labels (using 0.5 as threshold) by majority voting -y_est_ensemble = votes > (L / 2) +y_est_ensemble = votes>(L/2) # Compute error rate -ErrorRate = (y != y_est_ensemble).sum(dtype=float) / N -print("Error rate: {:3.2f}%".format(ErrorRate * 100)) +ErrorRate = (y!=y_est_ensemble).sum(dtype=float)/N +print('Error rate: {:3.2f}%'.format(ErrorRate*100)) ce = BinClassifierEnsemble(logits) -figure(1) -dbprobplot(ce, X, y, "auto", resolution=200) -figure(2) -dbplot(ce, X, y, "auto", resolution=200) +figure(1); dbprobplot(ce, X, y, 'auto', resolution=200) +figure(2); dbplot(ce, X, y, 'auto', resolution=200) show() -print("Ran Exercise 9.1.1") +print('Ran Exercise 9.1.1') \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py index a3ebddbc97f19582253abcb6c2d4b80a055e06f2..252a3c9f06a7a04db04760b9137a4bf9668e30a1 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py @@ -1,23 +1,19 @@ # exercise 9.1.2 - - import importlib_resources import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat -from sklearn.linear_model import LogisticRegression - from dtuimldmtools import BinClassifierEnsemble, bootstrap, dbplot, dbprobplot +from sklearn.linear_model import LogisticRegression filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth5.mat") - # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] -classNames = [name[0][0] for name in mat_data["classNames"]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] +classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) @@ -27,80 +23,78 @@ C = len(classNames) L = 100 # Weights for selecting samples in each bootstrap -weights = np.ones((N,), dtype=float) / N +weights = np.ones((N,),dtype=float)/N # Storage of trained log.reg. classifiers fitted in each bootstrap -logits = [0] * L -alpha = np.ones((L,)) -votes = np.zeros((N, 1)) +logits = [0]*L +alpha = np.ones( (L,) ) +votes = np.zeros((N,1)) epsi = 0 -y_all = np.zeros((N, L)) +y_all = np.zeros((N,L)) y = y > 0.5 # For each round of boosting for l in range(L): + # Extract training set by random sampling with replacement from X and y - while True: - # not a thing of beauty, however log.reg. fails if presented with less than two classes. - X_train, y_train = bootstrap(X, y, N, weights) - if not (all(y_train == 0) or all(y_train == 1)): - break - + while True : + # not a thing of beauty, however log.reg. fails if presented with less than two classes. + X_train, y_train = bootstrap(X, y, N, weights) + if not (all(y_train==0) or all(y_train == 1)) : break + # Fit logistic regression model to training data and save result - # turn off regularization with C. + # turn off regularization with C. logit_classifier = LogisticRegression(C=1000) - logit_classifier.fit(X_train, y_train) + logit_classifier.fit(X_train, y_train ) logits[l] = logit_classifier y_est = logit_classifier.predict(X).T > 0.5 - - y_all[:, l] = 1.0 * y_est - v = (y_est != y).T - ErrorRate = np.multiply(weights, v).sum() + + y_all[:,l] = 1.0 * y_est + v = (y_est != y).T + ErrorRate = np.multiply(weights,v).sum() epsi = ErrorRate - - alphai = 0.5 * np.log((1 - epsi) / epsi) - - weights[y_est == y] = weights[y_est == y] * np.exp(-alphai) - weights[y_est != y] = weights[y_est != y] * np.exp(alphai) - + + alphai = 0.5 * np.log( (1-epsi)/epsi) + + weights[y_est == y] = weights[y_est == y] * np.exp( -alphai ) + weights[y_est != y] = weights[y_est != y] * np.exp( alphai ) + weights = weights / sum(weights) - + votes = votes + y_est alpha[l] = alphai - print("Error rate: {:2.2f}%".format(ErrorRate * 100)) - - + print('Error rate: {:2.2f}%'.format(ErrorRate*100)) + + # Estimated value of class labels (using 0.5 as threshold) by majority voting -alpha = alpha / sum(alpha) +alpha = alpha/sum(alpha) y_est_ensemble = y_all @ alpha > 0.5 -# y_est_ensemble = votes > (L/2) -# y_est_ensemble = mat(y_all) * mat(alpha) - (1-mat(y_all)) * mat(alpha) > 0 -ErrorRateEnsemble = sum(y_est_ensemble != y) / N +#y_est_ensemble = votes > (L/2) +#y_est_ensemble = mat(y_all) * mat(alpha) - (1-mat(y_all)) * mat(alpha) > 0 +ErrorRateEnsemble = sum(y_est_ensemble != y)/N # Compute error rate -# ErrorRate = (y!=y_est_ensemble).sum(dtype=float)/N -print("Error rate for ensemble classifier: {:.1f}%".format(ErrorRateEnsemble * 100)) - -ce = BinClassifierEnsemble(logits, alpha) -# ce = BinClassifierEnsemble(logits) # What happens if alpha is not included? -plt.figure(1) -dbprobplot(ce, X, y, "auto", resolution=200) -plt.figure(2) -dbplot(ce, X, y, "auto", resolution=200) -# plt.figure(3); plt.plot(alpha); - -# %% -plt.figure(4, figsize=(8, 8)) +#ErrorRate = (y!=y_est_ensemble).sum(dtype=float)/N +print('Error rate for ensemble classifier: {:.1f}%'.format(ErrorRateEnsemble*100)) + +ce = BinClassifierEnsemble(logits,alpha) +#ce = BinClassifierEnsemble(logits) # What happens if alpha is not included? +plt.figure(1); dbprobplot(ce, X, y, 'auto', resolution=200) +plt.figure(2); dbplot(ce, X, y, 'auto', resolution=200) +#plt.figure(3); plt.plot(alpha); + +#%% +plt.figure(4,figsize=(8,8)) for i in range(2): - plt.plot(X[(y_est_ensemble == i), 0], X[(y_est_ensemble == i), 1], "br"[i] + "o") + plt.plot(X[ (y_est_ensemble==i),0],X[ (y_est_ensemble==i),1],'br'[i] + 'o') ## Incomment the below lines to investigate miss-classifications -# for i in range(2): +#for i in range(2): # plt.plot(X[ (y==i),0],X[ (y==i),1],'br'[i] + '.') -plt.xlabel("Feature 1") -plt.ylabel("Feature 2") +plt.xlabel('Feature 1') +plt.ylabel('Feature 2') plt.show() -print("Ran Exercise 9.1.2") +print('Ran Exercise 9.1.2') \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py index bba69f4da4a880649c0e5e29bbc1187319c7ffb7..296caf5fbe200202623b3c26168abc7da860f210 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py @@ -1,21 +1,18 @@ # exercise 9.1.3 - import importlib_resources from matplotlib.pyplot import figure, show from scipy.io import loadmat -from sklearn.ensemble import RandomForestClassifier - from dtuimldmtools import dbplot, dbprobplot +from sklearn.ensemble import RandomForestClassifier filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth7.mat") - # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] -classNames = [name[0][0] for name in mat_data["classNames"]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] +classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) @@ -30,15 +27,13 @@ y_est = rf_classifier.predict(X).T y_est_prob = rf_classifier.predict_proba(X).T # Compute classification error -ErrorRate = (y != y_est).sum(dtype=float) / N -print("Error rate: {:.2f}%".format(ErrorRate * 100)) +ErrorRate = (y!=y_est).sum(dtype=float)/N +print('Error rate: {:.2f}%'.format(ErrorRate*100)) -# Plot decision boundaries -figure(1) -dbprobplot(rf_classifier, X, y, "auto", resolution=400) -figure(2) -dbplot(rf_classifier, X, y, "auto", resolution=400) +# Plot decision boundaries +figure(1); dbprobplot(rf_classifier, X, y, 'auto', resolution=400) +figure(2); dbplot(rf_classifier, X, y, 'auto', resolution=400) show() -print("Ran Exercise 9.1.3") +print('Ran Exercise 9.1.3') \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py index 8c6fe679b05e39dd43a22bd4b9a50cd5042d4f1a..a582f7f20c63fa72e58d5c074706d740da195293 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py @@ -1,24 +1,19 @@ # exercise 9.2.1 - import importlib_resources from matplotlib.pyplot import figure, show - -# import numpy as np from scipy.io import loadmat -from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold - -from dtuimldmtools import confmatplot, rocplot +from sklearn.linear_model import LogisticRegression +from dtuimldmtools import rocplot, confmatplot filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") - # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"][0]] -classNames = [name[0][0] for name in mat_data["classNames"]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'][0]] +classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) @@ -26,27 +21,27 @@ C = len(classNames) K = 2 CV = StratifiedKFold(K, shuffle=True) -k = 0 -for train_index, test_index in CV.split(X, y): +k=0 +for train_index, test_index in CV.split(X,y): print(train_index) # extract training and test set for current CV fold - X_train, y_train = X[train_index, :], y[train_index] - X_test, y_test = X[test_index, :], y[test_index] + X_train, y_train = X[train_index,:], y[train_index] + X_test, y_test = X[test_index,:], y[test_index] logit_classifier = LogisticRegression() logit_classifier.fit(X_train, y_train) y_test_est = logit_classifier.predict(X_test).T - p = logit_classifier.predict_proba(X_test)[:, 1].T + p = logit_classifier.predict_proba(X_test)[:,1].T figure(k) rocplot(p, y_test) - figure(k + 1) - confmatplot(y_test, y_test_est) - - k += 2 + figure(k+1) + confmatplot(y_test,y_test_est) -show() + k+=2 + +show() -print("Ran Exercise 9.2.1") +print('Ran Exercise 9.2.1') \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py index a6e2a331b6b3debc5f1affd77ec5bc681f563973..138ce3dea61637541f17fb2b3820f898abb838ab 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py @@ -1,26 +1,22 @@ # exercise 9.2.2 - import importlib_resources from matplotlib.pyplot import figure, show - -# import numpy as np from scipy.io import loadmat -from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold - -from dtuimldmtools import confmatplot, rocplot +from sklearn.linear_model import LogisticRegression +from dtuimldmtools import rocplot, confmatplot filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") # Load Matlab data file and extract variables of interest mat_data = loadmat(filename) -X = mat_data["X"] -y = mat_data["y"].squeeze() -attributeNames = [name[0] for name in mat_data["attributeNames"][0]] -classNames = [name[0][0] for name in mat_data["classNames"]] +X = mat_data['X'] +y = mat_data['y'].squeeze() +attributeNames = [name[0] for name in mat_data['attributeNames'][0]] +classNames = [name[0][0] for name in mat_data['classNames']] -attribute_included = 10 # alcohol contents -X = X[:, attribute_included].reshape(-1, 1) +attribute_included = 10 # alcohol contents +X = X[:,attribute_included].reshape(-1,1) attributeNames = attributeNames[attribute_included] N, M = X.shape C = len(classNames) @@ -29,25 +25,25 @@ C = len(classNames) K = 2 CV = StratifiedKFold(K, shuffle=True) -k = 0 -for train_index, test_index in CV.split(X, y): +k=0 +for train_index, test_index in CV.split(X,y): print(train_index) # extract training and test set for current CV fold - X_train, y_train = X[train_index, :], y[train_index] - X_test, y_test = X[test_index, :], y[test_index] + X_train, y_train = X[train_index,:], y[train_index] + X_test, y_test = X[test_index,:], y[test_index] logit_classifier = LogisticRegression() logit_classifier.fit(X_train, y_train) y_test_est = logit_classifier.predict(X_test).T - p = logit_classifier.predict_proba(X_test)[:, 1].T + p = logit_classifier.predict_proba(X_test)[:,1].T figure(k) - rocplot(p, y_test) - - figure(k + 1) - confmatplot(y_test, y_test_est) + rocplot(p,y_test) - k += 2 + figure(k+1) + confmatplot(y_test,y_test_est) -show() + k+=2 + +show() \ No newline at end of file diff --git a/exercises/02450Toolbox_R/02450Toolbox_R_Development.Rproj b/exercises/02450Toolbox_R/02450Toolbox_R_Development.Rproj new file mode 100644 index 0000000000000000000000000000000000000000..8e3c2ebc99e2e337f7d69948b93529a437590b27 --- /dev/null +++ b/exercises/02450Toolbox_R/02450Toolbox_R_Development.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX