Adjusted scripts

18c851b3 · Stas Syrota · 00de7bf7 · 00de7bf7 · 00de7bf7 · 00de7bf7
Commit 18c851b3 authored Jan 22, 2024 by Stas Syrota
--- a/exercises/02450Toolbox_Python/Data/synth6.mat
+++ b/exercises/02450Toolbox_Python/Data/synth6.mat
--- a/exercises/02450Toolbox_Python/Data/synth7.mat
+++ b/exercises/02450Toolbox_Python/Data/synth7.mat
--- a/exercises/02450Toolbox_Python/Data/textDocs.txt
+++ b/exercises/02450Toolbox_Python/Data/textDocs.txt
-The Google matrix P is a model of the internet
-
-
-P_ij is nonzero if there is a link from webpage i to j
-
-
-The Google matrix is used to rank all Web pages
-
-
-The ranking is done by solving a matrix eigenvalue problem
-
-
-England dropped out of the top 10 in the FIFA ranking
\ No newline at end of file
--- a/exercises/02450Toolbox_Python/Data/wildfaces.mat
+++ b/exercises/02450Toolbox_Python/Data/wildfaces.mat
--- a/exercises/02450Toolbox_Python/Data/wildfaces_grayscale.mat
+++ b/exercises/02450Toolbox_Python/Data/wildfaces_grayscale.mat
--- a/exercises/02450Toolbox_Python/Data/wine.mat
+++ b/exercises/02450Toolbox_Python/Data/wine.mat
--- a/exercises/02450Toolbox_Python/Data/wine2.mat
+++ b/exercises/02450Toolbox_Python/Data/wine2.mat
--- a/exercises/02450Toolbox_Python/Data/xor.mat
+++ b/exercises/02450Toolbox_Python/Data/xor.mat
--- a/exercises/02450Toolbox_Python/Data/zipdata.mat
+++ b/exercises/02450Toolbox_Python/Data/zipdata.mat
--- a/exercises/02450Toolbox_Python/README.md
+++ b/exercises/02450Toolbox_Python/README.md
 # 02450 Toolbox - Python

 ## Installation
-The exercise scriprs foudn depend on a 
+The exercise scripts inside `/Scripts` depend on a course specific package [dtuimldmtools](https://pypi.org/project/dtuimldmtools/) which needs to be installed.

-TODO: To be completed
-TODO: Virtual envs
+We recommend using a Python Virtual environment using [Anaconda](https://www.anaconda.com/download/) or [Miniconda](https://docs.conda.io/projects/miniconda/en/latest/miniconda-install.html) and installing the package inside it. To set up such an environment follow the guide provided by [DTU Python support](https://pythonsupport.dtu.dk/python/install-conda.html).

-pip install 02450toolbox
-import toolbox_02450
+Once setup, the package can be installed by running the following command:
+
+```
+pip install dtuimldmtools
+```


 ## Dataset

--- a/exercises/02450Toolbox_Python/Scripts/ex0_4_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex0_4_3.py
@@ -21,4 +21,4 @@ c = np.arange(100, 95, -1)

 d = np.arange(1.2, 1.9, 0.1)

-e = np.pi*np.arange(0,2.5,.5) 
+e = np.pi * np.arange(0, 2.5, 0.5)
--- a/exercises/02450Toolbox_Python/Scripts/ex0_4_4.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex0_4_4.py
 ## exercise 0.4.4
 import numpy as np
+
 # Extracting the elements from vectors is easy. Consider the
 # following definition of x and the echoed results
 x = np.concatenate([np.zeros(2), np.arange(0, 3.6, 0.6), np.ones(3)])
@@ -17,7 +18,7 @@ x[1::2] # return every other element of x starting from the 2nd

 # Inserting numbers into vectors is also easy. Using the same
 # definition of x and observe the results when typing
-y = x;
+y = x
 y[1::2] = np.pi
 # Notice that we're inserting the same scalar value "pi" into all elements
 # that we index y with

--- a/exercises/02450Toolbox_Python/Scripts/ex0_4_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex0_4_5.py
@@ -51,14 +51,14 @@ a4 = np.eye(3)                          # diagonal array
 a5 = np.random.rand(2, 3)  # random array
 a6 = a1.copy()  # copy
 a7 = a1  # alias
-m1 = np.matrix('1 2 3; 4 5 6; 7 8 9')   # define matrix by string
+m1 = np.matrix("1 2 3; 4 5 6; 7 8 9")  # define matrix by string
 m2 = np.asmatrix(a1.copy())  # copy array into matrix
 m3 = np.mat(np.array([1, 2, 3]))  # map array onto matrix
 a8 = np.asarray(m1)  # map matrix onto array

 # It is easy to extract and/or modify selected items from arrays/matrices.
 # Here is how you can index matrix elements:
-m = np.matrix('1 2 3; 4 5 6; 7 8 9')
+m = np.matrix("1 2 3; 4 5 6; 7 8 9")
 m[0, 0]  # first element
 m[-1, -1]  # last element
 m[0, :]  # first row
@@ -67,7 +67,7 @@ m[1:3,-1]	# view on selected rows&columns

 # Similarly, you can selectively assign values to matrix elements or columns:
 m[-1, -1] = 10000
-m[0:2,-1] = np.matrix('100; 1000')
+m[0:2, -1] = np.matrix("100; 1000")
 m[:, 0] = 0

 # Logical indexing can be used to change or take only elements that

--- a/exercises/02450Toolbox_Python/Scripts/ex0_5_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex0_5_1.py
 ## exercise 0.5.1
-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np

 x = np.arange(0, 1, 0.1)
 f = np.exp(x)

 plt.figure(1)
 plt.plot(x, f)
-plt.xlabel('x')
-plt.ylabel('f(x)=exp(x)')
-plt.title('The exponential function')
+plt.xlabel("x")
+plt.ylabel("f(x)=exp(x)")
+plt.title("The exponential function")
 plt.show()
--- a/exercises/02450Toolbox_Python/Scripts/ex0_5_2.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex0_5_2.py
 ## exercise 0.5.2

-import numpy as np
 import matplotlib.pyplot as plt
-
+import numpy as np

 # We simulate measurements every 100 ms for a period of 10 seconds
 t = np.arange(0, 10, 0.1)
@@ -14,40 +13,40 @@ sensor2 = 3*np.cos(t)+0.5*np.random.normal(size=len(t))

 # Change the font size to make axis and title readable
 font_size = 15
-plt.rcParams.update({'font.size': font_size})
+plt.rcParams.update({"font.size": font_size})

 # Define the name of the curves
-legend_strings = ['Sensor 1', 'Sensor 2']
+legend_strings = ["Sensor 1", "Sensor 2"]

 # Start plotting the simulated measurements
 plt.figure(1)
 # Plot the sensor 1 output as a function of time, and
 # make the curve red and fully drawn
-plt.plot(t, sensor1, 'r-')
+plt.plot(t, sensor1, "r-")

 # Plot the sensor 2 output as a function of time, and
 # make the curve blue and dashed
-plt.plot(t, sensor2, 'b--')
+plt.plot(t, sensor2, "b--")

 # Ensure that the limits on the axis fit the data
-plt.axis('tight')
+plt.axis("tight")

 # Add a grid in the background
 plt.grid()

 # Add a legend describing each curve, place it at the "best" location
 # so as to minimize the amount of curve it covers
-plt.legend(legend_strings,loc='best')
+plt.legend(legend_strings, loc="best")

 # Add labels to the axes
-plt.xlabel('Time [s]')
-plt.ylabel('Voltage [mV]')
+plt.xlabel("Time [s]")
+plt.ylabel("Voltage [mV]")

 # Add a title to the plot
-plt.title('Sensor outputs')
+plt.title("Sensor outputs")

 # Export the figure
-plt.savefig('ex1_5_2.png')
+plt.savefig("ex1_5_2.png")

 # Show the figure in the console
 plt.show()
--- a/exercises/02450Toolbox_Python/Scripts/ex10_1_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_1.py
 # exercise 10.1.1
+import importlib_resources
 from matplotlib.pyplot import figure, show
 from scipy.io import loadmat
-from toolbox_02450 import clusterplot
 from sklearn.cluster import k_means

+from dtuimldmtools import clusterplot
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat")
+
 # Load Matlab data file and extract variables of interest
-mat_data = loadmat('../Data/synth1.mat')
-X = mat_data['X']
-y = mat_data['y'].squeeze()
-attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()]
-classNames = [name[0][0] for name in mat_data['classNames']]
+mat_data = loadmat(filename)
+X = mat_data["X"]
+y = mat_data["y"].squeeze()
+attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()]
+classNames = [name[0][0] for name in mat_data["classNames"]]
 N, M = X.shape
 C = len(classNames)

@@ -24,4 +28,4 @@ figure(figsize=(14,9))
 clusterplot(X, cls, centroids, y)
 show()

-print('Ran Exercise 10.1.1')
\ No newline at end of file
+print("Ran Exercise 10.1.1")
--- a/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py
 # exercise 10.1.3
-from matplotlib.pyplot import figure, title, plot, ylim, legend, show
+import importlib_resources
 import numpy as np
+from matplotlib.pyplot import figure, legend, plot, show, title, ylim
 from scipy.io import loadmat
-from toolbox_02450 import clusterval
 from sklearn.cluster import k_means

+from dtuimldmtools import clusterval
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat")
+

 # Load Matlab data file and extract variables of interest
-mat_data = loadmat('../Data/synth1.mat')
-X = mat_data['X']
-y = mat_data['y'].squeeze()
-attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()]
-classNames = [name[0][0] for name in mat_data['classNames']]
+mat_data = loadmat(filename)
+X = mat_data["X"]
+y = mat_data["y"].squeeze()
+attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()]
+classNames = [name[0][0] for name in mat_data["classNames"]]
 N, M = X.shape
 C = len(classNames)

@@ -34,11 +38,11 @@ for k in range(K-1):
 # Plot results:

 figure(1)
-title('Cluster validity')
+title("Cluster validity")
 plot(np.arange(K - 1) + 2, Rand)
 plot(np.arange(K - 1) + 2, Jaccard)
 plot(np.arange(K - 1) + 2, NMI)
-legend(['Rand', 'Jaccard', 'NMI'], loc=4)
+legend(["Rand", "Jaccard", "NMI"], loc=4)
 show()

-print('Ran Exercise 10.1.3')
\ No newline at end of file
+print("Ran Exercise 10.1.3")
--- a/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py
 # exercise 10_1_5
-from matplotlib import pyplot as plt
+import importlib_resources
 import numpy as np
+from matplotlib import pyplot as plt
 from scipy.io import loadmat
 from sklearn.cluster import k_means

+filename = importlib_resources.files("dtuimldmtools").joinpath("data/wildfaces.mat")
+

 # Load Matlab data file and extract variables of interest
-mat_data = loadmat('../Data/wildfaces.mat')
+mat_data = loadmat(filename)
 # mat_data = loadmat('../Data/digits.mat') #<-- uncomment this for using the digits dataset

-X = mat_data['X']
+X = mat_data["X"]
 N, M = X.shape
 # Image resolution and number of colors
 x = 40  # <-- change this for using the digits dataset
@@ -45,9 +48,11 @@ for k in range(K):
        # Squeeze out singleton dimension
        # and flip the image (cancel out previos transpose)
        img = np.squeeze(img).T
-    plt.imshow(img,interpolation='None', cmap=cmap)
-    plt.xticks([]); plt.yticks([])
-    if k==np.floor((n2-1)/2): plt.title('Centroids')
+    plt.imshow(img, interpolation="None", cmap=cmap)
+    plt.xticks([])
+    plt.yticks([])
+    if k == np.floor((n2 - 1) / 2):
+        plt.title("Centroids")

 # Plot few randomly selected faces and their nearest centroids
 L = 5  # number of images to plot
@@ -58,16 +63,19 @@ for l in range(L):
    img = np.resize(X[j[l], :], (c, x, y)).T
    if c == 1:
        img = np.squeeze(img).T
-    plt.imshow(img,interpolation='None', cmap=cmap)
-    plt.xticks([]); plt.yticks([])
-    if l==np.floor((L-1)/2): plt.title('Randomly selected faces and their centroids')
+    plt.imshow(img, interpolation="None", cmap=cmap)
+    plt.xticks([])
+    plt.yticks([])
+    if l == np.floor((L - 1) / 2):
+        plt.title("Randomly selected faces and their centroids")
    plt.subplot(2, L, L + l + 1)
    img = np.resize(centroids[cls[j[l]], :], (c, x, y)).T
    if c == 1:
        img = np.squeeze(img).T
-    plt.imshow(img,interpolation='None', cmap=cmap)
-    plt.xticks([]); plt.yticks([])
+    plt.imshow(img, interpolation="None", cmap=cmap)
+    plt.xticks([])
+    plt.yticks([])

 plt.show()

-print('Ran Exercise 10.1.5')
\ No newline at end of file
+print("Ran Exercise 10.1.5")
--- a/exercises/02450Toolbox_Python/Scripts/ex10_2_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex10_2_1.py
 # exercise 10.2.1
+import importlib_resources
 from matplotlib.pyplot import figure, show
+from scipy.cluster.hierarchy import dendrogram, fcluster, linkage
 from scipy.io import loadmat
-from toolbox_02450 import clusterplot
-from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
+
+from dtuimldmtools import clusterplot
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat")

 # Load Matlab data file and extract variables of interest
-mat_data = loadmat('../Data/synth1.mat')
-X = mat_data['X']
-y = mat_data['y'].squeeze()
-attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()]
-classNames = [name[0][0] for name in mat_data['classNames']]
+mat_data = loadmat(filename)
+X = mat_data["X"]
+y = mat_data["y"].squeeze()
+attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()]
+classNames = [name[0][0] for name in mat_data["classNames"]]
 N, M = X.shape
 C = len(classNames)


 # Perform hierarchical/agglomerative clustering on data matrix
-Method = 'single'
-Metric = 'euclidean'
+Method = "single"
+Metric = "euclidean"

 Z = linkage(X, method=Method, metric=Metric)

 # Compute and display clusters by thresholding the dendrogram
 Maxclust = 4
-cls = fcluster(Z, criterion='maxclust', t=Maxclust)
+cls = fcluster(Z, criterion="maxclust", t=Maxclust)
 figure(1)
 clusterplot(X, cls.reshape(cls.shape[0], 1), y=y)

 # Display dendrogram
 max_display_levels = 6
 figure(2, figsize=(10, 4))
-dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=Z[-Maxclust+1,2])
+dendrogram(
+    Z, truncate_mode="level", p=max_display_levels, color_threshold=Z[-Maxclust + 1, 2]
+)

 show()

-print('Ran Exercise 10.2.1')
\ No newline at end of file
+print("Ran Exercise 10.2.1")
--- a/exercises/02450Toolbox_Python/Scripts/ex11_1_1.py
+++ b/exercises/02450Toolbox_Python/Scripts/ex11_1_1.py
 # exercise 11.1.1
-from matplotlib.pyplot import figure, show
+import importlib_resources
 import numpy as np
+from matplotlib.pyplot import figure, show
 from scipy.io import loadmat
-from toolbox_02450 import clusterplot
 from sklearn.mixture import GaussianMixture
+
+from dtuimldmtools import clusterplot
+
+filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth2.mat")
+
 # Load Matlab data file and extract variables of interest
-mat_data = loadmat('../Data/synth2.mat')
-X = mat_data['X']
-y = mat_data['y'].squeeze()
-attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()]
-classNames = [name[0][0] for name in mat_data['classNames']]
+mat_data = loadmat(filename)
+X = mat_data["X"]
+y = mat_data["y"].squeeze()
+attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()]
+classNames = [name[0][0] for name in mat_data["classNames"]]
 # X_old = X
 # X = np.hstack([X,X])
 N, M = X.shape
 C = len(classNames)
 # Number of clusters
 K = 4
-cov_type = 'full' # e.g. 'full' or 'diag'
+cov_type = "full"  # e.g. 'full' or 'diag'

 # define the initialization procedure (initial value of means)
-initialization_method = 'random'#  'random' or 'kmeans'
+initialization_method = "random"  #  'random' or 'kmeans'
 # random signifies random initiation, kmeans means we run a K-means and use the
 # result as the starting point. K-means might converge faster/better than
 # random, but might also cause the algorithm to be stuck in a poor local minimum
@@ -28,15 +33,21 @@ initialization_method = 'random'#  'random' or 'kmeans'
 reps = 1
 # number of fits with different initalizations, best result will be kept
 # Fit Gaussian mixture model
-gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps, 
-                      tol=1e-6, reg_covar=1e-6, init_params=initialization_method).fit(X)
+gmm = GaussianMixture(
+    n_components=K,
+    covariance_type=cov_type,
+    n_init=reps,
+    tol=1e-6,
+    reg_covar=1e-6,
+    init_params=initialization_method,
+).fit(X)
 cls = gmm.predict(X)
 # extract cluster labels
 cds = gmm.means_
 # extract cluster centroids (means of gaussians)
 covs = gmm.covariances_
 # extract cluster shapes (covariances of gaussians)
-if cov_type.lower() == 'diag':
+if cov_type.lower() == "diag":
    new_covs = np.zeros([K, M, M])

    count = 0
@@ -58,4 +69,4 @@ show()
 # clusterplot(X[:,idx], clusterid=cls, centroids=cds[:,idx], y=y, covars=covs[:,idx,:][:,:,idx])
 # show()

-print('Ran Exercise 11.1.1')
\ No newline at end of file
+print("Ran Exercise 11.1.1")