# exercise 1.5.2 import importlib_resources import numpy as np # You can read data from excel spreadsheets after installing and importing xlrd # module. In most cases, you will need only few functions to accomplish it: # open_workbook(), col_values(), row_values() import xlrd # If you need more advanced reference, or if you are interested how to write # data to excel files, see the following tutorial: # http://www.simplistix.co.uk/presentations/python-excel.pdf} # Get path to the datafile filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls") # Print the location of the iris.xls file on your computer. # You should inspect it manually to understand the format and content print("\nLocation of the iris.xls file: {}".format(filename)) # Load xls sheet with data # There's only a single sheet in the .xls, so we take out that sheet doc = xlrd.open_workbook(filename).sheet_by_index(0) # Extract attribute names attributeNames = doc.row_values(rowx=0, start_colx=0, end_colx=4) # Try calling help(doc.row_values). You'll see that the above means # that we extract columns 0 through 4 from the first row of the document, # which contains the header of the xls files (where the attributen names are) # Extract class names to python list, then encode with integers (dict) just as # we did previously. The class labels are in the 5th column, in the rows 2 to # and up to 151: classLabels = doc.col_values(4, 1, 151) # check out help(doc.col_values) classNames = sorted(set(classLabels)) classDict = dict(zip(classNames, range(len(classNames)))) # Extract vector y, convert to NumPy array y = np.array([classDict[value] for value in classLabels]) # Preallocate memory, then extract data to matrix X X = np.empty((150, 4)) for i in range(4): X[:, i] = np.array(doc.col_values(i, 1, 151)).T # Compute values of N, M and C. N = len(y) M = len(attributeNames) C = len(classNames)