From 2c93bc6d68d82abf5f0622c195b1ff15b1f6eb25 Mon Sep 17 00:00:00 2001 From: Matte23 Date: Thu, 31 Oct 2024 16:22:50 +0100 Subject: [PATCH] labs: Add second lab (partial) --- Labs/Lab2 - Numpy.ipynb | 765 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 765 insertions(+) create mode 100644 Labs/Lab2 - Numpy.ipynb diff --git a/Labs/Lab2 - Numpy.ipynb b/Labs/Lab2 - Numpy.ipynb new file mode 100644 index 0000000..6db024e --- /dev/null +++ b/Labs/Lab2 - Numpy.ipynb @@ -0,0 +1,765 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b1ea060a47a6211d", + "metadata": {}, + "source": [ + "# LAB #2: Numpy\n", + "\n", + "## Introduction\n", + "In this laboratory, you will perform some operation with NumPy arrays in such a way to build your first Machine Learning model. \n", + "In particular, you will build a NumPy-based version of the K-Nearest Neighbors algorithm (a.k.a. KNN).\n", + "\n", + "## 0 Preliminary steps\n", + "### 0.1 NumPy\n", + "Make sure you have the NumPy library installed, its use is strongly recommended for this laboratory.\n", + "NumPy is the fundamental package for scientific computing with Python. You can read more about it on\n", + "the official documentation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9246699975edf562", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install numpy" + ] + }, + { + "cell_type": "markdown", + "id": "ad497ed1d0092203", + "metadata": {}, + "source": [ + "### 0.2 Iris dataset download \n", + "For this lab, you will need two of the datasets you have already met: Iris and MNIST. Please refer to\n", + "Laboratory 1 for a complete description of the datasets.\n", + "Iris. You can download it from:\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a838a5ed77a24051", + "metadata": {}, + "outputs": [], + "source": [ + "# linux users\n", + "# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -O iris.csv\n", + "# windows users\n", + "! pip install wget\n", + "import wget\n", + "wget.download(\"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\", \"iris.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "ef169d9060adb9a7", + "metadata": {}, + "source": [ + "## 1 Exercises \n", + "Note that exercises marked with a ($\\star$) are optional, you should focus on completing the other ones first." + ] + }, + { + "cell_type": "markdown", + "id": "a820274dc6b6f678", + "metadata": {}, + "source": [ + "## 1.1 Iris Analysis with Numpy\n", + "As you might remember from Lab. 1, the Iris dataset collects the measurements of different Iris flowers,\n", + "and each data point is characterized by 4 **features** (sepal length, sepal width, petal length, petal width) and is associated to 1 **label** (i.e. an Iris species - Setosa, Versicolor, or Virginica) which in this case is the last element of the row (last column of the csv file). " + ] + }, + { + "cell_type": "markdown", + "id": "46864c46cf9f9387", + "metadata": {}, + "source": [ + "1. Load the Iris dataset. You can use the `csv` library that we saw in the last laboratory or read it with the standard `open(filename, strategy)`. \n", + "In the second case remember to split correctly the different fields, and avoid new line characters. In any case check for empty lines. \n", + "This time remember to store the 4 features in a numpy array `x` of shape (n_sample, 4) and the labels in a different array `y` of shape (n_sample,) converting the 3 different species to a corresponding numerical value. E.g.,\n", + " - Iris-setosa: 0\n", + " - Iris-versicolor: 1\n", + " - Iris-virginica: 2\n", + "\n", + "In order to check you have correctly loaded the data, print the shape of the two arrays: you should find\n", + "(150, 4) for `x` and (150,) for `y`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a977ccc88ef2ca39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(150, 4)\n", + "(150,)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "def type_mapper(type):\n", + " match type:\n", + " case b\"Iris-setosa\":\n", + " return 0\n", + " case b\"Iris-versicolor\":\n", + " return 1\n", + " case b\"Iris-virginica\":\n", + " return 2\n", + " \n", + " return -1\n", + "\n", + "raw_csv = np.loadtxt(\"iris.csv\",\n", + " delimiter=\",\", dtype=float, converters={4:type_mapper})\n", + "\n", + "x = raw_csv[:,0:4]\n", + "y = raw_csv[:,4]\n", + "\n", + "print(x.shape)\n", + "print(y.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "5050d162966956ce", + "metadata": {}, + "source": [ + "2. Compute again the mean and standard deviation for each class by means of the numpy functions" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "33bfaed602d4bc3e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Metrics for specie 0\n", + "Sepal length for mean: 5.006, std_dev: 0.3489469873777391\n", + "Sepal width mean: 3.418, std_dev: 0.37719490982779713\n", + "Petal length mean: 1.464, std_dev: 0.17176728442867112\n", + "Petal width mean: 0.244, std_dev: 0.10613199329137281\n", + "\n", + "Metrics for specie 1\n", + "Sepal length for mean: 5.936, std_dev: 0.5109833656783751\n", + "Sepal width mean: 2.7700000000000005, std_dev: 0.31064449134018135\n", + "Petal length mean: 4.26, std_dev: 0.4651881339845203\n", + "Petal width mean: 1.3259999999999998, std_dev: 0.19576516544063705\n", + "\n", + "Metrics for specie 2\n", + "Sepal length for mean: 6.587999999999998, std_dev: 0.6294886813914926\n", + "Sepal width mean: 2.974, std_dev: 0.3192553836664309\n", + "Petal length mean: 5.5520000000000005, std_dev: 0.546347874526844\n", + "Petal width mean: 2.0260000000000002, std_dev: 0.2718896835115301\n", + "\n" + ] + } + ], + "source": [ + "for i in range(3):\n", + " iris = x[np.ma.masked_where(y, y==i)]\n", + "\n", + " print(f\"Metrics for specie {i}\")\n", + " print(f\"Sepal length for mean: {iris[:,0].mean()}, std_dev: {iris[:,0].std()}\")\n", + " print(f\"Sepal width mean: {iris[:,1].mean()}, std_dev: {iris[:,1].std()}\")\n", + " print(f\"Petal length mean: {iris[:,2].mean()}, std_dev: {iris[:,2].std()}\")\n", + " print(f\"Petal width mean: {iris[:,3].mean()}, std_dev: {iris[:,3].std()}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "1f84beb708797ba9", + "metadata": {}, + "source": [ + "3. Compute the distances among two samples (e.g., the $36^{th}$ and the $81^{th}$, the $13^{th}$ and the $15^{th}$) \n", + "by means of the `np.linalg.norm(a-b)` function which computes the norm of `a-b`, i.e., the euclidean distance between the feature of the `a` and of the `b` samples. \n", + " - Can you guess if the two couples of samples belong to the same species?\n", + " - From the mean and standard deviations computed before can you guess which species? " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "4a47fb722be07fb4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.7892651361962706\n", + "1.4317821063276353\n" + ] + } + ], + "source": [ + "print(np.linalg.norm(x[35]-x[81]))\n", + "print(np.linalg.norm(x[12]-x[14]))" + ] + }, + { + "cell_type": "markdown", + "id": "9dc024bce0c0dd04", + "metadata": { + "collapsed": false + }, + "source": [ + "TODO: write your comment here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd802b47b8519bb3", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + " " + ] + }, + { + "cell_type": "markdown", + "id": "f3fa448bd7bc9d94", + "metadata": { + "collapsed": false + }, + "source": [ + "TODO: write your comment here" + ] + }, + { + "cell_type": "markdown", + "id": "dcceaccd4a1a7526", + "metadata": { + "collapsed": false + }, + "source": [ + "4. Find the k nearest neighbors of a sample in the dataset.\n", + " - Define a function `k_nearest_neighbors(x, x_set, k)` that takes as input a sample `x` and a set of sample (i.e., a matrix) `x_set` and returns the indices of the `k` nearest neighbors of `x` in `x_set`.\n", + " - Reuse the `euclidean_distance` function that you defined before to do so. \n", + " - Remember that the `x_set` is a matrix of shape ($N_{samples}, N_{features}$), so you have to compute the distance between `x` and each row of `x_set`. \n", + " - In order to find the indices of the `k` nearest neighbors, you can use the `argsort` function that returns the indices that would sort an array\n", + " - Apply the function to the $36^{th}$ sample of the dataset with $k=5$.\n", + " - Print the indices of the $5$ nearest neighbors.\n", + " - Print the labels of the $5$ nearest neighbors. Can you guess the label of the $36^{th}$ sample?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b93f94748b3841e3", + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Label of 0 nearest neighbor: 0.0\n", + "Label of 1 nearest neighbor: 0.0\n", + "Label of 2 nearest neighbor: 0.0\n", + "Label of 3 nearest neighbor: 0.0\n", + "Label of 4 nearest neighbor: 0.0\n", + "Real label: 0.0\n" + ] + } + ], + "source": [ + "def k_nearest_neighbors(x: np.ndarray, x_set: np.ndarray, k: int):\n", + " distances = np.linalg.norm(x-x_set, axis=1)\n", + " distances_sorted = np.argsort(distances)\n", + "\n", + " return distances_sorted[0:k]\n", + "\n", + "indices = k_nearest_neighbors(x[35], x, 5)\n", + "for i, k in enumerate(indices):\n", + " print(f\"Label of {i} nearest neighbor: {y[k]}\")\n", + "\n", + "print(f\"Real label: {y[35]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4de2b1c8798fc98e", + "metadata": {}, + "source": [ + "TODO: write your comment here" + ] + }, + { + "cell_type": "markdown", + "id": "9dd1f94b256663e8", + "metadata": {}, + "source": [ + "## 1.2 KNN design and implementation\n", + "In this exercise, you will implement your own version of the K-Nearest Neighbors (KNN) algorithm, and you will use it to assign an\n", + "Iris species (i.e. a label) to flowers whose species is unknown.\n", + "\n", + "The KNN algorithm is straightforward. Suppose that some measurements (e.g., the iris features) and their\n", + "relative label (e.g., the iris species) of a set of samples are known in advance. \n", + "\n", + "\n", + "\n", + "Then, whenever we want to label a new sample, we look at the K most similar points (a.k.a. neighbors) and assign a label accordingly. \n", + "\n", + "\n", + "\n", + "\n", + "The simplest solution is using a majority voting scheme: if the majority of the neighbors votes for a label, we will go for it. \n", + "This approach is naive only at first sight: the local similarity assumed by KNN happens to be roughly true, as you have seen in the previous exercises.\n", + "Even though this reasoning does not generalize well, the KNN provides a valid baseline for your tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "5d185976071690ce", + "metadata": {}, + "source": [ + "1. Let’s identify a portion of our data for which we will try to guess the species. Randomly select 20%\n", + "of the records and store the first four columns (i.e. the features representing each flower) into a\n", + "two-dimensional numpy array of shape ($N_{test}, 4$), you can call it `X_test` and $N_{test}$ is the 20% of the total number of samples.\n", + "For the same records, store the test label column (i.e. the one with the species values) into another array, namely `y_test`. \n", + "This is the data that will be used to test the accuracy of your KNN implementation and its correct functioning (i.e. the testing data)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a642f03b563650e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1. 0. 0. 2. 2. 0. 2. 1. 1. 1. 1. 1. 2. 2. 2. 1. 1. 2. 2. 2. 1. 2. 2. 2.\n", + " 0. 2. 1. 0. 2. 1.]\n" + ] + } + ], + "source": [ + "test_subset_indices = np.random.choice(len(y), size=int(len(y)*0.2), replace=False)\n", + "X_test = x[test_subset_indices]\n", + "Y_test = y[test_subset_indices]\n", + "\n", + "x[test_subset_indices]\n", + "\n", + "print(Y_test)" + ] + }, + { + "cell_type": "markdown", + "id": "192e5663358e8e82", + "metadata": {}, + "source": [ + "2. Store the remaining 80% of the records in the same way. In this case, use the names X_train andy_train for the arrays.\n", + "This is the data that your model will use as ground-truth knowledge (i.e. the training data, from which we extract the knowledge and that we will use for comparison).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b9f1639cc7fe3b53", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.\n", + " 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n" + ] + } + ], + "source": [ + "train_subset_indices = [i not in test_subset_indices for i in range(len(y))]\n", + "X_train = x[train_subset_indices]\n", + "Y_train = y[train_subset_indices]\n", + "\n", + "print(Y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "dbbc62af2fef1d5c", + "metadata": {}, + "source": [ + "3. Focus now on the KNN technique. \n", + "From the next month, you will use the `scikit-learn` package. Many of its functionalities\n", + "are exposed via an object-oriented interface. With this paradigm in mind, implement now the KNN\n", + "algorithm and expose it as a Python class. The bare skeleton of your class should look like this (you\n", + "are free to add other methods if you want to).\n", + "\n", + "```\n", + "class KNearestNeighbors:\n", + " def __init__(self, k):\n", + " \"\"\"\n", + " Store the value of k in a attribute of the class and initialize other attributes.\n", + " :param k : int, number of neighbors to consider.\n", + " \"\"\"\n", + " pass # TODO: implement it!\n", + " def fit(self, X, y):\n", + " \"\"\"\n", + " Store the 'prior knowledge' of you model that will be used\n", + " to predict new labels.\n", + " :param X : input data points, ndarray, shape = (R,C).\n", + " :param y : input labels, ndarray, shape = (R,).\n", + " \"\"\"\n", + " pass # TODO: implement it!\n", + " \n", + " def predict(self, X):\n", + " \"\"\"Run the KNN classification on X.\n", + " :param X: input data points, ndarray, shape = (N,C).\n", + " :return: labels : ndarray, shape = (N,).\n", + " \"\"\"\n", + " pass # TODO: implement it!\n", + "\n", + "```\n", + "\n", + "\n", + "Implement the `__init__` and `fit` methods first. \n", + "- In the `__init__` method, you should store the value of `k` in a private attribute of the class.\n", + "- In the `fit` method you should only store the training data in private attributes of the class." + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "b5de6a78df7f8585", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-10T12:53:39.426246Z", + "start_time": "2024-10-10T12:53:39.420295Z" + } + }, + "outputs": [], + "source": [ + "class KNearestNeighbors:\n", + " def __init__(self, k):\n", + " \"\"\"\n", + " Store the value of k in a attribute of the class and initialize other attributes.\n", + " :param k : int, number of neighbors to consider.\n", + " \"\"\"\n", + " self.k = k\n", + "\n", + " def fit(self, X, y):\n", + " \"\"\"\n", + " Store the 'prior knowledge' of you model that will be used\n", + " to predict new labels.\n", + " :param X : input data points, ndarray, shape = (R,C).\n", + " :param y : input labels, ndarray, shape = (R,).\n", + " \"\"\"\n", + " self.X = x\n", + " self.y = y\n", + "\n", + " def vote(self, labels: np.ndarray):\n", + " voting = np.unique(labels, return_counts=True)\n", + " return voting[0][voting[1].argmax()]\n", + "\n", + " \n", + " def predict(self, X):\n", + " \"\"\"Run the KNN classification on X.\n", + " :param X: input data points, ndarray, shape = (N,C).\n", + " :return: labels : ndarray, shape = (N,).\n", + " \"\"\"\n", + " distances = [np.linalg.norm(x-self.X, axis=1) for x in X]\n", + " distances_sorted = np.argsort(distances)\n", + " nearest_neighbors_labels = y[distances_sorted[:,0:self.k]]\n", + "\n", + " return np.apply_along_axis(self.vote, 1, nearest_neighbors_labels)" + ] + }, + { + "cell_type": "markdown", + "id": "6ad6f4fc7071bff0", + "metadata": {}, + "source": [ + "4. Implement the `predict` method. The function receives as input a numpy array with N rows and C\n", + "columns, corresponding to N flowers. The method assigns to each row one of the three Iris species \n", + "using the KNN algorithm, and returns the predicted species as a numpy array. \n", + "\n", + " - For finding nearest neighbours, you can either re-use the previously defined `k_nearest_neighbors` function or \n", + "implement a new one exploiting the numpy broadcasting capabilities in order to avoid iterating over the sample matrix `X`.\n", + " - Then, assign the *predicted label* to each sample using a majority voting scheme, i.e., the label that appears most frequently among the k nearest neighbors. To do so you can use the `np.unique(neighbours_labels, return_count=True)` function that returns the unique labels and their counts. \n", + " - Finally, return the predicted labels as a numpy array." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c227627e47cc7253", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-10T13:03:44.621187Z", + "start_time": "2024-10-10T13:03:44.609767Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "4cbd1131d3ba785d", + "metadata": {}, + "source": [ + "5. Now let’s fit the KNN model with the X_train and y_train data. Then, try to use your KNN model\n", + "to predict the species for each record in X_test and store them in a nupy array called y_pred.\n", + "As we did in the previous lab, check how many Iris species in the array y_pred have been guessed correctly computing with respect to the ones in y_test computing the accuracy. \n", + " - A prediction is correct if `y_pred[i] == y_test[i]`. To get the accuracy then compute the ratio between the number of correct guesses and the total number of guesses is known. \n", + " - If all labels are assigned correctly ((y_pred == y_test).all() == True), the accuracy of the model is 100%. \n", + " - Instead, if none of the guessed species corresponds to the real one ((y_pred == y_test).any() == False), the accuracy is 0%\n" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "ca4f0b4bbe44c9fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8666666666666667\n" + ] + } + ], + "source": [ + "knn = KNearestNeighbors(5)\n", + "knn.fit(X_train, Y_train)\n", + "predictions = knn.predict(X_test)\n", + "correct_guesses = predictions == Y_test\n", + "accuracy = np.count_nonzero(correct_guesses == True) / len(correct_guesses)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "id": "7514fc82de74b729", + "metadata": {}, + "source": [ + "6. ($\\star$) As a software developer, you might want to increase the functionalities of your product and\n", + "publish newer versions over time. The better your code is structured and organized, the lower is the\n", + "effort to release updates.\n", + "As such, extend your KNN implementation adding the parameter `distance`. This has to be one among:\n", + " - Euclidean distance: $ euclidean(p,q) = \\sqrt{\\sum_{i=1}^{n} (p_i _- q_i)^2} $\n", + " - Manhattan distance: $ manhattan(p,q) = \\sum_{i=1}^n |p_i - q_i|$\n", + " - Cosine distance: $ cosine(p, q) = 1 - \\frac{\\sum_{i=1}^n p_i q_i}{ \\sqrt{\\sum^n_{i=1} p^2_i} \\cdot \\sqrt{\\sum^n_{i=1} q_i^2}}$\n", + "\n", + "If any of this distance is not already implemented in `numpy` implement it yourself" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "436c6395a2f3d853", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "24c76d735fe65dbd", + "metadata": {}, + "source": [ + "\n", + "7. ($\\star$) Again, extend now your KNN implementation by adding the parameter `weights` to the constructor,\n", + "as shown below:\n", + "\n", + "```\n", + "class KNearestNeighbors:\n", + " def __init__(self, k, distance_metric=\"euclidean\", weights=\"uniform\"):\n", + " self.k = k\n", + " self.distance_metric = distance_metric\n", + " self.weights = weights\n", + "```\n", + "\n", + "Change your KNN implementation to accept a new weighting scheme for the labels. If weights=\n", + "\"distance\", weight neighbor votes by the inverse of their distance (for the distance, again, use\n", + "distance_metric). The weight for a neighbor of the point p is:\n", + "\n", + "$\n", + "w(p, n) = \\frac{1}{distance\\_metric(p, n)}\n", + "$\n", + "\n", + "Instead, if the default is chosen (weights=\"uniform\"), use the majority voting you already implemented\n", + "in Exercise 6.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a84262b9fd13d9f1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "54f1e2a662695741", + "metadata": {}, + "source": [ + "8. ($\\star$) Test the modularity of the implementation applying it on a different dataset. Ideally, you should\n", + "not change the code of your KNN python class.\n", + "- Download the MNIST dataset and retain only 100 samples per digit. You will end up with a dataset of 1000 samples.\n", + "- Define again four numpy arrays as you did in Exercises 2 and 3.\n", + "- Apply your KNN as you did for the Iris dataset.\n", + "- Evaluate the accuracy on MNIST’s y_test." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b720ef714195eb68", + "metadata": {}, + "outputs": [], + "source": [ + "# download MNIST dataset\n", + "\n", + "# linux users\n", + "#! wget https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv -O mnist.csv\n", + "\n", + "# windows users\n", + "! pip install wget\n", + "import wget\n", + "wget.download(\"https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv\", \"mnist.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "77afcee410ef94ac", + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0 0 0 ... 0 0 0]\n", + " [0 0 0 ... 0 0 0]\n", + " [0 0 0 ... 0 0 0]\n", + " ...\n", + " [9 0 0 ... 0 0 0]\n", + " [9 0 0 ... 0 0 0]\n", + " [9 0 0 ... 0 0 0]]\n", + "(1000, 784)\n", + "(1000,)\n" + ] + } + ], + "source": [ + "# extracting MNIST dataset\n", + "import numpy as np\n", + "\n", + "raw_csv = np.loadtxt(\"mnist.csv\",\n", + " delimiter=\",\", dtype=int, converters={4:type_mapper})\n", + "\n", + "dataset_reduced = np.ndarray((0,785),dtype=int)\n", + "\n", + "for i in range(10):\n", + " items_with_digit = raw_csv[raw_csv[:,0] == i]\n", + " dataset_reduced = np.concatenate((dataset_reduced, items_with_digit[0:100,:]))\n", + "\n", + "print(dataset_reduced)\n", + "\n", + "x = dataset_reduced[:,1:]\n", + "y = dataset_reduced[:,0]\n", + "\n", + "print(x.shape)\n", + "print(y.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "d1a0834dd8885a2b", + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# define four numpy arrays x_train, y_train, x_test, y_test\n", + "test_subset_indices = np.random.choice(len(y), size=int(len(y)*0.2), replace=False)\n", + "X_test = x[test_subset_indices]\n", + "Y_test = y[test_subset_indices]\n", + "\n", + "x[test_subset_indices]\n", + "\n", + "train_subset_indices = [i not in test_subset_indices for i in range(len(y))]\n", + "X_train = x[train_subset_indices]\n", + "Y_train = y[train_subset_indices]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "c03d2add840c1531", + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.885\n" + ] + } + ], + "source": [ + "# Apply KNN on MNIST\n", + "knn = KNearestNeighbors(5)\n", + "knn.fit(X_train, Y_train)\n", + "predictions = knn.predict(X_test)\n", + "correct_guesses = predictions == Y_test\n", + "accuracy = np.count_nonzero(correct_guesses == True) / len(correct_guesses)\n", + "print(accuracy)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}