diff --git a/.gitignore b/.gitignore index 6769e21..ca2ff6f 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,6 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + +notebooks/ \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index a673446..5ef33a7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,4 +3,6 @@ good-names= X, y, n, - i \ No newline at end of file + i, + j, + k, \ No newline at end of file diff --git a/cmnemoi_learn/__init__.py b/cmnemoi_learn/__init__.py index 2d8d8dd..83d6361 100644 --- a/cmnemoi_learn/__init__.py +++ b/cmnemoi_learn/__init__.py @@ -3,4 +3,4 @@ with high quality development practices. """ -__all__ = ["classification", "regression"] +__all__ = ["classification", "metrics", "regression"] diff --git a/cmnemoi_learn/classification/__init__.py b/cmnemoi_learn/classification/__init__.py index 77e1974..3cb01ed 100644 --- a/cmnemoi_learn/classification/__init__.py +++ b/cmnemoi_learn/classification/__init__.py @@ -1,7 +1,8 @@ """ -Module implementing machine learning models for regression tasks. +Module implementing machine learning models for classification tasks. """ +from ._knn_classifier import KNNClassifier from ._logistic_regression import LogisticRegression -__all__ = ["LogisticRegression"] +__all__ = ["KNNClassifier", "LogisticRegression"] diff --git a/cmnemoi_learn/classification/_knn_classifier.py b/cmnemoi_learn/classification/_knn_classifier.py new file mode 100644 index 0000000..19e7352 --- /dev/null +++ b/cmnemoi_learn/classification/_knn_classifier.py @@ -0,0 +1,65 @@ +""" +File defining a K-Nearest Neighbors classifier model. +""" +from typing import Callable, Self +import numpy as np + +from ._abstract_classifier import AbstractClassifier +from ..metrics import manhattan_distance + + +class KNNClassifier(AbstractClassifier): + """K-Nearest Neighbors (KNN) classifier model. + + The KNN classifier memorizes all instances of + the training set passed in `fit` method. + + Then, for each new instance passed in the `predict` + method, it finds the K-nearest instances given + a specific norm (L1, L2,...) and returns + the majority label as prediction. + + For the moment only L1 and L2 norms are available, + via the `metrics` module. + + Args: + k (int): The number of neighbors to evaluate. + distance (Callable): The function to use to compute the distance + between points. By default: Manhattan distance + """ + + def __init__( + self, k: int, distance: Callable[[np.ndarray, np.ndarray], float] = manhattan_distance + ) -> None: + super().__init__() + self.k = k + self.dataset = np.array([]) + self.distance = distance + + def fit(self, X: np.ndarray, y: np.ndarray) -> Self: + self.dataset = np.concatenate([X, self._reshape_ndarray(y)], axis=1) + return self + + def predict(self, X: np.ndarray) -> np.ndarray: + nb_rows, max_column = X.shape + # compute the distance between all input to predict and all data points in train set + distances_between_input_and_dataset = np.full((nb_rows, self.dataset.shape[0]), np.inf) + for i, x_i in enumerate(X): + for j, x_j in enumerate(self.dataset[:, :max_column]): + distances_between_input_and_dataset[i, j] = self.distance(x_i, x_j) + + # get K nearest neighbors and associate them their labels + nearest_neighbor_indexes = np.full((nb_rows, self.k), 0) + nearest_neighbor_labels = np.full((nb_rows, self.k), 0) + for i, distance in enumerate(distances_between_input_and_dataset): + nearest_neighbor_indexes[i] = np.argpartition(distance, kth=self.k, axis=-1)[: self.k] + nearest_neighbor_labels[i] = self.dataset[nearest_neighbor_indexes[i], max_column] + + # count the number of occurences of each label in nearest neighbors and return the label + # with the highest count + return np.array( + [ + np.argmax(np.bincount(nearest_neighbor_labels[i])) + for i in range(len(nearest_neighbor_labels)) + ] + ) diff --git a/cmnemoi_learn/metrics.py b/cmnemoi_learn/metrics.py new file mode 100644 index 0000000..73e1b9b --- /dev/null +++ b/cmnemoi_learn/metrics.py @@ -0,0 +1,33 @@ +"""Module defining machine learning metrics and distances""" + +import numpy as np + + +def euclidian_distance(v_1: np.ndarray, v_2: np.ndarray) -> float: + """Compute Euclidian distance (L2 norm) between two vectors. + + Args: + v_1: Vector as numpy array + v_2 Vector as numpy array + + Returns: + float: Euclidian distance + """ + if v_1.shape != v_2.shape: + raise ValueError("v_1 and v_2 should have the same shape") + return np.sqrt(np.sum((v_1 - v_2) ** 2)) + + +def manhattan_distance(v_1: np.ndarray, v_2: np.ndarray) -> float: + """Compute Manhattan distance (L1 norm) between two vectors. + + Args: + v_1: Vector as numpy array + v_2 Vector as numpy array + + Returns: + float: Manhattan distance + """ + if v_1.shape != v_2.shape: + raise ValueError("v_1 and v_2 should have the same shape") + return np.sum(np.abs(v_1 - v_2)) diff --git a/mypy.ini b/mypy.ini index 5d5a0ca..e91fba1 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,2 +1,5 @@ +[mypy-scipy.*] +ignore_missing_imports = True + [mypy-sklearn.*] -ignore_missing_imports = True \ No newline at end of file +ignore_missing_imports = True diff --git a/notebooks/notebook.ipynb b/notebooks/notebook.ipynb deleted file mode 100644 index e01361e..0000000 --- a/notebooks/notebook.ipynb +++ /dev/null @@ -1,339 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'matplotlib'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpyplot\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mplt\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdatasets\u001b[39;00m \u001b[39mimport\u001b[39;00m make_blobs, make_circles\n\u001b[1;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmetrics\u001b[39;00m \u001b[39mimport\u001b[39;00m accuracy_score, log_loss\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'matplotlib'" - ] - } - ], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.datasets import make_blobs, make_circles\n", - "from sklearn.metrics import accuracy_score, log_loss\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fonctions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "def initialisation(dimensions):\n", - " \n", - " parametres = {}\n", - " C = len(dimensions)\n", - "\n", - " np.random.seed(1)\n", - "\n", - " for c in range(1, C):\n", - " parametres['W' + str(c)] = np.random.randn(dimensions[c], dimensions[c - 1])\n", - " parametres['b' + str(c)] = np.random.randn(dimensions[c], 1)\n", - "\n", - " return parametres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "def forward_propagation(X, parametres):\n", - " \n", - " activations = {'A0': X}\n", - "\n", - " C = len(parametres) // 2\n", - "\n", - " for c in range(1, C + 1):\n", - "\n", - " Z = parametres['W' + str(c)].dot(activations['A' + str(c - 1)]) + parametres['b' + str(c)]\n", - " activations['A' + str(c)] = 1 / (1 + np.exp(-Z))\n", - "\n", - " return activations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "def back_propagation(y, parametres, activations):\n", - "\n", - " m = y.shape[1]\n", - " C = len(parametres) // 2\n", - "\n", - " dZ = activations['A' + str(C)] - y\n", - " gradients = {}\n", - "\n", - " for c in reversed(range(1, C + 1)):\n", - " gradients['dW' + str(c)] = 1/m * np.dot(dZ, activations['A' + str(c - 1)].T)\n", - " gradients['db' + str(c)] = 1/m * np.sum(dZ, axis=1, keepdims=True)\n", - " if c > 1:\n", - " dZ = np.dot(parametres['W' + str(c)].T, dZ) * activations['A' + str(c - 1)] * (1 - activations['A' + str(c - 1)])\n", - "\n", - " return gradients" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "def update(gradients, parametres, learning_rate):\n", - "\n", - " C = len(parametres) // 2\n", - "\n", - " for c in range(1, C + 1):\n", - " parametres['W' + str(c)] = parametres['W' + str(c)] - learning_rate * gradients['dW' + str(c)]\n", - " parametres['b' + str(c)] = parametres['b' + str(c)] - learning_rate * gradients['db' + str(c)]\n", - "\n", - " return parametres" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "def predict(X, parametres):\n", - " activations = forward_propagation(X, parametres)\n", - " C = len(parametres) // 2\n", - " Af = activations['A' + str(C)]\n", - " return Af >= 0.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "def deep_neural_network(X, y, hidden_layers = (16, 16, 16), learning_rate = 0.001, n_iter = 3000):\n", - " \n", - " # initialisation parametres\n", - " dimensions = list(hidden_layers)\n", - " dimensions.insert(0, X.shape[0])\n", - " dimensions.append(y.shape[0])\n", - " np.random.seed(1)\n", - " parametres = initialisation(dimensions)\n", - "\n", - " # tableau numpy contenant les futures accuracy et log_loss\n", - " training_history = np.zeros((int(n_iter), 2))\n", - "\n", - " C = len(parametres) // 2\n", - "\n", - " # gradient descent\n", - " for i in tqdm(range(n_iter)):\n", - "\n", - " activations = forward_propagation(X, parametres)\n", - " gradients = back_propagation(y, parametres, activations)\n", - " parametres = update(gradients, parametres, learning_rate)\n", - " Af = activations['A' + str(C)]\n", - "\n", - " # calcul du log_loss et de l'accuracy\n", - " training_history[i, 0] = (log_loss(y.flatten(), Af.flatten()))\n", - " y_pred = predict(X, parametres)\n", - " training_history[i, 1] = (accuracy_score(y.flatten(), y_pred.flatten()))\n", - "\n", - " # Plot courbe d'apprentissage\n", - " plt.figure(figsize=(12, 4))\n", - " plt.subplot(1, 2, 1)\n", - " plt.plot(training_history[:, 0], label='train loss')\n", - " plt.legend()\n", - " plt.subplot(1, 2, 2)\n", - " plt.plot(training_history[:, 1], label='train acc')\n", - " plt.legend()\n", - " plt.show()\n", - "\n", - " return training_history" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "X, y = make_circles(n_samples=100, noise=0.1, factor=0.3, random_state=0)\n", - "X = X.T\n", - "y = y.reshape((1, y.shape[0]))\n", - "\n", - "print('dimensions de X:', X.shape)\n", - "print('dimensions de y:', y.shape)\n", - "\n", - "plt.scatter(X[0, :], X[1, :], c=y, cmap='summer')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [ - "deep_neural_network(X, y, hidden_layers = (16, 16, 16), learning_rate = 0.1, n_iter = 3000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with 'cmnemoi-learn-sm69zs_S-py3.11' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: '/Users/charles/Library/Caches/pypoetry/virtualenvs/cmnemoi-learn-sm69zs_S-py3.11/bin/python -m pip install ipykernel -U --force-reinstall'" - ] - } - ], - "source": [] - } - ], - "metadata": { - "interpreter": { - "hash": "038c04557dfd72b4d6039cb7951b93ffe7520921b6515cb88d8784deedfaf89f" - }, - "kernelspec": { - "display_name": "Python 3.7.9 ('base')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pyproject.toml b/pyproject.toml index b9b5ee5..e4d2820 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cmnemoi-learn" -version = "0.4.0" +version = "0.5.0" description = "Machine Learning from scratch by Charles-Meldhine Madi Mnemoi" authors = ["Charles-Meldhine Madi Mnemoi "] license = "MIT" diff --git a/tests/classification/test_knn_classifier.py b/tests/classification/test_knn_classifier.py new file mode 100644 index 0000000..4436cb7 --- /dev/null +++ b/tests/classification/test_knn_classifier.py @@ -0,0 +1,106 @@ +""" +Unit tests for Logistic Regression model against sklearn implementation +""" +import numpy as np +import pytest + +from sklearn.datasets import load_iris +from sklearn.neighbors import KNeighborsClassifier as SklearnKNNClassifier +from sklearn.metrics import accuracy_score + +from cmnemoi_learn.classification import KNNClassifier + +RANDOM_STATE = 42 + +np.random.seed(RANDOM_STATE) + + +@pytest.mark.parametrize("k", [1, 3, 7]) +def test_predict_moons_dataset(classification_moons_dataset: np.ndarray, k: int) -> None: + """ + Test `predict` on a circle pattern dataset. + """ + X, y = classification_moons_dataset + cmnemoi_model = KNNClassifier(k=k) + cmnemoi_model = cmnemoi_model.fit(X, y) + + sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=1) + sklearn_model = sklearn_model.fit(X, y) + + cmnemoi_prediction = cmnemoi_model.predict(X) + sklearn_prediction = sklearn_model.predict(X) + + assert np.array_equal(cmnemoi_prediction, sklearn_prediction) + + +@pytest.mark.parametrize("k", [1, 3, 7]) +def test_predict_linear_dataset(classification_linear_dataset: np.ndarray, k: int) -> None: + """ + Test `predict` on a linearly separable dataset. + """ + X, y = classification_linear_dataset + cmnemoi_model = KNNClassifier(k=k) + cmnemoi_model = cmnemoi_model.fit(X, y) + + sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=1) # p = 1 for Manhattan distance + sklearn_model = sklearn_model.fit(X, y) + + cmnemoi_prediction = cmnemoi_model.predict(X) + sklearn_prediction = sklearn_model.predict(X) + + assert np.array_equal(cmnemoi_prediction, sklearn_prediction) + + +@pytest.mark.parametrize("k", [1, 3, 7]) +def test_predict_iris_dataset(k: int) -> None: + """ + Test `predict` on Iris dataset. + """ + X, y = load_iris(return_X_y=True) + cmnemoi_model = KNNClassifier(k=k) + cmnemoi_model = cmnemoi_model.fit(X, y) + + sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=1) + sklearn_model = sklearn_model.fit(X, y) + + cmnemoi_prediction = cmnemoi_model.predict(X) + sklearn_prediction = sklearn_model.predict(X) + + assert np.array_equal(cmnemoi_prediction, sklearn_prediction) + + +@pytest.mark.parametrize("k", [1, 3, 7]) +def test_predict_with_l2_norm(k: int) -> None: + """ + Test `predict` on Iris dataset. + """ + X, y = load_iris(return_X_y=True) + cmnemoi_model = KNNClassifier(k=k) + cmnemoi_model = cmnemoi_model.fit(X, y) + + sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=2) + sklearn_model = sklearn_model.fit(X, y) + + cmnemoi_prediction = cmnemoi_model.predict(X) + sklearn_prediction = sklearn_model.predict(X) + + assert np.array_equal(cmnemoi_prediction, sklearn_prediction) + + +@pytest.mark.parametrize("k", [1, 3, 7]) +def test_score(classification_linear_dataset: np.ndarray, k: int) -> None: + """Test `score` method against sklearn implementation. + + Args: + classification_moons_dataset (np.ndarray): Dataset with a moons pattern. + """ + X, y = classification_linear_dataset + model = KNNClassifier(k=k) + model = model.fit(X, y) + + y_pred = model.predict(X) + + cmnemoi_accuracy = model.score(X, y) + sklearn_accuracy = accuracy_score(y_pred, y) + + assert cmnemoi_accuracy == sklearn_accuracy diff --git a/tests/conftest.py b/tests/conftest.py index 33c921e..47de09b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ """ Fixtures for unit tests """ +from typing import Tuple import numpy as np from sklearn.datasets import ( @@ -18,7 +19,7 @@ @pytest.fixture -def classification_linear_dataset() -> np.ndarray: +def classification_linear_dataset() -> Tuple[np.ndarray, np.ndarray]: """Classification dataset linearly separable `X, y = classification_linear_dataset` to use @@ -36,7 +37,7 @@ def classification_linear_dataset() -> np.ndarray: @pytest.fixture -def classification_linear_dataset_with_small_n_big_p() -> np.ndarray: +def classification_linear_dataset_with_small_n_big_p() -> Tuple[np.ndarray, np.ndarray]: """Classification dataset linearly separable with small n and big p (under determined problem). `X, y = classification_linear_dataset_with_small_n_big_p` to use @@ -55,7 +56,7 @@ def classification_linear_dataset_with_small_n_big_p() -> np.ndarray: @pytest.fixture -def classification_moons_dataset() -> np.ndarray: +def classification_moons_dataset() -> Tuple[np.ndarray, np.ndarray]: """Classification dataset with circles pattern (non linear) `X, y = classification_moons_dataset` to use @@ -66,7 +67,7 @@ def classification_moons_dataset() -> np.ndarray: @pytest.fixture -def regression_friedman_dataset() -> np.ndarray: +def regression_friedman_dataset() -> Tuple[np.ndarray, np.ndarray]: """Regression dataset which follows friedman #2 problem pattern `X, y = regression_friedman_dataset` to use @@ -77,7 +78,7 @@ def regression_friedman_dataset() -> np.ndarray: @pytest.fixture -def regression_linear_dataset() -> np.ndarray: +def regression_linear_dataset() -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Regression dataset which follows a linear pattern `X, y = regression_linear_dataset` to use @@ -95,7 +96,7 @@ def regression_linear_dataset() -> np.ndarray: @pytest.fixture -def regression_linear_dataset_with_small_n_big_p() -> np.ndarray: +def regression_linear_dataset_with_small_n_big_p() -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Regression dataset which follows a linear pattern `X, y = regression_linear_dataset_with_small_n_big_p` to use diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py new file mode 100644 index 0000000..fef2909 --- /dev/null +++ b/tests/metrics/test_metrics.py @@ -0,0 +1,42 @@ +""" +Unit tests for metrics and distances model against well-known implementations +""" +import numpy as np +import pytest +from scipy.spatial import distance + +from cmnemoi_learn.metrics import euclidian_distance, manhattan_distance + + +def test_manhattan_distance() -> None: + """Test for manhattan distance""" + v_1 = np.random.randn(1, 10).ravel() + v_2 = np.random.randn(1, 10).ravel() + + assert manhattan_distance(v_1, v_2) == distance.cityblock(v_1, v_2) + + +def test_manhattan_distance_raises_exception_if_arrays_not_same_size() -> None: + """Test for manhattan distance""" + v_1 = np.random.randn(1, 9).ravel() + v_2 = np.random.randn(1, 10).ravel() + + with pytest.raises(ValueError, match="v_1 and v_2 should have the same shape"): + manhattan_distance(v_1, v_2) + + +def test_euclidian_distance() -> None: + """Test for euclidian distance""" + v_1 = np.random.randn(1, 10).ravel() + v_2 = np.random.randn(1, 10).ravel() + + assert np.isclose(euclidian_distance(v_1, v_2), distance.euclidean(v_1, v_2)) + + +def test_euclidian_distance_raises_exception_if_arrays_not_same_size() -> None: + """Test for euclidian distance""" + v_1 = np.random.randn(1, 9).ravel() + v_2 = np.random.randn(1, 10).ravel() + + with pytest.raises(ValueError, match="v_1 and v_2 should have the same shape"): + euclidian_distance(v_1, v_2)