feat: Add support for euclidian distance for KNNclassifier

cmnemoi · Nov 14, 2023 · b4def1f · b4def1f
1 parent 82a8f31
commit b4def1f
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 9 deletions.
diff --git a/cmnemoi_learn/classification/_knn_classifier.py b/cmnemoi_learn/classification/_knn_classifier.py
@@ -1,7 +1,7 @@
 """
 File defining a K-Nearest Neighbors classifier model.
 """
-from typing import Self
+from typing import Callable, Self
 import numpy as np
 
 from ._abstract_classifier import AbstractClassifier
@@ -19,16 +19,22 @@ class KNNClassifier(AbstractClassifier):
     a specific norm (L1, L2,...) and returns
     the majority label as prediction.
 
-    (For the moment only the L2 norm, ie. Euclidian distance, is available)
+    For the moment only L1 and L2 norms are available,
+    via the `metrics` module.
 
     Args:
         k (int): The number of neighbors to evaluate.
+        distance (Callable): The function to use to compute the distance
+        between points. By default: Manhattan distance
     """
 
-    def __init__(self, k: int) -> None:
+    def __init__(
+        self, k: int, distance: Callable[[np.ndarray, np.ndarray], float] = manhattan_distance
+    ) -> None:
         super().__init__()
         self.k = k
         self.dataset = np.array([])
+        self.distance = distance
 
     def fit(self, X: np.ndarray, y: np.ndarray) -> Self:
         self.dataset = np.concatenate([X, self._reshape_ndarray(y)], axis=1)
@@ -40,7 +46,7 @@ def predict(self, X: np.ndarray) -> np.ndarray:
         distances_between_input_and_dataset = np.full((nb_rows, self.dataset.shape[0]), np.inf)
         for i, x_i in enumerate(X):
             for j, x_j in enumerate(self.dataset[:, :max_column]):
-                distances_between_input_and_dataset[i, j] = manhattan_distance(x_i, x_j)
+                distances_between_input_and_dataset[i, j] = self.distance(x_i, x_j)
 
         # get K nearest neighbors and associate them their labels
         nearest_neighbor_indexes = np.full((nb_rows, self.k), 0)

diff --git a/tests/classification/test_knn_classifier.py b/tests/classification/test_knn_classifier.py
@@ -15,7 +15,7 @@
 np.random.seed(RANDOM_STATE)
 
 
-@pytest.mark.parametrize("k", [1, 3, 10])
+@pytest.mark.parametrize("k", [1, 3, 7])
 def test_predict_moons_dataset(classification_moons_dataset: np.ndarray, k: int) -> None:
     """
     Test `predict` on a circle pattern dataset.
@@ -33,7 +33,7 @@ def test_predict_moons_dataset(classification_moons_dataset: np.ndarray, k: int)
     assert np.array_equal(cmnemoi_prediction, sklearn_prediction)
 
 
-@pytest.mark.parametrize("k", [1, 3, 10])
+@pytest.mark.parametrize("k", [1, 3, 7])
 def test_predict_linear_dataset(classification_linear_dataset: np.ndarray, k: int) -> None:
     """
     Test `predict` on a linearly separable dataset.
@@ -42,7 +42,7 @@ def test_predict_linear_dataset(classification_linear_dataset: np.ndarray, k: in
     cmnemoi_model = KNNClassifier(k=k)
     cmnemoi_model = cmnemoi_model.fit(X, y)
 
-    sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=1)
+    sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=1)  # p = 1 for Manhattan distance
     sklearn_model = sklearn_model.fit(X, y)
 
     cmnemoi_prediction = cmnemoi_model.predict(X)
@@ -51,7 +51,7 @@ def test_predict_linear_dataset(classification_linear_dataset: np.ndarray, k: in
     assert np.array_equal(cmnemoi_prediction, sklearn_prediction)
 
 
-@pytest.mark.parametrize("k", [1, 3, 10])
+@pytest.mark.parametrize("k", [1, 3, 7])
 def test_predict_iris_dataset(k: int) -> None:
     """
     Test `predict` on Iris dataset.
@@ -68,8 +68,25 @@ def test_predict_iris_dataset(k: int) -> None:
 
     assert np.array_equal(cmnemoi_prediction, sklearn_prediction)
 
+@pytest.mark.parametrize("k", [1, 3, 7])
+def test_predict_with_l2_norm(k: int) -> None:
+    """
+    Test `predict` on Iris dataset.
+    """
+    X, y = load_iris(return_X_y=True)
+    cmnemoi_model = KNNClassifier(k=k)
+    cmnemoi_model = cmnemoi_model.fit(X, y)
+
+    sklearn_model = SklearnKNNClassifier(n_neighbors=k, p=2)
+    sklearn_model = sklearn_model.fit(X, y)
+
+    cmnemoi_prediction = cmnemoi_model.predict(X)
+    sklearn_prediction = sklearn_model.predict(X)
+
+    assert np.array_equal(cmnemoi_prediction, sklearn_prediction)
+
 
-@pytest.mark.parametrize("k", [1, 3, 10])
+@pytest.mark.parametrize("k", [1, 3, 7])
 def test_score(classification_linear_dataset: np.ndarray, k: int) -> None:
     """Test `score` method against sklearn implementation.