Skip to content

Commit

Permalink
knn implementation with regression and classification, readme updated…
Browse files Browse the repository at this point in the history
… and folders reorganized to look like sklearn to facilitate study
  • Loading branch information
SalatielBairros committed Jan 2, 2024
1 parent ddba2cf commit c674dd8
Show file tree
Hide file tree
Showing 12 changed files with 263 additions and 44 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
name: Unit tests execution
run-name: Unit Test Execution by @${{ github.actor }}

on:
push:
Expand Down
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ The name "caos" is because all learning is caotic. You'll have joy and frustrati
Each algorithm has its own folder with a README.md file explaining the algorithm and how to run it. All the explanations are written in Portuguese, but the code is written in English to facilitate the comparison with other implementations.
More information will be added here in the future as more algorithms are implemented.

- [Linear Regression](https://github.com/SalatielBairros/CaosML/tree/main/src/regression/linear)
- [Logistic Regression](https://github.com/SalatielBairros/CaosML/tree/main/src/classification/logistic)
- [Linear Regression](https://github.com/SalatielBairros/CaosML/tree/main/src/linear_model/linear_regression)
- [Logistic Regression](https://github.com/SalatielBairros/CaosML/tree/main/src/linear_model/logistic_regression)
- [KNN](https://github.com/SalatielBairros/CaosML/tree/main/src/neighbors/knn)

### Current implementing:

- KNN
- Decision Tree

### Next

- Decision Tree
- Random Forest
7 changes: 5 additions & 2 deletions src/core/distances.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import numpy as np

def euclidean_distances(x_train: np.array, x_test_point: np.array):
return np.sqrt(np.sum((x_train - x_test_point) ** 2, axis=1))
def euclidean_distances(x: np.array, target: np.array):
return np.sqrt(np.sum((x - target) ** 2, axis=1))

def manhattan_distances(x: np.array, target: np.array):
return np.sum(np.absolute(x - target), axis=1)
4 changes: 4 additions & 0 deletions src/core/exceptions/not_implemented.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class NotImplementedException(Exception):
def __init__(self, message: str):
full_message = f'Not implemented error: {message}'
super().__init__(full_message)
File renamed without changes.
File renamed without changes.
File renamed without changes.
92 changes: 55 additions & 37 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,56 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from classification.logistic.logistic_regression import LogisticRegression
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
import time

dataset = pd.read_csv('https://raw.githubusercontent.com/animesh-agarwal/Machine-Learning/master/LogisticRegression/data/marks.txt',
header=None, names=['Exam 1', 'Exam 2', 'Admitted'])

X = dataset[['Exam 1', 'Exam 2']].copy()
y = dataset['Admitted'].copy()

labels, index = np.unique(dataset["Admitted"], return_inverse=True)
plt.scatter(dataset['Exam 1'], dataset['Exam 2'], marker='o', c=index)
plt.show()

lr = LogisticRegression(random_state=42)
print("Training with loss_minimization:")
start_time = time.time()
result = (lr
.fit(X, y, learning_rate=0.001, epochs=1000000, calculate_cost=False, save_steps=False, random_inicialization=True, method='loss_minimization')
.accuracy(X, y))
print(f'Result: {result} in {str(time.time() - start_time)} seconds')

print("\n Training with maximum_likehood")
start_time = time.time()
result = (lr
.fit(X, y, learning_rate=0.001, epochs=1000000, calculate_cost=False, save_steps=False, random_inicialization=True, method='maximum_likehood')
.accuracy(X, y))
print(f'Result: {result} in {str(time.time() - start_time)} seconds')

print("\n Training with SKLearn")
start_time = time.time()
sk_lr = SKLogisticRegression(penalty=None, random_state=42, max_iter=1000000)
sk_lr.fit(X, y)
score = sk_lr.score(X, y)
print(f'Result: {result} in {str(time.time() - start_time)} seconds')
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from neighbors.knn.knn import KNN

SEED = 42
np.random.seed(SEED)

iris = datasets.load_iris()

x = iris['data']
y = iris['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

scaler = Normalizer().fit(x_train)
normalized_x_train = scaler.transform(x_train)
normalized_x_test = scaler.transform(x_test)

sk_model = KNeighborsClassifier(n_neighbors=5, algorithm='brute')
sk_model.fit(x_train, y_train)

sk_train_predictions = sk_model.predict(x_train)
sk_test_predictions = sk_model.predict(x_test)

sk_train_score = accuracy_score(y_train, sk_train_predictions)
sk_test_score = accuracy_score(y_test, sk_test_predictions)

print(f'[SKLEARN] Train score: {sk_train_score}')
print(f'[SKLEARN] Test score: {sk_test_score}')

knn_model = KNN(k=5)
train_predictions = knn_model.fit_predict(x_train, y_train)
train_score = accuracy_score(y_train, train_predictions)
test_predictions = knn_model.predict(x_test)
test_score = accuracy_score(y_test, test_predictions)

print(f'[LOCAL EUCLIDEAN] Train score: {train_score}')
print(f'[LOCAL EUCLIDEAN] Test score: {test_score}')

knn_model = KNN(k=5, distance='manhattan')
train_predictions = knn_model.fit_predict(x_train, y_train)
train_score = accuracy_score(y_train, train_predictions)
test_predictions = knn_model.predict(x_test)
test_score = accuracy_score(y_test, test_predictions)

print(f'[LOCAL MANHATTAN] Train score: {train_score}')
print(f'[LOCAL MANHATTAN] Test score: {test_score}')
116 changes: 116 additions & 0 deletions src/neighbors/knn/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# KNN (K Nearest Neighbor)

## Introdução

O algoritmo KNN é baseado no princípio de que pontos próximos em um espaço de características tendem a ter rótulos semelhantes. Ele pode ser usado tanto para classificação quanto para regressão. Na classificação, o KNN atribui a uma instância de teste a classe mais frequente entre seus k vizinhos mais próximos. Na regressão, ele estima um valor pela média (ou outra medida) dos valores de seus vizinhos mais próximos.

Uma das principais características desse algoritmo é a simplicidade do uso. Por ser um "lazy learning", o processo de treinamento prévio é quase inexistente, dependendo dos dados enviados. Por causa disso, porém, esse algoritmo possui algumas desvantagens, como:

* Custo computacional que cresce diretamente relacionado com a quantidade de dimensões e dados;
* Alguns formatos de dados mais convexos não apresentação boa performance.

## Implementação

O passo-a-passo da implementação do KNN é bastante simples.

1. Calcula a distância entre um ponto e todos os pontos do dataset. Essa etapa pode ser otimizada com algoritmos que evitam com que o cálculo seja com todos os pontos sempre. Normalmente as otimizações utilizam alguma estrutura de grafo ou árvore. A distância pode ser a euclidiana ou manhattam
2. Escolhe-se os `k` elementos com menor distância do ponto escolhido.
3. Realiza-se uma votação. Existem vários algoritmos possíveis de votação. A implementação mais simples (e a escolhida aqui) é retornar a label mais frequente dentre os vizinhos.

Dessa forma, a implementação fica:

```python
def predict(self, x: np.array):
predictions = []

for item in x:
distances = self.__calculate_distances__(item)
distances_with_labels = np.c_[distances, self.labels]
nearest = self.__nearest_neighborns__(distances_with_labels)
prediction = self.__voting__(nearest[:,1])
predictions.append(prediction)

return np.asarray(predictions)
```

## Utilizando o algoritmo

Segue abaixo um exemplo de uso do algoritmo:

```python
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from classification.knn.knn_classifier import KNNClassifier

SEED = 42
np.random.seed(SEED)

iris = datasets.load_iris()

x = iris['data']
y = iris['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

scaler = Normalizer().fit(x_train)
normalized_x_train = scaler.transform(x_train)
normalized_x_test = scaler.transform(x_test)

sk_model = KNeighborsClassifier(n_neighbors=5, algorithm='brute')
sk_model.fit(x_train, y_train)

sk_train_predictions = sk_model.predict(x_train)
sk_test_predictions = sk_model.predict(x_test)

sk_train_score = accuracy_score(y_train, sk_train_predictions)
sk_test_score = accuracy_score(y_test, sk_test_predictions)

print(f'[SKLEARN] Train score: {sk_train_score}')
print(f'[SKLEARN] Test score: {sk_test_score}')

knn_model = KNNClassifier(k=5)
train_predictions = knn_model.fit_predict(x_train, y_train)
train_score = accuracy_score(y_train, train_predictions)
test_predictions = knn_model.predict(x_test)
test_score = accuracy_score(y_test, test_predictions)

print(f'[LOCAL EUCLIDEAN] Train score: {train_score}')
print(f'[LOCAL EUCLIDEAN] Test score: {test_score}')

knn_model = KNNClassifier(k=5, distance='manhattan')
train_predictions = knn_model.fit_predict(x_train, y_train)
train_score = accuracy_score(y_train, train_predictions)
test_predictions = knn_model.predict(x_test)
test_score = accuracy_score(y_test, test_predictions)

print(f'[LOCAL MANHATTAN] Train score: {train_score}')
print(f'[LOCAL MANHATTAN] Test score: {test_score}')

```

O output da execução acima é:

```bash
[SKLEARN] Train score: 0.9666666666666667
[SKLEARN] Test score: 1.0
[LOCAL EUCLIDEAN] Train score: 0.9666666666666667
[LOCAL EUCLIDEAN] Test score: 1.0
[LOCAL MANHATTAN] Train score: 0.9666666666666667
[LOCAL MANHATTAN] Test score: 1.0
```

Algumas observações sobre a execução:

* É extremamente importante que os dados estejam normalizados. A diferença de escala afeta diretamente as medidas de distância (especialmente a euclidiana).
* Note como a implementação do Scikit Learn possui a mesma resposta para o dataset iris. A diferença ficará sempre no tempo de execução, visto que a lib `sklearn` é melhor otimizada, especialmente no cálculo das distâncias.
* Sobre as medidas de distância e quando escolher cada uma, assista a [esta aula](https://www.youtube.com/watch?v=h0e2HAPTGF4&t=2362s&ab_channel=MITOpenCourseWare) de introdução ao aprendizado de máquina do MIT.
* Para realizar a chamada utilizando regressão, basta configurar o parâmetro em `KNNClassifier(k=5, task='regression')`.
60 changes: 60 additions & 0 deletions src/neighbors/knn/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from collections import Counter
import numpy as np
from core.distances import euclidean_distances, manhattan_distances
from core.exceptions.not_implemented import NotImplementedException

class KNN:

def __init__(self, k: int, distance: str = 'euclidean', task: str = 'classification'):
self.k = k
self.train_data = []
self.labels = []
self.distance = distance
self.task = task

def fit(self, x: np.array, y: np.array):
self.train_data = x
self.labels = y

def predict(self, x: np.array):
predictions = []

for item in x:
distances = self.__calculate_distances__(item)
distances_with_labels = np.c_[distances, self.labels]
nearest = self.__nearest_neighborns__(distances_with_labels)

if self.task == 'classification':
prediction = self.__voting__(nearest[:,1])
elif self.task == 'regression':
prediction = self.__regression__(nearest[:,1])
else:
raise NotImplementedException(f"task {self.task}")
predictions.append(prediction)

return np.asarray(predictions)

def fit_predict(self, x: np.array, y: np.array):
self.fit(x, y)
return self.predict(x)

def accuracy(self, x: np.array, y: np.array):
predictions = self.predict(x)
return np.mean(predictions == y)

def __calculate_distances__(self, target: np.array) -> np.array:
if self.distance == 'euclidean':
return euclidean_distances(self.train_data, target)
if self.distance == 'manhattan':
return manhattan_distances(self.train_data, target)

raise NotImplementedException(f'{self.distance} distance')

def __nearest_neighborns__(self, distances: np.array):
return distances[distances[:, 0].argsort()][:self.k]

def __voting__(self, nearest_neighborns_labels: np.array):
return Counter(nearest_neighborns_labels).most_common()[0][0]

def __regression__(self, nearest_neighborns_target: np.array):
return np.mean(nearest_neighborns_target)
18 changes: 17 additions & 1 deletion tests/core/test_distances.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from unittest import TestCase
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances as sk_ed
from src.core.distances import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances as sk_mh
from src.core.distances import euclidean_distances, manhattan_distances

class TestDistances(TestCase):
def test_should_calculate_euclidean_distance(self):
Expand All @@ -18,3 +19,18 @@ def test_should_calculate_euclidean_distance(self):

for distance_actual, distance_expected in zip(distances, expected_distances):
assert round(distance_actual, 4) == round(distance_expected[0], 4)

def test_should_calculate_manhattan_distance(self):
x = np.array([
[0.5, 0.3, 0.23],
[0.4, 0.4, 0.43],
[0.3, 0.5, 0.33]
])

target = np.array([0.3, 0.5, 0.33])

distances = manhattan_distances(x, target)
expected_distances = sk_mh(x, [target])

for distance_actual, distance_expected in zip(distances, expected_distances):
assert round(distance_actual, 4) == round(distance_expected[0], 4)

0 comments on commit c674dd8

Please sign in to comment.