-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
knn implementation with regression and classification, readme updated…
… and folders reorganized to look like sklearn to facilitate study
- Loading branch information
1 parent
ddba2cf
commit c674dd8
Showing
12 changed files
with
263 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
name: Unit tests execution | ||
run-name: Unit Test Execution by @${{ github.actor }} | ||
|
||
on: | ||
push: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
import numpy as np | ||
|
||
def euclidean_distances(x_train: np.array, x_test_point: np.array): | ||
return np.sqrt(np.sum((x_train - x_test_point) ** 2, axis=1)) | ||
def euclidean_distances(x: np.array, target: np.array): | ||
return np.sqrt(np.sum((x - target) ** 2, axis=1)) | ||
|
||
def manhattan_distances(x: np.array, target: np.array): | ||
return np.sum(np.absolute(x - target), axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
class NotImplementedException(Exception): | ||
def __init__(self, message: str): | ||
full_message = f'Not implemented error: {message}' | ||
super().__init__(full_message) |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,56 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from classification.logistic.logistic_regression import LogisticRegression | ||
from sklearn.linear_model import LogisticRegression as SKLogisticRegression | ||
import time | ||
|
||
dataset = pd.read_csv('https://raw.githubusercontent.com/animesh-agarwal/Machine-Learning/master/LogisticRegression/data/marks.txt', | ||
header=None, names=['Exam 1', 'Exam 2', 'Admitted']) | ||
|
||
X = dataset[['Exam 1', 'Exam 2']].copy() | ||
y = dataset['Admitted'].copy() | ||
|
||
labels, index = np.unique(dataset["Admitted"], return_inverse=True) | ||
plt.scatter(dataset['Exam 1'], dataset['Exam 2'], marker='o', c=index) | ||
plt.show() | ||
|
||
lr = LogisticRegression(random_state=42) | ||
print("Training with loss_minimization:") | ||
start_time = time.time() | ||
result = (lr | ||
.fit(X, y, learning_rate=0.001, epochs=1000000, calculate_cost=False, save_steps=False, random_inicialization=True, method='loss_minimization') | ||
.accuracy(X, y)) | ||
print(f'Result: {result} in {str(time.time() - start_time)} seconds') | ||
|
||
print("\n Training with maximum_likehood") | ||
start_time = time.time() | ||
result = (lr | ||
.fit(X, y, learning_rate=0.001, epochs=1000000, calculate_cost=False, save_steps=False, random_inicialization=True, method='maximum_likehood') | ||
.accuracy(X, y)) | ||
print(f'Result: {result} in {str(time.time() - start_time)} seconds') | ||
|
||
print("\n Training with SKLearn") | ||
start_time = time.time() | ||
sk_lr = SKLogisticRegression(penalty=None, random_state=42, max_iter=1000000) | ||
sk_lr.fit(X, y) | ||
score = sk_lr.score(X, y) | ||
print(f'Result: {result} in {str(time.time() - start_time)} seconds') | ||
from sklearn import datasets | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import Normalizer | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from neighbors.knn.knn import KNN | ||
|
||
SEED = 42 | ||
np.random.seed(SEED) | ||
|
||
iris = datasets.load_iris() | ||
|
||
x = iris['data'] | ||
y = iris['target'] | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED) | ||
|
||
x_train = np.asarray(x_train) | ||
x_test = np.asarray(x_test) | ||
y_train = np.asarray(y_train) | ||
y_test = np.asarray(y_test) | ||
|
||
scaler = Normalizer().fit(x_train) | ||
normalized_x_train = scaler.transform(x_train) | ||
normalized_x_test = scaler.transform(x_test) | ||
|
||
sk_model = KNeighborsClassifier(n_neighbors=5, algorithm='brute') | ||
sk_model.fit(x_train, y_train) | ||
|
||
sk_train_predictions = sk_model.predict(x_train) | ||
sk_test_predictions = sk_model.predict(x_test) | ||
|
||
sk_train_score = accuracy_score(y_train, sk_train_predictions) | ||
sk_test_score = accuracy_score(y_test, sk_test_predictions) | ||
|
||
print(f'[SKLEARN] Train score: {sk_train_score}') | ||
print(f'[SKLEARN] Test score: {sk_test_score}') | ||
|
||
knn_model = KNN(k=5) | ||
train_predictions = knn_model.fit_predict(x_train, y_train) | ||
train_score = accuracy_score(y_train, train_predictions) | ||
test_predictions = knn_model.predict(x_test) | ||
test_score = accuracy_score(y_test, test_predictions) | ||
|
||
print(f'[LOCAL EUCLIDEAN] Train score: {train_score}') | ||
print(f'[LOCAL EUCLIDEAN] Test score: {test_score}') | ||
|
||
knn_model = KNN(k=5, distance='manhattan') | ||
train_predictions = knn_model.fit_predict(x_train, y_train) | ||
train_score = accuracy_score(y_train, train_predictions) | ||
test_predictions = knn_model.predict(x_test) | ||
test_score = accuracy_score(y_test, test_predictions) | ||
|
||
print(f'[LOCAL MANHATTAN] Train score: {train_score}') | ||
print(f'[LOCAL MANHATTAN] Test score: {test_score}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
# KNN (K Nearest Neighbor) | ||
|
||
## Introdução | ||
|
||
O algoritmo KNN é baseado no princípio de que pontos próximos em um espaço de características tendem a ter rótulos semelhantes. Ele pode ser usado tanto para classificação quanto para regressão. Na classificação, o KNN atribui a uma instância de teste a classe mais frequente entre seus k vizinhos mais próximos. Na regressão, ele estima um valor pela média (ou outra medida) dos valores de seus vizinhos mais próximos. | ||
|
||
Uma das principais características desse algoritmo é a simplicidade do uso. Por ser um "lazy learning", o processo de treinamento prévio é quase inexistente, dependendo dos dados enviados. Por causa disso, porém, esse algoritmo possui algumas desvantagens, como: | ||
|
||
* Custo computacional que cresce diretamente relacionado com a quantidade de dimensões e dados; | ||
* Alguns formatos de dados mais convexos não apresentação boa performance. | ||
|
||
## Implementação | ||
|
||
O passo-a-passo da implementação do KNN é bastante simples. | ||
|
||
1. Calcula a distância entre um ponto e todos os pontos do dataset. Essa etapa pode ser otimizada com algoritmos que evitam com que o cálculo seja com todos os pontos sempre. Normalmente as otimizações utilizam alguma estrutura de grafo ou árvore. A distância pode ser a euclidiana ou manhattam | ||
2. Escolhe-se os `k` elementos com menor distância do ponto escolhido. | ||
3. Realiza-se uma votação. Existem vários algoritmos possíveis de votação. A implementação mais simples (e a escolhida aqui) é retornar a label mais frequente dentre os vizinhos. | ||
|
||
Dessa forma, a implementação fica: | ||
|
||
```python | ||
def predict(self, x: np.array): | ||
predictions = [] | ||
|
||
for item in x: | ||
distances = self.__calculate_distances__(item) | ||
distances_with_labels = np.c_[distances, self.labels] | ||
nearest = self.__nearest_neighborns__(distances_with_labels) | ||
prediction = self.__voting__(nearest[:,1]) | ||
predictions.append(prediction) | ||
|
||
return np.asarray(predictions) | ||
``` | ||
|
||
## Utilizando o algoritmo | ||
|
||
Segue abaixo um exemplo de uso do algoritmo: | ||
|
||
```python | ||
import numpy as np | ||
from sklearn import datasets | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import Normalizer | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from classification.knn.knn_classifier import KNNClassifier | ||
|
||
SEED = 42 | ||
np.random.seed(SEED) | ||
|
||
iris = datasets.load_iris() | ||
|
||
x = iris['data'] | ||
y = iris['target'] | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED) | ||
|
||
x_train = np.asarray(x_train) | ||
x_test = np.asarray(x_test) | ||
y_train = np.asarray(y_train) | ||
y_test = np.asarray(y_test) | ||
|
||
scaler = Normalizer().fit(x_train) | ||
normalized_x_train = scaler.transform(x_train) | ||
normalized_x_test = scaler.transform(x_test) | ||
|
||
sk_model = KNeighborsClassifier(n_neighbors=5, algorithm='brute') | ||
sk_model.fit(x_train, y_train) | ||
|
||
sk_train_predictions = sk_model.predict(x_train) | ||
sk_test_predictions = sk_model.predict(x_test) | ||
|
||
sk_train_score = accuracy_score(y_train, sk_train_predictions) | ||
sk_test_score = accuracy_score(y_test, sk_test_predictions) | ||
|
||
print(f'[SKLEARN] Train score: {sk_train_score}') | ||
print(f'[SKLEARN] Test score: {sk_test_score}') | ||
|
||
knn_model = KNNClassifier(k=5) | ||
train_predictions = knn_model.fit_predict(x_train, y_train) | ||
train_score = accuracy_score(y_train, train_predictions) | ||
test_predictions = knn_model.predict(x_test) | ||
test_score = accuracy_score(y_test, test_predictions) | ||
|
||
print(f'[LOCAL EUCLIDEAN] Train score: {train_score}') | ||
print(f'[LOCAL EUCLIDEAN] Test score: {test_score}') | ||
|
||
knn_model = KNNClassifier(k=5, distance='manhattan') | ||
train_predictions = knn_model.fit_predict(x_train, y_train) | ||
train_score = accuracy_score(y_train, train_predictions) | ||
test_predictions = knn_model.predict(x_test) | ||
test_score = accuracy_score(y_test, test_predictions) | ||
|
||
print(f'[LOCAL MANHATTAN] Train score: {train_score}') | ||
print(f'[LOCAL MANHATTAN] Test score: {test_score}') | ||
|
||
``` | ||
|
||
O output da execução acima é: | ||
|
||
```bash | ||
[SKLEARN] Train score: 0.9666666666666667 | ||
[SKLEARN] Test score: 1.0 | ||
[LOCAL EUCLIDEAN] Train score: 0.9666666666666667 | ||
[LOCAL EUCLIDEAN] Test score: 1.0 | ||
[LOCAL MANHATTAN] Train score: 0.9666666666666667 | ||
[LOCAL MANHATTAN] Test score: 1.0 | ||
``` | ||
|
||
Algumas observações sobre a execução: | ||
|
||
* É extremamente importante que os dados estejam normalizados. A diferença de escala afeta diretamente as medidas de distância (especialmente a euclidiana). | ||
* Note como a implementação do Scikit Learn possui a mesma resposta para o dataset iris. A diferença ficará sempre no tempo de execução, visto que a lib `sklearn` é melhor otimizada, especialmente no cálculo das distâncias. | ||
* Sobre as medidas de distância e quando escolher cada uma, assista a [esta aula](https://www.youtube.com/watch?v=h0e2HAPTGF4&t=2362s&ab_channel=MITOpenCourseWare) de introdução ao aprendizado de máquina do MIT. | ||
* Para realizar a chamada utilizando regressão, basta configurar o parâmetro em `KNNClassifier(k=5, task='regression')`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from collections import Counter | ||
import numpy as np | ||
from core.distances import euclidean_distances, manhattan_distances | ||
from core.exceptions.not_implemented import NotImplementedException | ||
|
||
class KNN: | ||
|
||
def __init__(self, k: int, distance: str = 'euclidean', task: str = 'classification'): | ||
self.k = k | ||
self.train_data = [] | ||
self.labels = [] | ||
self.distance = distance | ||
self.task = task | ||
|
||
def fit(self, x: np.array, y: np.array): | ||
self.train_data = x | ||
self.labels = y | ||
|
||
def predict(self, x: np.array): | ||
predictions = [] | ||
|
||
for item in x: | ||
distances = self.__calculate_distances__(item) | ||
distances_with_labels = np.c_[distances, self.labels] | ||
nearest = self.__nearest_neighborns__(distances_with_labels) | ||
|
||
if self.task == 'classification': | ||
prediction = self.__voting__(nearest[:,1]) | ||
elif self.task == 'regression': | ||
prediction = self.__regression__(nearest[:,1]) | ||
else: | ||
raise NotImplementedException(f"task {self.task}") | ||
predictions.append(prediction) | ||
|
||
return np.asarray(predictions) | ||
|
||
def fit_predict(self, x: np.array, y: np.array): | ||
self.fit(x, y) | ||
return self.predict(x) | ||
|
||
def accuracy(self, x: np.array, y: np.array): | ||
predictions = self.predict(x) | ||
return np.mean(predictions == y) | ||
|
||
def __calculate_distances__(self, target: np.array) -> np.array: | ||
if self.distance == 'euclidean': | ||
return euclidean_distances(self.train_data, target) | ||
if self.distance == 'manhattan': | ||
return manhattan_distances(self.train_data, target) | ||
|
||
raise NotImplementedException(f'{self.distance} distance') | ||
|
||
def __nearest_neighborns__(self, distances: np.array): | ||
return distances[distances[:, 0].argsort()][:self.k] | ||
|
||
def __voting__(self, nearest_neighborns_labels: np.array): | ||
return Counter(nearest_neighborns_labels).most_common()[0][0] | ||
|
||
def __regression__(self, nearest_neighborns_target: np.array): | ||
return np.mean(nearest_neighborns_target) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters