Skip to content

Commit

Permalink
random forest pt 1
Browse files Browse the repository at this point in the history
  • Loading branch information
SalatielBairros committed Jun 28, 2024
1 parent f2fc9c9 commit 06168cd
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 595 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ More information will be added here in the future as more algorithms are impleme

### Current implementing:

- Decision Tree
- Random Forest

### Next

- Random Forest
- K-Means
19 changes: 19 additions & 0 deletions src/ensemble/random_forest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Random Forest

[EM DESENVOLVIMENTO]

- Coleção de árvores não relacionadas

## Bootstrap

Uma técnica de estatística de amostragem, onde elementos são selecionados aleatoriamente, mas com reposição. Isso significa que um elemento pode ser escolhido mais de uma vez e que alguns elementos podem nunca serem escolhidos.

## Treinamento das árvores

Cada árvore é treinada com uma amostra realizada com Bootstrap. No entanto, não são escolhidas todas as features em todas as árvores, mas são escolhidas também por amostragem aleatória, tentando evitar o overfitting ou a existência de alguma feature que gere bons resultados em entropia e gini mas não tenha real ganho de informação (como um id, por exemplo).

## Classificação

É realizada uma votação pelo voto majoritário de todas as árvores.

// a quantidade de votos não poderia implicar a porcentagem de chance de ser? Segundo o chatgpt sim e até existe no sklearn a propriedade predict_proba.
109 changes: 109 additions & 0 deletions src/ensemble/random_forest/random_forest_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from src.tree.decision_tree_classifier import DecisionTreeClassifier
import numpy as np
from collections import defaultdict, Counter

class RandomForestClasifier:
def __init__(self,
n_estimators=50,
criterion='entropy',
max_depth=5,
max_features: float=0.7,
min_estimator_size: int = 3,
random_state: int = None) -> None:
self.n_estimators = n_estimators
self.criterion = criterion
self.max_depth = max_depth
self.max_features = max_features
self.min_estimator_size = min_estimator_size
self.random_state = random_state

if self.random_state is not None:
np.random.seed(self.random_state)

trees = []
for _ in range(n_estimators):
trees.append(TreeEstimator(criterion, min_estimator_size, max_depth))

self.estimators:list[TreeEstimator] = trees

def __bootstrap__(self, training_data_size: int, training_data_n_features: int):
sample_indexes = list(range(training_data_size))
selected_rows_indexes = np.random.choice(sample_indexes, training_data_size)
n_features_to_select = int(training_data_n_features * self.max_features)
selected_columns_indexes = np.random.permutation(training_data_n_features)[:n_features_to_select]

return selected_rows_indexes, selected_columns_indexes

def __bootstrap_all_estimators__(self, training_data_size: int, training_data_n_features: int):
for estimator in self.estimators:
idx_rows, idx_columns = self.__bootstrap__(training_data_size, training_data_n_features)
# Get the row indexes that were not taken
all_indexes = set(list(range(training_data_size)))
total = all_indexes - set(idx_rows)

estimator.set_bootstrap_result(
idx_rows,
idx_columns,
total
)

def fit(self, X: np.array, y:np.array):
training_data_size, training_data_n_features = X.shape
self.__bootstrap_all_estimators__(training_data_size, training_data_n_features)
for estimator in self.estimators:
estimator.fit(X, y)

def out_of_bag_score(self, X: np.array, y: np.array):
all_predictions = []
for estimator in self.estimators:
predictions = estimator.out_of_bag_predictions(X)
all_predictions.append(predictions)

idx_group = defaultdict(list)

for predictions_list in all_predictions:
for idx, prediction in predictions_list:
idx_group [idx].append(prediction)

result = []
for idx, predictions in idx_group.items():
most_frequent_value = Counter(predictions).most_common(1)[0][0]
correct = 1 if y[idx] == most_frequent_value else 0
result.append((idx, most_frequent_value, correct))

return np.mean(np.array(result)[:,2])

def predict(self, X):
pass

class TreeEstimator:
def __init__(self, criterion: str, min_estimator_size: int, max_depth: int) -> None:
self.estimator_model = DecisionTreeClassifier(criterion, min_estimator_size, max_depth)
self.bootstrap_row_indexes = []
self.feature_indexes = []
self.out_of_bag_indexes = []

def set_bootstrap_result(self, row_indexes: list, feature_indexes: list, out_of_bag: list):
self.bootstrap_row_indexes = []
self.feature_indexes = []
self.out_of_bag_indexes = []

self.bootstrap_row_indexes.extend(row_indexes)
self.feature_indexes.extend(feature_indexes)
self.out_of_bag_indexes.extend(out_of_bag)

def fit(self, X: np.array, y: np.array):
current_X = X[self.bootstrap_row_indexes[:, np.newaxis], self.feature_indexes]
current_y = y[self.bootstrap_row_indexes]
self.estimator_model.fit(current_X, current_y)

def out_of_bag_predictions(self, X):
response = []
for oob_idx in self.out_of_bag_indexes:
row = X[oob_idx, self.feature_indexes]
prediction = self.estimator_model.predict(row)
response.append((oob_idx, prediction))
return response

def predict(self, X):
return self.estimator_model.predict(X)
56 changes: 2 additions & 54 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,4 @@
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from neighbors.knn.knn import KNN

SEED = 42
np.random.seed(SEED)

iris = datasets.load_iris()

x = iris['data']
y = iris['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

scaler = Normalizer().fit(x_train)
normalized_x_train = scaler.transform(x_train)
normalized_x_test = scaler.transform(x_test)

sk_model = KNeighborsClassifier(n_neighbors=5, algorithm='brute')
sk_model.fit(x_train, y_train)

sk_train_predictions = sk_model.predict(x_train)
sk_test_predictions = sk_model.predict(x_test)

sk_train_score = accuracy_score(y_train, sk_train_predictions)
sk_test_score = accuracy_score(y_test, sk_test_predictions)

print(f'[SKLEARN] Train score: {sk_train_score}')
print(f'[SKLEARN] Test score: {sk_test_score}')

knn_model = KNN(k=5)
train_predictions = knn_model.fit_predict(x_train, y_train)
train_score = accuracy_score(y_train, train_predictions)
test_predictions = knn_model.predict(x_test)
test_score = accuracy_score(y_test, test_predictions)

print(f'[LOCAL EUCLIDEAN] Train score: {train_score}')
print(f'[LOCAL EUCLIDEAN] Test score: {test_score}')

knn_model = KNN(k=5, distance='manhattan')
train_predictions = knn_model.fit_predict(x_train, y_train)
train_score = accuracy_score(y_train, train_predictions)
test_predictions = knn_model.predict(x_test)
test_score = accuracy_score(y_test, test_predictions)

print(f'[LOCAL MANHATTAN] Train score: {train_score}')
print(f'[LOCAL MANHATTAN] Test score: {test_score}')
tu = [(1, 2), (3, 4), (5, 6)]
print(np.array(tu)[:,1])
Loading

0 comments on commit 06168cd

Please sign in to comment.