-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f2fc9c9
commit 06168cd
Showing
5 changed files
with
132 additions
and
595 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Random Forest | ||
|
||
[EM DESENVOLVIMENTO] | ||
|
||
- Coleção de árvores não relacionadas | ||
|
||
## Bootstrap | ||
|
||
Uma técnica de estatística de amostragem, onde elementos são selecionados aleatoriamente, mas com reposição. Isso significa que um elemento pode ser escolhido mais de uma vez e que alguns elementos podem nunca serem escolhidos. | ||
|
||
## Treinamento das árvores | ||
|
||
Cada árvore é treinada com uma amostra realizada com Bootstrap. No entanto, não são escolhidas todas as features em todas as árvores, mas são escolhidas também por amostragem aleatória, tentando evitar o overfitting ou a existência de alguma feature que gere bons resultados em entropia e gini mas não tenha real ganho de informação (como um id, por exemplo). | ||
|
||
## Classificação | ||
|
||
É realizada uma votação pelo voto majoritário de todas as árvores. | ||
|
||
// a quantidade de votos não poderia implicar a porcentagem de chance de ser? Segundo o chatgpt sim e até existe no sklearn a propriedade predict_proba. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
from src.tree.decision_tree_classifier import DecisionTreeClassifier | ||
import numpy as np | ||
from collections import defaultdict, Counter | ||
|
||
class RandomForestClasifier: | ||
def __init__(self, | ||
n_estimators=50, | ||
criterion='entropy', | ||
max_depth=5, | ||
max_features: float=0.7, | ||
min_estimator_size: int = 3, | ||
random_state: int = None) -> None: | ||
self.n_estimators = n_estimators | ||
self.criterion = criterion | ||
self.max_depth = max_depth | ||
self.max_features = max_features | ||
self.min_estimator_size = min_estimator_size | ||
self.random_state = random_state | ||
|
||
if self.random_state is not None: | ||
np.random.seed(self.random_state) | ||
|
||
trees = [] | ||
for _ in range(n_estimators): | ||
trees.append(TreeEstimator(criterion, min_estimator_size, max_depth)) | ||
|
||
self.estimators:list[TreeEstimator] = trees | ||
|
||
def __bootstrap__(self, training_data_size: int, training_data_n_features: int): | ||
sample_indexes = list(range(training_data_size)) | ||
selected_rows_indexes = np.random.choice(sample_indexes, training_data_size) | ||
n_features_to_select = int(training_data_n_features * self.max_features) | ||
selected_columns_indexes = np.random.permutation(training_data_n_features)[:n_features_to_select] | ||
|
||
return selected_rows_indexes, selected_columns_indexes | ||
|
||
def __bootstrap_all_estimators__(self, training_data_size: int, training_data_n_features: int): | ||
for estimator in self.estimators: | ||
idx_rows, idx_columns = self.__bootstrap__(training_data_size, training_data_n_features) | ||
# Get the row indexes that were not taken | ||
all_indexes = set(list(range(training_data_size))) | ||
total = all_indexes - set(idx_rows) | ||
|
||
estimator.set_bootstrap_result( | ||
idx_rows, | ||
idx_columns, | ||
total | ||
) | ||
|
||
def fit(self, X: np.array, y:np.array): | ||
training_data_size, training_data_n_features = X.shape | ||
self.__bootstrap_all_estimators__(training_data_size, training_data_n_features) | ||
for estimator in self.estimators: | ||
estimator.fit(X, y) | ||
|
||
def out_of_bag_score(self, X: np.array, y: np.array): | ||
all_predictions = [] | ||
for estimator in self.estimators: | ||
predictions = estimator.out_of_bag_predictions(X) | ||
all_predictions.append(predictions) | ||
|
||
idx_group = defaultdict(list) | ||
|
||
for predictions_list in all_predictions: | ||
for idx, prediction in predictions_list: | ||
idx_group [idx].append(prediction) | ||
|
||
result = [] | ||
for idx, predictions in idx_group.items(): | ||
most_frequent_value = Counter(predictions).most_common(1)[0][0] | ||
correct = 1 if y[idx] == most_frequent_value else 0 | ||
result.append((idx, most_frequent_value, correct)) | ||
|
||
return np.mean(np.array(result)[:,2]) | ||
|
||
def predict(self, X): | ||
pass | ||
|
||
class TreeEstimator: | ||
def __init__(self, criterion: str, min_estimator_size: int, max_depth: int) -> None: | ||
self.estimator_model = DecisionTreeClassifier(criterion, min_estimator_size, max_depth) | ||
self.bootstrap_row_indexes = [] | ||
self.feature_indexes = [] | ||
self.out_of_bag_indexes = [] | ||
|
||
def set_bootstrap_result(self, row_indexes: list, feature_indexes: list, out_of_bag: list): | ||
self.bootstrap_row_indexes = [] | ||
self.feature_indexes = [] | ||
self.out_of_bag_indexes = [] | ||
|
||
self.bootstrap_row_indexes.extend(row_indexes) | ||
self.feature_indexes.extend(feature_indexes) | ||
self.out_of_bag_indexes.extend(out_of_bag) | ||
|
||
def fit(self, X: np.array, y: np.array): | ||
current_X = X[self.bootstrap_row_indexes[:, np.newaxis], self.feature_indexes] | ||
current_y = y[self.bootstrap_row_indexes] | ||
self.estimator_model.fit(current_X, current_y) | ||
|
||
def out_of_bag_predictions(self, X): | ||
response = [] | ||
for oob_idx in self.out_of_bag_indexes: | ||
row = X[oob_idx, self.feature_indexes] | ||
prediction = self.estimator_model.predict(row) | ||
response.append((oob_idx, prediction)) | ||
return response | ||
|
||
def predict(self, X): | ||
return self.estimator_model.predict(X) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,4 @@ | ||
import numpy as np | ||
from sklearn import datasets | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import Normalizer | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from neighbors.knn.knn import KNN | ||
|
||
SEED = 42 | ||
np.random.seed(SEED) | ||
|
||
iris = datasets.load_iris() | ||
|
||
x = iris['data'] | ||
y = iris['target'] | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED) | ||
|
||
x_train = np.asarray(x_train) | ||
x_test = np.asarray(x_test) | ||
y_train = np.asarray(y_train) | ||
y_test = np.asarray(y_test) | ||
|
||
scaler = Normalizer().fit(x_train) | ||
normalized_x_train = scaler.transform(x_train) | ||
normalized_x_test = scaler.transform(x_test) | ||
|
||
sk_model = KNeighborsClassifier(n_neighbors=5, algorithm='brute') | ||
sk_model.fit(x_train, y_train) | ||
|
||
sk_train_predictions = sk_model.predict(x_train) | ||
sk_test_predictions = sk_model.predict(x_test) | ||
|
||
sk_train_score = accuracy_score(y_train, sk_train_predictions) | ||
sk_test_score = accuracy_score(y_test, sk_test_predictions) | ||
|
||
print(f'[SKLEARN] Train score: {sk_train_score}') | ||
print(f'[SKLEARN] Test score: {sk_test_score}') | ||
|
||
knn_model = KNN(k=5) | ||
train_predictions = knn_model.fit_predict(x_train, y_train) | ||
train_score = accuracy_score(y_train, train_predictions) | ||
test_predictions = knn_model.predict(x_test) | ||
test_score = accuracy_score(y_test, test_predictions) | ||
|
||
print(f'[LOCAL EUCLIDEAN] Train score: {train_score}') | ||
print(f'[LOCAL EUCLIDEAN] Test score: {test_score}') | ||
|
||
knn_model = KNN(k=5, distance='manhattan') | ||
train_predictions = knn_model.fit_predict(x_train, y_train) | ||
train_score = accuracy_score(y_train, train_predictions) | ||
test_predictions = knn_model.predict(x_test) | ||
test_score = accuracy_score(y_test, test_predictions) | ||
|
||
print(f'[LOCAL MANHATTAN] Train score: {train_score}') | ||
print(f'[LOCAL MANHATTAN] Test score: {test_score}') | ||
tu = [(1, 2), (3, 4), (5, 6)] | ||
print(np.array(tu)[:,1]) |
Oops, something went wrong.