-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a84c340
commit 68be73a
Showing
12 changed files
with
9,416 additions
and
0 deletions.
There are no files selected for viewing
1,001 changes: 1,001 additions & 0 deletions
1,001
Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Xte.csv
Large diffs are not rendered by default.
Oops, something went wrong.
1,001 changes: 1,001 additions & 0 deletions
1,001
Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Xte_mat100.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2,001 changes: 2,001 additions & 0 deletions
2,001
Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Xtr.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2,001 changes: 2,001 additions & 0 deletions
2,001
Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Xtr_mat100.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2,001 changes: 2,001 additions & 0 deletions
2,001
Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Ytr.csv
Large diffs are not rendered by default.
Oops, something went wrong.
181 changes: 181 additions & 0 deletions
181
Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/dataloader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
import numpy as np, pandas as pd | ||
from matplotlib import pyplot as plt | ||
from collections import defaultdict | ||
|
||
class Dataloader: | ||
def vocab(self, Df, subseq_size=5): | ||
V = {} | ||
counter = -1 | ||
for seq in Df: | ||
for i in range(len(seq[0])-subseq_size+1): | ||
key = seq[0][i:i+subseq_size] | ||
if key not in V: | ||
counter += 1 | ||
V[key] = counter | ||
self.vocabulary = V | ||
return V | ||
|
||
def transform1(self, Df, V, subseq_size = 8, count=False): | ||
NewDf = [] | ||
n = len(V) | ||
for seq in Df: | ||
counts = defaultdict(lambda: 0.0) | ||
vector = [0]*n | ||
if not count: | ||
for j in range(len(seq[0])-subseq_size+1): | ||
key = seq[0][j:j+subseq_size] | ||
vector[V[key]] = 1. | ||
else: | ||
for i in range(len(seq[0])-subseq_size+1): | ||
key = seq[0][i:i+subseq_size] | ||
counts[key] += 1. | ||
for j in range(len(seq[0])-subseq_size+1): | ||
key = seq[0][j:j+subseq_size] | ||
vector[V[key]] = counts[key]/10. | ||
NewDf.append(vector) | ||
return np.array(NewDf) | ||
|
||
def transform2(self, Xmat): | ||
Data = {'seq': []} | ||
for seq in Xmat.values: | ||
seq = seq[0].split() | ||
seq = list(map(lambda x: float(x), seq)) | ||
Data['seq'].append(seq) | ||
return np.array(Data['seq']) | ||
|
||
def load_data(self, dirXtr, dirXtr_mat, dirX_te, dirXte_mat, dir_y, use_data_mat=False, subseq_size=5, count=True): | ||
X, y, Xte = pd.read_csv(dirXtr), pd.read_csv(dir_y), pd.read_csv(dirX_te) | ||
Xtr, Xtest = pd.read_csv(dirXtr_mat), pd.read_csv(dirXte_mat) | ||
|
||
Df = X.values[:, 1:] | ||
Dfe = Xte.values[:, 1:] | ||
D = np.concatenate([Df, Dfe]) | ||
|
||
V = self.vocab(D, subseq_size=subseq_size) | ||
Df = self.transform1(Df, V, subseq_size=subseq_size, count=count) | ||
Dfe = self.transform1(Dfe, V, subseq_size=subseq_size, count=count) | ||
|
||
if use_data_mat: | ||
Xtr_mat = self.transform2(Xtr) | ||
Xte_mat = self.transform2(Xtest) | ||
|
||
Df = np.concatenate([Df, Xtr_mat], axis=1) | ||
Dfe = np.concatenate([Dfe, Xte_mat], axis=1) | ||
|
||
y = y.values[:,1:].flatten() | ||
y[y==0]=-1. | ||
|
||
Df = np.concatenate([Df, y.reshape(-1,1)], axis=1) | ||
Dfe = np.concatenate([Dfe, np.zeros(len(Dfe)).reshape(-1,1)], axis=1) | ||
|
||
return np.array(Df, dtype=float), np.array(Dfe, dtype=float) | ||
|
||
def split_data(self, Df, train_size=0.8): | ||
Rand = np.random.permutation(range(len(Df))) | ||
Df = Df[Rand] | ||
Xtrain = Df[:int(train_size*len(Df)), :-1] | ||
ytrain = Df[:int(train_size*len(Df)), -1] | ||
Xval = Df[int(train_size*len(Df)):, :-1] | ||
yval = Df[int(train_size*len(Df)):, -1] | ||
return np.array(Xtrain), np.array(ytrain), np.array(Xval), np.array(yval) | ||
|
||
class CrossValidation: | ||
def __init__(self, model_name='svm', k = 5): | ||
self.k = k | ||
assert model_name in ['svm', 'logistic_regression'], 'the model name should be svm or logistic_regression' | ||
self.model_name = model_name | ||
|
||
def Kfold(self, X, y): | ||
k = self.k | ||
Rand = np.random.permutation(range(len(X))) | ||
X = X[Rand] | ||
y = y[Rand] | ||
Folds = [] | ||
for i in range(k): | ||
Folds.append([X[i*(len(X)//k):(i+1)*(len(X)//k)], y[i*(len(y)//k):(i+1)*(len(y)//k)]]) | ||
self.Folds = Folds | ||
return self | ||
|
||
def crossvalidate(self): | ||
Folds = self.Folds | ||
if self.model_name == 'svm': | ||
if SVM.use_kernel: # Look for the best value of the degree | ||
best_degree, best_val_acc = None, 0.0 | ||
degrees = np.linspace(1, 3, 8) | ||
for j, degree in enumerate(degrees): | ||
print('*'*50) | ||
print('Progress: {}/{}'.format(j+1, len(degrees))) | ||
print('*'*50) | ||
Acc = [] | ||
SVM.degree = degree | ||
for i in range(self.k): | ||
Xv, yv = Folds[i] | ||
Rest = [Folds[l] for l in range(self.k) if l != i] | ||
concatX, concaty = Rest[0][0], Rest[0][1] | ||
for l in range(1,len(Rest)): | ||
concatX, concaty = np.vstack([concatX, Rest[l][0]]), np.hstack([concaty, Rest[l][1]]) | ||
Xt, yt = concatX, concaty | ||
SVM.fit_kernel(Xt, yt) | ||
y_pred = SVM.Kernel_predict(Xv) | ||
Acc.append(SVM.compute_accuracy(y_pred, yv)) | ||
acc = np.mean(Acc) | ||
print('Val acc: ', acc) | ||
if acc > best_val_acc: | ||
best_val_acc = acc | ||
best_degree = degree | ||
print('Best val acc: ', best_val_acc) | ||
return best_degree | ||
else: # Look for te best value of C for a simple SVM | ||
best_C, best_val_acc = None, 0.0 | ||
Cvalues = 10**np.linspace(-1, 3, 5) | ||
for j, C in enumerate(Cvalues): | ||
print('*'*50) | ||
print('Progress: {}/{}'.format(j+1, len(Cvalues))) | ||
print('*'*50) | ||
Acc = [] | ||
SVM.C = C | ||
for i in range(self.k): | ||
Xv, yv = Folds[i] | ||
Rest = [Folds[l] for l in range(self.k) if l != i] | ||
concatX, concaty = Rest[0][0], Rest[0][1] | ||
for l in range(1,len(Rest)): | ||
concatX, concaty = np.vstack([concatX, Rest[l][0]]), np.hstack([concaty, Rest[l][1]]) | ||
Xt, yt = concatX, concaty | ||
SVM.fit_no_kernel(Xt, yt) | ||
y_pred = SVM.predict_no_kernel(Xv) | ||
Acc.append(SVM.compute_accuracy(y_pred, yv)) | ||
acc = np.mean(Acc) | ||
print('Val acc: ', acc) | ||
if acc > best_val_acc: | ||
best_val_acc = acc | ||
best_C = C | ||
print('Best val acc: ', best_val_acc) | ||
return best_C | ||
|
||
else: | ||
# Logistic regression part | ||
best_lambd, best_val_acc = None, 0.0 | ||
Lambdas = 10**np.linspace(-1, 1, 5) | ||
for j,lambd in enumerate(Lambdas): | ||
print('*'*50) | ||
print('Progress: {}/{}'.format(j+1, len(Lambdas))) | ||
print('*'*50) | ||
LR.lambd = lambd | ||
Acc = [] | ||
for i in range(self.k): | ||
Xv, yv = Folds[i] | ||
Rest = [Folds[l] for l in range(self.k) if l != i] | ||
concatX, concaty = Rest[0][0], Rest[0][1] | ||
for l in range(1,len(Rest)): | ||
concatX, concaty = np.vstack([concatX, Rest[l][0]]), np.hstack([concaty, Rest[l][1]]) | ||
Xt, yt = concatX, concaty | ||
LR.fit(yt, Xt, max_iter=10) | ||
y_pred = LR.predict(Xv) | ||
Acc.append(LR.compute_accuracy(y_pred, yv)) | ||
acc = np.mean(Acc) | ||
print('Val acc: ', acc) | ||
if acc > best_val_acc: | ||
best_val_acc = acc | ||
best_lambd = lambd | ||
print('Best val acc:', best_val_acc) | ||
return best_lambd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import numpy as np, pandas as pd | ||
from Data_Dataloader_CrossVal.dataloader import Dataloader, CrossValidation | ||
from Models.models import MySVM, LogisticRegression | ||
DataLoader = Dataloader() | ||
Xtrf, Xtrmatf, Xtef, Xtematf, Ytrf = './Data_Dataloader_CrossVal/Xtr.csv', './Data_Dataloader_CrossVal/Xtr_mat100.csv', './Data_Dataloader_CrossVal/Xte.csv', './Data_Dataloader_CrossVal/Xte_mat100.csv', './Data_Dataloader_CrossVal/Ytr.csv' | ||
Df_tr, Df_te = DataLoader.load_data(Xtrf, Xtrmatf, Xtef, Xtematf, Ytrf, use_data_mat=True, subseq_size=6, count=True) | ||
np.random.seed(101) | ||
X_train, y_train, X_val, y_val = DataLoader.split_data(Df_tr, train_size=0.9) | ||
X_test, _, _, _ = DataLoader.split_data(Df_te,train_size=1.) | ||
|
||
# Uncomment the following to run the logistic regression | ||
|
||
# LR = LogisticRegression() | ||
# LR.lambd = 0.0001 | ||
# LR.fit(y_train, X_train, max_iter=20) | ||
# pred=LR.predict(X_val) | ||
# print('val acc:', LR.compute_accuracy(y_val, pred)) | ||
# pred = LR.predict(X_train) | ||
# print('train acc:', LR.compute_accuracy(y_train, pred)) | ||
# Yte = LR.predict(X_test) | ||
# Yte[Yte==-1]=0 | ||
|
||
SVM = MySVM() | ||
SVM.kernels = ['poly', 'gauss'] | ||
SVM.combine_kernel = 'prod' | ||
SVM.C=1 | ||
SVM.std = 35. | ||
SVM.degree=2.14 | ||
SVM.fit_kernel(X_train, y_train) | ||
y_pred = SVM.Kernel_predict(X_train) | ||
print('train accuracy:', SVM.compute_accuracy(y_pred, y_train)) | ||
y_pred = SVM.Kernel_predict(X_val) | ||
print('val accuracy:', SVM.compute_accuracy(y_pred, y_val)) | ||
|
||
Yte = SVM.Kernel_predict(X_test) | ||
Yte[Yte==-1]=0 | ||
Yte = list(map(lambda x: int(x), Yte)) | ||
submission = {"Id":list(range(len(Yte))), "Bound": Yte} | ||
submission_df = pd.DataFrame(submission) | ||
submission_df.to_csv('Submission.csv',columns=["Id","Bound"], index=False) | ||
print() | ||
print("Please find the generated submission file (Submission.csv) in the main repository") | ||
print() |
Binary file added
BIN
+6.45 KB
Kernel-Methods/KM-Project-Kouagou-Kanubala/Models/__pycache__/models.cpython-37.pyc
Binary file not shown.
Oops, something went wrong.