Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-KOUAGOU committed Jun 1, 2020
1 parent a84c340 commit 68be73a
Show file tree
Hide file tree
Showing 12 changed files with 9,416 additions and 0 deletions.
1,001 changes: 1,001 additions & 0 deletions Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Xte.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2,001 changes: 2,001 additions & 0 deletions Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Xtr.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2,001 changes: 2,001 additions & 0 deletions Kernel-Methods/KM-Project-Kouagou-Kanubala/Data_Dataloader_CrossVal/Ytr.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict

class Dataloader:
def vocab(self, Df, subseq_size=5):
V = {}
counter = -1
for seq in Df:
for i in range(len(seq[0])-subseq_size+1):
key = seq[0][i:i+subseq_size]
if key not in V:
counter += 1
V[key] = counter
self.vocabulary = V
return V

def transform1(self, Df, V, subseq_size = 8, count=False):
NewDf = []
n = len(V)
for seq in Df:
counts = defaultdict(lambda: 0.0)
vector = [0]*n
if not count:
for j in range(len(seq[0])-subseq_size+1):
key = seq[0][j:j+subseq_size]
vector[V[key]] = 1.
else:
for i in range(len(seq[0])-subseq_size+1):
key = seq[0][i:i+subseq_size]
counts[key] += 1.
for j in range(len(seq[0])-subseq_size+1):
key = seq[0][j:j+subseq_size]
vector[V[key]] = counts[key]/10.
NewDf.append(vector)
return np.array(NewDf)

def transform2(self, Xmat):
Data = {'seq': []}
for seq in Xmat.values:
seq = seq[0].split()
seq = list(map(lambda x: float(x), seq))
Data['seq'].append(seq)
return np.array(Data['seq'])

def load_data(self, dirXtr, dirXtr_mat, dirX_te, dirXte_mat, dir_y, use_data_mat=False, subseq_size=5, count=True):
X, y, Xte = pd.read_csv(dirXtr), pd.read_csv(dir_y), pd.read_csv(dirX_te)
Xtr, Xtest = pd.read_csv(dirXtr_mat), pd.read_csv(dirXte_mat)

Df = X.values[:, 1:]
Dfe = Xte.values[:, 1:]
D = np.concatenate([Df, Dfe])

V = self.vocab(D, subseq_size=subseq_size)
Df = self.transform1(Df, V, subseq_size=subseq_size, count=count)
Dfe = self.transform1(Dfe, V, subseq_size=subseq_size, count=count)

if use_data_mat:
Xtr_mat = self.transform2(Xtr)
Xte_mat = self.transform2(Xtest)

Df = np.concatenate([Df, Xtr_mat], axis=1)
Dfe = np.concatenate([Dfe, Xte_mat], axis=1)

y = y.values[:,1:].flatten()
y[y==0]=-1.

Df = np.concatenate([Df, y.reshape(-1,1)], axis=1)
Dfe = np.concatenate([Dfe, np.zeros(len(Dfe)).reshape(-1,1)], axis=1)

return np.array(Df, dtype=float), np.array(Dfe, dtype=float)

def split_data(self, Df, train_size=0.8):
Rand = np.random.permutation(range(len(Df)))
Df = Df[Rand]
Xtrain = Df[:int(train_size*len(Df)), :-1]
ytrain = Df[:int(train_size*len(Df)), -1]
Xval = Df[int(train_size*len(Df)):, :-1]
yval = Df[int(train_size*len(Df)):, -1]
return np.array(Xtrain), np.array(ytrain), np.array(Xval), np.array(yval)

class CrossValidation:
def __init__(self, model_name='svm', k = 5):
self.k = k
assert model_name in ['svm', 'logistic_regression'], 'the model name should be svm or logistic_regression'
self.model_name = model_name

def Kfold(self, X, y):
k = self.k
Rand = np.random.permutation(range(len(X)))
X = X[Rand]
y = y[Rand]
Folds = []
for i in range(k):
Folds.append([X[i*(len(X)//k):(i+1)*(len(X)//k)], y[i*(len(y)//k):(i+1)*(len(y)//k)]])
self.Folds = Folds
return self

def crossvalidate(self):
Folds = self.Folds
if self.model_name == 'svm':
if SVM.use_kernel: # Look for the best value of the degree
best_degree, best_val_acc = None, 0.0
degrees = np.linspace(1, 3, 8)
for j, degree in enumerate(degrees):
print('*'*50)
print('Progress: {}/{}'.format(j+1, len(degrees)))
print('*'*50)
Acc = []
SVM.degree = degree
for i in range(self.k):
Xv, yv = Folds[i]
Rest = [Folds[l] for l in range(self.k) if l != i]
concatX, concaty = Rest[0][0], Rest[0][1]
for l in range(1,len(Rest)):
concatX, concaty = np.vstack([concatX, Rest[l][0]]), np.hstack([concaty, Rest[l][1]])
Xt, yt = concatX, concaty
SVM.fit_kernel(Xt, yt)
y_pred = SVM.Kernel_predict(Xv)
Acc.append(SVM.compute_accuracy(y_pred, yv))
acc = np.mean(Acc)
print('Val acc: ', acc)
if acc > best_val_acc:
best_val_acc = acc
best_degree = degree
print('Best val acc: ', best_val_acc)
return best_degree
else: # Look for te best value of C for a simple SVM
best_C, best_val_acc = None, 0.0
Cvalues = 10**np.linspace(-1, 3, 5)
for j, C in enumerate(Cvalues):
print('*'*50)
print('Progress: {}/{}'.format(j+1, len(Cvalues)))
print('*'*50)
Acc = []
SVM.C = C
for i in range(self.k):
Xv, yv = Folds[i]
Rest = [Folds[l] for l in range(self.k) if l != i]
concatX, concaty = Rest[0][0], Rest[0][1]
for l in range(1,len(Rest)):
concatX, concaty = np.vstack([concatX, Rest[l][0]]), np.hstack([concaty, Rest[l][1]])
Xt, yt = concatX, concaty
SVM.fit_no_kernel(Xt, yt)
y_pred = SVM.predict_no_kernel(Xv)
Acc.append(SVM.compute_accuracy(y_pred, yv))
acc = np.mean(Acc)
print('Val acc: ', acc)
if acc > best_val_acc:
best_val_acc = acc
best_C = C
print('Best val acc: ', best_val_acc)
return best_C

else:
# Logistic regression part
best_lambd, best_val_acc = None, 0.0
Lambdas = 10**np.linspace(-1, 1, 5)
for j,lambd in enumerate(Lambdas):
print('*'*50)
print('Progress: {}/{}'.format(j+1, len(Lambdas)))
print('*'*50)
LR.lambd = lambd
Acc = []
for i in range(self.k):
Xv, yv = Folds[i]
Rest = [Folds[l] for l in range(self.k) if l != i]
concatX, concaty = Rest[0][0], Rest[0][1]
for l in range(1,len(Rest)):
concatX, concaty = np.vstack([concatX, Rest[l][0]]), np.hstack([concaty, Rest[l][1]])
Xt, yt = concatX, concaty
LR.fit(yt, Xt, max_iter=10)
y_pred = LR.predict(Xv)
Acc.append(LR.compute_accuracy(y_pred, yv))
acc = np.mean(Acc)
print('Val acc: ', acc)
if acc > best_val_acc:
best_val_acc = acc
best_lambd = lambd
print('Best val acc:', best_val_acc)
return best_lambd
43 changes: 43 additions & 0 deletions Kernel-Methods/KM-Project-Kouagou-Kanubala/MAIN_SCRIPT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import numpy as np, pandas as pd
from Data_Dataloader_CrossVal.dataloader import Dataloader, CrossValidation
from Models.models import MySVM, LogisticRegression
DataLoader = Dataloader()
Xtrf, Xtrmatf, Xtef, Xtematf, Ytrf = './Data_Dataloader_CrossVal/Xtr.csv', './Data_Dataloader_CrossVal/Xtr_mat100.csv', './Data_Dataloader_CrossVal/Xte.csv', './Data_Dataloader_CrossVal/Xte_mat100.csv', './Data_Dataloader_CrossVal/Ytr.csv'
Df_tr, Df_te = DataLoader.load_data(Xtrf, Xtrmatf, Xtef, Xtematf, Ytrf, use_data_mat=True, subseq_size=6, count=True)
np.random.seed(101)
X_train, y_train, X_val, y_val = DataLoader.split_data(Df_tr, train_size=0.9)
X_test, _, _, _ = DataLoader.split_data(Df_te,train_size=1.)

# Uncomment the following to run the logistic regression

# LR = LogisticRegression()
# LR.lambd = 0.0001
# LR.fit(y_train, X_train, max_iter=20)
# pred=LR.predict(X_val)
# print('val acc:', LR.compute_accuracy(y_val, pred))
# pred = LR.predict(X_train)
# print('train acc:', LR.compute_accuracy(y_train, pred))
# Yte = LR.predict(X_test)
# Yte[Yte==-1]=0

SVM = MySVM()
SVM.kernels = ['poly', 'gauss']
SVM.combine_kernel = 'prod'
SVM.C=1
SVM.std = 35.
SVM.degree=2.14
SVM.fit_kernel(X_train, y_train)
y_pred = SVM.Kernel_predict(X_train)
print('train accuracy:', SVM.compute_accuracy(y_pred, y_train))
y_pred = SVM.Kernel_predict(X_val)
print('val accuracy:', SVM.compute_accuracy(y_pred, y_val))

Yte = SVM.Kernel_predict(X_test)
Yte[Yte==-1]=0
Yte = list(map(lambda x: int(x), Yte))
submission = {"Id":list(range(len(Yte))), "Bound": Yte}
submission_df = pd.DataFrame(submission)
submission_df.to_csv('Submission.csv',columns=["Id","Bound"], index=False)
print()
print("Please find the generated submission file (Submission.csv) in the main repository")
print()
Binary file not shown.
Loading

0 comments on commit 68be73a

Please sign in to comment.