From 74c17ae426b4cd6a16c057462ce624cd4970e2dd Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Sun, 7 Jun 2020 19:27:20 -0600 Subject: [PATCH 1/8] updated to python 3.7 --- .gitignore | 3 +++ skipthoughts.py | 55 ++++++++++++++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 36ad723..bdde598 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,6 @@ target/ # Downloaded data files data/ + +# Downloaded tables +tables/ diff --git a/skipthoughts.py b/skipthoughts.py index 1a6011d..c18b517 100644 --- a/skipthoughts.py +++ b/skipthoughts.py @@ -2,11 +2,13 @@ Skip-thought vectors ''' import os +import warnings +warnings.filterwarnings("ignore") import theano import theano.tensor as tensor -import cPickle as pkl +import pickle as pkl import numpy import copy import nltk @@ -20,8 +22,10 @@ #-----------------------------------------------------------------------------# # Specify model and table locations here #-----------------------------------------------------------------------------# -path_to_models = '/u/rkiros/public_html/models/' -path_to_tables = '/u/rkiros/public_html/models/' +#path_to_models = '/u/rkiros/public_html/models/' +#path_to_tables = '/u/rkiros/public_html/models/' +path_to_models = 'skip-thoughts/data/' +path_to_tables = 'skip-thoughts/tables/' #-----------------------------------------------------------------------------# path_to_umodel = path_to_models + 'uni_skip.npz' @@ -33,7 +37,7 @@ def load_model(): Load the model with saved tables """ # Load model options - print 'Loading model parameters...' + print( 'Loading model parameters...') with open('%s.pkl'%path_to_umodel, 'rb') as f: uoptions = pkl.load(f) with open('%s.pkl'%path_to_bmodel, 'rb') as f: @@ -48,18 +52,18 @@ def load_model(): btparams = init_tparams(bparams) # Extractor functions - print 'Compiling encoders...' + print('Compiling encoders...') embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions) f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions) f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2') # Tables - print 'Loading tables...' + print('Loading tables...') utable, btable = load_tables() # Store everything we need in a dictionary - print 'Packing up...' + print( 'Packing up...') model = {} model['uoptions'] = uoptions model['boptions'] = boptions @@ -76,8 +80,8 @@ def load_tables(): Load the tables """ words = [] - utable = numpy.load(path_to_tables + 'utable.npy') - btable = numpy.load(path_to_tables + 'btable.npy') + utable = numpy.load(path_to_tables + 'utable.npy', allow_pickle=True, encoding='latin1') + btable = numpy.load(path_to_tables + 'btable.npy', allow_pickle=True, encoding='latin1') f = open(path_to_tables + 'dictionary.txt', 'rb') for line in f: words.append(line.decode('utf-8').strip()) @@ -93,13 +97,13 @@ class Encoder(object): """ def __init__(self, model): - self._model = model + self._model = model def encode(self, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): - """ - Encode sentences in the list X. Each entry will return a vector - """ - return encode(self._model, X, use_norm, verbose, batch_size, use_eos) + """ + Encode sentences in the list X. Each entry will return a vector + """ + return encode(self._model, X, use_norm, verbose, batch_size, use_eos) def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): @@ -125,7 +129,7 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False) # Get features. This encodes by length, in order to avoid wasting computation for k in ds.keys(): if verbose: - print k + print(k) numbatches = len(ds[k]) / batch_size + 1 for minibatch in range(numbatches): caps = ds[k][minibatch::numbatches] @@ -194,10 +198,10 @@ def nn(model, text, vectors, query, k=5): scores = numpy.dot(qf, vectors.T).flatten() sorted_args = numpy.argsort(scores)[::-1] sentences = [text[a] for a in sorted_args[:k]] - print 'QUERY: ' + query - print 'NEAREST: ' + print('QUERY: {}'.format(query)) + print( 'NEAREST: ') for i, s in enumerate(sentences): - print s, sorted_args[i] + print("{} {}".format(s, sorted_args[i])) def word_features(table): @@ -221,10 +225,10 @@ def nn_words(table, wordvecs, query, k=10): scores = numpy.dot(qf, wordvecs.T).flatten() sorted_args = numpy.argsort(scores)[::-1] words = [keys[a] for a in sorted_args[:k]] - print 'QUERY: ' + query - print 'NEAREST: ' + print('QUERY: '.format(query)) + print('NEAREST: ') for i, w in enumerate(words): - print w + print(w) def _p(pp, name): @@ -239,7 +243,7 @@ def init_tparams(params): initialize Theano shared variables according to the initial parameters """ tparams = OrderedDict() - for kk, pp in params.iteritems(): + for kk, pp in params.items(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams @@ -249,7 +253,7 @@ def load_params(path, params): load parameters """ pp = numpy.load(path) - for kk, vv in params.iteritems(): + for kk, vv in params.items(): if kk not in pp: warnings.warn('%s is not in the archive'%kk) continue @@ -340,6 +344,7 @@ def build_encoder_bi(tparams, options): # some utilities def ortho_weight(ndim): W = numpy.random.randn(ndim, ndim) + #W = numpy.random.Generator.standard_normal(size=(ndim, ndim)) u, s, v = numpy.linalg.svd(W) return u.astype('float32') @@ -350,7 +355,9 @@ def norm_weight(nin,nout=None, scale=0.1, ortho=True): if nout == nin and ortho: W = ortho_weight(nin) else: - W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout)) + W = numpy.random.uniform (low=-scale, high=scale, size=(nin, nout)) + #W = numpy.random.RandomState.uniform(low=-scale, high=scale, size=(nin, nout)) + return W.astype('float32') From 8bc8884c1d6df7293d17c7f2725b95911911411d Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Sun, 7 Jun 2020 22:39:24 -0600 Subject: [PATCH 2/8] making the github folder a module --- __init__.py | 5 +++++ dataset_handler.py | 2 +- eval_classification.py | 22 +++++++++++++--------- skipthoughts.py | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/__init__.py b/__init__.py index e69de29..c01a960 100644 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1,5 @@ +from . import skipthoughts +from . import dataset_handler +from . import nbsvm +from . import eval_classification + diff --git a/dataset_handler.py b/dataset_handler.py index 897e1f4..847ba9b 100644 --- a/dataset_handler.py +++ b/dataset_handler.py @@ -23,7 +23,7 @@ def load_data(encoder, name, loc='./data/', seed=1234): text, labels = shuffle_data(pos+neg, labels, seed=seed) z['text'] = text z['labels'] = labels - print 'Computing skip-thought vectors...' + print( 'Computing skip-thought vectors...') features = encoder.encode(text, verbose=False) return z, features diff --git a/eval_classification.py b/eval_classification.py index a3adc3d..4faac25 100644 --- a/eval_classification.py +++ b/eval_classification.py @@ -1,15 +1,17 @@ # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ) import numpy as np -import sys +#import sys import nbsvm -import dataset_handler +#import dataset_handler +import importlib +st = importlib.import_module("skip-thoughts") from scipy.sparse import hstack from sklearn.linear_model import LogisticRegression -from sklearn.cross_validation import KFold - +#from sklearn.cross_validation import KFold +from sklearn.model_selection import KFold def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False): """ @@ -21,11 +23,13 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA' """ # Load the dataset and extract features - z, features = dataset_handler.load_data(encoder, name, loc=loc, seed=seed) + z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed) scan = [2**t for t in range(0,9,1)] npts = len(z['text']) - kf = KFold(npts, n_folds=k, random_state=seed) + #kf = KFold(npts, n_folds=k, random_state=seed) + kf = KFold(n_splits=k, random_state=seed) + kf.split(npts) scores = [] for train, test in kf: @@ -74,8 +78,8 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals # Get the index of the best score s_ind = np.argmax(scanscores) s = scan[s_ind] - print scanscores - print s + print (scanscores) + print (s) # NB (if applicable) if use_nb: @@ -90,7 +94,7 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals # Evaluate acc = clf.score(X_test, y_test) scores.append(acc) - print scores + print (scores) return scores diff --git a/skipthoughts.py b/skipthoughts.py index c18b517..edc4979 100644 --- a/skipthoughts.py +++ b/skipthoughts.py @@ -130,7 +130,7 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False) for k in ds.keys(): if verbose: print(k) - numbatches = len(ds[k]) / batch_size + 1 + numbatches = int(len(ds[k]) / batch_size + 1) for minibatch in range(numbatches): caps = ds[k][minibatch::numbatches] From f73076990c68020e204823ccc6bf11366fb33b8b Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Sun, 7 Jun 2020 22:45:35 -0600 Subject: [PATCH 3/8] updated KFold usage --- eval_classification.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/eval_classification.py b/eval_classification.py index 4faac25..5431ac7 100644 --- a/eval_classification.py +++ b/eval_classification.py @@ -29,9 +29,9 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals npts = len(z['text']) #kf = KFold(npts, n_folds=k, random_state=seed) kf = KFold(n_splits=k, random_state=seed) - kf.split(npts) + scores = [] - for train, test in kf: + for train, test in kf.split(npts): # Split data X_train = features[train] @@ -46,9 +46,10 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals for s in scan: # Inner KFold - innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1) + #innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1) + innerkf = KFold(n_splits=k, random_state=seed+1) innerscores = [] - for innertrain, innertest in innerkf: + for innertrain, innertest in innerkf.split(len(X_train)): # Split data X_innertrain = X_train[innertrain] From af5d28e6680868f64486da080f7fb5c43d415498 Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Mon, 8 Jun 2020 00:09:20 -0600 Subject: [PATCH 4/8] making naive bayes svm work --- eval_classification.py | 34 ++++++++++++++++------------------ nbsvm.py | 6 +++--- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/eval_classification.py b/eval_classification.py index 5431ac7..e19a847 100644 --- a/eval_classification.py +++ b/eval_classification.py @@ -2,7 +2,7 @@ import numpy as np #import sys -import nbsvm +#import nbsvm #import dataset_handler import importlib st = importlib.import_module("skip-thoughts") @@ -13,7 +13,7 @@ #from sklearn.cross_validation import KFold from sklearn.model_selection import KFold -def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False): +def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False): """ Evaluate features with nested K-fold cross validation Outer loop: Held-out evaluation @@ -25,31 +25,29 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals # Load the dataset and extract features z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed) - scan = [2**t for t in range(0,9,1)] - npts = len(z['text']) + scan = [2**t for t in range(0,3,1)] + #npts = len(z['text']) #kf = KFold(npts, n_folds=k, random_state=seed) kf = KFold(n_splits=k, random_state=seed) scores = [] - for train, test in kf.split(npts): + for train_index, test_index in kf.split(features): # Split data - X_train = features[train] - y_train = z['labels'][train] - X_test = features[test] - y_test = z['labels'][test] + X_train, y_train = features[train_index], z['labels'][train_index] + X_test, y_test = features[test_index], z['labels'][test_index] - Xraw = [z['text'][i] for i in train] - Xraw_test = [z['text'][i] for i in test] + + Xraw = [z['text'][i] for i in train_index] + Xraw_test = [z['text'][i] for i in test_index] scanscores = [] for s in scan: # Inner KFold - #innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1) innerkf = KFold(n_splits=k, random_state=seed+1) innerscores = [] - for innertrain, innertest in innerkf.split(len(X_train)): + for innertrain, innertest in innerkf.split(X_train): # Split data X_innertrain = X_train[innertrain] @@ -107,11 +105,11 @@ def compute_nb(X, y, Z): labels = [int(t) for t in y] ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0] ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1] - poscounts = nbsvm.build_dict(ptrain, [1,2]) - negcounts = nbsvm.build_dict(ntrain, [1,2]) - dic, r = nbsvm.compute_ratio(poscounts, negcounts) - trainX = nbsvm.process_text(X, dic, r, [1,2]) - devX = nbsvm.process_text(Z, dic, r, [1,2]) + poscounts = st.nbsvm.build_dict(ptrain, [1,2]) + negcounts = st.nbsvm.build_dict(ntrain, [1,2]) + dic, r = st.nbsvm.compute_ratio(poscounts, negcounts) + trainX = st.nbsvm.process_text(X, dic, r, [1,2]) + devX = st.nbsvm.process_text(Z, dic, r, [1,2]) return trainX, devX diff --git a/nbsvm.py b/nbsvm.py index 30670dd..4f59c2b 100644 --- a/nbsvm.py +++ b/nbsvm.py @@ -1,8 +1,8 @@ # Naive-Bayes features # Derived from https://github.com/mesnilgr/nbsvm -import os -import pdb +#import os +#import pdb import numpy as np from collections import Counter from scipy.sparse import lil_matrix @@ -26,7 +26,7 @@ def build_dict(X, grams): def compute_ratio(poscounts, negcounts, alpha=1): - alltokens = list(set(poscounts.keys() + negcounts.keys())) + alltokens = list(set(list(poscounts.keys()) + list(negcounts.keys()))) dic = dict((t, i) for i, t in enumerate(alltokens)) d = len(dic) p, q = np.ones(d) * alpha , np.ones(d) * alpha From 14cdcb7029d5c42b01d8bff660b5ad1c9754da6c Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Mon, 8 Jun 2020 01:07:20 -0600 Subject: [PATCH 5/8] added option ACLIMBD to read the data from the 2011 Maas et al paper Learning Word Vectors for Sentiment Analysis --- dataset_handler.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/dataset_handler.py b/dataset_handler.py index 847ba9b..7921b11 100644 --- a/dataset_handler.py +++ b/dataset_handler.py @@ -2,12 +2,14 @@ import numpy as np from numpy.random import RandomState -import os.path +from os.path import join, isfile +from os import listdir + def load_data(encoder, name, loc='./data/', seed=1234): """ - Load one of MR, CR, SUBJ or MPQA + Load one of MR, CR, SUBJ or MPQA, ACLIMBD """ z = {} if name == 'MR': @@ -18,6 +20,8 @@ def load_data(encoder, name, loc='./data/', seed=1234): pos, neg = load_cr(loc=loc) elif name == 'MPQA': pos, neg = load_mpqa(loc=loc) + elif name == 'ACLIMBD': + pos, neg = load_aclimbd(loc=loc) labels = compute_labels(pos, neg) text, labels = shuffle_data(pos+neg, labels, seed=seed) @@ -33,14 +37,34 @@ def load_rt(loc='./data/'): Load the MR dataset """ pos, neg = [], [] - with open(os.path.join(loc, 'rt-polarity.pos'), 'rb') as f: + with open(join(loc, 'rt-polarity.pos'), 'rb') as f: for line in f: pos.append(line.decode('latin-1').strip()) - with open(os.path.join(loc, 'rt-polarity.neg'), 'rb') as f: + with open(join(loc, 'rt-polarity.neg'), 'rb') as f: for line in f: neg.append(line.decode('latin-1').strip()) return pos, neg +def load_aclimbd(loc='./data/'): + """ + Load the acl imbd dataset + """ + pos, neg = [], [] + pos_path = join(loc, 'pos') + onlyfiles = [f for f in listdir(pos_path) if isfile(join(pos_path, f))] + for f_name in onlyfiles: + with open(join(pos_path,f_name), 'rb') as f: + for line in f: + pos.append(line.decode('latin-1').strip()) + + neg_path = join(loc, 'neg') + onlyfiles = [f for f in listdir(neg_path) if isfile(join(neg_path, f))] + for f_name in onlyfiles: + with open(join(neg_path, f_name), 'rb') as f: + for line in f: + neg.append(line.decode('latin-1').strip()) + return pos, neg + def load_subj(loc='./data/'): """ From 4a59ad07088d13c8cf1b3c0695460a2e8ec8da88 Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Mon, 8 Jun 2020 16:32:32 -0600 Subject: [PATCH 6/8] saving sentence embedding to disk --- dataset_handler.py | 30 ++++++++++++++++++++++++--- eval_classification.py | 46 +++++++++++++++++++++++++++++++++++------- skipthoughts.py | 14 ++++++++++++- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/dataset_handler.py b/dataset_handler.py index 7921b11..78b3ce3 100644 --- a/dataset_handler.py +++ b/dataset_handler.py @@ -2,6 +2,9 @@ import numpy as np from numpy.random import RandomState +from nltk import sent_tokenize +from nltk.tokenize import word_tokenize +import string from os.path import join, isfile from os import listdir @@ -28,7 +31,7 @@ def load_data(encoder, name, loc='./data/', seed=1234): z['text'] = text z['labels'] = labels print( 'Computing skip-thought vectors...') - features = encoder.encode(text, verbose=False) + features = encoder.encode(text, verbose=False, use_eos=False) return z, features @@ -55,14 +58,35 @@ def load_aclimbd(loc='./data/'): for f_name in onlyfiles: with open(join(pos_path,f_name), 'rb') as f: for line in f: - pos.append(line.decode('latin-1').strip()) + l = line.decode('latin-1').strip() + sentences = sent_tokenize(l) + for s in sentences: + tokens = word_tokenize(s) + tokens = [w.lower() for w in tokens] + table = str.maketrans(',', ' ', '!?@#%&*"\'') + words = [w.translate(table) for w in tokens] + # remove remaining tokens that are not alphabetic + sentence = " ".join(words) + sent = sentence.split() + pos.append(" ".join(sent)) neg_path = join(loc, 'neg') onlyfiles = [f for f in listdir(neg_path) if isfile(join(neg_path, f))] for f_name in onlyfiles: with open(join(neg_path, f_name), 'rb') as f: for line in f: - neg.append(line.decode('latin-1').strip()) + l = line.decode('latin-1').strip() + sentences = sent_tokenize(l) + for s in sentences: + tokens = word_tokenize(s) + tokens = [w.lower() for w in tokens] + table = str.maketrans(',', ' ', '!?@#%&*"\'') + words = [w.translate(table) for w in tokens] + # remove remaining tokens that are not alphabetic + sentence = " ".join(words) + sent = sentence.split() + neg.append(" ".join(sent)) + return pos, neg diff --git a/eval_classification.py b/eval_classification.py index e19a847..4eb6eef 100644 --- a/eval_classification.py +++ b/eval_classification.py @@ -1,16 +1,17 @@ # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ) +from time import process_time, time import numpy as np -#import sys -#import nbsvm -#import dataset_handler +import pickle +from joblib import dump, load import importlib +from os import getcwd +from os.path import join, isfile st = importlib.import_module("skip-thoughts") from scipy.sparse import hstack from sklearn.linear_model import LogisticRegression -#from sklearn.cross_validation import KFold from sklearn.model_selection import KFold def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False): @@ -24,12 +25,23 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False """ # Load the dataset and extract features z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed) - + + time_stamp = time.strftime("%Y%m%d-%H%M%S") + file_name = 'sentence_embeddings_' + time_stamp + '.dat' + file_name_full_path = join(loc, 'skip-thoughts', 'data', file_name) + print("Saving embeddings to file {0}".format(file_name_full_path)) + with open(file_name_full_path, 'wb') as f: + pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL) + scan = [2**t for t in range(0,3,1)] #npts = len(z['text']) #kf = KFold(npts, n_folds=k, random_state=seed) kf = KFold(n_splits=k, random_state=seed) + start_time = process_time() + print("Started 'eval_nested_kfold'".format(start_time)) + scores = [] for train_index, test_index in kf.split(features): @@ -89,12 +101,19 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False # Train classifier clf = LogisticRegression(C=s) clf.fit(X_train, y_train) - + + cwd = getcwd() + dump(clf, join(cwd,'skip-thoughts/models', 'best_logit.joblib.gz')) + # Evaluate acc = clf.score(X_test, y_test) scores.append(acc) print (scores) + end_time = process_time() + + print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(end_time - start_time)) + return scores @@ -113,4 +132,17 @@ def compute_nb(X, y, Z): return trainX, devX - +def eval_test_data(encoder, name, loc='./data/'): + """ + load previously saved logistic regression model to predict on test data. + Only works on ACLIMBD, because it has train and test data stored separately. + """ + acc = 0.0 + if name == 'ACLIMBD': + full_path_model_file = join(loc, 'skip-thoughts', 'models', 'best_logit.joblib.gz') + if isfile(full_path_model_file): + full_path_test = join(loc, 'skip-thoughts', 'data', 'aclImdb', 'test') + z, features = st.dataset_handler.load_data(encoder, name, loc=full_path_test) + clf = load(full_path_model_file) + acc = clf.score(features, z['labels']) + return acc \ No newline at end of file diff --git a/skipthoughts.py b/skipthoughts.py index edc4979..aef6396 100644 --- a/skipthoughts.py +++ b/skipthoughts.py @@ -1,7 +1,8 @@ ''' Skip-thought vectors ''' -import os +from time import process_time +import datetime import warnings warnings.filterwarnings("ignore") @@ -110,6 +111,10 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False) """ Encode sentences in the list X. Each entry will return a vector """ + start_time = process_time() + + print(datetime.datetime.now().strftime("Started 'encode' at %Y-%m-%d %H:%M:%S")) + # first, do preprocessing X = preprocess(X) @@ -167,6 +172,13 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False) bfeatures[c] = bff[ind] features = numpy.c_[ufeatures, bfeatures] + + end_time = process_time() + + + print(datetime.datetime.now().strftime("Finished 'encode' at %Y-%m-%d %H:%M:%S")) + + print("Elapsed time in seconds for encoding: {0:4.2f}".format(end_time - start_time)) return features From bb7bb8b720e837fc55a45789795e7dd599547c56 Mon Sep 17 00:00:00 2001 From: Pablo Adames Date: Mon, 8 Jun 2020 19:57:15 -0600 Subject: [PATCH 7/8] fixing path to save features and labels --- dataset_handler.py | 3 +-- eval_classification.py | 8 +++++--- skipthoughts.py | 1 - 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dataset_handler.py b/dataset_handler.py index 78b3ce3..b429314 100644 --- a/dataset_handler.py +++ b/dataset_handler.py @@ -4,7 +4,6 @@ from numpy.random import RandomState from nltk import sent_tokenize from nltk.tokenize import word_tokenize -import string from os.path import join, isfile from os import listdir @@ -31,7 +30,7 @@ def load_data(encoder, name, loc='./data/', seed=1234): z['text'] = text z['labels'] = labels print( 'Computing skip-thought vectors...') - features = encoder.encode(text, verbose=False, use_eos=False) + features = encoder.encode(text, verbose=False, use_eos=True) return z, features diff --git a/eval_classification.py b/eval_classification.py index 4eb6eef..ae0edee 100644 --- a/eval_classification.py +++ b/eval_classification.py @@ -1,6 +1,7 @@ # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ) -from time import process_time, time +from time import process_time +import time import numpy as np import pickle from joblib import dump, load @@ -26,9 +27,10 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False # Load the dataset and extract features z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed) - time_stamp = time.strftime("%Y%m%d-%H%M%S") + time_stamp = time.strftime("%Y%m%d_%H%M%S") file_name = 'sentence_embeddings_' + time_stamp + '.dat' - file_name_full_path = join(loc, 'skip-thoughts', 'data', file_name) + cwd = getcwd() + file_name_full_path = join(cwd, 'skip-thoughts', 'data', file_name) print("Saving embeddings to file {0}".format(file_name_full_path)) with open(file_name_full_path, 'wb') as f: pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/skipthoughts.py b/skipthoughts.py index aef6396..bbabbd8 100644 --- a/skipthoughts.py +++ b/skipthoughts.py @@ -11,7 +11,6 @@ import pickle as pkl import numpy -import copy import nltk from collections import OrderedDict, defaultdict From 25316c1c221c4a32713fea505afa5f129eca6371 Mon Sep 17 00:00:00 2001 From: padames Date: Tue, 9 Jun 2020 09:12:41 -0600 Subject: [PATCH 8/8] attempting to evaluate test data, missing encoding of test features --- eval_classification.py | 166 ++++++++++++++++++++++++++++++++++------- 1 file changed, 140 insertions(+), 26 deletions(-) diff --git a/eval_classification.py b/eval_classification.py index ae0edee..5010d77 100644 --- a/eval_classification.py +++ b/eval_classification.py @@ -8,6 +8,7 @@ import importlib from os import getcwd from os.path import join, isfile + st = importlib.import_module("skip-thoughts") from scipy.sparse import hstack @@ -15,6 +16,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.model_selection import KFold + def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False): """ Evaluate features with nested K-fold cross validation @@ -26,7 +28,7 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False """ # Load the dataset and extract features z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed) - + time_stamp = time.strftime("%Y%m%d_%H%M%S") file_name = 'sentence_embeddings_' + time_stamp + '.dat' cwd = getcwd() @@ -35,15 +37,15 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False with open(file_name_full_path, 'wb') as f: pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL) - - scan = [2**t for t in range(0,3,1)] - #npts = len(z['text']) - #kf = KFold(npts, n_folds=k, random_state=seed) + + scan = [2 ** t for t in range(0, 3, 1)] + # npts = len(z['text']) + # kf = KFold(npts, n_folds=k, random_state=seed) kf = KFold(n_splits=k, random_state=seed) - + start_time = process_time() print("Started 'eval_nested_kfold'".format(start_time)) - + scores = [] for train_index, test_index in kf.split(features): @@ -51,7 +53,6 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False X_train, y_train = features[train_index], z['labels'][train_index] X_test, y_test = features[test_index], z['labels'][test_index] - Xraw = [z['text'][i] for i in train_index] Xraw_test = [z['text'][i] for i in test_index] @@ -59,10 +60,10 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False for s in scan: # Inner KFold - innerkf = KFold(n_splits=k, random_state=seed+1) + innerkf = KFold(n_splits=k, random_state=seed + 1) innerscores = [] for innertrain, innertest in innerkf.split(X_train): - + # Split data X_innertrain = X_train[innertrain] y_innertrain = y_train[innertrain] @@ -83,7 +84,7 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False clf.fit(X_innertrain, y_innertrain) acc = clf.score(X_innertest, y_innertest) innerscores.append(acc) - print (s, acc) + print(s, acc) # Append mean score scanscores.append(np.mean(innerscores)) @@ -91,30 +92,31 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False # Get the index of the best score s_ind = np.argmax(scanscores) s = scan[s_ind] - print (scanscores) - print (s) - + print(scanscores) + print(s) + # NB (if applicable) if use_nb: NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test) X_train = hstack((X_train, NBtrain)) X_test = hstack((X_test, NBtest)) - + # Train classifier clf = LogisticRegression(C=s) clf.fit(X_train, y_train) - + cwd = getcwd() - dump(clf, join(cwd,'skip-thoughts/models', 'best_logit.joblib.gz')) - + dump(clf, join(cwd, 'skip-thoughts/models', 'best_logit.joblib.gz')) + # Evaluate acc = clf.score(X_test, y_test) scores.append(acc) - print (scores) + print(scores) end_time = process_time() - - print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(end_time - start_time)) + + print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format( + end_time - start_time)) return scores @@ -126,11 +128,11 @@ def compute_nb(X, y, Z): labels = [int(t) for t in y] ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0] ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1] - poscounts = st.nbsvm.build_dict(ptrain, [1,2]) - negcounts = st.nbsvm.build_dict(ntrain, [1,2]) + poscounts = st.nbsvm.build_dict(ptrain, [1, 2]) + negcounts = st.nbsvm.build_dict(ntrain, [1, 2]) dic, r = st.nbsvm.compute_ratio(poscounts, negcounts) - trainX = st.nbsvm.process_text(X, dic, r, [1,2]) - devX = st.nbsvm.process_text(Z, dic, r, [1,2]) + trainX = st.nbsvm.process_text(X, dic, r, [1, 2]) + devX = st.nbsvm.process_text(Z, dic, r, [1, 2]) return trainX, devX @@ -147,4 +149,116 @@ def eval_test_data(encoder, name, loc='./data/'): z, features = st.dataset_handler.load_data(encoder, name, loc=full_path_test) clf = load(full_path_model_file) acc = clf.score(features, z['labels']) - return acc \ No newline at end of file + return acc + + +def eval_test_data_from_saved_embeddings_classification(embeddings_file_name, + k=3, + use_nb=False, + embeddings_loc='./data/', + model_loc='./data/', + seed=1234): + """ + load previously saved logistic regression model to predict on test data. + Only works on ACLIMBD, because it has train and test data stored separately. + """ + acc = 0.0 + full_path_embeddings = join(embeddings_loc, 'data', embeddings_file_name) + model_full_path = '' + if not isfile(full_path_embeddings): + print("Embeddings file not found") + exit(1) + else: + print("Loading embeddings from file {0}".format(full_path_embeddings)) + with open(full_path_embeddings, 'rb') as f: + z = pickle.load(f) + features = pickle.load(f) + # pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL) + # pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL) + + scan = [2 ** t for t in range(0, 3, 1)] + + kf = KFold(n_splits=k, random_state=seed) + + start_time = process_time() + print("Started 'eval_nested_kfold'".format(start_time)) + + scores = [] + for train_index, test_index in kf.split(features): + + # Split data + X_train, y_train = features[train_index], z['labels'][train_index] + X_test, y_test = features[test_index], z['labels'][test_index] + + Xraw = [z['text'][i] for i in train_index] + Xraw_test = [z['text'][i] for i in test_index] + + scan_scores = [] + for s in scan: + + # Inner KFold + innerkf = KFold(n_splits=k, random_state=seed + 1) + innerscores = [] + for innertrain, innertest in innerkf.split(X_train): + + # Split data + X_innertrain = X_train[innertrain] + y_innertrain = y_train[innertrain] + X_innertest = X_train[innertest] + y_innertest = y_train[innertest] + + Xraw_innertrain = [Xraw[i] for i in innertrain] + Xraw_innertest = [Xraw[i] for i in innertest] + + # NB (if applicable) + if use_nb: + NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest) + X_innertrain = hstack((X_innertrain, NBtrain)) + X_innertest = hstack((X_innertest, NBtest)) + + # Train classifier + clf = LogisticRegression(C=s) + clf.fit(X_innertrain, y_innertrain) + acc = clf.score(X_innertest, y_innertest) + innerscores.append(acc) + print(s, acc) + + # Append mean score + scan_scores.append(np.mean(innerscores)) + + # Get the index of the best score + s_ind = np.argmax(scan_scores) + s = scan[s_ind] + print(scan_scores) + print(s) + + # NB (if applicable) + if use_nb: + NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test) + X_train = hstack((X_train, NBtrain)) + X_test = hstack((X_test, NBtest)) + + # Train classifier + clf = LogisticRegression(C=s) + clf.fit(X_train, y_train) + + time_stamp = time.strftime("%Y%m%d_%H%M%S") + model_file_name = 'best_logit.' + time_stamp + '.joblib.gz' + model_full_path = join(model_loc, model_file_name) + + dump(clf, model_full_path) + + # Evaluate + acc = clf.score(X_test, y_test) + scores.append(acc) + print(scores) + + end_time = process_time() + + print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format( + end_time - start_time)) + + clf = load(model_full_path) + acc = clf.score(features, z['labels']) + + return acc