From 74c17ae426b4cd6a16c057462ce624cd4970e2dd Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Sun, 7 Jun 2020 19:27:20 -0600
Subject: [PATCH 1/8] updated to python 3.7

---
 .gitignore      |  3 +++
 skipthoughts.py | 55 ++++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index 36ad723..bdde598 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,6 @@ target/
 
 # Downloaded data files
 data/
+
+# Downloaded tables
+tables/
diff --git a/skipthoughts.py b/skipthoughts.py
index 1a6011d..c18b517 100644
--- a/skipthoughts.py
+++ b/skipthoughts.py
@@ -2,11 +2,13 @@
 Skip-thought vectors
 '''
 import os
+import warnings
+warnings.filterwarnings("ignore")
 
 import theano
 import theano.tensor as tensor
 
-import cPickle as pkl
+import pickle as pkl
 import numpy
 import copy
 import nltk
@@ -20,8 +22,10 @@
 #-----------------------------------------------------------------------------#
 # Specify model and table locations here
 #-----------------------------------------------------------------------------#
-path_to_models = '/u/rkiros/public_html/models/'
-path_to_tables = '/u/rkiros/public_html/models/'
+#path_to_models = '/u/rkiros/public_html/models/'
+#path_to_tables = '/u/rkiros/public_html/models/'
+path_to_models = 'skip-thoughts/data/'
+path_to_tables = 'skip-thoughts/tables/'
 #-----------------------------------------------------------------------------#
 
 path_to_umodel = path_to_models + 'uni_skip.npz'
@@ -33,7 +37,7 @@ def load_model():
     Load the model with saved tables
     """
     # Load model options
-    print 'Loading model parameters...'
+    print( 'Loading model parameters...')
     with open('%s.pkl'%path_to_umodel, 'rb') as f:
         uoptions = pkl.load(f)
     with open('%s.pkl'%path_to_bmodel, 'rb') as f:
@@ -48,18 +52,18 @@ def load_model():
     btparams = init_tparams(bparams)
 
     # Extractor functions
-    print 'Compiling encoders...'
+    print('Compiling encoders...')
     embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions)
     f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
     embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions)
     f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2')
 
     # Tables
-    print 'Loading tables...'
+    print('Loading tables...')
     utable, btable = load_tables()
 
     # Store everything we need in a dictionary
-    print 'Packing up...'
+    print( 'Packing up...')
     model = {}
     model['uoptions'] = uoptions
     model['boptions'] = boptions
@@ -76,8 +80,8 @@ def load_tables():
     Load the tables
     """
     words = []
-    utable = numpy.load(path_to_tables + 'utable.npy')
-    btable = numpy.load(path_to_tables + 'btable.npy')
+    utable = numpy.load(path_to_tables + 'utable.npy', allow_pickle=True, encoding='latin1')
+    btable = numpy.load(path_to_tables + 'btable.npy', allow_pickle=True, encoding='latin1')
     f = open(path_to_tables + 'dictionary.txt', 'rb')
     for line in f:
         words.append(line.decode('utf-8').strip())
@@ -93,13 +97,13 @@ class Encoder(object):
     """
 
     def __init__(self, model):
-      self._model = model
+        self._model = model
 
     def encode(self, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
-      """
-      Encode sentences in the list X. Each entry will return a vector
-      """
-      return encode(self._model, X, use_norm, verbose, batch_size, use_eos)
+        """
+        Encode sentences in the list X. Each entry will return a vector
+        """
+        return encode(self._model, X, use_norm, verbose, batch_size, use_eos)
 
 
 def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
@@ -125,7 +129,7 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False)
     # Get features. This encodes by length, in order to avoid wasting computation
     for k in ds.keys():
         if verbose:
-            print k
+            print(k)
         numbatches = len(ds[k]) / batch_size + 1
         for minibatch in range(numbatches):
             caps = ds[k][minibatch::numbatches]
@@ -194,10 +198,10 @@ def nn(model, text, vectors, query, k=5):
     scores = numpy.dot(qf, vectors.T).flatten()
     sorted_args = numpy.argsort(scores)[::-1]
     sentences = [text[a] for a in sorted_args[:k]]
-    print 'QUERY: ' + query
-    print 'NEAREST: '
+    print('QUERY: {}'.format(query))
+    print( 'NEAREST: ')
     for i, s in enumerate(sentences):
-        print s, sorted_args[i]
+        print("{} {}".format(s, sorted_args[i]))
 
 
 def word_features(table):
@@ -221,10 +225,10 @@ def nn_words(table, wordvecs, query, k=10):
     scores = numpy.dot(qf, wordvecs.T).flatten()
     sorted_args = numpy.argsort(scores)[::-1]
     words = [keys[a] for a in sorted_args[:k]]
-    print 'QUERY: ' + query
-    print 'NEAREST: '
+    print('QUERY: '.format(query))
+    print('NEAREST: ')
     for i, w in enumerate(words):
-        print w
+        print(w)
 
 
 def _p(pp, name):
@@ -239,7 +243,7 @@ def init_tparams(params):
     initialize Theano shared variables according to the initial parameters
     """
     tparams = OrderedDict()
-    for kk, pp in params.iteritems():
+    for kk, pp in params.items():
         tparams[kk] = theano.shared(params[kk], name=kk)
     return tparams
 
@@ -249,7 +253,7 @@ def load_params(path, params):
     load parameters
     """
     pp = numpy.load(path)
-    for kk, vv in params.iteritems():
+    for kk, vv in params.items():
         if kk not in pp:
             warnings.warn('%s is not in the archive'%kk)
             continue
@@ -340,6 +344,7 @@ def build_encoder_bi(tparams, options):
 # some utilities
 def ortho_weight(ndim):
     W = numpy.random.randn(ndim, ndim)
+    #W = numpy.random.Generator.standard_normal(size=(ndim, ndim))
     u, s, v = numpy.linalg.svd(W)
     return u.astype('float32')
 
@@ -350,7 +355,9 @@ def norm_weight(nin,nout=None, scale=0.1, ortho=True):
     if nout == nin and ortho:
         W = ortho_weight(nin)
     else:
-        W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
+        W = numpy.random.uniform (low=-scale, high=scale, size=(nin, nout))
+        #W = numpy.random.RandomState.uniform(low=-scale, high=scale, size=(nin, nout))
+        
     return W.astype('float32')
 
 

From 8bc8884c1d6df7293d17c7f2725b95911911411d Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Sun, 7 Jun 2020 22:39:24 -0600
Subject: [PATCH 2/8] making the github folder a module

---
 __init__.py            |  5 +++++
 dataset_handler.py     |  2 +-
 eval_classification.py | 22 +++++++++++++---------
 skipthoughts.py        |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/__init__.py b/__init__.py
index e69de29..c01a960 100644
--- a/__init__.py
+++ b/__init__.py
@@ -0,0 +1,5 @@
+from . import skipthoughts
+from . import dataset_handler
+from . import nbsvm
+from . import eval_classification
+
diff --git a/dataset_handler.py b/dataset_handler.py
index 897e1f4..847ba9b 100644
--- a/dataset_handler.py
+++ b/dataset_handler.py
@@ -23,7 +23,7 @@ def load_data(encoder, name, loc='./data/', seed=1234):
     text, labels = shuffle_data(pos+neg, labels, seed=seed)
     z['text'] = text
     z['labels'] = labels
-    print 'Computing skip-thought vectors...'
+    print( 'Computing skip-thought vectors...')
     features = encoder.encode(text, verbose=False)
     return z, features
 
diff --git a/eval_classification.py b/eval_classification.py
index a3adc3d..4faac25 100644
--- a/eval_classification.py
+++ b/eval_classification.py
@@ -1,15 +1,17 @@
 # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ)
 
 import numpy as np
-import sys
+#import sys
 import nbsvm
-import dataset_handler
+#import dataset_handler
+import importlib
+st = importlib.import_module("skip-thoughts")
 
 from scipy.sparse import hstack
 
 from sklearn.linear_model import LogisticRegression
-from sklearn.cross_validation import KFold
-
+#from sklearn.cross_validation import KFold
+from sklearn.model_selection import KFold
 
 def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False):
     """
@@ -21,11 +23,13 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
     Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA'
     """
     # Load the dataset and extract features
-    z, features = dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
+    z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
 
     scan = [2**t for t in range(0,9,1)]
     npts = len(z['text'])
-    kf = KFold(npts, n_folds=k, random_state=seed)
+    #kf = KFold(npts, n_folds=k, random_state=seed)
+    kf = KFold(n_splits=k, random_state=seed)
+    kf.split(npts)
     scores = []
     for train, test in kf:
 
@@ -74,8 +78,8 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
         # Get the index of the best score
         s_ind = np.argmax(scanscores)
         s = scan[s_ind]
-        print scanscores
-        print s
+        print (scanscores)
+        print (s)
  
         # NB (if applicable)
         if use_nb:
@@ -90,7 +94,7 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
         # Evaluate
         acc = clf.score(X_test, y_test)
         scores.append(acc)
-        print scores
+        print (scores)
 
     return scores
 
diff --git a/skipthoughts.py b/skipthoughts.py
index c18b517..edc4979 100644
--- a/skipthoughts.py
+++ b/skipthoughts.py
@@ -130,7 +130,7 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False)
     for k in ds.keys():
         if verbose:
             print(k)
-        numbatches = len(ds[k]) / batch_size + 1
+        numbatches = int(len(ds[k]) / batch_size + 1)
         for minibatch in range(numbatches):
             caps = ds[k][minibatch::numbatches]
 

From f73076990c68020e204823ccc6bf11366fb33b8b Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Sun, 7 Jun 2020 22:45:35 -0600
Subject: [PATCH 3/8] updated KFold usage

---
 eval_classification.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/eval_classification.py b/eval_classification.py
index 4faac25..5431ac7 100644
--- a/eval_classification.py
+++ b/eval_classification.py
@@ -29,9 +29,9 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
     npts = len(z['text'])
     #kf = KFold(npts, n_folds=k, random_state=seed)
     kf = KFold(n_splits=k, random_state=seed)
-    kf.split(npts)
+    
     scores = []
-    for train, test in kf:
+    for train, test in kf.split(npts):
 
         # Split data
         X_train = features[train]
@@ -46,9 +46,10 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
         for s in scan:
 
             # Inner KFold
-            innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1)
+            #innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1)
+            innerkf = KFold(n_splits=k, random_state=seed+1)
             innerscores = []
-            for innertrain, innertest in innerkf:
+            for innertrain, innertest in innerkf.split(len(X_train)):
         
                 # Split data
                 X_innertrain = X_train[innertrain]

From af5d28e6680868f64486da080f7fb5c43d415498 Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Mon, 8 Jun 2020 00:09:20 -0600
Subject: [PATCH 4/8] making naive bayes svm work

---
 eval_classification.py | 34 ++++++++++++++++------------------
 nbsvm.py               |  6 +++---
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/eval_classification.py b/eval_classification.py
index 5431ac7..e19a847 100644
--- a/eval_classification.py
+++ b/eval_classification.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 #import sys
-import nbsvm
+#import nbsvm
 #import dataset_handler
 import importlib
 st = importlib.import_module("skip-thoughts")
@@ -13,7 +13,7 @@
 #from sklearn.cross_validation import KFold
 from sklearn.model_selection import KFold
 
-def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False):
+def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False):
     """
     Evaluate features with nested K-fold cross validation
     Outer loop: Held-out evaluation
@@ -25,31 +25,29 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
     # Load the dataset and extract features
     z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
 
-    scan = [2**t for t in range(0,9,1)]
-    npts = len(z['text'])
+    scan = [2**t for t in range(0,3,1)]
+    #npts = len(z['text'])
     #kf = KFold(npts, n_folds=k, random_state=seed)
     kf = KFold(n_splits=k, random_state=seed)
     
     scores = []
-    for train, test in kf.split(npts):
+    for train_index, test_index in kf.split(features):
 
         # Split data
-        X_train = features[train]
-        y_train = z['labels'][train]
-        X_test = features[test]
-        y_test = z['labels'][test]
+        X_train, y_train = features[train_index], z['labels'][train_index]
+        X_test, y_test = features[test_index], z['labels'][test_index]
 
-        Xraw = [z['text'][i] for i in train]
-        Xraw_test = [z['text'][i] for i in test]
+
+        Xraw = [z['text'][i] for i in train_index]
+        Xraw_test = [z['text'][i] for i in test_index]
 
         scanscores = []
         for s in scan:
 
             # Inner KFold
-            #innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1)
             innerkf = KFold(n_splits=k, random_state=seed+1)
             innerscores = []
-            for innertrain, innertest in innerkf.split(len(X_train)):
+            for innertrain, innertest in innerkf.split(X_train):
         
                 # Split data
                 X_innertrain = X_train[innertrain]
@@ -107,11 +105,11 @@ def compute_nb(X, y, Z):
     labels = [int(t) for t in y]
     ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0]
     ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1]
-    poscounts = nbsvm.build_dict(ptrain, [1,2])
-    negcounts = nbsvm.build_dict(ntrain, [1,2])
-    dic, r = nbsvm.compute_ratio(poscounts, negcounts)
-    trainX = nbsvm.process_text(X, dic, r, [1,2])
-    devX = nbsvm.process_text(Z, dic, r, [1,2])
+    poscounts = st.nbsvm.build_dict(ptrain, [1,2])
+    negcounts = st.nbsvm.build_dict(ntrain, [1,2])
+    dic, r = st.nbsvm.compute_ratio(poscounts, negcounts)
+    trainX = st.nbsvm.process_text(X, dic, r, [1,2])
+    devX = st.nbsvm.process_text(Z, dic, r, [1,2])
     return trainX, devX
 
 
diff --git a/nbsvm.py b/nbsvm.py
index 30670dd..4f59c2b 100644
--- a/nbsvm.py
+++ b/nbsvm.py
@@ -1,8 +1,8 @@
 # Naive-Bayes features
 # Derived from https://github.com/mesnilgr/nbsvm
 
-import os
-import pdb
+#import os
+#import pdb
 import numpy as np
 from collections import Counter
 from scipy.sparse import lil_matrix
@@ -26,7 +26,7 @@ def build_dict(X, grams):
 
 
 def compute_ratio(poscounts, negcounts, alpha=1):
-    alltokens = list(set(poscounts.keys() + negcounts.keys()))
+    alltokens = list(set(list(poscounts.keys()) + list(negcounts.keys())))
     dic = dict((t, i) for i, t in enumerate(alltokens))
     d = len(dic)
     p, q = np.ones(d) * alpha , np.ones(d) * alpha

From 14cdcb7029d5c42b01d8bff660b5ad1c9754da6c Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Mon, 8 Jun 2020 01:07:20 -0600
Subject: [PATCH 5/8] added option ACLIMBD to read the data from the 2011 Maas
 et al paper Learning Word Vectors for Sentiment Analysis

---
 dataset_handler.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/dataset_handler.py b/dataset_handler.py
index 847ba9b..7921b11 100644
--- a/dataset_handler.py
+++ b/dataset_handler.py
@@ -2,12 +2,14 @@
 
 import numpy as np
 from numpy.random import RandomState
-import os.path
+from os.path import join, isfile
+from os import listdir
+
 
 
 def load_data(encoder, name, loc='./data/', seed=1234):
     """
-    Load one of MR, CR, SUBJ or MPQA
+    Load one of MR, CR, SUBJ or MPQA, ACLIMBD
     """
     z = {}
     if name == 'MR':
@@ -18,6 +20,8 @@ def load_data(encoder, name, loc='./data/', seed=1234):
         pos, neg = load_cr(loc=loc)
     elif name == 'MPQA':
         pos, neg = load_mpqa(loc=loc)
+    elif name == 'ACLIMBD':
+        pos, neg = load_aclimbd(loc=loc)
 
     labels = compute_labels(pos, neg)
     text, labels = shuffle_data(pos+neg, labels, seed=seed)
@@ -33,14 +37,34 @@ def load_rt(loc='./data/'):
     Load the MR dataset
     """
     pos, neg = [], []
-    with open(os.path.join(loc, 'rt-polarity.pos'), 'rb') as f:
+    with open(join(loc, 'rt-polarity.pos'), 'rb') as f:
         for line in f:
             pos.append(line.decode('latin-1').strip())
-    with open(os.path.join(loc, 'rt-polarity.neg'), 'rb') as f:
+    with open(join(loc, 'rt-polarity.neg'), 'rb') as f:
         for line in f:
             neg.append(line.decode('latin-1').strip())
     return pos, neg
 
+def load_aclimbd(loc='./data/'):
+    """
+    Load the acl imbd dataset
+    """
+    pos, neg = [], []
+    pos_path = join(loc, 'pos')
+    onlyfiles = [f for f in listdir(pos_path) if isfile(join(pos_path, f))]
+    for f_name in onlyfiles:
+        with open(join(pos_path,f_name), 'rb') as f:
+            for line in f:
+                pos.append(line.decode('latin-1').strip())
+    
+    neg_path = join(loc, 'neg')
+    onlyfiles = [f for f in listdir(neg_path) if isfile(join(neg_path, f))]    
+    for f_name in onlyfiles:
+        with open(join(neg_path, f_name), 'rb') as f:
+            for line in f:
+                neg.append(line.decode('latin-1').strip())
+    return pos, neg
+
 
 def load_subj(loc='./data/'):
     """

From 4a59ad07088d13c8cf1b3c0695460a2e8ec8da88 Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Mon, 8 Jun 2020 16:32:32 -0600
Subject: [PATCH 6/8] saving sentence embedding to disk

---
 dataset_handler.py     | 30 ++++++++++++++++++++++++---
 eval_classification.py | 46 +++++++++++++++++++++++++++++++++++-------
 skipthoughts.py        | 14 ++++++++++++-
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/dataset_handler.py b/dataset_handler.py
index 7921b11..78b3ce3 100644
--- a/dataset_handler.py
+++ b/dataset_handler.py
@@ -2,6 +2,9 @@
 
 import numpy as np
 from numpy.random import RandomState
+from nltk import sent_tokenize
+from nltk.tokenize import word_tokenize
+import string
 from os.path import join, isfile
 from os import listdir
 
@@ -28,7 +31,7 @@ def load_data(encoder, name, loc='./data/', seed=1234):
     z['text'] = text
     z['labels'] = labels
     print( 'Computing skip-thought vectors...')
-    features = encoder.encode(text, verbose=False)
+    features = encoder.encode(text, verbose=False, use_eos=False)
     return z, features
 
 
@@ -55,14 +58,35 @@ def load_aclimbd(loc='./data/'):
     for f_name in onlyfiles:
         with open(join(pos_path,f_name), 'rb') as f:
             for line in f:
-                pos.append(line.decode('latin-1').strip())
+                l = line.decode('latin-1').strip()
+                sentences = sent_tokenize(l)
+                for s in sentences:
+                    tokens = word_tokenize(s)
+                    tokens = [w.lower() for w in tokens]
+                    table = str.maketrans(',', ' ', '!?@#%&*"\'')
+                    words = [w.translate(table) for w in tokens]
+                    # remove remaining tokens that are not alphabetic
+                    sentence = " ".join(words)
+                    sent = sentence.split()
+                    pos.append(" ".join(sent))
     
     neg_path = join(loc, 'neg')
     onlyfiles = [f for f in listdir(neg_path) if isfile(join(neg_path, f))]    
     for f_name in onlyfiles:
         with open(join(neg_path, f_name), 'rb') as f:
             for line in f:
-                neg.append(line.decode('latin-1').strip())
+                l = line.decode('latin-1').strip()
+                sentences = sent_tokenize(l)
+                for s in sentences:
+                    tokens = word_tokenize(s)
+                    tokens = [w.lower() for w in tokens]
+                    table = str.maketrans(',', ' ', '!?@#%&*"\'')
+                    words = [w.translate(table) for w in tokens]
+                    # remove remaining tokens that are not alphabetic
+                    sentence = " ".join(words)
+                    sent = sentence.split()
+                    neg.append(" ".join(sent))                
+
     return pos, neg
 
 
diff --git a/eval_classification.py b/eval_classification.py
index e19a847..4eb6eef 100644
--- a/eval_classification.py
+++ b/eval_classification.py
@@ -1,16 +1,17 @@
 # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ)
 
+from time import process_time, time
 import numpy as np
-#import sys
-#import nbsvm
-#import dataset_handler
+import pickle
+from joblib import dump, load
 import importlib
+from os import getcwd
+from os.path import join, isfile
 st = importlib.import_module("skip-thoughts")
 
 from scipy.sparse import hstack
 
 from sklearn.linear_model import LogisticRegression
-#from sklearn.cross_validation import KFold
 from sklearn.model_selection import KFold
 
 def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False):
@@ -24,12 +25,23 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
     """
     # Load the dataset and extract features
     z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
-
+    
+    time_stamp = time.strftime("%Y%m%d-%H%M%S")
+    file_name = 'sentence_embeddings_' + time_stamp + '.dat'
+    file_name_full_path = join(loc, 'skip-thoughts', 'data', file_name)
+    print("Saving embeddings to file {0}".format(file_name_full_path))
+    with open(file_name_full_path, 'wb') as f:
+        pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL)
+        pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)
+    
     scan = [2**t for t in range(0,3,1)]
     #npts = len(z['text'])
     #kf = KFold(npts, n_folds=k, random_state=seed)
     kf = KFold(n_splits=k, random_state=seed)
     
+    start_time = process_time()
+    print("Started 'eval_nested_kfold'".format(start_time))
+    
     scores = []
     for train_index, test_index in kf.split(features):
 
@@ -89,12 +101,19 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
         # Train classifier
         clf = LogisticRegression(C=s)
         clf.fit(X_train, y_train)
-
+        
+        cwd = getcwd()
+        dump(clf, join(cwd,'skip-thoughts/models', 'best_logit.joblib.gz'))
+        
         # Evaluate
         acc = clf.score(X_test, y_test)
         scores.append(acc)
         print (scores)
 
+        end_time = process_time()
+        
+        print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(end_time - start_time))    
+
     return scores
 
 
@@ -113,4 +132,17 @@ def compute_nb(X, y, Z):
     return trainX, devX
 
 
-
+def eval_test_data(encoder, name, loc='./data/'):
+    """
+    load previously saved logistic regression model to predict on test data.
+    Only works on ACLIMBD, because it has train and test data stored separately.
+    """
+    acc = 0.0
+    if name == 'ACLIMBD':
+        full_path_model_file = join(loc, 'skip-thoughts', 'models', 'best_logit.joblib.gz')
+        if isfile(full_path_model_file):
+            full_path_test = join(loc, 'skip-thoughts', 'data', 'aclImdb', 'test')
+            z, features = st.dataset_handler.load_data(encoder, name, loc=full_path_test)
+            clf = load(full_path_model_file)
+            acc = clf.score(features, z['labels'])
+    return acc
\ No newline at end of file
diff --git a/skipthoughts.py b/skipthoughts.py
index edc4979..aef6396 100644
--- a/skipthoughts.py
+++ b/skipthoughts.py
@@ -1,7 +1,8 @@
 '''
 Skip-thought vectors
 '''
-import os
+from time import process_time
+import datetime
 import warnings
 warnings.filterwarnings("ignore")
 
@@ -110,6 +111,10 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False)
     """
     Encode sentences in the list X. Each entry will return a vector
     """
+    start_time = process_time()
+    
+    print(datetime.datetime.now().strftime("Started 'encode' at %Y-%m-%d %H:%M:%S"))
+
     # first, do preprocessing
     X = preprocess(X)
 
@@ -167,6 +172,13 @@ def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False)
                 bfeatures[c] = bff[ind]
     
     features = numpy.c_[ufeatures, bfeatures]
+    
+    end_time = process_time()
+    
+    
+    print(datetime.datetime.now().strftime("Finished 'encode' at %Y-%m-%d %H:%M:%S"))
+    
+    print("Elapsed time in seconds for encoding: {0:4.2f}".format(end_time - start_time))    
     return features
 
 

From bb7bb8b720e837fc55a45789795e7dd599547c56 Mon Sep 17 00:00:00 2001
From: Pablo Adames <pablo.adames@gmail.com>
Date: Mon, 8 Jun 2020 19:57:15 -0600
Subject: [PATCH 7/8] fixing path to save features and labels

---
 dataset_handler.py     | 3 +--
 eval_classification.py | 8 +++++---
 skipthoughts.py        | 1 -
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dataset_handler.py b/dataset_handler.py
index 78b3ce3..b429314 100644
--- a/dataset_handler.py
+++ b/dataset_handler.py
@@ -4,7 +4,6 @@
 from numpy.random import RandomState
 from nltk import sent_tokenize
 from nltk.tokenize import word_tokenize
-import string
 from os.path import join, isfile
 from os import listdir
 
@@ -31,7 +30,7 @@ def load_data(encoder, name, loc='./data/', seed=1234):
     z['text'] = text
     z['labels'] = labels
     print( 'Computing skip-thought vectors...')
-    features = encoder.encode(text, verbose=False, use_eos=False)
+    features = encoder.encode(text, verbose=False, use_eos=True)
     return z, features
 
 
diff --git a/eval_classification.py b/eval_classification.py
index 4eb6eef..ae0edee 100644
--- a/eval_classification.py
+++ b/eval_classification.py
@@ -1,6 +1,7 @@
 # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ)
 
-from time import process_time, time
+from time import process_time
+import time
 import numpy as np
 import pickle
 from joblib import dump, load
@@ -26,9 +27,10 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
     # Load the dataset and extract features
     z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
     
-    time_stamp = time.strftime("%Y%m%d-%H%M%S")
+    time_stamp = time.strftime("%Y%m%d_%H%M%S")
     file_name = 'sentence_embeddings_' + time_stamp + '.dat'
-    file_name_full_path = join(loc, 'skip-thoughts', 'data', file_name)
+    cwd = getcwd()
+    file_name_full_path = join(cwd, 'skip-thoughts', 'data', file_name)
     print("Saving embeddings to file {0}".format(file_name_full_path))
     with open(file_name_full_path, 'wb') as f:
         pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/skipthoughts.py b/skipthoughts.py
index aef6396..bbabbd8 100644
--- a/skipthoughts.py
+++ b/skipthoughts.py
@@ -11,7 +11,6 @@
 
 import pickle as pkl
 import numpy
-import copy
 import nltk
 
 from collections import OrderedDict, defaultdict

From 25316c1c221c4a32713fea505afa5f129eca6371 Mon Sep 17 00:00:00 2001
From: padames <pablo.adames@gmail.com>
Date: Tue, 9 Jun 2020 09:12:41 -0600
Subject: [PATCH 8/8] attempting to evaluate test data, missing encoding of
 test features

---
 eval_classification.py | 166 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 140 insertions(+), 26 deletions(-)

diff --git a/eval_classification.py b/eval_classification.py
index ae0edee..5010d77 100644
--- a/eval_classification.py
+++ b/eval_classification.py
@@ -8,6 +8,7 @@
 import importlib
 from os import getcwd
 from os.path import join, isfile
+
 st = importlib.import_module("skip-thoughts")
 
 from scipy.sparse import hstack
@@ -15,6 +16,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import KFold
 
+
 def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False):
     """
     Evaluate features with nested K-fold cross validation
@@ -26,7 +28,7 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
     """
     # Load the dataset and extract features
     z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
-    
+
     time_stamp = time.strftime("%Y%m%d_%H%M%S")
     file_name = 'sentence_embeddings_' + time_stamp + '.dat'
     cwd = getcwd()
@@ -35,15 +37,15 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
     with open(file_name_full_path, 'wb') as f:
         pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL)
         pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)
-    
-    scan = [2**t for t in range(0,3,1)]
-    #npts = len(z['text'])
-    #kf = KFold(npts, n_folds=k, random_state=seed)
+
+    scan = [2 ** t for t in range(0, 3, 1)]
+    # npts = len(z['text'])
+    # kf = KFold(npts, n_folds=k, random_state=seed)
     kf = KFold(n_splits=k, random_state=seed)
-    
+
     start_time = process_time()
     print("Started 'eval_nested_kfold'".format(start_time))
-    
+
     scores = []
     for train_index, test_index in kf.split(features):
 
@@ -51,7 +53,6 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
         X_train, y_train = features[train_index], z['labels'][train_index]
         X_test, y_test = features[test_index], z['labels'][test_index]
 
-
         Xraw = [z['text'][i] for i in train_index]
         Xraw_test = [z['text'][i] for i in test_index]
 
@@ -59,10 +60,10 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
         for s in scan:
 
             # Inner KFold
-            innerkf = KFold(n_splits=k, random_state=seed+1)
+            innerkf = KFold(n_splits=k, random_state=seed + 1)
             innerscores = []
             for innertrain, innertest in innerkf.split(X_train):
-        
+
                 # Split data
                 X_innertrain = X_train[innertrain]
                 y_innertrain = y_train[innertrain]
@@ -83,7 +84,7 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
                 clf.fit(X_innertrain, y_innertrain)
                 acc = clf.score(X_innertest, y_innertest)
                 innerscores.append(acc)
-                print (s, acc)
+                print(s, acc)
 
             # Append mean score
             scanscores.append(np.mean(innerscores))
@@ -91,30 +92,31 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False
         # Get the index of the best score
         s_ind = np.argmax(scanscores)
         s = scan[s_ind]
-        print (scanscores)
-        print (s)
- 
+        print(scanscores)
+        print(s)
+
         # NB (if applicable)
         if use_nb:
             NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test)
             X_train = hstack((X_train, NBtrain))
             X_test = hstack((X_test, NBtest))
-       
+
         # Train classifier
         clf = LogisticRegression(C=s)
         clf.fit(X_train, y_train)
-        
+
         cwd = getcwd()
-        dump(clf, join(cwd,'skip-thoughts/models', 'best_logit.joblib.gz'))
-        
+        dump(clf, join(cwd, 'skip-thoughts/models', 'best_logit.joblib.gz'))
+
         # Evaluate
         acc = clf.score(X_test, y_test)
         scores.append(acc)
-        print (scores)
+        print(scores)
 
         end_time = process_time()
-        
-        print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(end_time - start_time))    
+
+        print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(
+            end_time - start_time))
 
     return scores
 
@@ -126,11 +128,11 @@ def compute_nb(X, y, Z):
     labels = [int(t) for t in y]
     ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0]
     ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1]
-    poscounts = st.nbsvm.build_dict(ptrain, [1,2])
-    negcounts = st.nbsvm.build_dict(ntrain, [1,2])
+    poscounts = st.nbsvm.build_dict(ptrain, [1, 2])
+    negcounts = st.nbsvm.build_dict(ntrain, [1, 2])
     dic, r = st.nbsvm.compute_ratio(poscounts, negcounts)
-    trainX = st.nbsvm.process_text(X, dic, r, [1,2])
-    devX = st.nbsvm.process_text(Z, dic, r, [1,2])
+    trainX = st.nbsvm.process_text(X, dic, r, [1, 2])
+    devX = st.nbsvm.process_text(Z, dic, r, [1, 2])
     return trainX, devX
 
 
@@ -147,4 +149,116 @@ def eval_test_data(encoder, name, loc='./data/'):
             z, features = st.dataset_handler.load_data(encoder, name, loc=full_path_test)
             clf = load(full_path_model_file)
             acc = clf.score(features, z['labels'])
-    return acc
\ No newline at end of file
+    return acc
+
+
+def eval_test_data_from_saved_embeddings_classification(embeddings_file_name,
+                                                        k=3,
+                                                        use_nb=False,
+                                                        embeddings_loc='./data/',
+                                                        model_loc='./data/',
+                                                        seed=1234):
+    """
+    load previously saved logistic regression model to predict on test data.
+    Only works on ACLIMBD, because it has train and test data stored separately.
+    """
+    acc = 0.0
+    full_path_embeddings = join(embeddings_loc, 'data', embeddings_file_name)
+    model_full_path = ''
+    if not isfile(full_path_embeddings):
+        print("Embeddings file not found")
+        exit(1)
+    else:
+        print("Loading embeddings from file {0}".format(full_path_embeddings))
+        with open(full_path_embeddings, 'rb') as f:
+            z = pickle.load(f)
+            features = pickle.load(f)
+            # pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL)
+            # pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+        scan = [2 ** t for t in range(0, 3, 1)]
+
+        kf = KFold(n_splits=k, random_state=seed)
+
+        start_time = process_time()
+        print("Started 'eval_nested_kfold'".format(start_time))
+
+        scores = []
+        for train_index, test_index in kf.split(features):
+
+            # Split data
+            X_train, y_train = features[train_index], z['labels'][train_index]
+            X_test, y_test = features[test_index], z['labels'][test_index]
+
+            Xraw = [z['text'][i] for i in train_index]
+            Xraw_test = [z['text'][i] for i in test_index]
+
+            scan_scores = []
+            for s in scan:
+
+                # Inner KFold
+                innerkf = KFold(n_splits=k, random_state=seed + 1)
+                innerscores = []
+                for innertrain, innertest in innerkf.split(X_train):
+
+                    # Split data
+                    X_innertrain = X_train[innertrain]
+                    y_innertrain = y_train[innertrain]
+                    X_innertest = X_train[innertest]
+                    y_innertest = y_train[innertest]
+
+                    Xraw_innertrain = [Xraw[i] for i in innertrain]
+                    Xraw_innertest = [Xraw[i] for i in innertest]
+
+                    # NB (if applicable)
+                    if use_nb:
+                        NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest)
+                        X_innertrain = hstack((X_innertrain, NBtrain))
+                        X_innertest = hstack((X_innertest, NBtest))
+
+                    # Train classifier
+                    clf = LogisticRegression(C=s)
+                    clf.fit(X_innertrain, y_innertrain)
+                    acc = clf.score(X_innertest, y_innertest)
+                    innerscores.append(acc)
+                    print(s, acc)
+
+                # Append mean score
+                scan_scores.append(np.mean(innerscores))
+
+            # Get the index of the best score
+            s_ind = np.argmax(scan_scores)
+            s = scan[s_ind]
+            print(scan_scores)
+            print(s)
+
+            # NB (if applicable)
+            if use_nb:
+                NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test)
+                X_train = hstack((X_train, NBtrain))
+                X_test = hstack((X_test, NBtest))
+
+            # Train classifier
+            clf = LogisticRegression(C=s)
+            clf.fit(X_train, y_train)
+
+            time_stamp = time.strftime("%Y%m%d_%H%M%S")
+            model_file_name = 'best_logit.' + time_stamp + '.joblib.gz'
+            model_full_path = join(model_loc, model_file_name)
+
+            dump(clf, model_full_path)
+
+            # Evaluate
+            acc = clf.score(X_test, y_test)
+            scores.append(acc)
+            print(scores)
+
+            end_time = process_time()
+
+            print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(
+                end_time - start_time))
+
+        clf = load(model_full_path)
+        acc = clf.score(features, z['labels'])
+        
+    return acc