ryankiros · padames · Jun 8, 2020 · Jun 8, 2020 · Jun 8, 2020 · Jun 8, 2020
diff --git a/.gitignore b/.gitignore
@@ -58,3 +58,6 @@ target/
 
 # Downloaded data files
 data/
+
+# Downloaded tables
+tables/
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,5 @@
+from . import skipthoughts
+from . import dataset_handler
+from . import nbsvm
+from . import eval_classification
+
diff --git a/dataset_handler.py b/dataset_handler.py
@@ -2,12 +2,16 @@
 
 import numpy as np
 from numpy.random import RandomState
-import os.path
+from nltk import sent_tokenize
+from nltk.tokenize import word_tokenize
+from os.path import join, isfile
+from os import listdir
+
 
 
 def load_data(encoder, name, loc='./data/', seed=1234):
     """
-    Load one of MR, CR, SUBJ or MPQA
+    Load one of MR, CR, SUBJ or MPQA, ACLIMBD
     """
     z = {}
     if name == 'MR':
@@ -18,13 +22,15 @@ def load_data(encoder, name, loc='./data/', seed=1234):
         pos, neg = load_cr(loc=loc)
     elif name == 'MPQA':
         pos, neg = load_mpqa(loc=loc)
+    elif name == 'ACLIMBD':
+        pos, neg = load_aclimbd(loc=loc)
 
     labels = compute_labels(pos, neg)
     text, labels = shuffle_data(pos+neg, labels, seed=seed)
     z['text'] = text
     z['labels'] = labels
-    print 'Computing skip-thought vectors...'
-    features = encoder.encode(text, verbose=False)
+    print( 'Computing skip-thought vectors...')
+    features = encoder.encode(text, verbose=False, use_eos=True)
     return z, features
 
 
@@ -33,14 +39,55 @@ def load_rt(loc='./data/'):
     Load the MR dataset
     """
     pos, neg = [], []
-    with open(os.path.join(loc, 'rt-polarity.pos'), 'rb') as f:
+    with open(join(loc, 'rt-polarity.pos'), 'rb') as f:
         for line in f:
             pos.append(line.decode('latin-1').strip())
-    with open(os.path.join(loc, 'rt-polarity.neg'), 'rb') as f:
+    with open(join(loc, 'rt-polarity.neg'), 'rb') as f:
         for line in f:
             neg.append(line.decode('latin-1').strip())
     return pos, neg
 
+def load_aclimbd(loc='./data/'):
+    """
+    Load the acl imbd dataset
+    """
+    pos, neg = [], []
+    pos_path = join(loc, 'pos')
+    onlyfiles = [f for f in listdir(pos_path) if isfile(join(pos_path, f))]
+    for f_name in onlyfiles:
+        with open(join(pos_path,f_name), 'rb') as f:
+            for line in f:
+                l = line.decode('latin-1').strip()
+                sentences = sent_tokenize(l)
+                for s in sentences:
+                    tokens = word_tokenize(s)
+                    tokens = [w.lower() for w in tokens]
+                    table = str.maketrans(',', ' ', '!?@#%&*"\'')
+                    words = [w.translate(table) for w in tokens]
+                    # remove remaining tokens that are not alphabetic
+                    sentence = " ".join(words)
+                    sent = sentence.split()
+                    pos.append(" ".join(sent))
+
+    neg_path = join(loc, 'neg')
+    onlyfiles = [f for f in listdir(neg_path) if isfile(join(neg_path, f))]    
+    for f_name in onlyfiles:
+        with open(join(neg_path, f_name), 'rb') as f:
+            for line in f:
+                l = line.decode('latin-1').strip()
+                sentences = sent_tokenize(l)
+                for s in sentences:
+                    tokens = word_tokenize(s)
+                    tokens = [w.lower() for w in tokens]
+                    table = str.maketrans(',', ' ', '!?@#%&*"\'')
+                    words = [w.translate(table) for w in tokens]
+                    # remove remaining tokens that are not alphabetic
+                    sentence = " ".join(words)
+                    sent = sentence.split()
+                    neg.append(" ".join(sent))                
+
+    return pos, neg
+
 
 def load_subj(loc='./data/'):
     """

diff --git a/eval_classification.py b/eval_classification.py
@@ -1,17 +1,23 @@
 # Experiment scripts for binary classification benchmarks (e.g. MR, CR, MPQA, SUBJ)
 
+from time import process_time
+import time
 import numpy as np
-import sys
-import nbsvm
-import dataset_handler
+import pickle
+from joblib import dump, load
+import importlib
+from os import getcwd
+from os.path import join, isfile
+
+st = importlib.import_module("skip-thoughts")
 
 from scipy.sparse import hstack
 
 from sklearn.linear_model import LogisticRegression
-from sklearn.cross_validation import KFold
+from sklearn.model_selection import KFold
 
 
-def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=False):
+def eval_nested_kfold(encoder, name, loc='./data/', k=5, seed=1234, use_nb=False):
     """
     Evaluate features with nested K-fold cross validation
     Outer loop: Held-out evaluation
@@ -21,31 +27,43 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
     Options for name are 'MR', 'CR', 'SUBJ' and 'MPQA'
     """
     # Load the dataset and extract features
-    z, features = dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
+    z, features = st.dataset_handler.load_data(encoder, name, loc=loc, seed=seed)
+
+    time_stamp = time.strftime("%Y%m%d_%H%M%S")
+    file_name = 'sentence_embeddings_' + time_stamp + '.dat'
+    cwd = getcwd()
+    file_name_full_path = join(cwd, 'skip-thoughts', 'data', file_name)
+    print("Saving embeddings to file {0}".format(file_name_full_path))
+    with open(file_name_full_path, 'wb') as f:
+        pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL)
+        pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+    scan = [2 ** t for t in range(0, 3, 1)]
+    # npts = len(z['text'])
+    # kf = KFold(npts, n_folds=k, random_state=seed)
+    kf = KFold(n_splits=k, random_state=seed)
+
+    start_time = process_time()
+    print("Started 'eval_nested_kfold'".format(start_time))
 
-    scan = [2**t for t in range(0,9,1)]
-    npts = len(z['text'])
-    kf = KFold(npts, n_folds=k, random_state=seed)
     scores = []
-    for train, test in kf:
+    for train_index, test_index in kf.split(features):
 
         # Split data
-        X_train = features[train]
-        y_train = z['labels'][train]
-        X_test = features[test]
-        y_test = z['labels'][test]
+        X_train, y_train = features[train_index], z['labels'][train_index]
+        X_test, y_test = features[test_index], z['labels'][test_index]
 
-        Xraw = [z['text'][i] for i in train]
-        Xraw_test = [z['text'][i] for i in test]
+        Xraw = [z['text'][i] for i in train_index]
+        Xraw_test = [z['text'][i] for i in test_index]
 
         scanscores = []
         for s in scan:
 
             # Inner KFold
-            innerkf = KFold(len(X_train), n_folds=k, random_state=seed+1)
+            innerkf = KFold(n_splits=k, random_state=seed + 1)
             innerscores = []
-            for innertrain, innertest in innerkf:
-        
+            for innertrain, innertest in innerkf.split(X_train):
+
                 # Split data
                 X_innertrain = X_train[innertrain]
                 y_innertrain = y_train[innertrain]
@@ -66,31 +84,39 @@ def eval_nested_kfold(encoder, name, loc='./data/', k=10, seed=1234, use_nb=Fals
                 clf.fit(X_innertrain, y_innertrain)
                 acc = clf.score(X_innertest, y_innertest)
                 innerscores.append(acc)
-                print (s, acc)
+                print(s, acc)
 
             # Append mean score
             scanscores.append(np.mean(innerscores))
 
         # Get the index of the best score
         s_ind = np.argmax(scanscores)
         s = scan[s_ind]
-        print scanscores
-        print s
- 
+        print(scanscores)
+        print(s)
+
         # NB (if applicable)
         if use_nb:
             NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test)
             X_train = hstack((X_train, NBtrain))
             X_test = hstack((X_test, NBtest))
-       
+
         # Train classifier
         clf = LogisticRegression(C=s)
         clf.fit(X_train, y_train)
 
+        cwd = getcwd()
+        dump(clf, join(cwd, 'skip-thoughts/models', 'best_logit.joblib.gz'))
+
         # Evaluate
         acc = clf.score(X_test, y_test)
         scores.append(acc)
-        print scores
+        print(scores)
+
+        end_time = process_time()
+
+        print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(
+            end_time - start_time))
 
     return scores
 
@@ -102,12 +128,137 @@ def compute_nb(X, y, Z):
     labels = [int(t) for t in y]
     ptrain = [X[i] for i in range(len(labels)) if labels[i] == 0]
     ntrain = [X[i] for i in range(len(labels)) if labels[i] == 1]
-    poscounts = nbsvm.build_dict(ptrain, [1,2])
-    negcounts = nbsvm.build_dict(ntrain, [1,2])
-    dic, r = nbsvm.compute_ratio(poscounts, negcounts)
-    trainX = nbsvm.process_text(X, dic, r, [1,2])
-    devX = nbsvm.process_text(Z, dic, r, [1,2])
+    poscounts = st.nbsvm.build_dict(ptrain, [1, 2])
+    negcounts = st.nbsvm.build_dict(ntrain, [1, 2])
+    dic, r = st.nbsvm.compute_ratio(poscounts, negcounts)
+    trainX = st.nbsvm.process_text(X, dic, r, [1, 2])
+    devX = st.nbsvm.process_text(Z, dic, r, [1, 2])
     return trainX, devX
 
 
+def eval_test_data(encoder, name, loc='./data/'):
+    """
+    load previously saved logistic regression model to predict on test data.
+    Only works on ACLIMBD, because it has train and test data stored separately.
+    """
+    acc = 0.0
+    if name == 'ACLIMBD':
+        full_path_model_file = join(loc, 'skip-thoughts', 'models', 'best_logit.joblib.gz')
+        if isfile(full_path_model_file):
+            full_path_test = join(loc, 'skip-thoughts', 'data', 'aclImdb', 'test')
+            z, features = st.dataset_handler.load_data(encoder, name, loc=full_path_test)
+            clf = load(full_path_model_file)
+            acc = clf.score(features, z['labels'])
+    return acc
+
+
+def eval_test_data_from_saved_embeddings_classification(embeddings_file_name,
+                                                        k=3,
+                                                        use_nb=False,
+                                                        embeddings_loc='./data/',
+                                                        model_loc='./data/',
+                                                        seed=1234):
+    """
+    load previously saved logistic regression model to predict on test data.
+    Only works on ACLIMBD, because it has train and test data stored separately.
+    """
+    acc = 0.0
+    full_path_embeddings = join(embeddings_loc, 'data', embeddings_file_name)
+    model_full_path = ''
+    if not isfile(full_path_embeddings):
+        print("Embeddings file not found")
+        exit(1)
+    else:
+        print("Loading embeddings from file {0}".format(full_path_embeddings))
+        with open(full_path_embeddings, 'rb') as f:
+            z = pickle.load(f)
+            features = pickle.load(f)
+            # pickle.dump(z, f, protocol=pickle.HIGHEST_PROTOCOL)
+            # pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+        scan = [2 ** t for t in range(0, 3, 1)]
+
+        kf = KFold(n_splits=k, random_state=seed)
+
+        start_time = process_time()
+        print("Started 'eval_nested_kfold'".format(start_time))
 
+        scores = []
+        for train_index, test_index in kf.split(features):
+
+            # Split data
+            X_train, y_train = features[train_index], z['labels'][train_index]
+            X_test, y_test = features[test_index], z['labels'][test_index]
+
+            Xraw = [z['text'][i] for i in train_index]
+            Xraw_test = [z['text'][i] for i in test_index]
+
+            scan_scores = []
+            for s in scan:
+
+                # Inner KFold
+                innerkf = KFold(n_splits=k, random_state=seed + 1)
+                innerscores = []
+                for innertrain, innertest in innerkf.split(X_train):
+
+                    # Split data
+                    X_innertrain = X_train[innertrain]
+                    y_innertrain = y_train[innertrain]
+                    X_innertest = X_train[innertest]
+                    y_innertest = y_train[innertest]
+
+                    Xraw_innertrain = [Xraw[i] for i in innertrain]
+                    Xraw_innertest = [Xraw[i] for i in innertest]
+
+                    # NB (if applicable)
+                    if use_nb:
+                        NBtrain, NBtest = compute_nb(Xraw_innertrain, y_innertrain, Xraw_innertest)
+                        X_innertrain = hstack((X_innertrain, NBtrain))
+                        X_innertest = hstack((X_innertest, NBtest))
+
+                    # Train classifier
+                    clf = LogisticRegression(C=s)
+                    clf.fit(X_innertrain, y_innertrain)
+                    acc = clf.score(X_innertest, y_innertest)
+                    innerscores.append(acc)
+                    print(s, acc)
+
+                # Append mean score
+                scan_scores.append(np.mean(innerscores))
+
+            # Get the index of the best score
+            s_ind = np.argmax(scan_scores)
+            s = scan[s_ind]
+            print(scan_scores)
+            print(s)
+
+            # NB (if applicable)
+            if use_nb:
+                NBtrain, NBtest = compute_nb(Xraw, y_train, Xraw_test)
+                X_train = hstack((X_train, NBtrain))
+                X_test = hstack((X_test, NBtest))
+
+            # Train classifier
+            clf = LogisticRegression(C=s)
+            clf.fit(X_train, y_train)
+
+            time_stamp = time.strftime("%Y%m%d_%H%M%S")
+            model_file_name = 'best_logit.' + time_stamp + '.joblib.gz'
+            model_full_path = join(model_loc, model_file_name)
+
+            dump(clf, model_full_path)
+
+            # Evaluate
+            acc = clf.score(X_test, y_test)
+            scores.append(acc)
+            print(scores)
+
+            end_time = process_time()
+
+            print("Elapsed time in seconds for 5-fold CV and hyperparameter tuning on Logit: {0:4.2f}".format(
+                end_time - start_time))
+
+        clf = load(model_full_path)
+        acc = clf.score(features, z['labels'])
+
+    return acc