Initial import

DeepNLP · Nov 27, 2016 · 9229d20 · 9229d20
commit 9229d20
Show file tree

Hide file tree

Showing 14 changed files with 1,141 additions and 0 deletions.
diff --git a/collectSubmission.sh b/collectSubmission.sh
@@ -0,0 +1,2 @@
+rm -f assignment1.zip
+zip -r assignment1.zip *.py *.png saved_params_40000.npy
diff --git a/cs224d/__init__.py b/cs224d/__init__.py
diff --git a/cs224d/data_utils.py b/cs224d/data_utils.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import cPickle as pickle
+import numpy as np
+import os
+import random
+
+class StanfordSentiment:
+    def __init__(self, path=None, tablesize = 1000000):
+        if not path:
+            path = "cs224d/datasets/stanfordSentimentTreebank"
+
+        self.path = path
+        self.tablesize = tablesize
+
+    def tokens(self):
+        if hasattr(self, "_tokens") and self._tokens:
+            return self._tokens
+
+        tokens = dict()
+        tokenfreq = dict()
+        wordcount = 0
+        revtokens = []
+        idx = 0
+
+        for sentence in self.sentences():
+            for w in sentence:
+                wordcount += 1
+                if not w in tokens:
+                    tokens[w] = idx
+                    revtokens += [w]
+                    tokenfreq[w] = 1
+                    idx += 1
+                else:
+                    tokenfreq[w] += 1
+
+        tokens["UNK"] = idx
+        revtokens += ["UNK"]
+        tokenfreq["UNK"] = 1
+        wordcount += 1
+
+        self._tokens = tokens
+        self._tokenfreq = tokenfreq
+        self._wordcount = wordcount
+        self._revtokens = revtokens
+        return self._tokens
+
+    def sentences(self):
+        if hasattr(self, "_sentences") and self._sentences:
+            return self._sentences
+
+        sentences = []
+        with open(self.path + "/datasetSentences.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                splitted = line.strip().split()[1:]
+                # Deal with some peculiar encoding issues with this file
+                sentences += [[w.lower().decode("utf-8").encode('latin1') for w in splitted]]
+
+        self._sentences = sentences
+        self._sentlengths = np.array([len(s) for s in sentences])
+        self._cumsentlen = np.cumsum(self._sentlengths)
+
+        return self._sentences
+
+    def numSentences(self):
+        if hasattr(self, "_numSentences") and self._numSentences:
+            return self._numSentences
+        else:
+            self._numSentences = len(self.sentences())
+            return self._numSentences
+
+    def allSentences(self):
+        if hasattr(self, "_allsentences") and self._allsentences:
+            return self._allsentences
+
+        sentences = self.sentences()
+        rejectProb = self.rejectProb()
+        tokens = self.tokens()
+        allsentences = [[w for w in s 
+            if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
+            for s in sentences * 30]
+
+        allsentences = [s for s in allsentences if len(s) > 1]
+
+        self._allsentences = allsentences
+
+        return self._allsentences
+
+    def getRandomContext(self, C=5):
+        allsent = self.allSentences()
+        sentID = random.randint(0, len(allsent) - 1)
+        sent = allsent[sentID]
+        wordID = random.randint(0, len(sent) - 1)
+
+        context = sent[max(0, wordID - C):wordID] 
+        if wordID+1 < len(sent):
+            context += sent[wordID+1:min(len(sent), wordID + C + 1)]
+
+        centerword = sent[wordID]
+        context = [w for w in context if w != centerword]
+
+        if len(context) > 0:
+            return centerword, context
+        else:
+            return self.getRandomContext(C)
+
+    def sent_labels(self):
+        if hasattr(self, "_sent_labels") and self._sent_labels:
+            return self._sent_labels
+
+        dictionary = dict()
+        phrases = 0
+        with open(self.path + "/dictionary.txt", "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line: continue
+                splitted = line.split("|")
+                dictionary[splitted[0].lower()] = int(splitted[1])
+                phrases += 1
+
+        labels = [0.0] * phrases
+        with open(self.path + "/sentiment_labels.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                line = line.strip()
+                if not line: continue
+                splitted = line.split("|")
+                labels[int(splitted[0])] = float(splitted[1])
+
+        sent_labels = [0.0] * self.numSentences()
+        sentences = self.sentences()
+        for i in xrange(self.numSentences()):
+            sentence = sentences[i]
+            full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
+            sent_labels[i] = labels[dictionary[full_sent]]
+
+        self._sent_labels = sent_labels
+        return self._sent_labels
+
+    def dataset_split(self):
+        if hasattr(self, "_split") and self._split:
+            return self._split
+
+        split = [[] for i in xrange(3)]
+        with open(self.path + "/datasetSplit.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                splitted = line.strip().split(",")
+                split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
+
+        self._split = split
+        return self._split
+
+    def getRandomTrainSentence(self):
+        split = self.dataset_split()
+        sentId = split[0][random.randint(0, len(split[0]) - 1)]
+        return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
+
+    def categorify(self, label):
+        if label <= 0.2:
+            return 0
+        elif label <= 0.4:
+            return 1
+        elif label <= 0.6:
+            return 2
+        elif label <= 0.8:
+            return 3
+        else:
+            return 4
+
+    def getDevSentences(self):
+        return self.getSplitSentences(2)
+
+    def getTestSentences(self):
+        return self.getSplitSentences(1)
+
+    def getTrainSentences(self):
+        return self.getSplitSentences(0)
+
+    def getSplitSentences(self, split=0):
+        ds_split = self.dataset_split()
+        return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
+
+    def sampleTable(self):
+        if hasattr(self, '_sampleTable') and self._sampleTable is not None:
+            return self._sampleTable
+
+        nTokens = len(self.tokens())
+        samplingFreq = np.zeros((nTokens,))
+        self.allSentences()
+        i = 0
+        for w in xrange(nTokens):
+            w = self._revtokens[i]
+            if w in self._tokenfreq:
+                freq = 1.0 * self._tokenfreq[w]
+                # Reweigh
+                freq = freq ** 0.75
+            else:
+                freq = 0.0
+            samplingFreq[i] = freq
+            i += 1
+
+        samplingFreq /= np.sum(samplingFreq)
+        samplingFreq = np.cumsum(samplingFreq) * self.tablesize
+
+        self._sampleTable = [0] * self.tablesize
+
+        j = 0
+        for i in xrange(self.tablesize):
+            while i > samplingFreq[j]:
+                j += 1
+            self._sampleTable[i] = j
+
+        return self._sampleTable
+
+    def rejectProb(self):
+        if hasattr(self, '_rejectProb') and self._rejectProb is not None:
+            return self._rejectProb
+
+        threshold = 1e-5 * self._wordcount
+
+        nTokens = len(self.tokens())
+        rejectProb = np.zeros((nTokens,))
+        for i in xrange(nTokens):
+            w = self._revtokens[i]
+            freq = 1.0 * self._tokenfreq[w]
+            # Reweigh
+            rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
+
+        self._rejectProb = rejectProb
+        return self._rejectProb
+
+    def sampleTokenIdx(self):
+        return self.sampleTable()[random.randint(0, self.tablesize - 1)]
diff --git a/cs224d/datasets/get_datasets.sh b/cs224d/datasets/get_datasets.sh
@@ -0,0 +1,4 @@
+# Get Stanford Sentiment Treebank
+wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
+unzip stanfordSentimentTreebank.zip
+rm stanfordSentimentTreebank.zip
diff --git a/q1_softmax.py b/q1_softmax.py
@@ -0,0 +1,66 @@
+import numpy as np
+import random
+
+def softmax(x):
+    """
+    Compute the softmax function for each row of the input x.
+
+    It is crucial that this function is optimized for speed because
+    it will be used frequently in later code.
+    You might find numpy functions np.exp, np.sum, np.reshape,
+    np.max, and numpy broadcasting useful for this task. (numpy
+    broadcasting documentation:
+    http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+    You should also make sure that your code works for one
+    dimensional inputs (treat the vector as a row), you might find
+    it helpful for your later problems.
+
+    You must implement the optimization in problem 1(a) of the 
+    written assignment!
+    """
+
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+
+    return x
+
+def test_softmax_basic():
+    """
+    Some simple tests to get you started. 
+    Warning: these are not exhaustive.
+    """
+    print "Running basic tests..."
+    test1 = softmax(np.array([1,2]))
+    print test1
+    assert np.amax(np.fabs(test1 - np.array(
+        [0.26894142,  0.73105858]))) <= 1e-6
+
+    test2 = softmax(np.array([[1001,1002],[3,4]]))
+    print test2
+    assert np.amax(np.fabs(test2 - np.array(
+        [[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6
+
+    test3 = softmax(np.array([[-1001,-1002]]))
+    print test3
+    assert np.amax(np.fabs(test3 - np.array(
+        [0.73105858, 0.26894142]))) <= 1e-6
+
+    print "You should verify these results!\n"
+
+def test_softmax():
+    """ 
+    Use this space to test your softmax implementation by running:
+        python q1_softmax.py 
+    This function will not be called by the autograder, nor will
+    your tests be graded.
+    """
+    print "Running your tests..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE  
+
+if __name__ == "__main__":
+    test_softmax_basic()
+    test_softmax()
diff --git a/q2_gradcheck.py b/q2_gradcheck.py
@@ -0,0 +1,67 @@
+import numpy as np
+import random
+
+# First implement a gradient checker by filling in the following functions
+def gradcheck_naive(f, x):
+    """ 
+    Gradient check for a function f 
+    - f should be a function that takes a single argument and outputs the cost and its gradients
+    - x is the point (numpy array) to check the gradient at
+    """ 
+
+    rndstate = random.getstate()
+    random.setstate(rndstate)  
+    fx, grad = f(x) # Evaluate function value at original point
+    h = 1e-4
+
+    # Iterate over all indexes in x
+    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
+    while not it.finished:
+        ix = it.multi_index
+
+        ### try modifying x[ix] with h defined above to compute numerical gradients
+        ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it 
+        ### possible to test cost functions with built in randomness later
+        ### YOUR CODE HERE:
+        raise NotImplementedError
+        ### END YOUR CODE
+
+        # Compare gradients
+        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
+        if reldiff > 1e-5:
+            print "Gradient check failed."
+            print "First gradient error found at index %s" % str(ix)
+            print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
+            return
+
+        it.iternext() # Step to next dimension
+
+    print "Gradient check passed!"
+
+def sanity_check():
+    """
+    Some basic sanity checks.
+    """
+    quad = lambda x: (np.sum(x ** 2), x * 2)
+
+    print "Running sanity checks..."
+    gradcheck_naive(quad, np.array(123.456))      # scalar test
+    gradcheck_naive(quad, np.random.randn(3,))    # 1-D test
+    gradcheck_naive(quad, np.random.randn(4,5))   # 2-D test
+    print ""
+
+def your_sanity_checks(): 
+    """
+    Use this space add any additional sanity checks by running:
+        python q2_gradcheck.py 
+    This function will not be called by the autograder, nor will
+    your additional tests be graded.
+    """
+    print "Running your sanity checks..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+
+if __name__ == "__main__":
+    sanity_check()
+    your_sanity_checks()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		rm -f assignment1.zip
		zip -r assignment1.zip .py .png saved_params_40000.npy