diff --git a/collectSubmission.sh b/collectSubmission.sh new file mode 100644 index 0000000..3f6cef7 --- /dev/null +++ b/collectSubmission.sh @@ -0,0 +1,2 @@ +rm -f assignment1.zip +zip -r assignment1.zip *.py *.png saved_params_40000.npy diff --git a/cs224d/__init__.py b/cs224d/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cs224d/data_utils.py b/cs224d/data_utils.py new file mode 100644 index 0000000..b1fc52f --- /dev/null +++ b/cs224d/data_utils.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import cPickle as pickle +import numpy as np +import os +import random + +class StanfordSentiment: + def __init__(self, path=None, tablesize = 1000000): + if not path: + path = "cs224d/datasets/stanfordSentimentTreebank" + + self.path = path + self.tablesize = tablesize + + def tokens(self): + if hasattr(self, "_tokens") and self._tokens: + return self._tokens + + tokens = dict() + tokenfreq = dict() + wordcount = 0 + revtokens = [] + idx = 0 + + for sentence in self.sentences(): + for w in sentence: + wordcount += 1 + if not w in tokens: + tokens[w] = idx + revtokens += [w] + tokenfreq[w] = 1 + idx += 1 + else: + tokenfreq[w] += 1 + + tokens["UNK"] = idx + revtokens += ["UNK"] + tokenfreq["UNK"] = 1 + wordcount += 1 + + self._tokens = tokens + self._tokenfreq = tokenfreq + self._wordcount = wordcount + self._revtokens = revtokens + return self._tokens + + def sentences(self): + if hasattr(self, "_sentences") and self._sentences: + return self._sentences + + sentences = [] + with open(self.path + "/datasetSentences.txt", "r") as f: + first = True + for line in f: + if first: + first = False + continue + + splitted = line.strip().split()[1:] + # Deal with some peculiar encoding issues with this file + sentences += [[w.lower().decode("utf-8").encode('latin1') for w in splitted]] + + self._sentences = sentences + self._sentlengths = np.array([len(s) for s in sentences]) + self._cumsentlen = np.cumsum(self._sentlengths) + + return self._sentences + + def numSentences(self): + if hasattr(self, "_numSentences") and self._numSentences: + return self._numSentences + else: + self._numSentences = len(self.sentences()) + return self._numSentences + + def allSentences(self): + if hasattr(self, "_allsentences") and self._allsentences: + return self._allsentences + + sentences = self.sentences() + rejectProb = self.rejectProb() + tokens = self.tokens() + allsentences = [[w for w in s + if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]] + for s in sentences * 30] + + allsentences = [s for s in allsentences if len(s) > 1] + + self._allsentences = allsentences + + return self._allsentences + + def getRandomContext(self, C=5): + allsent = self.allSentences() + sentID = random.randint(0, len(allsent) - 1) + sent = allsent[sentID] + wordID = random.randint(0, len(sent) - 1) + + context = sent[max(0, wordID - C):wordID] + if wordID+1 < len(sent): + context += sent[wordID+1:min(len(sent), wordID + C + 1)] + + centerword = sent[wordID] + context = [w for w in context if w != centerword] + + if len(context) > 0: + return centerword, context + else: + return self.getRandomContext(C) + + def sent_labels(self): + if hasattr(self, "_sent_labels") and self._sent_labels: + return self._sent_labels + + dictionary = dict() + phrases = 0 + with open(self.path + "/dictionary.txt", "r") as f: + for line in f: + line = line.strip() + if not line: continue + splitted = line.split("|") + dictionary[splitted[0].lower()] = int(splitted[1]) + phrases += 1 + + labels = [0.0] * phrases + with open(self.path + "/sentiment_labels.txt", "r") as f: + first = True + for line in f: + if first: + first = False + continue + + line = line.strip() + if not line: continue + splitted = line.split("|") + labels[int(splitted[0])] = float(splitted[1]) + + sent_labels = [0.0] * self.numSentences() + sentences = self.sentences() + for i in xrange(self.numSentences()): + sentence = sentences[i] + full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')') + sent_labels[i] = labels[dictionary[full_sent]] + + self._sent_labels = sent_labels + return self._sent_labels + + def dataset_split(self): + if hasattr(self, "_split") and self._split: + return self._split + + split = [[] for i in xrange(3)] + with open(self.path + "/datasetSplit.txt", "r") as f: + first = True + for line in f: + if first: + first = False + continue + + splitted = line.strip().split(",") + split[int(splitted[1]) - 1] += [int(splitted[0]) - 1] + + self._split = split + return self._split + + def getRandomTrainSentence(self): + split = self.dataset_split() + sentId = split[0][random.randint(0, len(split[0]) - 1)] + return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId]) + + def categorify(self, label): + if label <= 0.2: + return 0 + elif label <= 0.4: + return 1 + elif label <= 0.6: + return 2 + elif label <= 0.8: + return 3 + else: + return 4 + + def getDevSentences(self): + return self.getSplitSentences(2) + + def getTestSentences(self): + return self.getSplitSentences(1) + + def getTrainSentences(self): + return self.getSplitSentences(0) + + def getSplitSentences(self, split=0): + ds_split = self.dataset_split() + return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]] + + def sampleTable(self): + if hasattr(self, '_sampleTable') and self._sampleTable is not None: + return self._sampleTable + + nTokens = len(self.tokens()) + samplingFreq = np.zeros((nTokens,)) + self.allSentences() + i = 0 + for w in xrange(nTokens): + w = self._revtokens[i] + if w in self._tokenfreq: + freq = 1.0 * self._tokenfreq[w] + # Reweigh + freq = freq ** 0.75 + else: + freq = 0.0 + samplingFreq[i] = freq + i += 1 + + samplingFreq /= np.sum(samplingFreq) + samplingFreq = np.cumsum(samplingFreq) * self.tablesize + + self._sampleTable = [0] * self.tablesize + + j = 0 + for i in xrange(self.tablesize): + while i > samplingFreq[j]: + j += 1 + self._sampleTable[i] = j + + return self._sampleTable + + def rejectProb(self): + if hasattr(self, '_rejectProb') and self._rejectProb is not None: + return self._rejectProb + + threshold = 1e-5 * self._wordcount + + nTokens = len(self.tokens()) + rejectProb = np.zeros((nTokens,)) + for i in xrange(nTokens): + w = self._revtokens[i] + freq = 1.0 * self._tokenfreq[w] + # Reweigh + rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq)) + + self._rejectProb = rejectProb + return self._rejectProb + + def sampleTokenIdx(self): + return self.sampleTable()[random.randint(0, self.tablesize - 1)] \ No newline at end of file diff --git a/cs224d/datasets/get_datasets.sh b/cs224d/datasets/get_datasets.sh new file mode 100755 index 0000000..aff89c7 --- /dev/null +++ b/cs224d/datasets/get_datasets.sh @@ -0,0 +1,4 @@ +# Get Stanford Sentiment Treebank +wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip +unzip stanfordSentimentTreebank.zip +rm stanfordSentimentTreebank.zip diff --git a/q1_softmax.py b/q1_softmax.py new file mode 100644 index 0000000..9b409f1 --- /dev/null +++ b/q1_softmax.py @@ -0,0 +1,66 @@ +import numpy as np +import random + +def softmax(x): + """ + Compute the softmax function for each row of the input x. + + It is crucial that this function is optimized for speed because + it will be used frequently in later code. + You might find numpy functions np.exp, np.sum, np.reshape, + np.max, and numpy broadcasting useful for this task. (numpy + broadcasting documentation: + http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) + + You should also make sure that your code works for one + dimensional inputs (treat the vector as a row), you might find + it helpful for your later problems. + + You must implement the optimization in problem 1(a) of the + written assignment! + """ + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return x + +def test_softmax_basic(): + """ + Some simple tests to get you started. + Warning: these are not exhaustive. + """ + print "Running basic tests..." + test1 = softmax(np.array([1,2])) + print test1 + assert np.amax(np.fabs(test1 - np.array( + [0.26894142, 0.73105858]))) <= 1e-6 + + test2 = softmax(np.array([[1001,1002],[3,4]])) + print test2 + assert np.amax(np.fabs(test2 - np.array( + [[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6 + + test3 = softmax(np.array([[-1001,-1002]])) + print test3 + assert np.amax(np.fabs(test3 - np.array( + [0.73105858, 0.26894142]))) <= 1e-6 + + print "You should verify these results!\n" + +def test_softmax(): + """ + Use this space to test your softmax implementation by running: + python q1_softmax.py + This function will not be called by the autograder, nor will + your tests be graded. + """ + print "Running your tests..." + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + +if __name__ == "__main__": + test_softmax_basic() + test_softmax() \ No newline at end of file diff --git a/q2_gradcheck.py b/q2_gradcheck.py new file mode 100644 index 0000000..d0c83d4 --- /dev/null +++ b/q2_gradcheck.py @@ -0,0 +1,67 @@ +import numpy as np +import random + +# First implement a gradient checker by filling in the following functions +def gradcheck_naive(f, x): + """ + Gradient check for a function f + - f should be a function that takes a single argument and outputs the cost and its gradients + - x is the point (numpy array) to check the gradient at + """ + + rndstate = random.getstate() + random.setstate(rndstate) + fx, grad = f(x) # Evaluate function value at original point + h = 1e-4 + + # Iterate over all indexes in x + it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) + while not it.finished: + ix = it.multi_index + + ### try modifying x[ix] with h defined above to compute numerical gradients + ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it + ### possible to test cost functions with built in randomness later + ### YOUR CODE HERE: + raise NotImplementedError + ### END YOUR CODE + + # Compare gradients + reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix])) + if reldiff > 1e-5: + print "Gradient check failed." + print "First gradient error found at index %s" % str(ix) + print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad) + return + + it.iternext() # Step to next dimension + + print "Gradient check passed!" + +def sanity_check(): + """ + Some basic sanity checks. + """ + quad = lambda x: (np.sum(x ** 2), x * 2) + + print "Running sanity checks..." + gradcheck_naive(quad, np.array(123.456)) # scalar test + gradcheck_naive(quad, np.random.randn(3,)) # 1-D test + gradcheck_naive(quad, np.random.randn(4,5)) # 2-D test + print "" + +def your_sanity_checks(): + """ + Use this space add any additional sanity checks by running: + python q2_gradcheck.py + This function will not be called by the autograder, nor will + your additional tests be graded. + """ + print "Running your sanity checks..." + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + +if __name__ == "__main__": + sanity_check() + your_sanity_checks() diff --git a/q2_neural.py b/q2_neural.py new file mode 100644 index 0000000..488caee --- /dev/null +++ b/q2_neural.py @@ -0,0 +1,76 @@ +import numpy as np +import random + +from q1_softmax import softmax +from q2_sigmoid import sigmoid, sigmoid_grad +from q2_gradcheck import gradcheck_naive + +def forward_backward_prop(data, labels, params, dimensions): + """ + Forward and backward propagation for a two-layer sigmoidal network + + Compute the forward propagation and for the cross entropy cost, + and backward propagation for the gradients for all parameters. + """ + + ### Unpack network parameters (do not modify) + ofs = 0 + Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) + + W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) + ofs += Dx * H + b1 = np.reshape(params[ofs:ofs + H], (1, H)) + ofs += H + W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) + ofs += H * Dy + b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) + + ### YOUR CODE HERE: forward propagation + raise NotImplementedError + ### END YOUR CODE + + ### YOUR CODE HERE: backward propagation + raise NotImplementedError + ### END YOUR CODE + + ### Stack gradients (do not modify) + grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), + gradW2.flatten(), gradb2.flatten())) + + return cost, grad + +def sanity_check(): + """ + Set up fake data and parameters for the neural network, and test using + gradcheck. + """ + print "Running sanity check..." + + N = 20 + dimensions = [10, 5, 10] + data = np.random.randn(N, dimensions[0]) # each row will be a datum + labels = np.zeros((N, dimensions[2])) + for i in xrange(N): + labels[i,random.randint(0,dimensions[2]-1)] = 1 + + params = np.random.randn((dimensions[0] + 1) * dimensions[1] + ( + dimensions[1] + 1) * dimensions[2], ) + + gradcheck_naive(lambda params: forward_backward_prop(data, labels, params, + dimensions), params) + +def your_sanity_checks(): + """ + Use this space add any additional sanity checks by running: + python q2_neural.py + This function will not be called by the autograder, nor will + your additional tests be graded. + """ + print "Running your sanity checks..." + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + +if __name__ == "__main__": + sanity_check() + your_sanity_checks() \ No newline at end of file diff --git a/q2_sigmoid.py b/q2_sigmoid.py new file mode 100644 index 0000000..04cc2de --- /dev/null +++ b/q2_sigmoid.py @@ -0,0 +1,58 @@ +import numpy as np + +def sigmoid(x): + """ + Compute the sigmoid function for the input here. + """ + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return x + +def sigmoid_grad(f): + """ + Compute the gradient for the sigmoid function here. Note that + for this implementation, the input f should be the sigmoid + function value of your original input x. + """ + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return f + +def test_sigmoid_basic(): + """ + Some simple tests to get you started. + Warning: these are not exhaustive. + """ + print "Running basic tests..." + x = np.array([[1, 2], [-1, -2]]) + f = sigmoid(x) + g = sigmoid_grad(f) + print f + assert np.amax(f - np.array([[0.73105858, 0.88079708], + [0.26894142, 0.11920292]])) <= 1e-6 + print g + assert np.amax(g - np.array([[0.19661193, 0.10499359], + [0.19661193, 0.10499359]])) <= 1e-6 + print "You should verify these results!\n" + +def test_sigmoid(): + """ + Use this space to test your sigmoid implementation by running: + python q2_sigmoid.py + This function will not be called by the autograder, nor will + your tests be graded. + """ + print "Running your tests..." + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + +if __name__ == "__main__": + test_sigmoid_basic(); + test_sigmoid() diff --git a/q3_run.py b/q3_run.py new file mode 100644 index 0000000..a000074 --- /dev/null +++ b/q3_run.py @@ -0,0 +1,57 @@ +import random +import numpy as np +from cs224d.data_utils import * +import matplotlib.pyplot as plt + +from q3_word2vec import * +from q3_sgd import * + +# Reset the random seed to make sure that everyone gets the same results +random.seed(314) +dataset = StanfordSentiment() +tokens = dataset.tokens() +nWords = len(tokens) + +# We are going to train 10-dimensional vectors for this assignment +dimVectors = 10 + +# Context size +C = 5 + +# Reset the random seed to make sure that everyone gets the same results +random.seed(31415) +np.random.seed(9265) +wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / \ + dimVectors, np.zeros((nWords, dimVectors))), axis=0) +wordVectors0 = sgd( + lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, + negSamplingCostAndGradient), + wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) +print "sanity check: cost at convergence should be around or below 10" + +# sum the input and output word vectors +wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) + +# Visualize the word vectors you trained +_, wordVectors0, _ = load_saved_params() +wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) +visualizeWords = ["the", "a", "an", ",", ".", "?", "!", "``", "''", "--", + "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", + "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", + "annoying"] +visualizeIdx = [tokens[word] for word in visualizeWords] +visualizeVecs = wordVectors[visualizeIdx, :] +temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) +covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) +U,S,V = np.linalg.svd(covariance) +coord = temp.dot(U[:,0:2]) + +for i in xrange(len(visualizeWords)): + plt.text(coord[i,0], coord[i,1], visualizeWords[i], + bbox=dict(facecolor='green', alpha=0.1)) + +plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]))) +plt.ylim((np.min(coord[:,1]), np.max(coord[:,1]))) + +plt.savefig('q3_word_vectors.png') +plt.show() \ No newline at end of file diff --git a/q3_sgd.py b/q3_sgd.py new file mode 100644 index 0000000..8a0ca55 --- /dev/null +++ b/q3_sgd.py @@ -0,0 +1,128 @@ +# Save parameters every a few SGD iterations as fail-safe +SAVE_PARAMS_EVERY = 1000 + +import glob +import random +import numpy as np +import os.path as op +import cPickle as pickle + +def load_saved_params(): + """ A helper function that loads previously saved parameters and resets iteration start """ + st = 0 + for f in glob.glob("saved_params_*.npy"): + iter = int(op.splitext(op.basename(f))[0].split("_")[2]) + if (iter > st): + st = iter + + if st > 0: + with open("saved_params_%d.npy" % st, "r") as f: + params = pickle.load(f) + state = pickle.load(f) + return st, params, state + else: + return st, None, None + +def save_params(iter, params): + with open("saved_params_%d.npy" % iter, "w") as f: + pickle.dump(params, f) + pickle.dump(random.getstate(), f) + +def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10): + """ Stochastic Gradient Descent """ + # Implement the stochastic gradient descent method in this + # function. + + # Inputs: + # - f: the function to optimize, it should take a single + # argument and yield two outputs, a cost and the gradient + # with respect to the arguments + # - x0: the initial point to start SGD from + # - step: the step size for SGD + # - iterations: total iterations to run SGD for + # - postprocessing: postprocessing function for the parameters + # if necessary. In the case of word2vec we will need to + # normalize the word vectors to have unit length. + # - PRINT_EVERY: specifies every how many iterations to output + + # Output: + # - x: the parameter value after SGD finishes + + # Anneal learning rate every several iterations + ANNEAL_EVERY = 20000 + + if useSaved: + start_iter, oldx, state = load_saved_params() + if start_iter > 0: + x0 = oldx; + step *= 0.5 ** (start_iter / ANNEAL_EVERY) + + if state: + random.setstate(state) + else: + start_iter = 0 + + x = x0 + + if not postprocessing: + postprocessing = lambda x: x + + expcost = None + + for iter in xrange(start_iter + 1, iterations + 1): + ### Don't forget to apply the postprocessing after every iteration! + ### You might want to print the progress every few iterations. + + cost = None + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + if iter % PRINT_EVERY == 0: + if not expcost: + expcost = cost + else: + expcost = .95 * expcost + .05 * cost + print "iter %d: %f" % (iter, expcost) + + if iter % SAVE_PARAMS_EVERY == 0 and useSaved: + save_params(iter, x) + + if iter % ANNEAL_EVERY == 0: + step *= 0.5 + + return x + +def sanity_check(): + quad = lambda x: (np.sum(x ** 2), x * 2) + + print "Running sanity checks..." + t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100) + print "test 1 result:", t1 + assert abs(t1) <= 1e-6 + + t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100) + print "test 2 result:", t2 + assert abs(t2) <= 1e-6 + + t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100) + print "test 3 result:", t3 + assert abs(t3) <= 1e-6 + + print "" + +def your_sanity_checks(): + """ + Use this space add any additional sanity checks by running: + python q3_sgd.py + This function will not be called by the autograder, nor will + your additional tests be graded. + """ + print "Running your sanity checks..." + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + +if __name__ == "__main__": + sanity_check(); + your_sanity_checks(); \ No newline at end of file diff --git a/q3_word2vec.py b/q3_word2vec.py new file mode 100644 index 0000000..df85c1d --- /dev/null +++ b/q3_word2vec.py @@ -0,0 +1,198 @@ +import numpy as np +import random + +from q1_softmax import softmax +from q2_gradcheck import gradcheck_naive +from q2_sigmoid import sigmoid, sigmoid_grad + +def normalizeRows(x): + """ Row normalization function """ + # Implement a function that normalizes each row of a matrix to have unit length + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return x + +def test_normalize_rows(): + print "Testing normalizeRows..." + x = normalizeRows(np.array([[3.0,4.0],[1, 2]])) + # the result should be [[0.6, 0.8], [0.4472, 0.8944]] + print x + assert (x.all() == np.array([[0.6, 0.8], [0.4472, 0.8944]]).all()) + print "" + +def softmaxCostAndGradient(predicted, target, outputVectors, dataset): + """ Softmax cost function for word2vec models """ + + # Implement the cost and gradients for one predicted word vector + # and one target word vector as a building block for word2vec + # models, assuming the softmax prediction function and cross + # entropy loss. + + # Inputs: + # - predicted: numpy ndarray, predicted word vector (\hat{v} in + # the written component or \hat{r} in an earlier version) + # - target: integer, the index of the target word + # - outputVectors: "output" vectors (as rows) for all tokens + # - dataset: needed for negative sampling, unused here. + + # Outputs: + # - cost: cross entropy cost for the softmax word prediction + # - gradPred: the gradient with respect to the predicted word + # vector + # - grad: the gradient with respect to all the other word + # vectors + + # We will not provide starter code for this function, but feel + # free to reference the code you previously wrote for this + # assignment! + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return cost, gradPred, grad + +def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, + K=10): + """ Negative sampling cost function for word2vec models """ + + # Implement the cost and gradients for one predicted word vector + # and one target word vector as a building block for word2vec + # models, using the negative sampling technique. K is the sample + # size. You might want to use dataset.sampleTokenIdx() to sample + # a random word index. + # + # Note: See test_word2vec below for dataset's initialization. + # + # Input/Output Specifications: same as softmaxCostAndGradient + # We will not provide starter code for this function, but feel + # free to reference the code you previously wrote for this + # assignment! + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return cost, gradPred, grad + + +def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors, + dataset, word2vecCostAndGradient = softmaxCostAndGradient): + """ Skip-gram model in word2vec """ + + # Implement the skip-gram model in this function. + + # Inputs: + # - currrentWord: a string of the current center word + # - C: integer, context size + # - contextWords: list of no more than 2*C strings, the context words + # - tokens: a dictionary that maps words to their indices in + # the word vector list + # - inputVectors: "input" word vectors (as rows) for all tokens + # - outputVectors: "output" word vectors (as rows) for all tokens + # - word2vecCostAndGradient: the cost and gradient function for + # a prediction vector given the target word vectors, + # could be one of the two cost functions you + # implemented above + + # Outputs: + # - cost: the cost function value for the skip-gram model + # - grad: the gradient with respect to the word vectors + # We will not provide starter code for this function, but feel + # free to reference the code you previously wrote for this + # assignment! + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return cost, gradIn, gradOut + +def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors, + dataset, word2vecCostAndGradient = softmaxCostAndGradient): + """ CBOW model in word2vec """ + + # Implement the continuous bag-of-words model in this function. + # Input/Output specifications: same as the skip-gram model + # We will not provide starter code for this function, but feel + # free to reference the code you previously wrote for this + # assignment! + + ################################################################# + # IMPLEMENTING CBOW IS EXTRA CREDIT, DERIVATIONS IN THE WRIITEN # + # ASSIGNMENT ARE NOT! # + ################################################################# + + cost = 0 + gradIn = np.zeros(inputVectors.shape) + gradOut = np.zeros(outputVectors.shape) + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return cost, gradIn, gradOut + +############################################# +# Testing functions below. DO NOT MODIFY! # +############################################# + +def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient): + batchsize = 50 + cost = 0.0 + grad = np.zeros(wordVectors.shape) + N = wordVectors.shape[0] + inputVectors = wordVectors[:N/2,:] + outputVectors = wordVectors[N/2:,:] + for i in xrange(batchsize): + C1 = random.randint(1,C) + centerword, context = dataset.getRandomContext(C1) + + if word2vecModel == skipgram: + denom = 1 + else: + denom = 1 + + c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient) + cost += c / batchsize / denom + grad[:N/2, :] += gin / batchsize / denom + grad[N/2:, :] += gout / batchsize / denom + + return cost, grad + +def test_word2vec(): + # Interface to the dataset for negative sampling + dataset = type('dummy', (), {})() + def dummySampleTokenIdx(): + return random.randint(0, 4) + + def getRandomContext(C): + tokens = ["a", "b", "c", "d", "e"] + return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \ + for i in xrange(2*C)] + dataset.sampleTokenIdx = dummySampleTokenIdx + dataset.getRandomContext = getRandomContext + + random.seed(31415) + np.random.seed(9265) + dummy_vectors = normalizeRows(np.random.randn(10,3)) + dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)]) + print "==== Gradient check for skip-gram ====" + gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors) + gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors) + print "\n==== Gradient check for CBOW ====" + gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5), dummy_vectors) + gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors) + + print "\n=== Results ===" + print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset) + print skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient) + print cbow("a", 2, ["a", "b", "c", "a"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset) + print cbow("a", 2, ["a", "b", "a", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient) + +if __name__ == "__main__": + test_normalize_rows() + test_word2vec() \ No newline at end of file diff --git a/q4_sentiment.py b/q4_sentiment.py new file mode 100644 index 0000000..1adb4f0 --- /dev/null +++ b/q4_sentiment.py @@ -0,0 +1,114 @@ +import numpy as np +import matplotlib.pyplot as plt + +from cs224d.data_utils import * + +from q3_sgd import load_saved_params, sgd +from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper + +# Try different regularizations and pick the best! +# NOTE: fill in one more "your code here" below before running! +REGULARIZATION = None # Assign a list of floats in the block below +### YOUR CODE HERE +raise NotImplementedError +### END YOUR CODE + +# Load the dataset +dataset = StanfordSentiment() +tokens = dataset.tokens() +nWords = len(tokens) + +# Load the word vectors we trained earlier +_, wordVectors0, _ = load_saved_params() +wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) +dimVectors = wordVectors.shape[1] + +# Load the train set +trainset = dataset.getTrainSentences() +nTrain = len(trainset) +trainFeatures = np.zeros((nTrain, dimVectors)) +trainLabels = np.zeros((nTrain,), dtype=np.int32) +for i in xrange(nTrain): + words, trainLabels[i] = trainset[i] + trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) + +# Prepare dev set features +devset = dataset.getDevSentences() +nDev = len(devset) +devFeatures = np.zeros((nDev, dimVectors)) +devLabels = np.zeros((nDev,), dtype=np.int32) +for i in xrange(nDev): + words, devLabels[i] = devset[i] + devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) + +# Try our regularization parameters +results = [] +for regularization in REGULARIZATION: + random.seed(3141) + np.random.seed(59265) + weights = np.random.randn(dimVectors, 5) + print "Training for reg=%f" % regularization + + # We will do batch optimization + weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels, + weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100) + + # Test on train set + _, _, pred = softmaxRegression(trainFeatures, trainLabels, weights) + trainAccuracy = accuracy(trainLabels, pred) + print "Train accuracy (%%): %f" % trainAccuracy + + # Test on dev set + _, _, pred = softmaxRegression(devFeatures, devLabels, weights) + devAccuracy = accuracy(devLabels, pred) + print "Dev accuracy (%%): %f" % devAccuracy + + # Save the results and weights + results.append({ + "reg" : regularization, + "weights" : weights, + "train" : trainAccuracy, + "dev" : devAccuracy}) + +# Print the accuracies +print "" +print "=== Recap ===" +print "Reg\t\tTrain\t\tDev" +for result in results: + print "%E\t%f\t%f" % ( + result["reg"], + result["train"], + result["dev"]) +print "" + +# Pick the best regularization parameters +BEST_REGULARIZATION = None +BEST_WEIGHTS = None + +### YOUR CODE HERE +raise NotImplementedError +### END YOUR CODE + +# Test your findings on the test set +testset = dataset.getTestSentences() +nTest = len(testset) +testFeatures = np.zeros((nTest, dimVectors)) +testLabels = np.zeros((nTest,), dtype=np.int32) +for i in xrange(nTest): + words, testLabels[i] = testset[i] + testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words) + +_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS) +print "Best regularization value: %E" % BEST_REGULARIZATION +print "Test accuracy (%%): %f" % accuracy(testLabels, pred) + +# Make a plot of regularization vs accuracy +plt.plot(REGULARIZATION, [x["train"] for x in results]) +plt.plot(REGULARIZATION, [x["dev"] for x in results]) +plt.xscale('log') +plt.xlabel("regularization") +plt.ylabel("accuracy") +plt.legend(['train', 'dev'], loc='upper left') +plt.savefig("q4_reg_v_acc.png") +plt.show() + diff --git a/q4_softmaxreg.py b/q4_softmaxreg.py new file mode 100644 index 0000000..332b401 --- /dev/null +++ b/q4_softmaxreg.py @@ -0,0 +1,105 @@ +import numpy as np +import random + +from cs224d.data_utils import * + +from q1_softmax import softmax +from q2_gradcheck import gradcheck_naive +from q3_sgd import load_saved_params + +def getSentenceFeature(tokens, wordVectors, sentence): + """ Obtain the sentence feature for sentiment analysis by averaging its word vectors """ + # Implement computation for the sentence features given a sentence. + + # Inputs: + # - tokens: a dictionary that maps words to their indices in + # the word vector list + # - wordVectors: word vectors (each row) for all tokens + # - sentence: a list of words in the sentence of interest + + # Output: + # - sentVector: feature vector for the sentence + + sentVector = np.zeros((wordVectors.shape[1],)) + + ### YOUR CODE HERE + raise NotImplementedError + ### END YOUR CODE + + return sentVector + +def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): + """ Softmax Regression """ + # Implement softmax regression with weight regularization. + + # Inputs: + # - features: feature vectors, each row is a feature vector + # - labels: labels corresponding to the feature vectors + # - weights: weights of the regressor + # - regularization: L2 regularization constant + + # Output: + # - cost: cost of the regressor + # - grad: gradient of the regressor cost with respect to its + # weights + # - pred: label predictions of the regressor (you might find + # np.argmax helpful) + + prob = softmax(features.dot(weights)) + if len(features.shape) > 1: + N = features.shape[0] + else: + N = 1 + # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 + cost = np.sum(-np.log(prob[range(N), labels])) / N + cost += 0.5 * regularization * np.sum(weights ** 2) + + ### YOUR CODE HERE: compute the gradients and predictions + raise NotImplementedError + ### END YOUR CODE + + if nopredictions: + return cost, grad + else: + return cost, grad, pred + +def accuracy(y, yhat): + """ Precision for classifier """ + assert(y.shape == yhat.shape) + return np.sum(y == yhat) * 100.0 / y.size + +def softmax_wrapper(features, labels, weights, regularization = 0.0): + cost, grad, _ = softmaxRegression(features, labels, weights, + regularization) + return cost, grad + +def sanity_check(): + """ + Run python q4_softmaxreg.py. + """ + random.seed(314159) + np.random.seed(265) + + dataset = StanfordSentiment() + tokens = dataset.tokens() + nWords = len(tokens) + + _, wordVectors0, _ = load_saved_params() + wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) + dimVectors = wordVectors.shape[1] + + dummy_weights = 0.1 * np.random.randn(dimVectors, 5) + dummy_features = np.zeros((10, dimVectors)) + dummy_labels = np.zeros((10,), dtype=np.int32) + for i in xrange(10): + words, dummy_labels[i] = dataset.getRandomTrainSentence() + dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) + print "==== Gradient check for softmax regression ====" + gradcheck_naive(lambda weights: softmaxRegression(dummy_features, + dummy_labels, weights, 1.0, nopredictions = True), dummy_weights) + + print "\n=== Results ===" + print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0) + +if __name__ == "__main__": + sanity_check() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f515c43 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +Jinja2==2.7.3 +MarkupSafe==0.23 +backports.ssl-match-hostname==3.4.0.2 +certifi==14.05.14 +gnureadline==6.3.3 +ipython==2.3.1 +matplotlib==1.4.2 +mock==1.0.1 +nose==1.3.4 +numpy==1.9.1 +pyparsing==2.0.3 +python-dateutil==2.4.0 +pytz==2014.10 +pyzmq==14.4.1 +scipy==0.14.1 +six==1.9.0 +tornado==4.0.2 +wsgiref==0.1.2