-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 9229d20
Showing
14 changed files
with
1,141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
rm -f assignment1.zip | ||
zip -r assignment1.zip *.py *.png saved_params_40000.npy |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,248 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
import cPickle as pickle | ||
import numpy as np | ||
import os | ||
import random | ||
|
||
class StanfordSentiment: | ||
def __init__(self, path=None, tablesize = 1000000): | ||
if not path: | ||
path = "cs224d/datasets/stanfordSentimentTreebank" | ||
|
||
self.path = path | ||
self.tablesize = tablesize | ||
|
||
def tokens(self): | ||
if hasattr(self, "_tokens") and self._tokens: | ||
return self._tokens | ||
|
||
tokens = dict() | ||
tokenfreq = dict() | ||
wordcount = 0 | ||
revtokens = [] | ||
idx = 0 | ||
|
||
for sentence in self.sentences(): | ||
for w in sentence: | ||
wordcount += 1 | ||
if not w in tokens: | ||
tokens[w] = idx | ||
revtokens += [w] | ||
tokenfreq[w] = 1 | ||
idx += 1 | ||
else: | ||
tokenfreq[w] += 1 | ||
|
||
tokens["UNK"] = idx | ||
revtokens += ["UNK"] | ||
tokenfreq["UNK"] = 1 | ||
wordcount += 1 | ||
|
||
self._tokens = tokens | ||
self._tokenfreq = tokenfreq | ||
self._wordcount = wordcount | ||
self._revtokens = revtokens | ||
return self._tokens | ||
|
||
def sentences(self): | ||
if hasattr(self, "_sentences") and self._sentences: | ||
return self._sentences | ||
|
||
sentences = [] | ||
with open(self.path + "/datasetSentences.txt", "r") as f: | ||
first = True | ||
for line in f: | ||
if first: | ||
first = False | ||
continue | ||
|
||
splitted = line.strip().split()[1:] | ||
# Deal with some peculiar encoding issues with this file | ||
sentences += [[w.lower().decode("utf-8").encode('latin1') for w in splitted]] | ||
|
||
self._sentences = sentences | ||
self._sentlengths = np.array([len(s) for s in sentences]) | ||
self._cumsentlen = np.cumsum(self._sentlengths) | ||
|
||
return self._sentences | ||
|
||
def numSentences(self): | ||
if hasattr(self, "_numSentences") and self._numSentences: | ||
return self._numSentences | ||
else: | ||
self._numSentences = len(self.sentences()) | ||
return self._numSentences | ||
|
||
def allSentences(self): | ||
if hasattr(self, "_allsentences") and self._allsentences: | ||
return self._allsentences | ||
|
||
sentences = self.sentences() | ||
rejectProb = self.rejectProb() | ||
tokens = self.tokens() | ||
allsentences = [[w for w in s | ||
if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]] | ||
for s in sentences * 30] | ||
|
||
allsentences = [s for s in allsentences if len(s) > 1] | ||
|
||
self._allsentences = allsentences | ||
|
||
return self._allsentences | ||
|
||
def getRandomContext(self, C=5): | ||
allsent = self.allSentences() | ||
sentID = random.randint(0, len(allsent) - 1) | ||
sent = allsent[sentID] | ||
wordID = random.randint(0, len(sent) - 1) | ||
|
||
context = sent[max(0, wordID - C):wordID] | ||
if wordID+1 < len(sent): | ||
context += sent[wordID+1:min(len(sent), wordID + C + 1)] | ||
|
||
centerword = sent[wordID] | ||
context = [w for w in context if w != centerword] | ||
|
||
if len(context) > 0: | ||
return centerword, context | ||
else: | ||
return self.getRandomContext(C) | ||
|
||
def sent_labels(self): | ||
if hasattr(self, "_sent_labels") and self._sent_labels: | ||
return self._sent_labels | ||
|
||
dictionary = dict() | ||
phrases = 0 | ||
with open(self.path + "/dictionary.txt", "r") as f: | ||
for line in f: | ||
line = line.strip() | ||
if not line: continue | ||
splitted = line.split("|") | ||
dictionary[splitted[0].lower()] = int(splitted[1]) | ||
phrases += 1 | ||
|
||
labels = [0.0] * phrases | ||
with open(self.path + "/sentiment_labels.txt", "r") as f: | ||
first = True | ||
for line in f: | ||
if first: | ||
first = False | ||
continue | ||
|
||
line = line.strip() | ||
if not line: continue | ||
splitted = line.split("|") | ||
labels[int(splitted[0])] = float(splitted[1]) | ||
|
||
sent_labels = [0.0] * self.numSentences() | ||
sentences = self.sentences() | ||
for i in xrange(self.numSentences()): | ||
sentence = sentences[i] | ||
full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')') | ||
sent_labels[i] = labels[dictionary[full_sent]] | ||
|
||
self._sent_labels = sent_labels | ||
return self._sent_labels | ||
|
||
def dataset_split(self): | ||
if hasattr(self, "_split") and self._split: | ||
return self._split | ||
|
||
split = [[] for i in xrange(3)] | ||
with open(self.path + "/datasetSplit.txt", "r") as f: | ||
first = True | ||
for line in f: | ||
if first: | ||
first = False | ||
continue | ||
|
||
splitted = line.strip().split(",") | ||
split[int(splitted[1]) - 1] += [int(splitted[0]) - 1] | ||
|
||
self._split = split | ||
return self._split | ||
|
||
def getRandomTrainSentence(self): | ||
split = self.dataset_split() | ||
sentId = split[0][random.randint(0, len(split[0]) - 1)] | ||
return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId]) | ||
|
||
def categorify(self, label): | ||
if label <= 0.2: | ||
return 0 | ||
elif label <= 0.4: | ||
return 1 | ||
elif label <= 0.6: | ||
return 2 | ||
elif label <= 0.8: | ||
return 3 | ||
else: | ||
return 4 | ||
|
||
def getDevSentences(self): | ||
return self.getSplitSentences(2) | ||
|
||
def getTestSentences(self): | ||
return self.getSplitSentences(1) | ||
|
||
def getTrainSentences(self): | ||
return self.getSplitSentences(0) | ||
|
||
def getSplitSentences(self, split=0): | ||
ds_split = self.dataset_split() | ||
return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]] | ||
|
||
def sampleTable(self): | ||
if hasattr(self, '_sampleTable') and self._sampleTable is not None: | ||
return self._sampleTable | ||
|
||
nTokens = len(self.tokens()) | ||
samplingFreq = np.zeros((nTokens,)) | ||
self.allSentences() | ||
i = 0 | ||
for w in xrange(nTokens): | ||
w = self._revtokens[i] | ||
if w in self._tokenfreq: | ||
freq = 1.0 * self._tokenfreq[w] | ||
# Reweigh | ||
freq = freq ** 0.75 | ||
else: | ||
freq = 0.0 | ||
samplingFreq[i] = freq | ||
i += 1 | ||
|
||
samplingFreq /= np.sum(samplingFreq) | ||
samplingFreq = np.cumsum(samplingFreq) * self.tablesize | ||
|
||
self._sampleTable = [0] * self.tablesize | ||
|
||
j = 0 | ||
for i in xrange(self.tablesize): | ||
while i > samplingFreq[j]: | ||
j += 1 | ||
self._sampleTable[i] = j | ||
|
||
return self._sampleTable | ||
|
||
def rejectProb(self): | ||
if hasattr(self, '_rejectProb') and self._rejectProb is not None: | ||
return self._rejectProb | ||
|
||
threshold = 1e-5 * self._wordcount | ||
|
||
nTokens = len(self.tokens()) | ||
rejectProb = np.zeros((nTokens,)) | ||
for i in xrange(nTokens): | ||
w = self._revtokens[i] | ||
freq = 1.0 * self._tokenfreq[w] | ||
# Reweigh | ||
rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq)) | ||
|
||
self._rejectProb = rejectProb | ||
return self._rejectProb | ||
|
||
def sampleTokenIdx(self): | ||
return self.sampleTable()[random.randint(0, self.tablesize - 1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Get Stanford Sentiment Treebank | ||
wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip | ||
unzip stanfordSentimentTreebank.zip | ||
rm stanfordSentimentTreebank.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import numpy as np | ||
import random | ||
|
||
def softmax(x): | ||
""" | ||
Compute the softmax function for each row of the input x. | ||
It is crucial that this function is optimized for speed because | ||
it will be used frequently in later code. | ||
You might find numpy functions np.exp, np.sum, np.reshape, | ||
np.max, and numpy broadcasting useful for this task. (numpy | ||
broadcasting documentation: | ||
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) | ||
You should also make sure that your code works for one | ||
dimensional inputs (treat the vector as a row), you might find | ||
it helpful for your later problems. | ||
You must implement the optimization in problem 1(a) of the | ||
written assignment! | ||
""" | ||
|
||
### YOUR CODE HERE | ||
raise NotImplementedError | ||
### END YOUR CODE | ||
|
||
return x | ||
|
||
def test_softmax_basic(): | ||
""" | ||
Some simple tests to get you started. | ||
Warning: these are not exhaustive. | ||
""" | ||
print "Running basic tests..." | ||
test1 = softmax(np.array([1,2])) | ||
print test1 | ||
assert np.amax(np.fabs(test1 - np.array( | ||
[0.26894142, 0.73105858]))) <= 1e-6 | ||
|
||
test2 = softmax(np.array([[1001,1002],[3,4]])) | ||
print test2 | ||
assert np.amax(np.fabs(test2 - np.array( | ||
[[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6 | ||
|
||
test3 = softmax(np.array([[-1001,-1002]])) | ||
print test3 | ||
assert np.amax(np.fabs(test3 - np.array( | ||
[0.73105858, 0.26894142]))) <= 1e-6 | ||
|
||
print "You should verify these results!\n" | ||
|
||
def test_softmax(): | ||
""" | ||
Use this space to test your softmax implementation by running: | ||
python q1_softmax.py | ||
This function will not be called by the autograder, nor will | ||
your tests be graded. | ||
""" | ||
print "Running your tests..." | ||
### YOUR CODE HERE | ||
raise NotImplementedError | ||
### END YOUR CODE | ||
|
||
if __name__ == "__main__": | ||
test_softmax_basic() | ||
test_softmax() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import numpy as np | ||
import random | ||
|
||
# First implement a gradient checker by filling in the following functions | ||
def gradcheck_naive(f, x): | ||
""" | ||
Gradient check for a function f | ||
- f should be a function that takes a single argument and outputs the cost and its gradients | ||
- x is the point (numpy array) to check the gradient at | ||
""" | ||
|
||
rndstate = random.getstate() | ||
random.setstate(rndstate) | ||
fx, grad = f(x) # Evaluate function value at original point | ||
h = 1e-4 | ||
|
||
# Iterate over all indexes in x | ||
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) | ||
while not it.finished: | ||
ix = it.multi_index | ||
|
||
### try modifying x[ix] with h defined above to compute numerical gradients | ||
### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it | ||
### possible to test cost functions with built in randomness later | ||
### YOUR CODE HERE: | ||
raise NotImplementedError | ||
### END YOUR CODE | ||
|
||
# Compare gradients | ||
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix])) | ||
if reldiff > 1e-5: | ||
print "Gradient check failed." | ||
print "First gradient error found at index %s" % str(ix) | ||
print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad) | ||
return | ||
|
||
it.iternext() # Step to next dimension | ||
|
||
print "Gradient check passed!" | ||
|
||
def sanity_check(): | ||
""" | ||
Some basic sanity checks. | ||
""" | ||
quad = lambda x: (np.sum(x ** 2), x * 2) | ||
|
||
print "Running sanity checks..." | ||
gradcheck_naive(quad, np.array(123.456)) # scalar test | ||
gradcheck_naive(quad, np.random.randn(3,)) # 1-D test | ||
gradcheck_naive(quad, np.random.randn(4,5)) # 2-D test | ||
print "" | ||
|
||
def your_sanity_checks(): | ||
""" | ||
Use this space add any additional sanity checks by running: | ||
python q2_gradcheck.py | ||
This function will not be called by the autograder, nor will | ||
your additional tests be graded. | ||
""" | ||
print "Running your sanity checks..." | ||
### YOUR CODE HERE | ||
raise NotImplementedError | ||
### END YOUR CODE | ||
|
||
if __name__ == "__main__": | ||
sanity_check() | ||
your_sanity_checks() |
Oops, something went wrong.