Skip to content

Commit

Permalink
Initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
julien-c committed Nov 27, 2016
0 parents commit 9229d20
Show file tree
Hide file tree
Showing 14 changed files with 1,141 additions and 0 deletions.
2 changes: 2 additions & 0 deletions collectSubmission.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
rm -f assignment1.zip
zip -r assignment1.zip *.py *.png saved_params_40000.npy
Empty file added cs224d/__init__.py
Empty file.
248 changes: 248 additions & 0 deletions cs224d/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import cPickle as pickle
import numpy as np
import os
import random

class StanfordSentiment:
def __init__(self, path=None, tablesize = 1000000):
if not path:
path = "cs224d/datasets/stanfordSentimentTreebank"

self.path = path
self.tablesize = tablesize

def tokens(self):
if hasattr(self, "_tokens") and self._tokens:
return self._tokens

tokens = dict()
tokenfreq = dict()
wordcount = 0
revtokens = []
idx = 0

for sentence in self.sentences():
for w in sentence:
wordcount += 1
if not w in tokens:
tokens[w] = idx
revtokens += [w]
tokenfreq[w] = 1
idx += 1
else:
tokenfreq[w] += 1

tokens["UNK"] = idx
revtokens += ["UNK"]
tokenfreq["UNK"] = 1
wordcount += 1

self._tokens = tokens
self._tokenfreq = tokenfreq
self._wordcount = wordcount
self._revtokens = revtokens
return self._tokens

def sentences(self):
if hasattr(self, "_sentences") and self._sentences:
return self._sentences

sentences = []
with open(self.path + "/datasetSentences.txt", "r") as f:
first = True
for line in f:
if first:
first = False
continue

splitted = line.strip().split()[1:]
# Deal with some peculiar encoding issues with this file
sentences += [[w.lower().decode("utf-8").encode('latin1') for w in splitted]]

self._sentences = sentences
self._sentlengths = np.array([len(s) for s in sentences])
self._cumsentlen = np.cumsum(self._sentlengths)

return self._sentences

def numSentences(self):
if hasattr(self, "_numSentences") and self._numSentences:
return self._numSentences
else:
self._numSentences = len(self.sentences())
return self._numSentences

def allSentences(self):
if hasattr(self, "_allsentences") and self._allsentences:
return self._allsentences

sentences = self.sentences()
rejectProb = self.rejectProb()
tokens = self.tokens()
allsentences = [[w for w in s
if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
for s in sentences * 30]

allsentences = [s for s in allsentences if len(s) > 1]

self._allsentences = allsentences

return self._allsentences

def getRandomContext(self, C=5):
allsent = self.allSentences()
sentID = random.randint(0, len(allsent) - 1)
sent = allsent[sentID]
wordID = random.randint(0, len(sent) - 1)

context = sent[max(0, wordID - C):wordID]
if wordID+1 < len(sent):
context += sent[wordID+1:min(len(sent), wordID + C + 1)]

centerword = sent[wordID]
context = [w for w in context if w != centerword]

if len(context) > 0:
return centerword, context
else:
return self.getRandomContext(C)

def sent_labels(self):
if hasattr(self, "_sent_labels") and self._sent_labels:
return self._sent_labels

dictionary = dict()
phrases = 0
with open(self.path + "/dictionary.txt", "r") as f:
for line in f:
line = line.strip()
if not line: continue
splitted = line.split("|")
dictionary[splitted[0].lower()] = int(splitted[1])
phrases += 1

labels = [0.0] * phrases
with open(self.path + "/sentiment_labels.txt", "r") as f:
first = True
for line in f:
if first:
first = False
continue

line = line.strip()
if not line: continue
splitted = line.split("|")
labels[int(splitted[0])] = float(splitted[1])

sent_labels = [0.0] * self.numSentences()
sentences = self.sentences()
for i in xrange(self.numSentences()):
sentence = sentences[i]
full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
sent_labels[i] = labels[dictionary[full_sent]]

self._sent_labels = sent_labels
return self._sent_labels

def dataset_split(self):
if hasattr(self, "_split") and self._split:
return self._split

split = [[] for i in xrange(3)]
with open(self.path + "/datasetSplit.txt", "r") as f:
first = True
for line in f:
if first:
first = False
continue

splitted = line.strip().split(",")
split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]

self._split = split
return self._split

def getRandomTrainSentence(self):
split = self.dataset_split()
sentId = split[0][random.randint(0, len(split[0]) - 1)]
return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])

def categorify(self, label):
if label <= 0.2:
return 0
elif label <= 0.4:
return 1
elif label <= 0.6:
return 2
elif label <= 0.8:
return 3
else:
return 4

def getDevSentences(self):
return self.getSplitSentences(2)

def getTestSentences(self):
return self.getSplitSentences(1)

def getTrainSentences(self):
return self.getSplitSentences(0)

def getSplitSentences(self, split=0):
ds_split = self.dataset_split()
return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]

def sampleTable(self):
if hasattr(self, '_sampleTable') and self._sampleTable is not None:
return self._sampleTable

nTokens = len(self.tokens())
samplingFreq = np.zeros((nTokens,))
self.allSentences()
i = 0
for w in xrange(nTokens):
w = self._revtokens[i]
if w in self._tokenfreq:
freq = 1.0 * self._tokenfreq[w]
# Reweigh
freq = freq ** 0.75
else:
freq = 0.0
samplingFreq[i] = freq
i += 1

samplingFreq /= np.sum(samplingFreq)
samplingFreq = np.cumsum(samplingFreq) * self.tablesize

self._sampleTable = [0] * self.tablesize

j = 0
for i in xrange(self.tablesize):
while i > samplingFreq[j]:
j += 1
self._sampleTable[i] = j

return self._sampleTable

def rejectProb(self):
if hasattr(self, '_rejectProb') and self._rejectProb is not None:
return self._rejectProb

threshold = 1e-5 * self._wordcount

nTokens = len(self.tokens())
rejectProb = np.zeros((nTokens,))
for i in xrange(nTokens):
w = self._revtokens[i]
freq = 1.0 * self._tokenfreq[w]
# Reweigh
rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))

self._rejectProb = rejectProb
return self._rejectProb

def sampleTokenIdx(self):
return self.sampleTable()[random.randint(0, self.tablesize - 1)]
4 changes: 4 additions & 0 deletions cs224d/datasets/get_datasets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Get Stanford Sentiment Treebank
wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
unzip stanfordSentimentTreebank.zip
rm stanfordSentimentTreebank.zip
66 changes: 66 additions & 0 deletions q1_softmax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import numpy as np
import random

def softmax(x):
"""
Compute the softmax function for each row of the input x.
It is crucial that this function is optimized for speed because
it will be used frequently in later code.
You might find numpy functions np.exp, np.sum, np.reshape,
np.max, and numpy broadcasting useful for this task. (numpy
broadcasting documentation:
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
You should also make sure that your code works for one
dimensional inputs (treat the vector as a row), you might find
it helpful for your later problems.
You must implement the optimization in problem 1(a) of the
written assignment!
"""

### YOUR CODE HERE
raise NotImplementedError
### END YOUR CODE

return x

def test_softmax_basic():
"""
Some simple tests to get you started.
Warning: these are not exhaustive.
"""
print "Running basic tests..."
test1 = softmax(np.array([1,2]))
print test1
assert np.amax(np.fabs(test1 - np.array(
[0.26894142, 0.73105858]))) <= 1e-6

test2 = softmax(np.array([[1001,1002],[3,4]]))
print test2
assert np.amax(np.fabs(test2 - np.array(
[[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6

test3 = softmax(np.array([[-1001,-1002]]))
print test3
assert np.amax(np.fabs(test3 - np.array(
[0.73105858, 0.26894142]))) <= 1e-6

print "You should verify these results!\n"

def test_softmax():
"""
Use this space to test your softmax implementation by running:
python q1_softmax.py
This function will not be called by the autograder, nor will
your tests be graded.
"""
print "Running your tests..."
### YOUR CODE HERE
raise NotImplementedError
### END YOUR CODE

if __name__ == "__main__":
test_softmax_basic()
test_softmax()
67 changes: 67 additions & 0 deletions q2_gradcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import numpy as np
import random

# First implement a gradient checker by filling in the following functions
def gradcheck_naive(f, x):
"""
Gradient check for a function f
- f should be a function that takes a single argument and outputs the cost and its gradients
- x is the point (numpy array) to check the gradient at
"""

rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
h = 1e-4

# Iterate over all indexes in x
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index

### try modifying x[ix] with h defined above to compute numerical gradients
### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it
### possible to test cost functions with built in randomness later
### YOUR CODE HERE:
raise NotImplementedError
### END YOUR CODE

# Compare gradients
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
if reldiff > 1e-5:
print "Gradient check failed."
print "First gradient error found at index %s" % str(ix)
print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
return

it.iternext() # Step to next dimension

print "Gradient check passed!"

def sanity_check():
"""
Some basic sanity checks.
"""
quad = lambda x: (np.sum(x ** 2), x * 2)

print "Running sanity checks..."
gradcheck_naive(quad, np.array(123.456)) # scalar test
gradcheck_naive(quad, np.random.randn(3,)) # 1-D test
gradcheck_naive(quad, np.random.randn(4,5)) # 2-D test
print ""

def your_sanity_checks():
"""
Use this space add any additional sanity checks by running:
python q2_gradcheck.py
This function will not be called by the autograder, nor will
your additional tests be graded.
"""
print "Running your sanity checks..."
### YOUR CODE HERE
raise NotImplementedError
### END YOUR CODE

if __name__ == "__main__":
sanity_check()
your_sanity_checks()
Loading

0 comments on commit 9229d20

Please sign in to comment.