diff --git a/collectSubmission.sh b/collectSubmission.sh
new file mode 100644
index 0000000..3f6cef7
--- /dev/null
+++ b/collectSubmission.sh
@@ -0,0 +1,2 @@
+rm -f assignment1.zip
+zip -r assignment1.zip *.py *.png saved_params_40000.npy
diff --git a/cs224d/__init__.py b/cs224d/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cs224d/data_utils.py b/cs224d/data_utils.py
new file mode 100644
index 0000000..b1fc52f
--- /dev/null
+++ b/cs224d/data_utils.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import cPickle as pickle
+import numpy as np
+import os
+import random
+
+class StanfordSentiment:
+    def __init__(self, path=None, tablesize = 1000000):
+        if not path:
+            path = "cs224d/datasets/stanfordSentimentTreebank"
+
+        self.path = path
+        self.tablesize = tablesize
+
+    def tokens(self):
+        if hasattr(self, "_tokens") and self._tokens:
+            return self._tokens
+
+        tokens = dict()
+        tokenfreq = dict()
+        wordcount = 0
+        revtokens = []
+        idx = 0
+
+        for sentence in self.sentences():
+            for w in sentence:
+                wordcount += 1
+                if not w in tokens:
+                    tokens[w] = idx
+                    revtokens += [w]
+                    tokenfreq[w] = 1
+                    idx += 1
+                else:
+                    tokenfreq[w] += 1
+
+        tokens["UNK"] = idx
+        revtokens += ["UNK"]
+        tokenfreq["UNK"] = 1
+        wordcount += 1
+
+        self._tokens = tokens
+        self._tokenfreq = tokenfreq
+        self._wordcount = wordcount
+        self._revtokens = revtokens
+        return self._tokens
+    
+    def sentences(self):
+        if hasattr(self, "_sentences") and self._sentences:
+            return self._sentences
+
+        sentences = []
+        with open(self.path + "/datasetSentences.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                splitted = line.strip().split()[1:]
+                # Deal with some peculiar encoding issues with this file
+                sentences += [[w.lower().decode("utf-8").encode('latin1') for w in splitted]]
+                
+        self._sentences = sentences
+        self._sentlengths = np.array([len(s) for s in sentences])
+        self._cumsentlen = np.cumsum(self._sentlengths)
+
+        return self._sentences
+
+    def numSentences(self):
+        if hasattr(self, "_numSentences") and self._numSentences:
+            return self._numSentences
+        else:
+            self._numSentences = len(self.sentences())
+            return self._numSentences
+
+    def allSentences(self):
+        if hasattr(self, "_allsentences") and self._allsentences:
+            return self._allsentences
+
+        sentences = self.sentences()
+        rejectProb = self.rejectProb()
+        tokens = self.tokens()
+        allsentences = [[w for w in s 
+            if 0 >= rejectProb[tokens[w]] or random.random() >= rejectProb[tokens[w]]]
+            for s in sentences * 30]
+
+        allsentences = [s for s in allsentences if len(s) > 1]
+        
+        self._allsentences = allsentences
+        
+        return self._allsentences
+
+    def getRandomContext(self, C=5):
+        allsent = self.allSentences()
+        sentID = random.randint(0, len(allsent) - 1)
+        sent = allsent[sentID]
+        wordID = random.randint(0, len(sent) - 1)
+
+        context = sent[max(0, wordID - C):wordID] 
+        if wordID+1 < len(sent):
+            context += sent[wordID+1:min(len(sent), wordID + C + 1)]
+
+        centerword = sent[wordID]
+        context = [w for w in context if w != centerword]
+
+        if len(context) > 0:
+            return centerword, context
+        else:
+            return self.getRandomContext(C)
+
+    def sent_labels(self):
+        if hasattr(self, "_sent_labels") and self._sent_labels:
+            return self._sent_labels
+
+        dictionary = dict()
+        phrases = 0
+        with open(self.path + "/dictionary.txt", "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line: continue
+                splitted = line.split("|")
+                dictionary[splitted[0].lower()] = int(splitted[1])
+                phrases += 1
+
+        labels = [0.0] * phrases
+        with open(self.path + "/sentiment_labels.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                line = line.strip()
+                if not line: continue
+                splitted = line.split("|")
+                labels[int(splitted[0])] = float(splitted[1])
+
+        sent_labels = [0.0] * self.numSentences()
+        sentences = self.sentences()
+        for i in xrange(self.numSentences()):
+            sentence = sentences[i]
+            full_sent = " ".join(sentence).replace('-lrb-', '(').replace('-rrb-', ')')
+            sent_labels[i] = labels[dictionary[full_sent]]
+            
+        self._sent_labels = sent_labels
+        return self._sent_labels
+
+    def dataset_split(self):
+        if hasattr(self, "_split") and self._split:
+            return self._split
+
+        split = [[] for i in xrange(3)]
+        with open(self.path + "/datasetSplit.txt", "r") as f:
+            first = True
+            for line in f:
+                if first:
+                    first = False
+                    continue
+
+                splitted = line.strip().split(",")
+                split[int(splitted[1]) - 1] += [int(splitted[0]) - 1]
+
+        self._split = split
+        return self._split
+
+    def getRandomTrainSentence(self):
+        split = self.dataset_split()
+        sentId = split[0][random.randint(0, len(split[0]) - 1)]
+        return self.sentences()[sentId], self.categorify(self.sent_labels()[sentId])
+
+    def categorify(self, label):
+        if label <= 0.2:
+            return 0
+        elif label <= 0.4:
+            return 1
+        elif label <= 0.6:
+            return 2
+        elif label <= 0.8:
+            return 3
+        else:
+            return 4
+
+    def getDevSentences(self):
+        return self.getSplitSentences(2)
+
+    def getTestSentences(self):
+        return self.getSplitSentences(1)
+
+    def getTrainSentences(self):
+        return self.getSplitSentences(0)
+
+    def getSplitSentences(self, split=0):
+        ds_split = self.dataset_split()
+        return [(self.sentences()[i], self.categorify(self.sent_labels()[i])) for i in ds_split[split]]
+
+    def sampleTable(self):
+        if hasattr(self, '_sampleTable') and self._sampleTable is not None:
+            return self._sampleTable
+
+        nTokens = len(self.tokens())
+        samplingFreq = np.zeros((nTokens,))
+        self.allSentences()
+        i = 0
+        for w in xrange(nTokens):
+            w = self._revtokens[i]
+            if w in self._tokenfreq:
+                freq = 1.0 * self._tokenfreq[w]
+                # Reweigh
+                freq = freq ** 0.75
+            else:
+                freq = 0.0
+            samplingFreq[i] = freq
+            i += 1
+
+        samplingFreq /= np.sum(samplingFreq)
+        samplingFreq = np.cumsum(samplingFreq) * self.tablesize
+
+        self._sampleTable = [0] * self.tablesize
+
+        j = 0
+        for i in xrange(self.tablesize):
+            while i > samplingFreq[j]:
+                j += 1
+            self._sampleTable[i] = j
+
+        return self._sampleTable
+
+    def rejectProb(self):
+        if hasattr(self, '_rejectProb') and self._rejectProb is not None:
+            return self._rejectProb
+
+        threshold = 1e-5 * self._wordcount
+
+        nTokens = len(self.tokens())
+        rejectProb = np.zeros((nTokens,))
+        for i in xrange(nTokens):
+            w = self._revtokens[i]
+            freq = 1.0 * self._tokenfreq[w]
+            # Reweigh
+            rejectProb[i] = max(0, 1 - np.sqrt(threshold / freq))
+
+        self._rejectProb = rejectProb
+        return self._rejectProb
+
+    def sampleTokenIdx(self):
+        return self.sampleTable()[random.randint(0, self.tablesize - 1)]
\ No newline at end of file
diff --git a/cs224d/datasets/get_datasets.sh b/cs224d/datasets/get_datasets.sh
new file mode 100755
index 0000000..aff89c7
--- /dev/null
+++ b/cs224d/datasets/get_datasets.sh
@@ -0,0 +1,4 @@
+# Get Stanford Sentiment Treebank
+wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
+unzip stanfordSentimentTreebank.zip
+rm stanfordSentimentTreebank.zip
diff --git a/q1_softmax.py b/q1_softmax.py
new file mode 100644
index 0000000..9b409f1
--- /dev/null
+++ b/q1_softmax.py
@@ -0,0 +1,66 @@
+import numpy as np
+import random
+
+def softmax(x):
+    """
+    Compute the softmax function for each row of the input x.
+
+    It is crucial that this function is optimized for speed because
+    it will be used frequently in later code.
+    You might find numpy functions np.exp, np.sum, np.reshape,
+    np.max, and numpy broadcasting useful for this task. (numpy
+    broadcasting documentation:
+    http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+    You should also make sure that your code works for one
+    dimensional inputs (treat the vector as a row), you might find
+    it helpful for your later problems.
+
+    You must implement the optimization in problem 1(a) of the 
+    written assignment!
+    """
+
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return x
+
+def test_softmax_basic():
+    """
+    Some simple tests to get you started. 
+    Warning: these are not exhaustive.
+    """
+    print "Running basic tests..."
+    test1 = softmax(np.array([1,2]))
+    print test1
+    assert np.amax(np.fabs(test1 - np.array(
+        [0.26894142,  0.73105858]))) <= 1e-6
+
+    test2 = softmax(np.array([[1001,1002],[3,4]]))
+    print test2
+    assert np.amax(np.fabs(test2 - np.array(
+        [[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6
+
+    test3 = softmax(np.array([[-1001,-1002]]))
+    print test3
+    assert np.amax(np.fabs(test3 - np.array(
+        [0.73105858, 0.26894142]))) <= 1e-6
+
+    print "You should verify these results!\n"
+
+def test_softmax():
+    """ 
+    Use this space to test your softmax implementation by running:
+        python q1_softmax.py 
+    This function will not be called by the autograder, nor will
+    your tests be graded.
+    """
+    print "Running your tests..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE  
+
+if __name__ == "__main__":
+    test_softmax_basic()
+    test_softmax()
\ No newline at end of file
diff --git a/q2_gradcheck.py b/q2_gradcheck.py
new file mode 100644
index 0000000..d0c83d4
--- /dev/null
+++ b/q2_gradcheck.py
@@ -0,0 +1,67 @@
+import numpy as np
+import random
+
+# First implement a gradient checker by filling in the following functions
+def gradcheck_naive(f, x):
+    """ 
+    Gradient check for a function f 
+    - f should be a function that takes a single argument and outputs the cost and its gradients
+    - x is the point (numpy array) to check the gradient at
+    """ 
+
+    rndstate = random.getstate()
+    random.setstate(rndstate)  
+    fx, grad = f(x) # Evaluate function value at original point
+    h = 1e-4
+
+    # Iterate over all indexes in x
+    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
+    while not it.finished:
+        ix = it.multi_index
+
+        ### try modifying x[ix] with h defined above to compute numerical gradients
+        ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it 
+        ### possible to test cost functions with built in randomness later
+        ### YOUR CODE HERE:
+        raise NotImplementedError
+        ### END YOUR CODE
+
+        # Compare gradients
+        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
+        if reldiff > 1e-5:
+            print "Gradient check failed."
+            print "First gradient error found at index %s" % str(ix)
+            print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
+            return
+    
+        it.iternext() # Step to next dimension
+
+    print "Gradient check passed!"
+
+def sanity_check():
+    """
+    Some basic sanity checks.
+    """
+    quad = lambda x: (np.sum(x ** 2), x * 2)
+
+    print "Running sanity checks..."
+    gradcheck_naive(quad, np.array(123.456))      # scalar test
+    gradcheck_naive(quad, np.random.randn(3,))    # 1-D test
+    gradcheck_naive(quad, np.random.randn(4,5))   # 2-D test
+    print ""
+
+def your_sanity_checks(): 
+    """
+    Use this space add any additional sanity checks by running:
+        python q2_gradcheck.py 
+    This function will not be called by the autograder, nor will
+    your additional tests be graded.
+    """
+    print "Running your sanity checks..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+
+if __name__ == "__main__":
+    sanity_check()
+    your_sanity_checks()
diff --git a/q2_neural.py b/q2_neural.py
new file mode 100644
index 0000000..488caee
--- /dev/null
+++ b/q2_neural.py
@@ -0,0 +1,76 @@
+import numpy as np
+import random
+
+from q1_softmax import softmax
+from q2_sigmoid import sigmoid, sigmoid_grad
+from q2_gradcheck import gradcheck_naive
+
+def forward_backward_prop(data, labels, params, dimensions):
+    """ 
+    Forward and backward propagation for a two-layer sigmoidal network 
+    
+    Compute the forward propagation and for the cross entropy cost,
+    and backward propagation for the gradients for all parameters.
+    """
+
+    ### Unpack network parameters (do not modify)
+    ofs = 0
+    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
+
+    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
+    ofs += Dx * H
+    b1 = np.reshape(params[ofs:ofs + H], (1, H))
+    ofs += H
+    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
+    ofs += H * Dy
+    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))
+
+    ### YOUR CODE HERE: forward propagation
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    ### YOUR CODE HERE: backward propagation
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    ### Stack gradients (do not modify)
+    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
+        gradW2.flatten(), gradb2.flatten()))
+    
+    return cost, grad
+
+def sanity_check():
+    """
+    Set up fake data and parameters for the neural network, and test using 
+    gradcheck.
+    """
+    print "Running sanity check..."
+
+    N = 20
+    dimensions = [10, 5, 10]
+    data = np.random.randn(N, dimensions[0])   # each row will be a datum
+    labels = np.zeros((N, dimensions[2]))
+    for i in xrange(N):
+        labels[i,random.randint(0,dimensions[2]-1)] = 1
+    
+    params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (
+        dimensions[1] + 1) * dimensions[2], )
+
+    gradcheck_naive(lambda params: forward_backward_prop(data, labels, params,
+        dimensions), params)
+
+def your_sanity_checks(): 
+    """
+    Use this space add any additional sanity checks by running:
+        python q2_neural.py 
+    This function will not be called by the autograder, nor will
+    your additional tests be graded.
+    """
+    print "Running your sanity checks..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+
+if __name__ == "__main__":
+    sanity_check()
+    your_sanity_checks()
\ No newline at end of file
diff --git a/q2_sigmoid.py b/q2_sigmoid.py
new file mode 100644
index 0000000..04cc2de
--- /dev/null
+++ b/q2_sigmoid.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+def sigmoid(x):
+    """
+    Compute the sigmoid function for the input here.
+    """
+    
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return x
+
+def sigmoid_grad(f):
+    """
+    Compute the gradient for the sigmoid function here. Note that
+    for this implementation, the input f should be the sigmoid
+    function value of your original input x. 
+    """
+    
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return f
+
+def test_sigmoid_basic():
+    """
+    Some simple tests to get you started. 
+    Warning: these are not exhaustive.
+    """
+    print "Running basic tests..."
+    x = np.array([[1, 2], [-1, -2]])
+    f = sigmoid(x)
+    g = sigmoid_grad(f)
+    print f
+    assert np.amax(f - np.array([[0.73105858, 0.88079708], 
+        [0.26894142, 0.11920292]])) <= 1e-6
+    print g
+    assert np.amax(g - np.array([[0.19661193, 0.10499359],
+        [0.19661193, 0.10499359]])) <= 1e-6
+    print "You should verify these results!\n"
+
+def test_sigmoid(): 
+    """
+    Use this space to test your sigmoid implementation by running:
+        python q2_sigmoid.py 
+    This function will not be called by the autograder, nor will
+    your tests be graded.
+    """
+    print "Running your tests..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+
+if __name__ == "__main__":
+    test_sigmoid_basic();
+    test_sigmoid()
diff --git a/q3_run.py b/q3_run.py
new file mode 100644
index 0000000..a000074
--- /dev/null
+++ b/q3_run.py
@@ -0,0 +1,57 @@
+import random
+import numpy as np
+from cs224d.data_utils import *
+import matplotlib.pyplot as plt
+
+from q3_word2vec import *
+from q3_sgd import *
+
+# Reset the random seed to make sure that everyone gets the same results
+random.seed(314)
+dataset = StanfordSentiment()
+tokens = dataset.tokens()
+nWords = len(tokens)
+
+# We are going to train 10-dimensional vectors for this assignment
+dimVectors = 10
+
+# Context size
+C = 5
+
+# Reset the random seed to make sure that everyone gets the same results
+random.seed(31415)
+np.random.seed(9265)
+wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - .5) / \
+	dimVectors, np.zeros((nWords, dimVectors))), axis=0)
+wordVectors0 = sgd(
+    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, 
+    	negSamplingCostAndGradient), 
+    wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
+print "sanity check: cost at convergence should be around or below 10"
+
+# sum the input and output word vectors
+wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
+
+# Visualize the word vectors you trained
+_, wordVectors0, _ = load_saved_params()
+wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
+visualizeWords = ["the", "a", "an", ",", ".", "?", "!", "``", "''", "--", 
+	"good", "great", "cool", "brilliant", "wonderful", "well", "amazing",
+	"worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", 
+	"annoying"]
+visualizeIdx = [tokens[word] for word in visualizeWords]
+visualizeVecs = wordVectors[visualizeIdx, :]
+temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
+covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
+U,S,V = np.linalg.svd(covariance)
+coord = temp.dot(U[:,0:2]) 
+
+for i in xrange(len(visualizeWords)):
+    plt.text(coord[i,0], coord[i,1], visualizeWords[i], 
+    	bbox=dict(facecolor='green', alpha=0.1))
+    
+plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
+plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
+
+plt.savefig('q3_word_vectors.png')
+plt.show()
\ No newline at end of file
diff --git a/q3_sgd.py b/q3_sgd.py
new file mode 100644
index 0000000..8a0ca55
--- /dev/null
+++ b/q3_sgd.py
@@ -0,0 +1,128 @@
+# Save parameters every a few SGD iterations as fail-safe
+SAVE_PARAMS_EVERY = 1000
+
+import glob
+import random
+import numpy as np
+import os.path as op
+import cPickle as pickle
+
+def load_saved_params():
+    """ A helper function that loads previously saved parameters and resets iteration start """
+    st = 0
+    for f in glob.glob("saved_params_*.npy"):
+        iter = int(op.splitext(op.basename(f))[0].split("_")[2])
+        if (iter > st):
+            st = iter
+            
+    if st > 0:
+        with open("saved_params_%d.npy" % st, "r") as f:
+            params = pickle.load(f)
+            state = pickle.load(f)
+        return st, params, state
+    else:
+        return st, None, None
+    
+def save_params(iter, params):
+    with open("saved_params_%d.npy" % iter, "w") as f:
+        pickle.dump(params, f)
+        pickle.dump(random.getstate(), f)
+
+def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10):
+    """ Stochastic Gradient Descent """
+    # Implement the stochastic gradient descent method in this        
+    # function.                                                       
+    
+    # Inputs:                                                         
+    # - f: the function to optimize, it should take a single        
+    #     argument and yield two outputs, a cost and the gradient  
+    #     with respect to the arguments                            
+    # - x0: the initial point to start SGD from                     
+    # - step: the step size for SGD                                 
+    # - iterations: total iterations to run SGD for                 
+    # - postprocessing: postprocessing function for the parameters  
+    #     if necessary. In the case of word2vec we will need to    
+    #     normalize the word vectors to have unit length.          
+    # - PRINT_EVERY: specifies every how many iterations to output  
+
+    # Output:                                                         
+    # - x: the parameter value after SGD finishes  
+    
+    # Anneal learning rate every several iterations
+    ANNEAL_EVERY = 20000
+    
+    if useSaved:
+        start_iter, oldx, state = load_saved_params()
+        if start_iter > 0:
+            x0 = oldx;
+            step *= 0.5 ** (start_iter / ANNEAL_EVERY)
+            
+        if state:
+            random.setstate(state)
+    else:
+        start_iter = 0
+    
+    x = x0
+    
+    if not postprocessing:
+        postprocessing = lambda x: x
+    
+    expcost = None
+    
+    for iter in xrange(start_iter + 1, iterations + 1):
+        ### Don't forget to apply the postprocessing after every iteration!
+        ### You might want to print the progress every few iterations.
+
+        cost = None
+        ### YOUR CODE HERE
+        raise NotImplementedError
+        ### END YOUR CODE
+        
+        if iter % PRINT_EVERY == 0:
+            if not expcost:
+                expcost = cost
+            else:
+                expcost = .95 * expcost + .05 * cost
+            print "iter %d: %f" % (iter, expcost)
+        
+        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
+            save_params(iter, x)
+            
+        if iter % ANNEAL_EVERY == 0:
+            step *= 0.5
+    
+    return x
+
+def sanity_check():
+    quad = lambda x: (np.sum(x ** 2), x * 2)
+
+    print "Running sanity checks..."
+    t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
+    print "test 1 result:", t1
+    assert abs(t1) <= 1e-6
+
+    t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
+    print "test 2 result:", t2
+    assert abs(t2) <= 1e-6
+
+    t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
+    print "test 3 result:", t3
+    assert abs(t3) <= 1e-6
+    
+    print ""
+
+def your_sanity_checks(): 
+    """
+    Use this space add any additional sanity checks by running:
+        python q3_sgd.py 
+    This function will not be called by the autograder, nor will
+    your additional tests be graded.
+    """
+    print "Running your sanity checks..."
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+
+if __name__ == "__main__":
+    sanity_check();
+    your_sanity_checks();
\ No newline at end of file
diff --git a/q3_word2vec.py b/q3_word2vec.py
new file mode 100644
index 0000000..df85c1d
--- /dev/null
+++ b/q3_word2vec.py
@@ -0,0 +1,198 @@
+import numpy as np
+import random
+
+from q1_softmax import softmax
+from q2_gradcheck import gradcheck_naive
+from q2_sigmoid import sigmoid, sigmoid_grad
+
+def normalizeRows(x):
+    """ Row normalization function """
+    # Implement a function that normalizes each row of a matrix to have unit length
+    
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return x
+
+def test_normalize_rows():
+    print "Testing normalizeRows..."
+    x = normalizeRows(np.array([[3.0,4.0],[1, 2]])) 
+    # the result should be [[0.6, 0.8], [0.4472, 0.8944]]
+    print x
+    assert (x.all() == np.array([[0.6, 0.8], [0.4472, 0.8944]]).all())
+    print ""
+
+def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
+    """ Softmax cost function for word2vec models """
+    
+    # Implement the cost and gradients for one predicted word vector  
+    # and one target word vector as a building block for word2vec     
+    # models, assuming the softmax prediction function and cross      
+    # entropy loss.                                                   
+    
+    # Inputs:                                                         
+    # - predicted: numpy ndarray, predicted word vector (\hat{v} in 
+    #   the written component or \hat{r} in an earlier version)
+    # - target: integer, the index of the target word               
+    # - outputVectors: "output" vectors (as rows) for all tokens     
+    # - dataset: needed for negative sampling, unused here.         
+    
+    # Outputs:                                                        
+    # - cost: cross entropy cost for the softmax word prediction    
+    # - gradPred: the gradient with respect to the predicted word   
+    #        vector                                                
+    # - grad: the gradient with respect to all the other word        
+    #        vectors                                               
+    
+    # We will not provide starter code for this function, but feel    
+    # free to reference the code you previously wrote for this        
+    # assignment!                                                  
+    
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return cost, gradPred, grad
+
+def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
+    K=10):
+    """ Negative sampling cost function for word2vec models """
+
+    # Implement the cost and gradients for one predicted word vector  
+    # and one target word vector as a building block for word2vec     
+    # models, using the negative sampling technique. K is the sample  
+    # size. You might want to use dataset.sampleTokenIdx() to sample  
+    # a random word index. 
+    # 
+    # Note: See test_word2vec below for dataset's initialization.
+    #                                       
+    # Input/Output Specifications: same as softmaxCostAndGradient     
+    # We will not provide starter code for this function, but feel    
+    # free to reference the code you previously wrote for this        
+    # assignment!
+    
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return cost, gradPred, grad
+
+
+def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors, 
+    dataset, word2vecCostAndGradient = softmaxCostAndGradient):
+    """ Skip-gram model in word2vec """
+
+    # Implement the skip-gram model in this function.
+
+    # Inputs:                                                         
+    # - currrentWord: a string of the current center word           
+    # - C: integer, context size                                    
+    # - contextWords: list of no more than 2*C strings, the context words                                               
+    # - tokens: a dictionary that maps words to their indices in    
+    #      the word vector list                                
+    # - inputVectors: "input" word vectors (as rows) for all tokens           
+    # - outputVectors: "output" word vectors (as rows) for all tokens         
+    # - word2vecCostAndGradient: the cost and gradient function for 
+    #      a prediction vector given the target word vectors,  
+    #      could be one of the two cost functions you          
+    #      implemented above
+
+    # Outputs:                                                        
+    # - cost: the cost function value for the skip-gram model       
+    # - grad: the gradient with respect to the word vectors         
+    # We will not provide starter code for this function, but feel    
+    # free to reference the code you previously wrote for this        
+    # assignment!
+
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return cost, gradIn, gradOut
+
+def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors, 
+    dataset, word2vecCostAndGradient = softmaxCostAndGradient):
+    """ CBOW model in word2vec """
+
+    # Implement the continuous bag-of-words model in this function.            
+    # Input/Output specifications: same as the skip-gram model        
+    # We will not provide starter code for this function, but feel    
+    # free to reference the code you previously wrote for this        
+    # assignment!
+
+    #################################################################
+    # IMPLEMENTING CBOW IS EXTRA CREDIT, DERIVATIONS IN THE WRIITEN #
+    # ASSIGNMENT ARE NOT!                                           #  
+    #################################################################
+    
+    cost = 0
+    gradIn = np.zeros(inputVectors.shape)
+    gradOut = np.zeros(outputVectors.shape)
+
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return cost, gradIn, gradOut
+
+#############################################
+# Testing functions below. DO NOT MODIFY!   #
+#############################################
+
+def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C, word2vecCostAndGradient = softmaxCostAndGradient):
+    batchsize = 50
+    cost = 0.0
+    grad = np.zeros(wordVectors.shape)
+    N = wordVectors.shape[0]
+    inputVectors = wordVectors[:N/2,:]
+    outputVectors = wordVectors[N/2:,:]
+    for i in xrange(batchsize):
+        C1 = random.randint(1,C)
+        centerword, context = dataset.getRandomContext(C1)
+        
+        if word2vecModel == skipgram:
+            denom = 1
+        else:
+            denom = 1
+        
+        c, gin, gout = word2vecModel(centerword, C1, context, tokens, inputVectors, outputVectors, dataset, word2vecCostAndGradient)
+        cost += c / batchsize / denom
+        grad[:N/2, :] += gin / batchsize / denom
+        grad[N/2:, :] += gout / batchsize / denom
+        
+    return cost, grad
+
+def test_word2vec():
+    # Interface to the dataset for negative sampling
+    dataset = type('dummy', (), {})()
+    def dummySampleTokenIdx():
+        return random.randint(0, 4)
+
+    def getRandomContext(C):
+        tokens = ["a", "b", "c", "d", "e"]
+        return tokens[random.randint(0,4)], [tokens[random.randint(0,4)] \
+           for i in xrange(2*C)]
+    dataset.sampleTokenIdx = dummySampleTokenIdx
+    dataset.getRandomContext = getRandomContext
+
+    random.seed(31415)
+    np.random.seed(9265)
+    dummy_vectors = normalizeRows(np.random.randn(10,3))
+    dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
+    print "==== Gradient check for skip-gram ===="
+    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5), dummy_vectors)
+    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors)
+    print "\n==== Gradient check for CBOW      ===="
+    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5), dummy_vectors)
+    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient), dummy_vectors)
+
+    print "\n=== Results ==="
+    print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
+    print skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)
+    print cbow("a", 2, ["a", "b", "c", "a"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
+    print cbow("a", 2, ["a", "b", "a", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset, negSamplingCostAndGradient)
+
+if __name__ == "__main__":
+    test_normalize_rows()
+    test_word2vec()
\ No newline at end of file
diff --git a/q4_sentiment.py b/q4_sentiment.py
new file mode 100644
index 0000000..1adb4f0
--- /dev/null
+++ b/q4_sentiment.py
@@ -0,0 +1,114 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+from cs224d.data_utils import *
+
+from q3_sgd import load_saved_params, sgd
+from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper
+
+# Try different regularizations and pick the best!
+# NOTE: fill in one more "your code here" below before running!
+REGULARIZATION = None   # Assign a list of floats in the block below
+### YOUR CODE HERE
+raise NotImplementedError
+### END YOUR CODE
+
+# Load the dataset
+dataset = StanfordSentiment()
+tokens = dataset.tokens()
+nWords = len(tokens)
+
+# Load the word vectors we trained earlier 
+_, wordVectors0, _ = load_saved_params()
+wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
+dimVectors = wordVectors.shape[1]
+
+# Load the train set
+trainset = dataset.getTrainSentences()
+nTrain = len(trainset)
+trainFeatures = np.zeros((nTrain, dimVectors))
+trainLabels = np.zeros((nTrain,), dtype=np.int32)
+for i in xrange(nTrain):
+    words, trainLabels[i] = trainset[i]
+    trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
+
+# Prepare dev set features
+devset = dataset.getDevSentences()
+nDev = len(devset)
+devFeatures = np.zeros((nDev, dimVectors))
+devLabels = np.zeros((nDev,), dtype=np.int32)
+for i in xrange(nDev):
+    words, devLabels[i] = devset[i]
+    devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
+
+# Try our regularization parameters
+results = []
+for regularization in REGULARIZATION:
+    random.seed(3141)
+    np.random.seed(59265)
+    weights = np.random.randn(dimVectors, 5)
+    print "Training for reg=%f" % regularization 
+
+    # We will do batch optimization
+    weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels, 
+        weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)
+
+    # Test on train set
+    _, _, pred = softmaxRegression(trainFeatures, trainLabels, weights)
+    trainAccuracy = accuracy(trainLabels, pred)
+    print "Train accuracy (%%): %f" % trainAccuracy
+
+    # Test on dev set
+    _, _, pred = softmaxRegression(devFeatures, devLabels, weights)
+    devAccuracy = accuracy(devLabels, pred)
+    print "Dev accuracy (%%): %f" % devAccuracy
+
+    # Save the results and weights
+    results.append({
+        "reg" : regularization, 
+        "weights" : weights, 
+        "train" : trainAccuracy, 
+        "dev" : devAccuracy})
+
+# Print the accuracies
+print ""
+print "=== Recap ==="
+print "Reg\t\tTrain\t\tDev"
+for result in results:
+    print "%E\t%f\t%f" % (
+        result["reg"], 
+        result["train"], 
+        result["dev"])
+print ""
+
+# Pick the best regularization parameters
+BEST_REGULARIZATION = None
+BEST_WEIGHTS = None
+
+### YOUR CODE HERE 
+raise NotImplementedError
+### END YOUR CODE
+
+# Test your findings on the test set
+testset = dataset.getTestSentences()
+nTest = len(testset)
+testFeatures = np.zeros((nTest, dimVectors))
+testLabels = np.zeros((nTest,), dtype=np.int32)
+for i in xrange(nTest):
+    words, testLabels[i] = testset[i]
+    testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
+
+_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS)
+print "Best regularization value: %E" % BEST_REGULARIZATION
+print "Test accuracy (%%): %f" % accuracy(testLabels, pred)
+
+# Make a plot of regularization vs accuracy
+plt.plot(REGULARIZATION, [x["train"] for x in results])
+plt.plot(REGULARIZATION, [x["dev"] for x in results])
+plt.xscale('log')
+plt.xlabel("regularization")
+plt.ylabel("accuracy")
+plt.legend(['train', 'dev'], loc='upper left')
+plt.savefig("q4_reg_v_acc.png")
+plt.show()
+
diff --git a/q4_softmaxreg.py b/q4_softmaxreg.py
new file mode 100644
index 0000000..332b401
--- /dev/null
+++ b/q4_softmaxreg.py
@@ -0,0 +1,105 @@
+import numpy as np
+import random
+
+from cs224d.data_utils import *
+
+from q1_softmax import softmax
+from q2_gradcheck import gradcheck_naive
+from q3_sgd import load_saved_params
+
+def getSentenceFeature(tokens, wordVectors, sentence):
+    """ Obtain the sentence feature for sentiment analysis by averaging its word vectors """
+    # Implement computation for the sentence features given a sentence.                                                       
+    
+    # Inputs:                                                         
+    # - tokens: a dictionary that maps words to their indices in    
+    #          the word vector list                                
+    # - wordVectors: word vectors (each row) for all tokens                
+    # - sentence: a list of words in the sentence of interest 
+
+    # Output:                                                         
+    # - sentVector: feature vector for the sentence    
+    
+    sentVector = np.zeros((wordVectors.shape[1],))
+    
+    ### YOUR CODE HERE
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    return sentVector
+
+def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
+    """ Softmax Regression """
+    # Implement softmax regression with weight regularization.        
+    
+    # Inputs:                                                         
+    # - features: feature vectors, each row is a feature vector     
+    # - labels: labels corresponding to the feature vectors         
+    # - weights: weights of the regressor                           
+    # - regularization: L2 regularization constant                  
+    
+    # Output:                                                         
+    # - cost: cost of the regressor                                 
+    # - grad: gradient of the regressor cost with respect to its    
+    #        weights                                               
+    # - pred: label predictions of the regressor (you might find    
+    #        np.argmax helpful)  
+    
+    prob = softmax(features.dot(weights))
+    if len(features.shape) > 1:
+        N = features.shape[0]
+    else:
+        N = 1
+    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
+    cost = np.sum(-np.log(prob[range(N), labels])) / N 
+    cost += 0.5 * regularization * np.sum(weights ** 2)
+    
+    ### YOUR CODE HERE: compute the gradients and predictions
+    raise NotImplementedError
+    ### END YOUR CODE
+    
+    if nopredictions:
+        return cost, grad
+    else:
+        return cost, grad, pred
+
+def accuracy(y, yhat):
+    """ Precision for classifier """
+    assert(y.shape == yhat.shape)
+    return np.sum(y == yhat) * 100.0 / y.size
+
+def softmax_wrapper(features, labels, weights, regularization = 0.0):
+    cost, grad, _ = softmaxRegression(features, labels, weights, 
+        regularization)
+    return cost, grad
+
+def sanity_check():
+    """
+    Run python q4_softmaxreg.py.
+    """
+    random.seed(314159)
+    np.random.seed(265)
+
+    dataset = StanfordSentiment()
+    tokens = dataset.tokens()
+    nWords = len(tokens)
+
+    _, wordVectors0, _ = load_saved_params()
+    wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
+    dimVectors = wordVectors.shape[1]
+
+    dummy_weights = 0.1 * np.random.randn(dimVectors, 5)
+    dummy_features = np.zeros((10, dimVectors))
+    dummy_labels = np.zeros((10,), dtype=np.int32)    
+    for i in xrange(10):
+        words, dummy_labels[i] = dataset.getRandomTrainSentence()
+        dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words)
+    print "==== Gradient check for softmax regression ===="
+    gradcheck_naive(lambda weights: softmaxRegression(dummy_features,
+        dummy_labels, weights, 1.0, nopredictions = True), dummy_weights)
+
+    print "\n=== Results ==="
+    print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0)
+
+if __name__ == "__main__":
+    sanity_check()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f515c43
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,18 @@
+Jinja2==2.7.3
+MarkupSafe==0.23
+backports.ssl-match-hostname==3.4.0.2
+certifi==14.05.14
+gnureadline==6.3.3
+ipython==2.3.1
+matplotlib==1.4.2
+mock==1.0.1
+nose==1.3.4
+numpy==1.9.1
+pyparsing==2.0.3
+python-dateutil==2.4.0
+pytz==2014.10
+pyzmq==14.4.1
+scipy==0.14.1
+six==1.9.0
+tornado==4.0.2
+wsgiref==0.1.2