-
Notifications
You must be signed in to change notification settings - Fork 0
/
q4_sentiment.py
116 lines (97 loc) · 3.66 KB
/
q4_sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import matplotlib.pyplot as plt
from cs224d.data_utils import *
from q3_sgd import load_saved_params, sgd
from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper
# Try different regularizations and pick the best!
# NOTE: fill in one more "your code here" below before running!
REGULARIZATION = None # Assign a list of floats in the block below
### YOUR CODE HERE
REGULARIZATION = sorted([1e-1, 3e-1, 1e-2, 3e-2, 1e-3, 3e-3, 1e-4, 3e-4, 1e-5, 3e-5, 0.])
### END YOUR CODE
# Load the dataset
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
# Load the word vectors we trained earlier
_, wordVectors0, _ = load_saved_params()
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:])
dimVectors = wordVectors.shape[1]
# Load the train set
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain,), dtype=np.int32)
for i in range(nTrain):
words, trainLabels[i] = trainset[i]
trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
# Prepare dev set features
devset = dataset.getDevSentences()
nDev = len(devset)
devFeatures = np.zeros((nDev, dimVectors))
devLabels = np.zeros((nDev,), dtype=np.int32)
for i in range(nDev):
words, devLabels[i] = devset[i]
devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
# Try our regularization parameters
results = []
for regularization in REGULARIZATION:
random.seed(3141)
np.random.seed(59265)
weights = np.random.randn(dimVectors, 5)
print ("Training for reg=%f" % regularization )
# We will do batch optimization
weights = sgd(lambda weights: softmax_wrapper(trainFeatures, trainLabels,
weights, regularization), weights, 3.0, 10000, PRINT_EVERY=100)
# Test on train set
_, _, pred = softmaxRegression(trainFeatures, trainLabels, weights)
trainAccuracy = accuracy(trainLabels, pred)
print ("Train accuracy (%%): %f" % trainAccuracy)
# Test on dev set
_, _, pred = softmaxRegression(devFeatures, devLabels, weights)
devAccuracy = accuracy(devLabels, pred)
print ("Dev accuracy (%%): %f" % devAccuracy)
# Save the results and weights
results.append({
"reg" : regularization,
"weights" : weights,
"train" : trainAccuracy,
"dev" : devAccuracy})
# Print the accuracies
print ("")
print ("=== Recap ===")
print ("Reg\t\tTrain\t\tDev")
for result in results:
print ("%E\t%f\t%f" % (
result["reg"],
result["train"],
result["dev"]))
print ("")
# Pick the best regularization parameters
BEST_REGULARIZATION = None
BEST_WEIGHTS = None
### YOUR CODE HERE
best_result = max(results, key=lambda result: result["dev"])
BEST_REGULARIZATION = best_result["reg"]
BEST_WEIGHTS = best_result["weights"]
### END YOUR CODE
# Test your findings on the test set
testset = dataset.getTestSentences()
nTest = len(testset)
testFeatures = np.zeros((nTest, dimVectors))
testLabels = np.zeros((nTest,), dtype=np.int32)
for i in range(nTest):
words, testLabels[i] = testset[i]
testFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)
_, _, pred = softmaxRegression(testFeatures, testLabels, BEST_WEIGHTS)
print ("Best regularization value: %E" % BEST_REGULARIZATION)
print ("Test accuracy (%%): %f" % accuracy(testLabels, pred))
# Make a plot of regularization vs accuracy
plt.plot(REGULARIZATION, [x["train"] for x in results])
plt.plot(REGULARIZATION, [x["dev"] for x in results])
plt.xscale('log')
plt.xlabel("regularization")
plt.ylabel("accuracy")
plt.legend(['train', 'dev'], loc='upper left')
plt.savefig("q4_reg_v_acc.png")
plt.show()