Skip to content

Commit eb1f498

Browse files
committed
added more fun stuff
1 parent 12221f4 commit eb1f498

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed

data/helpX

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
cnumpy.core.multiarray
2+
_reconstruct
3+
p1
4+
(cnumpy
5+
ndarray
6+
p2
7+
(I0
8+
tS'b'
9+
tRp3
10+
(I1
11+
(I14216263
12+
I10
13+
I64
14+
tcnumpy
15+
dtype
16+
p4
17+
(S'f8'
18+
I0
19+
I1
20+
tRp5
21+
(I3
22+
S'<'
23+
NNNI-1
24+
I-1
25+
I0
26+
tbI00

src/models/tweetnetHelpful.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import sys
2+
import os
3+
from os.path import expanduser
4+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..","utils")))
5+
from loadKaggleHelpful import loadTrain, loadTest
6+
from ReducedAsciiDictionary import ReducedAsciiDictionary
7+
from numpy import random
8+
from random import shuffle
9+
import cPickle as pickle
10+
import numpy
11+
12+
def heuristic(text, helpful, outOf):
13+
heurText = []
14+
heurHelpful = []
15+
heurOutOf = []
16+
for i in range(len(outOf)):
17+
if outOf[i] == 0:
18+
continue
19+
heurText.append(text[i])
20+
heurHelpful.append(helpful[i])
21+
heurOutOf.append(outOf[i])
22+
return heurText, heurHelpful, heurOutOf
23+
24+
def clean(text, dictionary, charLimit):
25+
cleanText = []
26+
for textBlock in text:
27+
if(len(textBlock) > charLimit):
28+
textBlock = textBlock[:charLimit]
29+
cleanBlock = []
30+
for character in textBlock:
31+
if(character in dictionary):
32+
cleanBlock.append(character)
33+
elif(character >= 'A' and character <= 'Z'):
34+
character = chr(ord(character)-ord('A')+1+64)
35+
cleanBlock.append(character)
36+
else:
37+
continue
38+
cleanText.append(cleanBlock)
39+
return cleanText
40+
41+
#print("Loading word2vec dictionary")
42+
#word2vecDict = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"),"rb"))
43+
#print("Finished loading word2vec dictionary")
44+
45+
#load reduced ascii dictionary
46+
print("Loading reduced ascii dictionary")
47+
dictionary = ReducedAsciiDictionary({},numpy.array([])).dictionary
48+
49+
#get review data and metadata
50+
print("Loading Training Data")
51+
trainText, trainHelpful, trainOutOf, trainUserID, trainItemID = loadTrain()
52+
trainText, trainHelpful, trainOutOf = heuristic(trainText, trainHelpful, trainOutOf)
53+
print("Loading Testing Data")
54+
testText, testOutOf, testUserID, testItemID = loadTest()
55+
56+
57+
58+
#clean text data
59+
print("Cleaning text")
60+
trainText = clean(trainText, dictionary,300)
61+
testText = clean(testText, dictionary,300)
62+
63+
64+
#set up train sequence and labels
65+
trainInput = []
66+
#trainInputContext = []
67+
trainLabel = []
68+
sequenceLength = 10
69+
70+
for i in range(len(trainText)):
71+
textBlock = trainText[i]
72+
helpfulBlock = trainHelpful[i]
73+
outOfBlock = trainOutOf[i]
74+
if(outOfBlock == 0):
75+
outOfBlock = 1
76+
helpfulnessRate = helpfulBlock*1.0/outOfBlock
77+
78+
#reviewerID = trainUserID[i]
79+
for c in range(0, len(textBlock) - sequenceLength):
80+
trainInput.append(textBlock[c:c+sequenceLength])
81+
trainLabel.append(helpfulnessRate)
82+
#trainInputContext.append(reviewerID)
83+
print('Number of sequences in training set: ', len(trainInput))
84+
85+
trainX = numpy.zeros((len(trainInput), sequenceLength, len(dictionary)), dtype=numpy.float64)
86+
trainY = numpy.zeros(len(trainInput))
87+
numExamples = 1000000
88+
for i, seq in enumerate(trainInput):
89+
if i % 1000 == 0:
90+
if i > numExamples:
91+
break
92+
print("loading review ", i)
93+
for j, ch in enumerate(seq):
94+
oneHotIndex = dictionary.get(ch)
95+
trainX[i,j,oneHotIndex] = 1
96+
trainY[i] = trainLabel[i]
97+
98+
pickle.dump(trainX, open(expanduser("~/tweetnet/data/helpX"), "wb"))
99+
pickle.dump(trainY, open(expanduser("~/tweetnet/data/helpY"),"wb"))
100+
101+
102+

0 commit comments

Comments
 (0)