-
Notifications
You must be signed in to change notification settings - Fork 0
/
modules.py
66 lines (48 loc) · 1.89 KB
/
modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from collections import Counter
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import time
import pickle
import data_load
reviews = data_load.load_data()
def tokenize():
# Get a balanced sample of positive and negative reviews
texts = [review['text'] for review in reviews]
# Convert our 5 classes into 2 (negative or positive)
binstars = [0 if review['stars'] <= 3 else 1 for review in reviews]
balanced_texts = []
balanced_labels = []
limit = 200000 # Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(texts)):
polarity = binstars[i]
if neg_pos_counts[polarity] < limit:
balanced_texts.append(texts[i])
balanced_labels.append(binstars[i])
neg_pos_counts[polarity] += 1
Counter(balanced_labels)
# >>> Counter({0: 100000, 1: 100000})
# tokenizer = Tokenizer(num_words=5)
# toytexts = ["Is is a common word", "So is the", "the is common", "discombobulation is not common"]
# tokenizer.fit_on_texts(toytexts)
# sequences = tokenizer.texts_to_sequences(toytexts)
# >>> [[1, 1, 4, 2], [1, 3], [3, 1, 2], [1, 2]]
# print(sequences)
# >>> [[1, 1, 4, 2], [1, 3], [3, 1, 2], [1, 2]]
# print(tokenizer.word_index)
# padded_sequences = pad_sequences(sequences)
# print(padded_sequences)
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=300)
with open("keras_tokenizer.pickle", "wb") as f:
pickle.dump(tokenizer, f)
return balanced_labels, tokenizer, data
#print(tokenize())
if __name__ == '__main__':
start_time = time.time()
print("Tokensize start", flush=True)
tokenize()
print("Tokenize done.", flush=True)