-
Notifications
You must be signed in to change notification settings - Fork 0
/
10May-Sentiment2.py
239 lines (153 loc) · 6.03 KB
/
10May-Sentiment2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import pyprind
import pandas as pd
import os
xxxhmiac = 'imported done'
print(xxxhmiac)
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
for l in ('pos', 'neg'):
path = r'C:\Users\Harry Muzart\Documents\aclImdb/%s/%s' % (s, l)
for file in os.listdir(path):
with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index=True)
pbar.update()
df.columns = ['review', 'sentiment']
xxxhmiad = 'appended to dataframe done'
print(xxxhmiad)
# # path = os.path.join(basepath, s, l)
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv(r'C:\Users\Harry Muzart\Documents\SAfiles/movie_data.csv', index=False)
xxxhmiae = 'permutation done'
print(xxxhmiae)
import pandas as pd
xxxhmiaf = 'panda done'
print(xxxhmiaf)
df = pd.read_csv(r'C:\Users\Harry Muzart\Documents\SAfiles/movie_data.csv')
df.head(3)
xxxhmiaf = 'read csv done'
print(xxxhmiaf)
#### sort unicaode error at read csv
# Transforming documents into feature vectors
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
'The sun is shining',
'The weather is sweet',
'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())
# Assessing word relevancy via term frequency-inverse document frequency
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
tf_is = 3
n_docs = 3
idf_is = np.log((n_docs+1) / (3+1))
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf
# Cleaning text data
df.loc[0, 'review'][-50:]
import re
def preprocessor(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
text = re.sub('[\W]+', ' ', text.lower()) +\
' '.join(emoticons).replace('-', '')
return text
preprocessor(df.loc[0, 'review'][-50:])
preprocessor("</a>This :) is :( a test :-)!")
df['review'] = df['review'].apply(preprocessor)
# Processing documents into tokens
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
return text.split()
tokenizer('runners like running and thus they run')
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]
# Training a logistic regression model for document classification
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import GridSearchCV
else:
from sklearn.model_selection import GridSearchCV
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
{'vect__ngram_range': [(1, 1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'vect__use_idf':[False],
'vect__norm':[None],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
]
lr_tfidf = Pipeline([('vect', tfidf),
('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
from sklearn.linear_model import LogisticRegression
import numpy as np
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
else:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(25)]
X = (y + np.random.randn(25)).reshape(-1, 1)
if Version(sklearn_version) < '0.18':
cv5_idx = list(StratifiedKFold(y, n_folds=5, shuffle=False, random_state=0))
else:
cv5_idx = list(StratifiedKFold(n_splits=5, shuffle=False, random_state=0).split(X, y))
cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx)
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import GridSearchCV
else:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y)
gs.best_score_
cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean()
# Working with bigger data - online algorithms and out-of-core learning
# Ch9 - Serializing fitted scikit-learn estimators