Skip to content

Commit 25cb36d

Browse files
Updates to Chapter 7 notebooks
1 parent 3a11883 commit 25cb36d

File tree

2 files changed

+152
-119
lines changed

2 files changed

+152
-119
lines changed

chapter_7/0702_TextClassification_With_TextBlob.ipynb

Lines changed: 105 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
{
99
"data": {
1010
"text/plain": [
11-
"['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']"
11+
"['DESCR', 'data', 'filenames', 'target', 'target_names']"
1212
]
1313
},
1414
"execution_count": 1,
@@ -33,13 +33,13 @@
3333
"output_type": "stream",
3434
"text": [
3535
"11314\n",
36-
"[7 4 4 ..., 3 1 8]\n",
36+
"[7 4 4 ... 3 1 8]\n",
3737
"['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
3838
]
3939
}
4040
],
4141
"source": [
42-
"# 11,314\n",
42+
"# 11,314 posts\n",
4343
"print(len(texts.target))\n",
4444
"print(texts.target)\n",
4545
"print(texts.target_names)"
@@ -123,15 +123,15 @@
123123
},
124124
{
125125
"cell_type": "code",
126-
"execution_count": null,
126+
"execution_count": 6,
127127
"metadata": {},
128128
"outputs": [
129129
{
130130
"name": "stdout",
131131
"output_type": "stream",
132132
"text": [
133-
"The size of vocabulary is: 18230\n",
134-
"['>', ',', 'the', '.', '--', ':', 'to', '(', ')', \"'ax\", 'of', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?', 'it']\n",
133+
"The size of vocabulary is: 18279\n",
134+
"['>', ',', 'the', '.', '--', ':', 'to', '(', ')', \"'ax\", 'of', '*', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?']\n",
135135
"(\"from : @ wam.umd.edu ( where 's my thing ) subject : what car is this ! ? nntp-posting-host : organization : university of maryland , college park lines : 15 i was wondering if anyone out there could enlighten me on this car i saw the other day . it was a sports car , looked to be from the late early 70s . it was called a . the doors were really small . in addition , the front bumper was separate from the rest of the body . this is all i know . if anyone can a model name , engine specs , years of production , where this car is made , history , or whatever info you have on this looking car , please e-mail . thanks , - il -- -- brought to you by your neighborhood -- --\", 'rec.autos')\n"
136136
]
137137
}
@@ -141,12 +141,13 @@
141141
"num_testing = 300\n",
142142
"\n",
143143
"# first get vocabulary. We are creating a vocabulary to limit the features,\n",
144-
"# since each word will eventually be a feature\n",
144+
"# since each word will eventually be a feature.\n",
145+
"# https://docs.python.org/2/library/collections.html#collections.Counter\n",
145146
"all_text = ''\n",
146147
"for i in range(num_training):\n",
147148
" all_text += texts.data[i].lower()\n",
148-
" \n",
149-
"# make a list of words, we need to tokenzie ourselves to get this list\n",
149+
"\n",
150+
"# make a list of words, we need to tokenize ourselves to get this list\n",
150151
"from nltk.tokenize import word_tokenize\n",
151152
"tokens = word_tokenize(all_text)\n",
152153
"tokens = [token.lower() for token in tokens]\n",
@@ -167,23 +168,18 @@
167168
" tokens = word_tokenize(texts.data[i])\n",
168169
" item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])\n",
169170
" training_data.append((item_text, texts.target_names[texts.target[i]]))\n",
170-
"\n",
171171
"testing_data = []\n",
172172
"for i in range(num_training, num_training + num_testing):\n",
173173
" tokens = word_tokenize(texts.data[i])\n",
174174
" item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])\n",
175-
" training_data.append((item_text, texts.target_names[texts.target[i]]))\n",
176-
" \n",
177-
"print(training_data[0]) \n",
178-
" "
175+
" testing_data.append((item_text, texts.target_names[texts.target[i]]))\n",
176+
"print(training_data[0])"
179177
]
180178
},
181179
{
182180
"cell_type": "code",
183-
"execution_count": null,
184-
"metadata": {
185-
"collapsed": true
186-
},
181+
"execution_count": 7,
182+
"metadata": {},
187183
"outputs": [],
188184
"source": [
189185
"# The standard TextBlob Naive Bayes Classifier re-parses the whole text of the corpus for each record.\n",
@@ -196,11 +192,37 @@
196192
},
197193
{
198194
"cell_type": "code",
199-
"execution_count": null,
200-
"metadata": {
201-
"collapsed": true
202-
},
203-
"outputs": [],
195+
"execution_count": 8,
196+
"metadata": {},
197+
"outputs": [
198+
{
199+
"name": "stdout",
200+
"output_type": "stream",
201+
"text": [
202+
"Most Informative Features\n",
203+
" contains(windows) = True comp.o : rec.sp = 235.0 : 1.0\n",
204+
" contains(sale) = True misc.f : comp.w = 208.4 : 1.0\n",
205+
" contains(car) = True rec.au : comp.w = 194.3 : 1.0\n",
206+
" contains(dod) = True rec.mo : comp.w = 186.6 : 1.0\n",
207+
"contains(nntp-posting-host) = True talk.p : soc.re = 180.7 : 1.0\n",
208+
" contains(clipper) = True sci.cr : misc.f = 180.2 : 1.0\n",
209+
" contains(chip) = True sci.cr : sci.sp = 164.7 : 1.0\n",
210+
" contains(bike) = True rec.mo : rec.sp = 162.2 : 1.0\n",
211+
" contains(encryption) = True sci.cr : sci.el = 155.4 : 1.0\n",
212+
" contains(team) = True rec.sp : rec.au = 151.0 : 1.0\n",
213+
" contains(gun) = True talk.p : rec.sp = 149.1 : 1.0\n",
214+
" contains(israel) = True talk.p : comp.w = 139.8 : 1.0\n",
215+
" contains(game) = True rec.sp : sci.me = 134.7 : 1.0\n",
216+
" contains(mac) = True comp.s : rec.sp = 127.8 : 1.0\n",
217+
" contains(government) = True sci.cr : comp.w = 126.8 : 1.0\n",
218+
" contains(israeli) = True talk.p : soc.re = 124.2 : 1.0\n",
219+
" contains(christian) = True talk.r : sci.me = 122.1 : 1.0\n",
220+
" contains(baseball) = True rec.sp : rec.mo = 118.8 : 1.0\n",
221+
" contains(god) = True soc.re : sci.el = 117.8 : 1.0\n",
222+
" contains(christians) = True soc.re : sci.sp = 117.4 : 1.0\n"
223+
]
224+
}
225+
],
204226
"source": [
205227
"# Shows what the features look like and what the important ones are\n",
206228
"# Very helpful for debugging and understanding data\n",
@@ -209,56 +231,87 @@
209231
},
210232
{
211233
"cell_type": "code",
212-
"execution_count": null,
213-
"metadata": {
214-
"collapsed": true
215-
},
216-
"outputs": [],
234+
"execution_count": 9,
235+
"metadata": {},
236+
"outputs": [
237+
{
238+
"name": "stdout",
239+
"output_type": "stream",
240+
"text": [
241+
"Accuracy: 0.7766666666666666\n"
242+
]
243+
}
244+
],
217245
"source": [
218246
"# Pretty good, baseline is 5% because we have 20 classes\n",
219-
"print(\"Accuracy: \"m cl.accuracy(testing_data))"
247+
"print(\"Accuracy: \", float(cl.accuracy(testing_data)))"
220248
]
221249
},
222250
{
223251
"cell_type": "code",
224-
"execution_count": null,
225-
"metadata": {
226-
"collapsed": true
227-
},
228-
"outputs": [],
252+
"execution_count": 10,
253+
"metadata": {},
254+
"outputs": [
255+
{
256+
"data": {
257+
"text/plain": [
258+
"'misc.forsale'"
259+
]
260+
},
261+
"execution_count": 10,
262+
"metadata": {},
263+
"output_type": "execute_result"
264+
}
265+
],
229266
"source": [
230-
"# Weird thing, ite doesn't work well for short sentences.\n",
267+
"# Weird thing, it doesn't work well for short sentences.\n",
231268
"# Maybe can't overcome prior because it was training on longer texts.\n",
232269
"cl.classify('god christians jesus lord christian savior church')"
233270
]
234271
},
235272
{
236273
"cell_type": "code",
237-
"execution_count": null,
238-
"metadata": {
239-
"collapsed": true
240-
},
241-
"outputs": [],
274+
"execution_count": 11,
275+
"metadata": {},
276+
"outputs": [
277+
{
278+
"name": "stdout",
279+
"output_type": "stream",
280+
"text": [
281+
"from : @ ( robert ) subject : re : sho and sc nntp-posting-host : organization :\n",
282+
"Predicted: rec.autos, Actual: rec.autos\n",
283+
"from : @ magnus.acs.ohio-state.edu ( kim richard man ) subject : syquest forsale\n",
284+
"Predicted: misc.forsale, Actual: misc.forsale\n",
285+
"from : @ casbah.acns.nwu.edu ( wilson ) subject : office package article-i.d . :\n",
286+
"Predicted: comp.sys.mac.hardware, Actual: comp.os.ms-windows.misc\n",
287+
"subject : re : do n't more innocents die without the death penalty ? from : bobb\n",
288+
"Predicted: alt.atheism, Actual: alt.atheism\n",
289+
"from : livesey @ solntze.wpd.sgi.com ( jon livesey ) subject : re : genocide is \n",
290+
"Predicted: alt.atheism, Actual: alt.atheism\n",
291+
"from : @ ( david silver ) subject : re : fractal generation of clouds organizati\n",
292+
"Predicted: comp.graphics, Actual: comp.graphics\n",
293+
"subject : re : mike 's 1993 predictions from : gajarsky @ pilot.njin.net ( bob g\n",
294+
"Predicted: rec.sport.baseball, Actual: rec.sport.baseball\n",
295+
"from : jet @ ( j. eric ) subject : re : insurance and lotsa points ... in-reply-\n",
296+
"Predicted: rec.motorcycles, Actual: rec.motorcycles\n",
297+
"from : gld @ cunixb.cc.columbia.edu ( gary l dare ) subject : re : abc coverage \n",
298+
"Predicted: rec.sport.hockey, Actual: rec.sport.hockey\n",
299+
"from : sehari @ iastate.edu ( babak sehari ) subject : re : how to the disks cop\n",
300+
"Predicted: sci.crypt, Actual: sci.electronics\n"
301+
]
302+
}
303+
],
242304
"source": [
243305
"# We see it does better with the full text\n",
244306
"for t in testing_data[:10]:\n",
245307
" print(t[0][:80])\n",
246308
" print(\"Predicted: {}, Actual: {}\".format(cl.classify(t[0]), t[1]))"
247309
]
248-
},
249-
{
250-
"cell_type": "code",
251-
"execution_count": null,
252-
"metadata": {
253-
"collapsed": true
254-
},
255-
"outputs": [],
256-
"source": []
257310
}
258311
],
259312
"metadata": {
260313
"kernelspec": {
261-
"display_name": "Python 3",
314+
"display_name": "Python 3 (ipykernel)",
262315
"language": "python",
263316
"name": "python3"
264317
},
@@ -272,9 +325,9 @@
272325
"name": "python",
273326
"nbconvert_exporter": "python",
274327
"pygments_lexer": "ipython3",
275-
"version": "3.5.2"
328+
"version": "3.8.10"
276329
}
277330
},
278331
"nbformat": 4,
279-
"nbformat_minor": 2
332+
"nbformat_minor": 4
280333
}

0 commit comments

Comments
 (0)