|
8 | 8 | {
|
9 | 9 | "data": {
|
10 | 10 | "text/plain": [
|
11 |
| - "['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']" |
| 11 | + "['DESCR', 'data', 'filenames', 'target', 'target_names']" |
12 | 12 | ]
|
13 | 13 | },
|
14 | 14 | "execution_count": 1,
|
|
33 | 33 | "output_type": "stream",
|
34 | 34 | "text": [
|
35 | 35 | "11314\n",
|
36 |
| - "[7 4 4 ..., 3 1 8]\n", |
| 36 | + "[7 4 4 ... 3 1 8]\n", |
37 | 37 | "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
|
38 | 38 | ]
|
39 | 39 | }
|
40 | 40 | ],
|
41 | 41 | "source": [
|
42 |
| - "# 11,314\n", |
| 42 | + "# 11,314 posts\n", |
43 | 43 | "print(len(texts.target))\n",
|
44 | 44 | "print(texts.target)\n",
|
45 | 45 | "print(texts.target_names)"
|
|
123 | 123 | },
|
124 | 124 | {
|
125 | 125 | "cell_type": "code",
|
126 |
| - "execution_count": null, |
| 126 | + "execution_count": 6, |
127 | 127 | "metadata": {},
|
128 | 128 | "outputs": [
|
129 | 129 | {
|
130 | 130 | "name": "stdout",
|
131 | 131 | "output_type": "stream",
|
132 | 132 | "text": [
|
133 |
| - "The size of vocabulary is: 18230\n", |
134 |
| - "['>', ',', 'the', '.', '--', ':', 'to', '(', ')', \"'ax\", 'of', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?', 'it']\n", |
| 133 | + "The size of vocabulary is: 18279\n", |
| 134 | + "['>', ',', 'the', '.', '--', ':', 'to', '(', ')', \"'ax\", 'of', '*', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?']\n", |
135 | 135 | "(\"from : @ wam.umd.edu ( where 's my thing ) subject : what car is this ! ? nntp-posting-host : organization : university of maryland , college park lines : 15 i was wondering if anyone out there could enlighten me on this car i saw the other day . it was a sports car , looked to be from the late early 70s . it was called a . the doors were really small . in addition , the front bumper was separate from the rest of the body . this is all i know . if anyone can a model name , engine specs , years of production , where this car is made , history , or whatever info you have on this looking car , please e-mail . thanks , - il -- -- brought to you by your neighborhood -- --\", 'rec.autos')\n"
|
136 | 136 | ]
|
137 | 137 | }
|
|
141 | 141 | "num_testing = 300\n",
|
142 | 142 | "\n",
|
143 | 143 | "# first get vocabulary. We are creating a vocabulary to limit the features,\n",
|
144 |
| - "# since each word will eventually be a feature\n", |
| 144 | + "# since each word will eventually be a feature.\n", |
| 145 | + "# https://docs.python.org/2/library/collections.html#collections.Counter\n", |
145 | 146 | "all_text = ''\n",
|
146 | 147 | "for i in range(num_training):\n",
|
147 | 148 | " all_text += texts.data[i].lower()\n",
|
148 |
| - " \n", |
149 |
| - "# make a list of words, we need to tokenzie ourselves to get this list\n", |
| 149 | + "\n", |
| 150 | + "# make a list of words, we need to tokenize ourselves to get this list\n", |
150 | 151 | "from nltk.tokenize import word_tokenize\n",
|
151 | 152 | "tokens = word_tokenize(all_text)\n",
|
152 | 153 | "tokens = [token.lower() for token in tokens]\n",
|
|
167 | 168 | " tokens = word_tokenize(texts.data[i])\n",
|
168 | 169 | " item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])\n",
|
169 | 170 | " training_data.append((item_text, texts.target_names[texts.target[i]]))\n",
|
170 |
| - "\n", |
171 | 171 | "testing_data = []\n",
|
172 | 172 | "for i in range(num_training, num_training + num_testing):\n",
|
173 | 173 | " tokens = word_tokenize(texts.data[i])\n",
|
174 | 174 | " item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])\n",
|
175 |
| - " training_data.append((item_text, texts.target_names[texts.target[i]]))\n", |
176 |
| - " \n", |
177 |
| - "print(training_data[0]) \n", |
178 |
| - " " |
| 175 | + " testing_data.append((item_text, texts.target_names[texts.target[i]]))\n", |
| 176 | + "print(training_data[0])" |
179 | 177 | ]
|
180 | 178 | },
|
181 | 179 | {
|
182 | 180 | "cell_type": "code",
|
183 |
| - "execution_count": null, |
184 |
| - "metadata": { |
185 |
| - "collapsed": true |
186 |
| - }, |
| 181 | + "execution_count": 7, |
| 182 | + "metadata": {}, |
187 | 183 | "outputs": [],
|
188 | 184 | "source": [
|
189 | 185 | "# The standard TextBlob Naive Bayes Classifier re-parses the whole text of the corpus for each record.\n",
|
|
196 | 192 | },
|
197 | 193 | {
|
198 | 194 | "cell_type": "code",
|
199 |
| - "execution_count": null, |
200 |
| - "metadata": { |
201 |
| - "collapsed": true |
202 |
| - }, |
203 |
| - "outputs": [], |
| 195 | + "execution_count": 8, |
| 196 | + "metadata": {}, |
| 197 | + "outputs": [ |
| 198 | + { |
| 199 | + "name": "stdout", |
| 200 | + "output_type": "stream", |
| 201 | + "text": [ |
| 202 | + "Most Informative Features\n", |
| 203 | + " contains(windows) = True comp.o : rec.sp = 235.0 : 1.0\n", |
| 204 | + " contains(sale) = True misc.f : comp.w = 208.4 : 1.0\n", |
| 205 | + " contains(car) = True rec.au : comp.w = 194.3 : 1.0\n", |
| 206 | + " contains(dod) = True rec.mo : comp.w = 186.6 : 1.0\n", |
| 207 | + "contains(nntp-posting-host) = True talk.p : soc.re = 180.7 : 1.0\n", |
| 208 | + " contains(clipper) = True sci.cr : misc.f = 180.2 : 1.0\n", |
| 209 | + " contains(chip) = True sci.cr : sci.sp = 164.7 : 1.0\n", |
| 210 | + " contains(bike) = True rec.mo : rec.sp = 162.2 : 1.0\n", |
| 211 | + " contains(encryption) = True sci.cr : sci.el = 155.4 : 1.0\n", |
| 212 | + " contains(team) = True rec.sp : rec.au = 151.0 : 1.0\n", |
| 213 | + " contains(gun) = True talk.p : rec.sp = 149.1 : 1.0\n", |
| 214 | + " contains(israel) = True talk.p : comp.w = 139.8 : 1.0\n", |
| 215 | + " contains(game) = True rec.sp : sci.me = 134.7 : 1.0\n", |
| 216 | + " contains(mac) = True comp.s : rec.sp = 127.8 : 1.0\n", |
| 217 | + " contains(government) = True sci.cr : comp.w = 126.8 : 1.0\n", |
| 218 | + " contains(israeli) = True talk.p : soc.re = 124.2 : 1.0\n", |
| 219 | + " contains(christian) = True talk.r : sci.me = 122.1 : 1.0\n", |
| 220 | + " contains(baseball) = True rec.sp : rec.mo = 118.8 : 1.0\n", |
| 221 | + " contains(god) = True soc.re : sci.el = 117.8 : 1.0\n", |
| 222 | + " contains(christians) = True soc.re : sci.sp = 117.4 : 1.0\n" |
| 223 | + ] |
| 224 | + } |
| 225 | + ], |
204 | 226 | "source": [
|
205 | 227 | "# Shows what the features look like and what the important ones are\n",
|
206 | 228 | "# Very helpful for debugging and understanding data\n",
|
|
209 | 231 | },
|
210 | 232 | {
|
211 | 233 | "cell_type": "code",
|
212 |
| - "execution_count": null, |
213 |
| - "metadata": { |
214 |
| - "collapsed": true |
215 |
| - }, |
216 |
| - "outputs": [], |
| 234 | + "execution_count": 9, |
| 235 | + "metadata": {}, |
| 236 | + "outputs": [ |
| 237 | + { |
| 238 | + "name": "stdout", |
| 239 | + "output_type": "stream", |
| 240 | + "text": [ |
| 241 | + "Accuracy: 0.7766666666666666\n" |
| 242 | + ] |
| 243 | + } |
| 244 | + ], |
217 | 245 | "source": [
|
218 | 246 | "# Pretty good, baseline is 5% because we have 20 classes\n",
|
219 |
| - "print(\"Accuracy: \"m cl.accuracy(testing_data))" |
| 247 | + "print(\"Accuracy: \", float(cl.accuracy(testing_data)))" |
220 | 248 | ]
|
221 | 249 | },
|
222 | 250 | {
|
223 | 251 | "cell_type": "code",
|
224 |
| - "execution_count": null, |
225 |
| - "metadata": { |
226 |
| - "collapsed": true |
227 |
| - }, |
228 |
| - "outputs": [], |
| 252 | + "execution_count": 10, |
| 253 | + "metadata": {}, |
| 254 | + "outputs": [ |
| 255 | + { |
| 256 | + "data": { |
| 257 | + "text/plain": [ |
| 258 | + "'misc.forsale'" |
| 259 | + ] |
| 260 | + }, |
| 261 | + "execution_count": 10, |
| 262 | + "metadata": {}, |
| 263 | + "output_type": "execute_result" |
| 264 | + } |
| 265 | + ], |
229 | 266 | "source": [
|
230 |
| - "# Weird thing, ite doesn't work well for short sentences.\n", |
| 267 | + "# Weird thing, it doesn't work well for short sentences.\n", |
231 | 268 | "# Maybe can't overcome prior because it was training on longer texts.\n",
|
232 | 269 | "cl.classify('god christians jesus lord christian savior church')"
|
233 | 270 | ]
|
234 | 271 | },
|
235 | 272 | {
|
236 | 273 | "cell_type": "code",
|
237 |
| - "execution_count": null, |
238 |
| - "metadata": { |
239 |
| - "collapsed": true |
240 |
| - }, |
241 |
| - "outputs": [], |
| 274 | + "execution_count": 11, |
| 275 | + "metadata": {}, |
| 276 | + "outputs": [ |
| 277 | + { |
| 278 | + "name": "stdout", |
| 279 | + "output_type": "stream", |
| 280 | + "text": [ |
| 281 | + "from : @ ( robert ) subject : re : sho and sc nntp-posting-host : organization :\n", |
| 282 | + "Predicted: rec.autos, Actual: rec.autos\n", |
| 283 | + "from : @ magnus.acs.ohio-state.edu ( kim richard man ) subject : syquest forsale\n", |
| 284 | + "Predicted: misc.forsale, Actual: misc.forsale\n", |
| 285 | + "from : @ casbah.acns.nwu.edu ( wilson ) subject : office package article-i.d . :\n", |
| 286 | + "Predicted: comp.sys.mac.hardware, Actual: comp.os.ms-windows.misc\n", |
| 287 | + "subject : re : do n't more innocents die without the death penalty ? from : bobb\n", |
| 288 | + "Predicted: alt.atheism, Actual: alt.atheism\n", |
| 289 | + "from : livesey @ solntze.wpd.sgi.com ( jon livesey ) subject : re : genocide is \n", |
| 290 | + "Predicted: alt.atheism, Actual: alt.atheism\n", |
| 291 | + "from : @ ( david silver ) subject : re : fractal generation of clouds organizati\n", |
| 292 | + "Predicted: comp.graphics, Actual: comp.graphics\n", |
| 293 | + "subject : re : mike 's 1993 predictions from : gajarsky @ pilot.njin.net ( bob g\n", |
| 294 | + "Predicted: rec.sport.baseball, Actual: rec.sport.baseball\n", |
| 295 | + "from : jet @ ( j. eric ) subject : re : insurance and lotsa points ... in-reply-\n", |
| 296 | + "Predicted: rec.motorcycles, Actual: rec.motorcycles\n", |
| 297 | + "from : gld @ cunixb.cc.columbia.edu ( gary l dare ) subject : re : abc coverage \n", |
| 298 | + "Predicted: rec.sport.hockey, Actual: rec.sport.hockey\n", |
| 299 | + "from : sehari @ iastate.edu ( babak sehari ) subject : re : how to the disks cop\n", |
| 300 | + "Predicted: sci.crypt, Actual: sci.electronics\n" |
| 301 | + ] |
| 302 | + } |
| 303 | + ], |
242 | 304 | "source": [
|
243 | 305 | "# We see it does better with the full text\n",
|
244 | 306 | "for t in testing_data[:10]:\n",
|
245 | 307 | " print(t[0][:80])\n",
|
246 | 308 | " print(\"Predicted: {}, Actual: {}\".format(cl.classify(t[0]), t[1]))"
|
247 | 309 | ]
|
248 |
| - }, |
249 |
| - { |
250 |
| - "cell_type": "code", |
251 |
| - "execution_count": null, |
252 |
| - "metadata": { |
253 |
| - "collapsed": true |
254 |
| - }, |
255 |
| - "outputs": [], |
256 |
| - "source": [] |
257 | 310 | }
|
258 | 311 | ],
|
259 | 312 | "metadata": {
|
260 | 313 | "kernelspec": {
|
261 |
| - "display_name": "Python 3", |
| 314 | + "display_name": "Python 3 (ipykernel)", |
262 | 315 | "language": "python",
|
263 | 316 | "name": "python3"
|
264 | 317 | },
|
|
272 | 325 | "name": "python",
|
273 | 326 | "nbconvert_exporter": "python",
|
274 | 327 | "pygments_lexer": "ipython3",
|
275 |
| - "version": "3.5.2" |
| 328 | + "version": "3.8.10" |
276 | 329 | }
|
277 | 330 | },
|
278 | 331 | "nbformat": 4,
|
279 |
| - "nbformat_minor": 2 |
| 332 | + "nbformat_minor": 4 |
280 | 333 | }
|
0 commit comments