ScottMcCormack
diff --git a/‎chapter_7/0702_TextClassification_With_TextBlob.ipynb
Lines changed: 105 additions & 52 deletions b/‎chapter_7/0702_TextClassification_With_TextBlob.ipynb
Lines changed: 105 additions & 52 deletions
@@ -8,7 +8,7 @@
     {
      "data": {
       "text/plain": [
-       "['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']"
+       "['DESCR', 'data', 'filenames', 'target', 'target_names']"
       ]
      },
      "execution_count": 1,
@@ -33,13 +33,13 @@
      "output_type": "stream",
      "text": [
       "11314\n",
-      "[7 4 4 ..., 3 1 8]\n",
+      "[7 4 4 ... 3 1 8]\n",
       "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n"
      ]
     }
    ],
    "source": [
-    "# 11,314\n",
+    "# 11,314 posts\n",
     "print(len(texts.target))\n",
     "print(texts.target)\n",
     "print(texts.target_names)"
@@ -123,15 +123,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The size of vocabulary is:  18230\n",
-      "['>', ',', 'the', '.', '--', ':', 'to', '(', ')', \"'ax\", 'of', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?', 'it']\n",
+      "The size of vocabulary is:  18279\n",
+      "['>', ',', 'the', '.', '--', ':', 'to', '(', ')', \"'ax\", 'of', '*', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?']\n",
       "(\"from : @ wam.umd.edu ( where 's my thing ) subject : what car is this ! ? nntp-posting-host : organization : university of maryland , college park lines : 15 i was wondering if anyone out there could enlighten me on this car i saw the other day . it was a sports car , looked to be from the late early 70s . it was called a . the doors were really small . in addition , the front bumper was separate from the rest of the body . this is all i know . if anyone can a model name , engine specs , years of production , where this car is made , history , or whatever info you have on this looking car , please e-mail . thanks , - il -- -- brought to you by your neighborhood -- --\", 'rec.autos')\n"
      ]
     }
@@ -141,12 +141,13 @@
     "num_testing = 300\n",
     "\n",
     "# first get vocabulary. We are creating a vocabulary to limit the features,\n",
-    "# since each word will eventually be a feature\n",
+    "# since each word will eventually be a feature.\n",
+    "# https://docs.python.org/2/library/collections.html#collections.Counter\n",
     "all_text = ''\n",
     "for i in range(num_training):\n",
     "    all_text += texts.data[i].lower()\n",
-    "    \n",
-    "# make a list of words, we need to tokenzie ourselves to get this list\n",
+    "\n",
+    "# make a list of words, we need to tokenize ourselves to get this list\n",
     "from nltk.tokenize import word_tokenize\n",
     "tokens = word_tokenize(all_text)\n",
     "tokens = [token.lower() for token in tokens]\n",
@@ -167,23 +168,18 @@
     "    tokens = word_tokenize(texts.data[i])\n",
     "    item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])\n",
     "    training_data.append((item_text, texts.target_names[texts.target[i]]))\n",
-    "\n",
     "testing_data = []\n",
     "for i in range(num_training, num_training + num_testing):\n",
     "    tokens = word_tokenize(texts.data[i])\n",
     "    item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])\n",
-    "    training_data.append((item_text, texts.target_names[texts.target[i]]))\n",
-    "    \n",
-    "print(training_data[0])  \n",
-    "    "
+    "    testing_data.append((item_text, texts.target_names[texts.target[i]]))\n",
+    "print(training_data[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# The standard TextBlob Naive Bayes Classifier re-parses the whole text of the corpus for each record.\n",
@@ -196,11 +192,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most Informative Features\n",
+      "       contains(windows) = True           comp.o : rec.sp =    235.0 : 1.0\n",
+      "          contains(sale) = True           misc.f : comp.w =    208.4 : 1.0\n",
+      "           contains(car) = True           rec.au : comp.w =    194.3 : 1.0\n",
+      "           contains(dod) = True           rec.mo : comp.w =    186.6 : 1.0\n",
+      "contains(nntp-posting-host) = True           talk.p : soc.re =    180.7 : 1.0\n",
+      "       contains(clipper) = True           sci.cr : misc.f =    180.2 : 1.0\n",
+      "          contains(chip) = True           sci.cr : sci.sp =    164.7 : 1.0\n",
+      "          contains(bike) = True           rec.mo : rec.sp =    162.2 : 1.0\n",
+      "    contains(encryption) = True           sci.cr : sci.el =    155.4 : 1.0\n",
+      "          contains(team) = True           rec.sp : rec.au =    151.0 : 1.0\n",
+      "           contains(gun) = True           talk.p : rec.sp =    149.1 : 1.0\n",
+      "        contains(israel) = True           talk.p : comp.w =    139.8 : 1.0\n",
+      "          contains(game) = True           rec.sp : sci.me =    134.7 : 1.0\n",
+      "           contains(mac) = True           comp.s : rec.sp =    127.8 : 1.0\n",
+      "    contains(government) = True           sci.cr : comp.w =    126.8 : 1.0\n",
+      "       contains(israeli) = True           talk.p : soc.re =    124.2 : 1.0\n",
+      "     contains(christian) = True           talk.r : sci.me =    122.1 : 1.0\n",
+      "      contains(baseball) = True           rec.sp : rec.mo =    118.8 : 1.0\n",
+      "           contains(god) = True           soc.re : sci.el =    117.8 : 1.0\n",
+      "    contains(christians) = True           soc.re : sci.sp =    117.4 : 1.0\n"
+     ]
+    }
+   ],
    "source": [
     "# Shows what the features look like and what the important ones are\n",
     "# Very helpful for debugging and understanding data\n",
@@ -209,56 +231,87 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy:  0.7766666666666666\n"
+     ]
+    }
+   ],
    "source": [
     "# Pretty good, baseline is 5% because we have 20 classes\n",
-    "print(\"Accuracy: \"m cl.accuracy(testing_data))"
+    "print(\"Accuracy: \", float(cl.accuracy(testing_data)))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'misc.forsale'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Weird thing, ite doesn't work well for short sentences.\n",
+    "# Weird thing, it doesn't work well for short sentences.\n",
     "# Maybe can't overcome prior because it was training on longer texts.\n",
     "cl.classify('god christians jesus lord christian savior church')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "from : @ ( robert ) subject : re : sho and sc nntp-posting-host : organization :\n",
+      "Predicted: rec.autos, Actual: rec.autos\n",
+      "from : @ magnus.acs.ohio-state.edu ( kim richard man ) subject : syquest forsale\n",
+      "Predicted: misc.forsale, Actual: misc.forsale\n",
+      "from : @ casbah.acns.nwu.edu ( wilson ) subject : office package article-i.d . :\n",
+      "Predicted: comp.sys.mac.hardware, Actual: comp.os.ms-windows.misc\n",
+      "subject : re : do n't more innocents die without the death penalty ? from : bobb\n",
+      "Predicted: alt.atheism, Actual: alt.atheism\n",
+      "from : livesey @ solntze.wpd.sgi.com ( jon livesey ) subject : re : genocide is \n",
+      "Predicted: alt.atheism, Actual: alt.atheism\n",
+      "from : @ ( david silver ) subject : re : fractal generation of clouds organizati\n",
+      "Predicted: comp.graphics, Actual: comp.graphics\n",
+      "subject : re : mike 's 1993 predictions from : gajarsky @ pilot.njin.net ( bob g\n",
+      "Predicted: rec.sport.baseball, Actual: rec.sport.baseball\n",
+      "from : jet @ ( j. eric ) subject : re : insurance and lotsa points ... in-reply-\n",
+      "Predicted: rec.motorcycles, Actual: rec.motorcycles\n",
+      "from : gld @ cunixb.cc.columbia.edu ( gary l dare ) subject : re : abc coverage \n",
+      "Predicted: rec.sport.hockey, Actual: rec.sport.hockey\n",
+      "from : sehari @ iastate.edu ( babak sehari ) subject : re : how to the disks cop\n",
+      "Predicted: sci.crypt, Actual: sci.electronics\n"
+     ]
+    }
+   ],
    "source": [
     "# We see it does better with the full text\n",
     "for t in testing_data[:10]:\n",
     "    print(t[0][:80])\n",
     "    print(\"Predicted: {}, Actual: {}\".format(cl.classify(t[0]), t[1]))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -272,9 +325,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }