From 9f835f4734b902bef47e72e8272a50b66bb1b2a7 Mon Sep 17 00:00:00 2001 From: "diego.esteves" Date: Wed, 6 May 2020 17:57:08 +0100 Subject: [PATCH] #51 - text features dict - feat extraction - bash update (spacy models) --- ...horus-training-news-classifiers-wiki.ipynb | 36 - {src/training => training}/README.md | 0 {src/training => training}/__init__.py | 0 .../notebooks/__init__.py | 0 training/notebooks/horus_v1/.gitignore | 1 + ...orus-training-image-classifiers-sift.ipynb | 0 ...horus-training-news-classifiers-wiki.ipynb | 4424 +++++++++++++++++ .../horus_v1/03-horus-training-ner-crf.ipynb | 0 .../notebooks/horus_v1/__init__.py | 0 .../notebooks/horus_v2/__init__.py | 0 .../training => training}/scripts/__init__.py | 0 11 files changed, 4425 insertions(+), 36 deletions(-) delete mode 100644 src/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb rename {src/training => training}/README.md (100%) rename {src/training => training}/__init__.py (100%) rename {src/training => training}/notebooks/__init__.py (100%) create mode 100644 training/notebooks/horus_v1/.gitignore rename {src/training => training}/notebooks/horus_v1/01-horus-training-image-classifiers-sift.ipynb (100%) create mode 100644 training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb rename {src/training => training}/notebooks/horus_v1/03-horus-training-ner-crf.ipynb (100%) rename {src/training => training}/notebooks/horus_v1/__init__.py (100%) rename {src/training => training}/notebooks/horus_v2/__init__.py (100%) rename {src/training => training}/scripts/__init__.py (100%) diff --git a/src/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb b/src/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb deleted file mode 100644 index e8a667c..0000000 --- a/src/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb +++ /dev/null @@ -1,36 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/src/training/README.md b/training/README.md similarity index 100% rename from src/training/README.md rename to training/README.md diff --git a/src/training/__init__.py b/training/__init__.py similarity index 100% rename from src/training/__init__.py rename to training/__init__.py diff --git a/src/training/notebooks/__init__.py b/training/notebooks/__init__.py similarity index 100% rename from src/training/notebooks/__init__.py rename to training/notebooks/__init__.py diff --git a/training/notebooks/horus_v1/.gitignore b/training/notebooks/horus_v1/.gitignore new file mode 100644 index 0000000..0bc9644 --- /dev/null +++ b/training/notebooks/horus_v1/.gitignore @@ -0,0 +1 @@ +data/*.* diff --git a/src/training/notebooks/horus_v1/01-horus-training-image-classifiers-sift.ipynb b/training/notebooks/horus_v1/01-horus-training-image-classifiers-sift.ipynb similarity index 100% rename from src/training/notebooks/horus_v1/01-horus-training-image-classifiers-sift.ipynb rename to training/notebooks/horus_v1/01-horus-training-image-classifiers-sift.ipynb diff --git a/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb b/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb new file mode 100644 index 0000000..6fbf1f5 --- /dev/null +++ b/training/notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb @@ -0,0 +1,4424 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": true + } + }, + "source": [ + "## HORUS-NER: Text Classification (v1.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get training data from DBPedia" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "import csv\n", + "import pandas as pd\n", + "import io\n", + "from SPARQLWrapper import SPARQLWrapper, JSON, CSV, TSV\n", + "import os\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.feature_selection import chi2\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.model_selection import cross_val_score\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix\n", + "from IPython.display import display\n", + "from sklearn import metrics\n", + "from sklearn.svm import SVC\n", + "from sklearn.calibration import CalibratedClassifierCV\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining SPARQL queries" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = 'https://dbpedia.org/sparql'\n", + "\n", + "query_sufix = \"\"\"\n", + "offset \n", + "limit 10000\n", + "\"\"\"\n", + "query_namespaces = \"\"\"\n", + "PREFIX dbo: \n", + "PREFIX dbr: \n", + "PREFIX rdf: \n", + "PREFIX rdfs: \n", + "\n", + "\"\"\"\n", + "query_per = \"\"\"\n", + "SELECT ?s, ?label, dbo:Person as ?type, ?abstract\n", + "FROM \n", + "WHERE {\n", + " ?s rdf:type dbo:Person .\n", + " ?s rdfs:label ?label .\n", + " ?s dbo:abstract ?abstract .\n", + " FILTER (lang(?label) = 'en') .\n", + " FILTER (lang(?abstract) = 'en')\n", + "}\n", + "\"\"\"\n", + "\n", + "query_loc = \"\"\"\n", + "SELECT ?s, ?label, ?type, ?abstract\n", + "FROM \n", + "WHERE {\n", + " ?s rdf:type ?type . \n", + " ?s rdfs:label ?label .\n", + " ?s dbo:abstract ?abstract .\n", + " FILTER (lang(?abstract) = 'en')\n", + " FILTER (lang(?label) = 'en')\n", + " FILTER (?type IN (dbo:City, dbo:Country, dbo:Towns, dbo:Municipality, dbo:Hill_station, dbo:Village, dbo:Suburb, dbo:Neighborhood, dbo:NaturalPlace, dbo:Urban_areas, dbo:Tourist_attraction)) \n", + "}\n", + "\"\"\"\n", + "\n", + "query_org = \"\"\"\n", + "SELECT ?s, ?label, ?type, ?abstract\n", + "FROM \n", + "WHERE { ?s rdf:type ?type .\n", + " ?s rdfs:label ?label .\n", + " ?s dbo:abstract ?abstract .\n", + " FILTER (lang(?abstract) = 'en')\n", + " FILTER (lang(?label) = 'en')\n", + " FILTER (?type IN (dbo:Organisation, dbo:Company)) \n", + "}\n", + "\"\"\" \n", + "\n", + "query_others = \"\"\"\n", + "SELECT ?s, ?label, ?type, ?abstract\n", + "FROM \n", + "WHERE {\n", + " ?s rdf:type ?type . \n", + " ?s rdfs:label ?label .\n", + " ?s dbo:abstract ?abstract .\n", + " FILTER (lang(?abstract) = 'en')\n", + " FILTER (lang(?label) = 'en')\n", + " FILTER (?type IN (dbo:WrittenWork, dbo:Species, dbo:CelestialBody, dbo:Food, dbo:Disease, dbo:Game, dbo:MeanOfTransportation)) \n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving data from DBPedia" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "person 0\n", + "person 10000\n", + "person 20000\n", + "person 30000\n", + "person 40000\n", + "person 50000\n", + "person 60000\n", + "person 70000\n", + "person 80000\n", + "person 90000\n", + "person 100000\n", + "location 0\n", + "location 10000\n", + "location 20000\n", + "location 30000\n", + "location 40000\n", + "location 50000\n", + "location 60000\n", + "location 70000\n", + "location 80000\n", + "location 90000\n", + "location 100000\n", + "organisation 0\n", + "organisation 10000\n", + "organisation 20000\n", + "organisation 30000\n", + "organisation 40000\n", + "organisation 50000\n", + "organisation 60000\n", + "organisation 70000\n", + "organisation 80000\n", + "organisation 90000\n", + "organisation 100000\n", + "other 0\n", + "other 10000\n", + "other 20000\n", + "other 30000\n", + "other 40000\n", + "other 50000\n", + "other 60000\n", + "other 70000\n", + "other 80000\n", + "other 90000\n", + "other 100000\n" + ] + } + ], + "source": [ + "# limit is fixed to its default: 10000\n", + "offsets = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000]\n", + "\n", + "sparql = SPARQLWrapper(\"http://dbpedia.org/sparql\")\n", + "Q = [('person', query_per), ('location', query_loc), ('organisation', query_org), ('other', query_others)]\n", + "for (label, query) in Q:\n", + " for offset in offsets:\n", + " print(label, offset)\n", + " file_sufix = '0' + str(offset)[0]\n", + " if len(str(offset))==6: \n", + " file_sufix = str(offset)[0:2]\n", + " path = f'./data/raw/dump_dbpedia_{label}_{file_sufix}.csv'\n", + " if not os.path.isfile(path):\n", + " sparql.setQuery(query_namespaces + query + query_sufix.replace('', str(offset)))\n", + " sparql.setReturnFormat(TSV)\n", + " results = sparql.query().convert()\n", + " df = pd.read_csv(io.BytesIO(results), sep=\"\\t\") \n", + " df.to_csv(path, sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"\\ndf_other2 = pd.read_csv('./data/raw/dump_dbpedia_other_02.csv', sep='\\t', index_col=0)\\ndf_other3 = pd.read_csv('./data/raw/dump_dbpedia_other_03.csv', sep='\\t', index_col=0)\\ndf_other4 = pd.read_csv('./data/raw/dump_dbpedia_other_04.csv', sep='\\t', index_col=0)\\ndf_other5 = pd.read_csv('./data/raw/dump_dbpedia_other_05.csv', sep='\\t', index_col=0)\\ndf_other6 = pd.read_csv('./data/raw/dump_dbpedia_other_06.csv', sep='\\t', index_col=0)\\ndf_other7 = pd.read_csv('./data/raw/dump_dbpedia_other_07.csv', sep='\\t', index_col=0)\\ndf_other8 = pd.read_csv('./data/raw/dump_dbpedia_other_08.csv', sep='\\t', index_col=0)\\ndf_other9 = pd.read_csv('./data/raw/dump_dbpedia_other_09.csv', sep='\\t', index_col=0)\\ndf_other10 = pd.read_csv('./data/raw/dump_dbpedia_other_10.csv', sep='\\t', index_col=0)\\n\"" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_per0 = pd.read_csv('./data/raw/dump_dbpedia_person_00.csv', sep='\\t', index_col=0)\n", + "df_per1 = pd.read_csv('./data/raw/dump_dbpedia_person_01.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "df_per2 = pd.read_csv('./data/raw/dump_dbpedia_person_02.csv', sep='\\t', index_col=0)\n", + "df_per3 = pd.read_csv('./data/raw/dump_dbpedia_person_03.csv', sep='\\t', index_col=0)\n", + "df_per4 = pd.read_csv('./data/raw/dump_dbpedia_person_04.csv', sep='\\t', index_col=0)\n", + "df_per5 = pd.read_csv('./data/raw/dump_dbpedia_person_05.csv', sep='\\t', index_col=0)\n", + "df_per6 = pd.read_csv('./data/raw/dump_dbpedia_person_06.csv', sep='\\t', index_col=0)\n", + "df_per7 = pd.read_csv('./data/raw/dump_dbpedia_person_07.csv', sep='\\t', index_col=0)\n", + "df_per8 = pd.read_csv('./data/raw/dump_dbpedia_person_08.csv', sep='\\t', index_col=0)\n", + "df_per9 = pd.read_csv('./data/raw/dump_dbpedia_person_09.csv', sep='\\t', index_col=0)\n", + "df_per10 = pd.read_csv('./data/raw/dump_dbpedia_person_10.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "\n", + "df_org0 = pd.read_csv('./data/raw/dump_dbpedia_organisation_00.csv', sep='\\t', index_col=0)\n", + "df_org1 = pd.read_csv('./data/raw/dump_dbpedia_organisation_01.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "df_org2 = pd.read_csv('./data/raw/dump_dbpedia_organisation_02.csv', sep='\\t', index_col=0)\n", + "df_org3 = pd.read_csv('./data/raw/dump_dbpedia_organisation_03.csv', sep='\\t', index_col=0)\n", + "df_org4 = pd.read_csv('./data/raw/dump_dbpedia_organisation_04.csv', sep='\\t', index_col=0)\n", + "df_org5 = pd.read_csv('./data/raw/dump_dbpedia_organisation_05.csv', sep='\\t', index_col=0)\n", + "df_org6 = pd.read_csv('./data/raw/dump_dbpedia_organisation_06.csv', sep='\\t', index_col=0)\n", + "df_org7 = pd.read_csv('./data/raw/dump_dbpedia_organisation_07.csv', sep='\\t', index_col=0)\n", + "df_org8 = pd.read_csv('./data/raw/dump_dbpedia_organisation_08.csv', sep='\\t', index_col=0)\n", + "df_org9 = pd.read_csv('./data/raw/dump_dbpedia_organisation_09.csv', sep='\\t', index_col=0)\n", + "df_org10 = pd.read_csv('./data/raw/dump_dbpedia_organisation_10.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "\n", + "df_loc0 = pd.read_csv('./data/raw/dump_dbpedia_location_00.csv', sep='\\t', index_col=0)\n", + "df_loc1 = pd.read_csv('./data/raw/dump_dbpedia_location_01.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "df_loc2 = pd.read_csv('./data/raw/dump_dbpedia_location_02.csv', sep='\\t', index_col=0)\n", + "df_loc3 = pd.read_csv('./data/raw/dump_dbpedia_location_03.csv', sep='\\t', index_col=0)\n", + "df_loc4 = pd.read_csv('./data/raw/dump_dbpedia_location_04.csv', sep='\\t', index_col=0)\n", + "df_loc5 = pd.read_csv('./data/raw/dump_dbpedia_location_05.csv', sep='\\t', index_col=0)\n", + "df_loc6 = pd.read_csv('./data/raw/dump_dbpedia_location_06.csv', sep='\\t', index_col=0)\n", + "df_loc7 = pd.read_csv('./data/raw/dump_dbpedia_location_07.csv', sep='\\t', index_col=0)\n", + "df_loc8 = pd.read_csv('./data/raw/dump_dbpedia_location_08.csv', sep='\\t', index_col=0)\n", + "df_loc9 = pd.read_csv('./data/raw/dump_dbpedia_location_09.csv', sep='\\t', index_col=0)\n", + "df_loc10 = pd.read_csv('./data/raw/dump_dbpedia_location_10.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "df_other0 = pd.read_csv('./data/raw/dump_dbpedia_other_00.csv', sep='\\t', index_col=0)\n", + "df_other1 = pd.read_csv('./data/raw/dump_dbpedia_other_01.csv', sep='\\t', index_col=0)\n", + "\"\"\"\n", + "df_other2 = pd.read_csv('./data/raw/dump_dbpedia_other_02.csv', sep='\\t', index_col=0)\n", + "df_other3 = pd.read_csv('./data/raw/dump_dbpedia_other_03.csv', sep='\\t', index_col=0)\n", + "df_other4 = pd.read_csv('./data/raw/dump_dbpedia_other_04.csv', sep='\\t', index_col=0)\n", + "df_other5 = pd.read_csv('./data/raw/dump_dbpedia_other_05.csv', sep='\\t', index_col=0)\n", + "df_other6 = pd.read_csv('./data/raw/dump_dbpedia_other_06.csv', sep='\\t', index_col=0)\n", + "df_other7 = pd.read_csv('./data/raw/dump_dbpedia_other_07.csv', sep='\\t', index_col=0)\n", + "df_other8 = pd.read_csv('./data/raw/dump_dbpedia_other_08.csv', sep='\\t', index_col=0)\n", + "df_other9 = pd.read_csv('./data/raw/dump_dbpedia_other_09.csv', sep='\\t', index_col=0)\n", + "df_other10 = pd.read_csv('./data/raw/dump_dbpedia_other_10.csv', sep='\\t', index_col=0)\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
slabeltypeabstract
0http://dbpedia.org/resource/Andreas_EkbergAndreas Ekberghttp://dbpedia.org/ontology/PersonAndreas Ekberg (born 2 January 1985) is a Swed...
1http://dbpedia.org/resource/Danilo_TognonDanilo Tognonhttp://dbpedia.org/ontology/PersonThe Canoeist Danilo Tognon (born October 9, 19...
2http://dbpedia.org/resource/Lorine_Livington_P...Lorine Livington Pruettehttp://dbpedia.org/ontology/PersonLorine Livington Pruette (1896–1977) was an Am...
3http://dbpedia.org/resource/Megan_LawrenceMegan Lawrencehttp://dbpedia.org/ontology/PersonMegan Lawrence (born 1972) is an American actr...
4http://dbpedia.org/resource/Nikolaos_VentourasNikolaos Ventourashttp://dbpedia.org/ontology/PersonNikos Ventouras (August 31, 1899 – April 1, 19...
\n", + "
" + ], + "text/plain": [ + " s \\\n", + "0 http://dbpedia.org/resource/Andreas_Ekberg \n", + "1 http://dbpedia.org/resource/Danilo_Tognon \n", + "2 http://dbpedia.org/resource/Lorine_Livington_P... \n", + "3 http://dbpedia.org/resource/Megan_Lawrence \n", + "4 http://dbpedia.org/resource/Nikolaos_Ventouras \n", + "\n", + " label type \\\n", + "0 Andreas Ekberg http://dbpedia.org/ontology/Person \n", + "1 Danilo Tognon http://dbpedia.org/ontology/Person \n", + "2 Lorine Livington Pruette http://dbpedia.org/ontology/Person \n", + "3 Megan Lawrence http://dbpedia.org/ontology/Person \n", + "4 Nikolaos Ventouras http://dbpedia.org/ontology/Person \n", + "\n", + " abstract \n", + "0 Andreas Ekberg (born 2 January 1985) is a Swed... \n", + "1 The Canoeist Danilo Tognon (born October 9, 19... \n", + "2 Lorine Livington Pruette (1896–1977) was an Am... \n", + "3 Megan Lawrence (born 1972) is an American actr... \n", + "4 Nikos Ventouras (August 31, 1899 – April 1, 19... " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_per0.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training data" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PERSON (20000, 4)\n", + "(20000, 4)\n", + "ORGANISATION (20000, 4)\n", + "(12292, 4)\n", + "LOCATION (20000, 4)\n", + "(20000, 4)\n", + "OTHER (20000, 4)\n", + "(19970, 4)\n" + ] + } + ], + "source": [ + "#aux = [df_per0, df_per1, df_per2, df_per3, df_per4, df_per5, df_per6, df_per7, df_per8, df_per9, df_per10]\n", + "aux = [df_per0, df_per1]\n", + "df_per = pd.concat(aux)\n", + "print('PERSON', df_per.shape)\n", + "df_per.drop_duplicates(subset =\"label\", keep = False, inplace = True) \n", + "print(df_per.shape)\n", + "df_per.to_csv('./data/processed/dump_dbpedia_person.csv', sep='\\t')\n", + "\n", + "#aux = [df_org0, df_org1, df_org2, df_org3, df_org4, df_org5, df_org6, df_org7, df_org8, df_org9, df_org10]\n", + "aux = [df_org0, df_org1]\n", + "df_org = pd.concat(aux)\n", + "print('ORGANISATION', df_org.shape)\n", + "df_org.drop_duplicates(subset =\"label\", keep = False, inplace = True) \n", + "print(df_org.shape)\n", + "df_org.to_csv('./data/processed/dump_dbpedia_organisation.csv', sep='\\t')\n", + "\n", + "#aux = [df_loc0, df_loc1, df_loc2, df_loc3, df_loc4, df_loc5, df_loc6, df_loc7, df_loc8, df_loc9, df_loc10]\n", + "aux = [df_loc0, df_loc1]\n", + "df_loc = pd.concat(aux)\n", + "print('LOCATION', df_loc.shape)\n", + "df_loc.drop_duplicates(subset =\"label\", keep = False, inplace = True) \n", + "print(df_loc.shape)\n", + "df_loc.to_csv('./data/processed/dump_dbpedia_location.csv', sep='\\t')\n", + "\n", + "#aux = [df_other0, df_other1, df_other2, df_other3, df_other4, df_other5, df_other6, df_other7, df_other8, df_other9, df_other10]\n", + "aux = [df_other0, df_other1]\n", + "df_other = pd.concat(aux)\n", + "print('OTHER', df_other.shape)\n", + "df_other.drop_duplicates(subset =\"label\", keep = False, inplace = True) \n", + "print(df_other.shape)\n", + "df_other.to_csv('./data/processed/dump_dbpedia_other.csv', sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# adding the label class\n", + "df_per['category'] = 'PER'\n", + "df_org['category'] = 'ORG'\n", + "df_loc['category'] = 'LOC'\n", + "df_other['category'] = 'OTHER'" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "category\n", + "LOC 20000\n", + "ORG 12292\n", + "OTHER 19970\n", + "PER 20000\n", + "Name: s, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_data = [df_per, df_org, df_loc, df_other]\n", + "df_train = pd.concat(training_data, ignore_index=True, keys=[\"s\", \"label\", \"type\", \"abstract\", \"category\"])\n", + "df_train.reset_index(drop=True)\n", + "df_train.groupby('category').s.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "df_train['category_id'] = df_train['category'].factorize()[0]\n", + "#df_train.set_index(['s', 'y'])\n", + "#columns = [\"s\", \"label\", \"type\", \"abstract\", \"y\"]\n", + "#df_train = df_train.reindex(columns=columns)\n", + "#df_train[columns] = df_train[columns].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "category_id_df = df_train[['category', 'category_id']].drop_duplicates().sort_values('category_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorycategory_id
0PER0
20000ORG1
32292LOC2
52292OTHER3
\n", + "
" + ], + "text/plain": [ + " category category_id\n", + "0 PER 0\n", + "20000 ORG 1\n", + "32292 LOC 2\n", + "52292 OTHER 3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "category_id_df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
slabeltypeabstractcategorycategory_id
0http://dbpedia.org/resource/Andreas_EkbergAndreas Ekberghttp://dbpedia.org/ontology/PersonAndreas Ekberg (born 2 January 1985) is a Swed...PER0
1http://dbpedia.org/resource/Danilo_TognonDanilo Tognonhttp://dbpedia.org/ontology/PersonThe Canoeist Danilo Tognon (born October 9, 19...PER0
2http://dbpedia.org/resource/Lorine_Livington_P...Lorine Livington Pruettehttp://dbpedia.org/ontology/PersonLorine Livington Pruette (1896–1977) was an Am...PER0
3http://dbpedia.org/resource/Megan_LawrenceMegan Lawrencehttp://dbpedia.org/ontology/PersonMegan Lawrence (born 1972) is an American actr...PER0
4http://dbpedia.org/resource/Nikolaos_VentourasNikolaos Ventourashttp://dbpedia.org/ontology/PersonNikos Ventouras (August 31, 1899 – April 1, 19...PER0
\n", + "
" + ], + "text/plain": [ + " s \\\n", + "0 http://dbpedia.org/resource/Andreas_Ekberg \n", + "1 http://dbpedia.org/resource/Danilo_Tognon \n", + "2 http://dbpedia.org/resource/Lorine_Livington_P... \n", + "3 http://dbpedia.org/resource/Megan_Lawrence \n", + "4 http://dbpedia.org/resource/Nikolaos_Ventouras \n", + "\n", + " label type \\\n", + "0 Andreas Ekberg http://dbpedia.org/ontology/Person \n", + "1 Danilo Tognon http://dbpedia.org/ontology/Person \n", + "2 Lorine Livington Pruette http://dbpedia.org/ontology/Person \n", + "3 Megan Lawrence http://dbpedia.org/ontology/Person \n", + "4 Nikolaos Ventouras http://dbpedia.org/ontology/Person \n", + "\n", + " abstract category category_id \n", + "0 Andreas Ekberg (born 2 January 1985) is a Swed... PER 0 \n", + "1 The Canoeist Danilo Tognon (born October 9, 19... PER 0 \n", + "2 Lorine Livington Pruette (1896–1977) was an Am... PER 0 \n", + "3 Megan Lawrence (born 1972) is an American actr... PER 0 \n", + "4 Nikos Ventouras (August 31, 1899 – April 1, 19... PER 0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "category_to_id = dict(category_id_df.values)\n", + "id_to_category = dict(category_id_df[['category_id', 'category']].values)\n", + "df_train.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfMAAAGLCAYAAADTfnvnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAdT0lEQVR4nO3df7TldV3v8edLEKOUC8qJRQw0qIOFZIPMQkrtkiQO9AMs40I3mYwcXcK9cm2tQltdTKNLatqljBbqBNwVIIYuJ8Vwoq5aN5DhhzCgxAEhZtYAo0NCWij2vn/sz8nNeH5xzmH2+ZzzfKy11/5+398f+73Zi3md7/f72d+dqkKSJPXraaNuQJIkzY9hLklS5wxzSZI6Z5hLktQ5w1ySpM7tOeoG5mr//fevlStXjroNSZJ2ixtvvPErVTU22bJuw3zlypVs3rx51G1IkrRbJLlvqmWeZpckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6tyMYZ7k4CR/m+SOJLcneXOrPzvJpiR3tef9Wj1JLkgynuTWJC8e2te6tv5dSdYN1Y9Kclvb5oIkeSrerCRJS9FsjswfB369qg4HjgHOTHI4cA5wbVWtAq5t8wAnAKvaYz1wIQzCHzgXeAlwNHDuxB8AbZ3XD223dv5vTZKk5WHGMK+q7VV1U5t+FPgicBBwEnBJW+0S4OQ2fRJwaQ1cB+yb5EDgVcCmqtpZVQ8Dm4C1bdk+VXVdVRVw6dC+JEnSDJ7UT6AmWQkcCVwPHFBV29uiB4AD2vRBwP1Dm21ttenqWyepT/b66xkc7XPIIYc8mdbnbeU5n9ytr7e73Xv+T4+6BWlS/r/Xt6X8+S2mz27WA+CSPBO4Cji7qh4ZXtaOqGuBe/suVXVRVa2pqjVjY5P+PrskScvOrMI8ydMZBPmfV9VHW/nBdoqc9vxQq28DDh7afEWrTVdfMUldkiTNwmxGswf4EPDFqnrv0KKNwMSI9HXAx4fqp7dR7ccAX2un468Bjk+yXxv4djxwTVv2SJJj2mudPrQvSZI0g9lcM38p8FrgtiS3tNrbgPOBK5OcAdwHnNKWXQ2cCIwD3wBeB1BVO5O8E7ihrfeOqtrZpt8EXAzsDXyqPSRJ0izMGOZV9XfAVN/7Pm6S9Qs4c4p9bQA2TFLfDBwxUy+SJOm7eQc4SZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdmzHMk2xI8lCSLUO1Dye5pT3uTXJLq69M8q9Dy/50aJujktyWZDzJBUnS6s9OsinJXe15v6fijUqStFTN5sj8YmDtcKGq/ktVra6q1cBVwEeHFt89sayq3jhUvxB4PbCqPSb2eQ5wbVWtAq5t85IkaZZmDPOq+iywc7Jl7ej6FODy6faR5EBgn6q6rqoKuBQ4uS0+CbikTV8yVJckSbMw32vmLwcerKq7hmqHJrk5yWeSvLzVDgK2Dq2ztdUADqiq7W36AeCAqV4syfokm5Ns3rFjxzxblyRpaZhvmJ/GE4/KtwOHVNWRwFuAy5LsM9udtaP2mmb5RVW1pqrWjI2NzbVnSZKWlD3numGSPYGfB46aqFXVY8BjbfrGJHcDhwHbgBVDm69oNYAHkxxYVdvb6fiH5tqTJEnL0XyOzH8K+FJV/cfp8yRjSfZo089lMNDtnnYa/ZEkx7Tr7KcDH2+bbQTWtel1Q3VJkjQLs/lq2uXAPwAvSLI1yRlt0al898C3nwBubV9V+wvgjVU1MXjuTcAHgXHgbuBTrX4+8MokdzH4A+H8ebwfSZKWnRlPs1fVaVPUf2WS2lUMvqo22fqbgSMmqX8VOG6mPiRJ0uS8A5wkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktS5GcM8yYYkDyXZMlR7e5JtSW5pjxOHlr01yXiSO5O8aqi+ttXGk5wzVD80yfWt/uEkey3kG5QkaambzZH5xcDaServq6rV7XE1QJLDgVOBF7Zt/iTJHkn2AN4PnAAcDpzW1gX4/bav5wMPA2fM5w1JkrTczBjmVfVZYOcs93cScEVVPVZVXwbGgaPbY7yq7qmqbwJXACclCfAK4C/a9pcAJz/J9yBJ0rI2n2vmZyW5tZ2G36/VDgLuH1pna6tNVX8O8M9V9fgu9UklWZ9kc5LNO3bsmEfrkiQtHXMN8wuB5wGrge3AHyxYR9Ooqouqak1VrRkbG9sdLylJ0qK351w2qqoHJ6aTfAD4RJvdBhw8tOqKVmOK+leBfZPs2Y7Oh9eXJEmzMKcj8yQHDs2+GpgY6b4RODXJM5IcCqwCPg/cAKxqI9f3YjBIbmNVFfC3wGva9uuAj8+lJ0mSlqsZj8yTXA4cC+yfZCtwLnBsktVAAfcCbwCoqtuTXAncATwOnFlV3277OQu4BtgD2FBVt7eX+E3giiS/C9wMfGjB3p0kScvAjGFeVadNUp4ycKvqPOC8SepXA1dPUr+HwWh3SZI0B94BTpKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS52YM8yQbkjyUZMtQ7d1JvpTk1iQfS7Jvq69M8q9JbmmPPx3a5qgktyUZT3JBkrT6s5NsSnJXe97vqXijkiQtVbM5Mr8YWLtLbRNwRFW9CPhH4K1Dy+6uqtXt8cah+oXA64FV7TGxz3OAa6tqFXBtm5ckSbM0Y5hX1WeBnbvUPl1Vj7fZ64AV0+0jyYHAPlV1XVUVcClwclt8EnBJm75kqC5JkmZhIa6Z/yrwqaH5Q5PcnOQzSV7eagcBW4fW2dpqAAdU1fY2/QBwwFQvlGR9ks1JNu/YsWMBWpckqX/zCvMkvwU8Dvx5K20HDqmqI4G3AJcl2We2+2tH7TXN8ouqak1VrRkbG5tH55IkLR17znXDJL8C/AxwXAthquox4LE2fWOSu4HDgG088VT8ilYDeDDJgVW1vZ2Of2iuPUmStBzN6cg8yVrgN4Cfq6pvDNXHkuzRpp/LYKDbPe00+iNJjmmj2E8HPt422wisa9PrhuqSJGkWZjwyT3I5cCywf5KtwLkMRq8/A9jUvmF2XRu5/hPAO5J8C/h34I1VNTF47k0MRsbvzeAa+8R19vOBK5OcAdwHnLIg70xqVp7zyVG38JS69/yfHnULkkZsxjCvqtMmKX9oinWvAq6aYtlm4IhJ6l8FjpupD0mSNDnvACdJUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOzCvMkG5I8lGTLUO3ZSTYluas979fqSXJBkvEktyZ58dA269r6dyVZN1Q/KsltbZsLkmQh36QkSUvZbI/MLwbW7lI7B7i2qlYB17Z5gBOAVe2xHrgQBuEPnAu8BDgaOHfiD4C2zuuHttv1tSRJ0hRmFeZV9Vlg5y7lk4BL2vQlwMlD9Utr4Dpg3yQHAq8CNlXVzqp6GNgErG3L9qmq66qqgEuH9iVJkmYwn2vmB1TV9jb9AHBAmz4IuH9ova2tNl196yT175JkfZLNSTbv2LFjHq1LkrR0LMgAuHZEXQuxrxle56KqWlNVa8bGxp7ql5MkqQvzCfMH2yly2vNDrb4NOHhovRWtNl19xSR1SZI0C/MJ843AxIj0dcDHh+qnt1HtxwBfa6fjrwGOT7JfG/h2PHBNW/ZIkmPaKPbTh/YlSZJmsOdsVkpyOXAssH+SrQxGpZ8PXJnkDOA+4JS2+tXAicA48A3gdQBVtTPJO4Eb2nrvqKqJQXVvYjBifm/gU+0hSZJmYVZhXlWnTbHouEnWLeDMKfazAdgwSX0zcMRsepEkSU/kHeAkSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHVuzmGe5AVJbhl6PJLk7CRvT7JtqH7i0DZvTTKe5M4krxqqr2218STnzPdNSZK0nOw51w2r6k5gNUCSPYBtwMeA1wHvq6r3DK+f5HDgVOCFwA8Af53ksLb4/cArga3ADUk2VtUdc+1NkqTlZM5hvovjgLur6r4kU61zEnBFVT0GfDnJOHB0WzZeVfcAJLmirWuYS5I0Cwt1zfxU4PKh+bOS3JpkQ5L9Wu0g4P6hdba22lT175JkfZLNSTbv2LFjgVqXJKlv8w7zJHsBPwd8pJUuBJ7H4BT8duAP5vsaE6rqoqpaU1VrxsbGFmq3kiR1bSFOs58A3FRVDwJMPAMk+QDwiTa7DTh4aLsVrcY0dUmSNIOFOM1+GkOn2JMcOLTs1cCWNr0RODXJM5IcCqwCPg/cAKxKcmg7yj+1rStJkmZhXkfmSb6PwSj0NwyV35VkNVDAvRPLqur2JFcyGNj2OHBmVX277ecs4BpgD2BDVd0+n74kSVpO5hXmVfV14Dm71F47zfrnAedNUr8auHo+vUiStFx5BzhJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ2bd5gnuTfJbUluSbK51Z6dZFOSu9rzfq2eJBckGU9ya5IXD+1nXVv/riTr5tuXJEnLxUIdmf9kVa2uqjVt/hzg2qpaBVzb5gFOAFa1x3rgQhiEP3Au8BLgaODciT8AJEnS9J6q0+wnAZe06UuAk4fql9bAdcC+SQ4EXgVsqqqdVfUwsAlY+xT1JknSkrIQYV7Ap5PcmGR9qx1QVdvb9APAAW36IOD+oW23ttpU9SdIsj7J5iSbd+zYsQCtS5LUvz0XYB8vq6ptSb4f2JTkS8MLq6qS1AK8DlV1EXARwJo1axZkn5Ik9W7eR+ZVta09PwR8jME17wfb6XPa80Nt9W3AwUObr2i1qeqSJGkG8wrzJN+X5FkT08DxwBZgIzAxIn0d8PE2vRE4vY1qPwb4Wjsdfw1wfJL92sC341tNkiTNYL6n2Q8APpZkYl+XVdVfJbkBuDLJGcB9wClt/auBE4Fx4BvA6wCqameSdwI3tPXeUVU759mbJEnLwrzCvKruAX50kvpXgeMmqRdw5hT72gBsmE8/kiQtR94BTpKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5+Yc5kkOTvK3Se5IcnuSN7f625NsS3JLe5w4tM1bk4wnuTPJq4bqa1ttPMk583tLkiQtL3vOY9vHgV+vqpuSPAu4Mcmmtux9VfWe4ZWTHA6cCrwQ+AHgr5Mc1ha/H3glsBW4IcnGqrpjHr1JkrRszDnMq2o7sL1NP5rki8BB02xyEnBFVT0GfDnJOHB0WzZeVfcAJLmirWuYS5I0CwtyzTzJSuBI4PpWOivJrUk2JNmv1Q4C7h/abGurTVWf7HXWJ9mcZPOOHTsWonVJkro37zBP8kzgKuDsqnoEuBB4HrCawZH7H8z3NSZU1UVVtaaq1oyNjS3UbiVJ6tp8rpmT5OkMgvzPq+qjAFX14NDyDwCfaLPbgIOHNl/RakxTlyRJM5jPaPYAHwK+WFXvHaofOLTaq4EtbXojcGqSZyQ5FFgFfB64AViV5NAkezEYJLdxrn1JkrTczOfI/KXAa4HbktzSam8DTkuyGijgXuANAFV1e5IrGQxsexw4s6q+DZDkLOAaYA9gQ1XdPo++JElaVuYzmv3vgEyy6OpptjkPOG+S+tXTbSdJkqbmHeAkSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOGeaSJHXOMJckqXOGuSRJnTPMJUnqnGEuSVLnDHNJkjpnmEuS1DnDXJKkzhnmkiR1zjCXJKlzhrkkSZ0zzCVJ6pxhLklS5wxzSZI6Z5hLktQ5w1ySpM4Z5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUucMc0mSOmeYS5LUOcNckqTOLZowT7I2yZ1JxpOcM+p+JEnqxaII8yR7AO8HTgAOB05Lcvhou5IkqQ+LIsyBo4Hxqrqnqr4JXAGcNOKeJEnqQqpq1D2Q5DXA2qr6tTb/WuAlVXXWLuutB9a32RcAd+7WRnev/YGvjLoJzYmfXd/8/Pq11D+7H6yqsckW7Lm7O5mPqroIuGjUfewOSTZX1ZpR96Enz8+ub35+/VrOn91iOc2+DTh4aH5Fq0mSpBksljC/AViV5NAkewGnAhtH3JMkSV1YFKfZq+rxJGcB1wB7ABuq6vYRtzVqy+JywhLlZ9c3P79+LdvPblEMgJMkSXO3WE6zS5KkOTLMJUnqnGEuSVLnDHNJkjpnmC8CScYmuxd9ksOTTHq3H0nSd0tyWJIPjLqP3c0wXxz+iMFtCHf1HOB/7+Ze9CQkWZHkZUPzb0nyP9vj+aPsTTNLskeS/Yfm90qyPskXR9mXZpbkRUk+nWRLkt9NcmCSq4C/Ae4YdX+7m2G+ODy/qj67a7GqPge8aAT9aPbeDew7NP8G4OtAAb8zko40K0lOBXYCtyb5TJLjgXsY/Hrjfx1pc5qNDwCXAb8A7ABuAe5m8O/p+0bZ2Cj4PfNFIMmdVfWCJ7tMo5fkpqp68dD8zVV1ZJv+XFW9fHTdaTpJtgAnV9V4khcD/wC8pqr+csStaRaS3FJVq4fm76mq546yp1FaFHeAE+NJTqyqq4eLSU5gcKSgxet7dpk/bmh6sksnWjy+WVXjAFV1U5K7DPKufE+SI4G0+ceG56vqppF1NgKG+eJwNvDJJKcAN7baGuDHgJ8ZWVeajUeTHFZV/whQVTsBkvwQ8OhIO9NMvj/JW4bm9x2er6r3jqAnzd4DwHunmC/gFbu9oxHyNPsikeQZwC8BR7TS7cBlVfVvo+tKM0myFrgAOA+YOBI4Cngb8Oaq+tSoetP0kpw73fKqcsyDumGYLyJJDgVe2GbvqCpPsXcgyRHAb/Cdz24L8O6q2jK6rqSlLclvVNW72vQvVtVHhpb9XlW9bXTd7X6G+SKQZB/ggwyO6G5hcM1nNYNT7mdU1SMjbE9zlOSQqvqnUfehySW5sqpOadO/X1W/ObTs01V1/Oi600yGB59OMhD1CfPLgV9NWxwuYPC9yFVV9QtV9fPA84DbgD8eaWeaUZIfS/KaJN/f5l+U5DLg70fcmqa3amj6lbss82ZNi1+mmJ5sfskzzBeHl1bV26vq3ycKNfAOBoPgtEgleTewgcF3XT+Z5HeBTwPX88Sw0OIz3WlJT1kufjXF9GTzS56j2Re/ZfcXZmd+Gjiyqv4tyX7A/cARVXXvaNvSLHxv+yrT04C9h77WFGDvkXam2fjRJI/QPq82TZvf9SujS57XzBeBJJcwuHPRO2voA0ny28BhVfXakTWnaU130xgtbkn+L9McwVXVT+6+bqT5McwXgTYA7kPAixkMgIPBALibGQyA+9qoetP0kvwzMHwr3p9o82FwteTnRtKYtMQl+R7gjcDzgVuBDVX1+Gi7Gh3DfBFJ8jxg4tfT7qiqu5OcXVV/OMq+NLUk/7lN7s3gGnkB48C/AlTVZ0bUmmaQ5OenW15VH91dvejJS/Jh4FvA5xjcT/++qnrzaLsaHcN8kUvyT1V1yKj70OSSPJ3BDWN+FZj4GtrBwMXA26rqWyNqTTNI8mdDsz8LDN/KtarqV3dzS3oSktxWVT/SpvcEPr/cvo42zAFwi58D4Ba3dwHPBA6tqkfhPy6bvIfBL6qdPcLeNI2qet3EdBvr8Lrp1tei8x9/KFfV48ny/qfSI/NFziPzxS3JXQwGKdYu9T2AL1WVX0/rwHK8yUjvknybwc8Nw3e+gfANvjNeZZ9R9TYKHpkvAkkeZfJRtX5FZvGrXYO8Fb+dxL+UpadIVe0x6h4WE8N8EaiqZ426B83ZHUlOr6pLh4tJfhn40oh60iwk+Uu+80f0c5NsHF7uNxHUE0+zS/OQ5CDgowxGrw//fO3ewKuratuoetP0hr6JMCm/iaCeGObSAkjyCp74i3fXjrIfzSzJxVX1K6PuQ1oIhrmkZclBb1pKvGYuabmauDf7pN9pqqqbdnM/0px5ZC5pWWrfIrmBycO8quoVu7klac48Mpe0XI0b2FoqDHNJy1r7wY7nt9nxqvq3UfYjzYWn2SUtS0mOB44DzgDuY3C6/WDgz4Df8r766snTRt2AJI3IicBzGNxX/6g2sv15wL4M7q0vdcMjc0nLkvfV11Likbmk5WrK++oz+W8lSIuWYS5pubojyem7Fr2vvnrkaXZJy5L31ddSYphLWta8r76WAsNckqTOec1ckqTOGeaSJHXOMJcEQJJjk/z4qPuQ9OQZ5pImHAs8pWGeAf/dkRaY/1NJS1yS05PcmuQLSf5Pkp9Ncn2Sm5P8dZIDkqwE3gj8jyS3JHl5krEkVyW5oT1e2vY3lmRTktuTfDDJfUn2b8vekmRLe5zdaiuT3JnkUmAL8NtJ/nCov9cned/u/u8iLSWOZpeWsCQvBD4G/HhVfSXJsxnc3eyfq6qS/Brww1X160neDvxLVb2nbXsZ8CdV9XdJDgGuqaofTvLHwLaq+l9J1gKfAsaAHwQuBo5h8KMl1wO/DDwM3NN6uC7JM4EvAD9UVd9K8v+AN1TVbbvpP4u05PgTqNLS9grgI1X1FYCq2pnkR4APJzkQ2Av48hTb/hRweJKJ+X1aEL8MeHXb318lebgtfxnwsar6OkCSjwIvBzYC91XVdW2bf0nyN8DPJPki8HSDXJofw1xafv4IeG9VbUxyLPD2KdZ7GnDMrr/vPRTuT8bXd5n/IPA2BrdN/bO57FDSd3jNXFra/gb4xSTPAWin2f8TMHGr0nVD6z4KPGto/tPAf5uYSbK6Tf49cEqrHQ/s1+qfA05O8r1Jvo/B0fvnJmuqqq5n8NvhvwRcPtc3J2nAMJeWsKq6HTgP+EySLwDvZXAk/pEkNwJfGVr9L4FXTwyAA/47sKYNnruDwQA5gN8Bjk+yBfhF4AHg0aq6icE1888zuF7+waq6eZr2rgT+vqoenmYdSbPgADhJT0qSZwDfrqrHk/wYcGFVrZ5pu0n28wngfd4LXZo/r5lLerIOAa5s3xf/JvD6J7Nxkn0ZHL1/wSCXFoZH5pIkdc5r5pIkdc4wlySpc4a5JEmdM8wlSeqcYS5JUuf+P+ShOXOq8A+KAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize=(8,6))\n", + "df_train.groupby('category').label.count().plot.bar(ylim=0)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(72262, 15000)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfidf = TfidfVectorizer(sublinear_tf=True, \n", + " min_df=5, \n", + " norm='l2', \n", + " encoding='latin-1', \n", + " ngram_range=(1, 2), \n", + " stop_words='english',\n", + " max_features=15000)\n", + "features = tfidf.fit_transform(df_train.abstract.astype(str)).toarray()\n", + "labels = df_train.category_id\n", + "features.shape\n", + "# total of abstracts x nr. of features per abstract" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# 'LOC':\n", + " . -- Most correlated unigrams:\n", + ". population\n", + ". city\n", + " . -- Most correlated bigrams:\n", + ". states population\n", + ". 2010 census\n", + "# 'ORG':\n", + " . -- Most correlated unigrams:\n", + ". band\n", + ". school\n", + " . -- Most correlated bigrams:\n", + ". radio station\n", + ". high school\n", + "# 'OTHER':\n", + " . -- Most correlated unigrams:\n", + ". family\n", + ". species\n", + " . -- Most correlated bigrams:\n", + ". gastropod mollusk\n", + ". marine gastropod\n", + "# 'PER':\n", + " . -- Most correlated unigrams:\n", + ". played\n", + ". born\n", + " . -- Most correlated bigrams:\n", + ". states population\n", + ". 2010 census\n" + ] + } + ], + "source": [ + "N = 2\n", + "for category, category_id in sorted(category_to_id.items()):\n", + " features_chi2 = chi2(features, labels == category_id)\n", + " indices = np.argsort(features_chi2[0])\n", + " feature_names = np.array(tfidf.get_feature_names())[indices]\n", + " unigrams = [v for v in feature_names if len(v.split(' ')) == 1]\n", + " bigrams = [v for v in feature_names if len(v.split(' ')) == 2]\n", + " print(\"# '{}':\".format(category))\n", + " print(\" . -- Most correlated unigrams:\\n. {}\".format('\\n. '.join(unigrams[-N:])))\n", + " print(\" . -- Most correlated bigrams:\\n. {}\".format('\\n. '.join(bigrams[-N:])))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df_train['abstract'].astype(str), \n", + " df_train['category'].astype(str), \n", + " random_state = 0)\n", + "count_vect = CountVectorizer()\n", + "X_train_counts = count_vect.fit_transform(X_train)\n", + "tfidf_transformer = TfidfTransformer()\n", + "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "clf = MultinomialNB().fit(X_train_tfidf, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['PER']\n" + ] + } + ], + "source": [ + "print(clf.predict(count_vect.transform([\"Diego was born in Mars and currently lives in Venus, which is a city.\"])))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "72262" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "2\n", + "LinearSVC\n", + "3\n", + "4\n", + "4\n", + "4\n", + "CalibratedClassifierCV\n", + "3\n", + "4\n", + "4\n", + "4\n", + "LogisticRegression\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/diego.esteves/.local/share/virtualenvs/horus-MzGAWMx0/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:939: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html.\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n", + "/Users/diego.esteves/.local/share/virtualenvs/horus-MzGAWMx0/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:939: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html.\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n", + "/Users/diego.esteves/.local/share/virtualenvs/horus-MzGAWMx0/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:939: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html.\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n", + "4\n", + "4\n", + "4\n" + ] + } + ], + "source": [ + "models = [\n", + " #RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),\n", + " LinearSVC(),\n", + " CalibratedClassifierCV(LinearSVC()),\n", + " #LogisticRegression(random_state=0)\n", + " LogisticRegression(multi_class=\"multinomial\", solver=\"lbfgs\", C=5, random_state=0)\n", + "]\n", + "CV = 3\n", + "print(1)\n", + "cv_df = pd.DataFrame(index=range(CV * len(models)))\n", + "print(2)\n", + "entries = []\n", + "for model in models:\n", + " model_name = model.__class__.__name__\n", + " print(model_name)\n", + " accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)\n", + " print(3)\n", + " for fold_idx, accuracy in enumerate(accuracies):\n", + " print(4)\n", + " entries.append((model_name, fold_idx, accuracy))\n", + " cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEHCAYAAAC0pdErAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXwV9b3/8dfnZAMS1oQ9Ihawgha3aFtbC9XqD2/dcGm11tb6q9bb69JL7a1Wa60/16u1dbveWmut1YtVq15qEaQggksrmyiLSBDQsBl2IpDlnM/vj5mEkzAhJ5CTk+X9fDzyyMx3vjPnc+bMmc/Md+Z8x9wdERGRhmKZDkBERNomJQgREYmkBCEiIpGUIEREJJIShIiIRMrOdAAtpaioyIcOHZrpMERE2pV58+ZtdPe+UdM6TIIYOnQoc+fOzXQYIiLtipmtbmyamphERCSSEoSIiERSghARkUgd5hqESKaUlpYyZcoUNm3aRO/evTnllFMYOXJkpsMSOWBKECL7qaamhrvuuoupU6fWK3/uuecYM2YMN954I3l5eRmKTtKppqaG2bNnM3v2bHbu3MmQIUM4/fTTGTJkSKZDa1FKECL76YEHHmDq1KnkxRJ8fdBmRvX4lA92dOWvawt57bXX6NKlCzfccEOmw5QWtn79en7yk5+wevWem3/efPNNnn76aS6++GK+//3vY2YZjLDl6BqEyH7YtGkTkyZNIoZz39Er+NGhazh1wFauHLGO/zq2lBxL8Morr7B27dpMhyotqLq6ui45xLvH2fn5nVScXEHloZW4OX/605948cUXMx1mi9EZhEjo/vvvp7S0NKW6GzduJB6Pc0LRdkb13Flv2rCC3Yzpt42/b+jNhAkTqKqqAqC4uLjFYx4+fDhXX311iy9Xos2ePbsuOew4aweeFzwuoXpoNTUDa8h/LZ+nnnqKM844g+zs9r97bf/vQCQDEokEAMVdqyKnF3etBCAej7Nr165Wi6uzak5yb46ysjJgT3JftWoVAJWjKuuSQ62qYVV0md+FTz75hB/84Afk5+c3ufy2nuCVINqJnTt38sorrzBr1ix27dpVd1Hsc5/7XKZD6zCa80WdOXMmN910E/O2FOAODZuc523pDsD3vvc9pkyZAgQ7MWlfGib32gODRPfE3pUNEvkJsnZk1dVr75Qg2oFVq1bx4x//mPLy8rqyxYsX8/LLL3PmmWcyYcIEYjFdTmpNJ5xwAr1792bFFvjDyv58Z+gGsmMQd3jm4768ty2fbt26MXbs2LoEIemTrqPw2uXWJvcHHniAZ599lpyyHKoPrq5X13Yb2eXBLvX2229n4MCBaYmpNWmv0sZVVlbyH//xH5SXl1OV34+NI05nw+EXsm3wF/BYNpMmTeLpp5/OdJidTm5uLtdccw0Aj68awDffGsnP3h3KhW+N5OHSQQBceeWVdOvWLZNhSgs7/fTTAcj9IJecFTkQtjJZpdFtVjcsbhx33HEdIjmAziDavBkzZrB+/Xqquhax4Yhv41k5AFT2HEJV90H0ff95nnnmGc4//3xycnIyHG3nctJJJ5GTk8PDDz9MWVkZ5ZW5AAwcOJDLLruMr33taxmOUFraIYccwoUXXsjEiRMpmFlAfF6cREGC7E+ysbhRUFDAVVddlekwW4wSRAtKx4WylStXAlAx8Ji65FBrV+/hVHfpzebNm7niiitSuigWpa1fKEuWrouRB6KwsJCuXbtSXV1NdnY2+fn5TJo0iUmTJgGwfPlyIH3NIOnQnraJ1nbFFVdQVFTExIkT2bhxI1k7sgAoKSnhqquuoiM9dkAJoo1zD85h4zkRO38z4jndyNm9pcNcFGtKaWkpHyyaz5CCeKZDqScr/AOo3Fh/Wm510JK7e9WcyHkTDturjE9rYrhDXpbTKy9BToYagD+qyGq6UidmZpx//vmMHz+epUuXsmvXLoqLixk0aFCmQ2txShAtKB1HXA8//DATJ06k6+bl7Co8tN60WFUFeRXriMVi3H333RQVFbX467dFQwri3FhSkekwWsTqHVnc804BWyr3ZIOKGthaFePiz+7ka8XRt9Gm061zC1r9Nduj7OzsDn8XoRJEG3fGGWcwceJE8ssXUVUwkIr+R0Isi6zd2ygs/RvmCb705RM7TXLoSCqqjf9cUMC2qhjDCnZx9uBNFGTHee2Tnsws78Xj7+dTmOcc3be66YW1E22xiXBf2mPzILRcE6ESRBtXXFzMwIEDWbduHX1WTqPnx68Tzy0gZ+dGDKewsJArr7wy02HKfpi5JpdtVTEO7/Ep9x2zgtxY0Jx4cv+t/HHlLn6/ciCTVnXpUAmitLSUBYsXQK9MR5KisOV2wZoFmY2jOba23KKUINqB/v37k5OTQywWY9WqVWTV7CI7O5uxY8dy+eWXM2DAgEyH2GrKysr4dEdWh2gGWR1e3Lzw4E/qkkOtbwzZyFMf9WP5tmxufruA7Fa8HrF6Rxb54S+I06IXJMZ2jmtmmRCb2XIbS6dMEO31NHf48OEcdthhJBIJcnNz2bhxI7fffnuGo2uc7oTZt7gHP78e2GXv6wxdsxL0yqlhfTyLhO81WVqCA5+ArTWIA93Bhzqoh/Y6nTJBlJaWsuC9JSS69cl0KCmxqmAPMf/DDfuuGK/BanYD4LFsyM7buw+IVhLbuTktyy0uLmZ3zboOcZH6NwvzmVueyz839WBE9931pq3+NI/1u/PIizk3lFSQ14o3Ft06t4AuaehYsE3ZBbE3Y9jm+t8PX+T4UY4PU1aGTpogABLd+rB71OmZDqNl1FSRt2Im2TvW1ytO5HSl6pATifdu/YeYdFnyUqu/Znvz1cGVzC3P5cnV/RjZYyfH9gmSXnllNncsPQiALw6oatXk0CkkIDY7hm0zEl0TVB5aSSI/Qe5HueSU5WDzjXhuHA7KdKCZ12kTRIfhCbp8MJWsHRtIxHLYWXQY8Zx8um4pJXfnRvKWT2P3Yf9CokfH+Ol/RzK6sIbP96/inxty+fd3hvGZ/F10z4mzaFs+cTcKu8Q5d5h6gm1xa8C2GfGCODvO3IF3Dc4WqkZWkfduHt3mdCO2JEaiOAEd47k/+00Jop3L2vIxWTs2EM/JZ/3nLibepScA24Z8hd4rp9N9/Txyy+Z1nLOlDsQMfnj4pwzoFufvH+fx4addg3Kc4/pW8e3P7qR3Xsdq6igrK4NtLXshtdm2Bf8qj6isSw61Kg+vpMt7XYhtjxGbHmufe8itUOYtc5NBe3z7B6ysrIzYzm0dohkktmsLANsHHV+XHAAwY+vBJ5L/ybtk7VhPl0UvQqz1Pu7Yzk2UldW02uu1V1kxOH/Ybs4cupsPt2dTk4DigniHSwxtSrhq4z0jfo2fFXTlHdsdq6vXmaV1j2Fm44D7CHoheNTd72ww/WDgMaAvsBn4truXmdlXgV8nVT0MuMDdO86z/FqKB7cLVhXsfaurZ+VR3bWQvE/X19WTtikvC0b27vgJtbi4mHIrz+htrjbHsFVG9rpsaorrr3OrNLI2Bxd9El9KQDvsjDc2M0bx4Ja5ySBtCcLMsoCHgFOAMmCOmU1y9yVJ1e4BnnD3P5rZScAdwMXu/ipwVLicPkAp8EpLxVZcXMyGyuwO0eySt3w62ZtXkluxjsqe9S9GW00lObs2AVB56Kl4Xuv9dqDLkpcoLu48v8+Q9sMPcVgFXZZ0oeagGmoGhEmiBrq9EXTZ7f29XSaHlpbOM4jjgVJ3/xDAzJ4GzgKSE8QoYEI4/CoQdYZwHvCyu++MmNbp1RSNIHvzSnqseZtdfUZQ0zW8ddedXqtnEktUE+8+sFWTg0ibVgiJIQliH8Xo/rfu1PSrIZGfIHttNrHKGJ7lJEbrjBvSmyAGAx8njZcBn29QZyFwDkEz1Higu5kVuvumpDoXAPdGvYCZXQ5cDjBkSOvfytkWxHsVE+8xkKzt6xj4zmPsLDw0vItpBTm7t+AWo+qgYzMdpkjbYeDHOYmuCazUyP5kz27QezuJYxLtpyuQNMv0ReprgQfN7BJgFrCG4DeNAJjZQOBzwNSomd39EeARgJKSks55Scli7D70FPI+nE3W5pXkb1xaNymRm0/VISeS6K6mHpF6YuCjHR/pwa+pawzv4UFi6OS3tiZLZ4JYQ/2fmhSHZXXcfS3BGQRmVgCc6+7JXU19A3jB3TtOb2XpkJVL5YiTsd3bydr6EZaIk+jai3ivg8D0VFmRRuUAg8F1y1KkdCaIOcAIMzuEIDFcAHwruYKZFQGb3T0BXE9wR1OyC8NySYF36UHNgCMyHYaIdBBpO7x09xrgSoLmoaXAM+6+2MxuMbMzw2pjgWVm9gHQH7itdn4zG0pwBvJaumIUEZHGpfUahLtPBiY3KLspafg54LlG5l1FcKFbREQyQA3UIiISSQlCREQiKUGIiEgkJQgREYmkBCEiIpGUIEREJJIShIiIRFKCEBGRSEoQIiISSQlCREQiKUGIiEgkJQgREYmkBCEiIpGUIEREJJIShIiIRFKCEBGRSEoQIiISSQlCREQiKUGIiEgkJQgREYmkBCEiIpGUIEREJJIShIiIRFKCEBGRSEoQIiISSQlCREQiKUGIiEgkJQgREYmkBCEiIpHSmiDMbJyZLTOzUjO7LmL6wWY23czeNbOZZlacNG2Imb1iZkvNbImZDU1nrCIiUl/aEoSZZQEPAacBo4ALzWxUg2r3AE+4+2jgFuCOpGlPAHe7+0jgeOCTdMUqIiJ7S+cZxPFAqbt/6O5VwNPAWQ3qjAJmhMOv1k4PE0m2u08DcPcKd9+ZxlhFRKSBdCaIwcDHSeNlYVmyhcA54fB4oLuZFQKHAlvN7HkzW2Bmd4dnJPWY2eVmNtfM5paXl6fhLYiIdF6Zvkh9LTDGzBYAY4A1QBzIBk4Mpx8HfAa4pOHM7v6Iu5e4e0nfvn1bLWgRkc4gnQliDXBQ0nhxWFbH3de6+znufjRwQ1i2leBs452weaoGeBE4Jo2xiohIA+lMEHOAEWZ2iJnlAhcAk5IrmFmRmdXGcD3wWNK8vcys9rTgJGBJGmMVEZEG0pYgwiP/K4GpwFLgGXdfbGa3mNmZYbWxwDIz+wDoD9wWzhsnaF6abmbvAQb8Ll2xiojI3rLTuXB3nwxMblB2U9Lwc8Bzjcw7DRidzvhERKRxmb5ILSIibZQShIiIRFKCEBGRSEoQIiISSQlCREQiKUGIiEgkJQgREYmkBCEiIpGUIEREJJIShIiIRFKCEBGRSEoQIiISSQlCREQiKUGIiEgkJQgREYmUUoIws+fN7OtJT38TEZEOLtUd/n8B3wKWm9mdZvbZNMYkIiJtQEoJwt3/7u4XAccAq4C/m9mbZvY9M8tJZ4AiIpIZKTcZmVkhcAnwfWABcB9BwpiWlshERCSjUnomtZm9AHwW+BNwhruvCyf92czmpis4ERHJnJQSBHC/u78aNcHdS1owHhERaSNSbWIaZWa9akfMrLeZ/TBNMYmISBuQaoK4zN231o64+xbgsvSEJCIibUGqCSLLzKx2xMyygNz0hCQiIm1BqtcgphBckP5tOP6DsExERDqoVBPETwmSwr+G49OAR9MSkYiItAkpJQh3TwAPh38iItIJpPo7iBHAHcAooEttubt/Jk1xiYhIhqV6kfoPBGcPNcBXgSeAJ9MVlIiIZF6qCaKru08HzN1Xu/vNwNebmsnMxpnZMjMrNbPrIqYfbGbTzexdM5tpZsVJ0+Jm9k74NynVNyQiIi0j1YvUlWFX38vN7EpgDVCwrxnCW2EfAk4ByoA5ZjbJ3ZckVbsHeMLd/2hmJxE0Y10cTtvl7kc1472IiEgLSvUM4hqgG3A1cCzwbeC7TcxzPFDq7h+6exXwNHBWgzqjgBnh8KsR00VEJEOaTBDhmcA33b3C3cvc/Xvufq67/6OJWQcDHyeNl4VlyRYC54TD44HuYa+xAF3MbK6Z/cPMzm76rYiISEtqMkG4exz4cppe/1pgjJktAMYQNF3Fw2kHhx0Bfgv4jZkNazizmV0eJpG55eXlaQpRRKRzSvUaxILwQvGzwKe1he7+/D7mWQMclDReHJbVcfe1hGcQZlYAnFvb55O7rwn/f2hmM4GjgRUN5n8EeASgpKTEU3wvIiKSglQTRBdgE3BSUpkD+0oQc4ARZnYIQWK4gOBsoI6ZFQGbwx/iXQ88Fpb3Bna6e2VY50vAf6YYq3RwH1Vkcevcfd4j0aZs2BmcqPfvlshwJKn5qCKLQzMdhLQJqf6S+nvNXbC714R3PE0FsoDH3H2xmd0CzHX3ScBY4A4zc2AW8G/h7COB35pZgqAZ7M4Gdz8dsNjOzXRZ8lJLLjJtbPd2ALxLjwxHkrrYzs3AgBZf7vDhw1t8melWtXw5AF2GjshwJKk5lPa5nqXlpfpL6j8QnDHU4+6X7ms+d58MTG5QdlPS8HPAcxHzvQl8LpXY9kd72/iXL98BwIhhLb/DTZ8BaVnPV199dYsvM91qY77//vszHIlI86TaxJR8qN2F4I6jtS0fTutobzsZ7WBEJBNSbWL6S/K4mU0EXk9LRCIi0iak+kO5hkYA/VoyEBERaVtSvQaxg/rXINYTPCNCREQ6qFSbmLqnOxAR6SS2Qmzm/jZetLKK8H/7uasatrJ3nxX7KdUziPHADHffFo73Asa6+4stE4aIdAbt7w7C4BblEYPbxy3KAAxuufWc6l1Mv3D3F2pH3H2rmf0CUIIQkZTpDsL2JdXzvKh6qSYXERFph1JNEHPN7F4zGxb+3QvMS2dgIiKSWakmiKuAKuDPBM912M2ebjFERKQDSvUupk+BvR4ZKiIiHVdKZxBmNi28c6l2vLeZTU1fWCIikmmpNjEV1T6nAcDdt6BfUouIdGipJoiEmQ2pHTGzoUT07ioiIh1Hqreq3gC8bmavAQacCFyetqhERCTjUr1IPcXMSgiSwgKCH8jtSmdgIiKSWal2tfF94BqC50q/A3wBeIv6jyAVEZEOJNVrENcAxwGr3f2rwNEEXUKJiEgHlWqC2O3uuwHMLM/d3wc+m76wREQk01K9SF0W/g7iRWCamW0BVqcvLBERybRUL1KPDwdvNrNXgZ7AlLRFJSIiGdfsHlnd/bV0BCIiIm1LO3msk4iItDYlCBERiaQEISIikZQgREQkkhKEiIhEUoIQEZFIShAiIhJJCUJERCKlNUGY2TgzW2ZmpWa21zOtzexgM5tuZu+a2UwzK24wvYeZlZnZg+mMU0RE9pa2BGFmWcBDwGnAKOBCMxvVoNo9wBPuPhq4BbijwfT/B8xKV4wiItK4dJ5BHA+UuvuH7l4FPA2c1aDOKGBGOPxq8nQzOxboD7ySxhhFRKQR6UwQg4GPk8bLwrJkC4FzwuHxQHczKzSzGPAr4Np9vYCZXW5mc81sbnl5eQuFLSIikPmL1NcCY8xsATAGWAPEgR8Ck929bF8zu/sj7l7i7iV9+/ZNf7QiIp1Is3tzbYY1wEFJ48VhWR13X0t4BmFmBcC57r7VzL4InGhmPwQKgFwzq3D3vS50i4hIeqQzQcwBRpjZIQSJ4QLgW8kVzKwI2OzuCeB64DEAd78oqc4lQImSg4hI60pbE5O71wBXAlOBpcAz7r7YzG4xszPDamOBZWb2AcEF6dvSFY+IiDRPOs8gcPfJwOQGZTclDT8HPNfEMh4HHk9DeCIisg+ZvkgtIiJtlBKEiIhEUoIQEZFIShAiIhJJCUJERCIpQYiISCQlCBERiaQEISIikZQgREQkkhKEiIhEUoIQEZFIShAiIhJJCUJERCIpQYiISCQlCBERiaQEISIikZQgREQkkhKEiIhEUoIQEZFIShAiIhJJCUJERCIpQYiISCQlCBERiaQEISIikZQgREQkkhKEiIhEUoIQEZFIShAiIhJJCUJERCKlNUGY2TgzW2ZmpWZ2XcT0g81supm9a2Yzzaw4qXy+mb1jZovN7Ip0xikiIntLW4IwsyzgIeA0YBRwoZmNalDtHuAJdx8N3ALcEZavA77o7kcBnweuM7NB6YpVRET2ls4ziOOBUnf/0N2rgKeBsxrUGQXMCIdfrZ3u7lXuXhmW56U5ThERiZDOHe9g4OOk8bKwLNlC4JxweDzQ3cwKAczsIDN7N1zGXe6+tuELmNnlZjbXzOaWl5e3+BsQEenMMn1kfi0wxswWAGOANUAcwN0/DpuehgPfNbP+DWd290fcvcTdS/r27duacYuIdHjpTBBrgIOSxovDsjruvtbdz3H3o4EbwrKtDesAi4AT0xiriIg0kM4EMQcYYWaHmFkucAEwKbmCmRWZWW0M1wOPheXFZtY1HO4NfBlYlsZYRUSkgbQlCHevAa4EpgJLgWfcfbGZ3WJmZ4bVxgLLzOwDoD9wW1g+EvinmS0EXgPucff30hWriIjsLTudC3f3ycDkBmU3JQ0/BzwXMd80YHQ6YxMRkX3L9EVqERFpo9J6BiH7r7q6mtdff50VK1awfv16evTokemQRKSTUYJog+bMmcNtt93G5s2b68rWr1/PNddcwy9+8Qv69OmTwehEpLNQE1Mbs2TJEn563XVs3ryZqq5FbCv+IjsGHE0iK48FCxbw4x//mMrKyqYXJCJygHQG0cb84Q9/oKa6mop+o9k8bByYAbCt+Ev0f+9JVqxYwYwZMzjttNMyHKkciBUrVvDCCy+wdOlSYrEYRx55JOPHj2fw4IadDYhkjhJEC7r//vspLS3d7/lrampYtGgRbllsPXhsXXIASOTms734CxSumMIDDzzAyy+/3AIRB4YPH87VV1/dYsuTfXvmmWd48MEH65UtW7aM559/nhtuuIGTTz45Q5GJ1KcE0YbU1NQE//N6kMjputf0qvwB9epJ+/P222/XJYezBm/k1AFbqEkYk9YUMv2T3tx6660MHTqUYcOGZThSESWIFnWgR+Hbt2/nzDPPJLtyG7GqChK5BfWm51UE/RUeeeSR3H333Qf0WpIZf/7znwG4ZOh6Lv3Mhrryo3t/St7SBJPXFfLss89y3XV7PT5FpNUpQbQhPXr04IQTTuD111+n98rpbBpxOsSyAMjavY0eZW8CMG7cuEyG2WEdaBNhY5YvXw7AVVddxcKFCzGc8w/auFe9bxy0kcnrCpk2bRpr1+7VeXEkNQ9KOilBtDGXXnopc+fOhU3vk7djDbv6jCBWs4tum5djiRoOP/xwxowZk+kwpRm6dg2aC90dgCxzumXH96rXIydoOkwkEq0XnMg+KEG0McOHD+fee+/l9ttvp6ysjO7r59dNO/HEE7nuuuvIztbHlg7pPhJ3dy666CLKysp4Y2MPvtJ3e73pMz/pBcDo0aO577770hqLSCq0p2mDjjjiCJ588knmz5/PihUryMnJ4fjjj6e4uDjTockBMDPOPvtsHnzwQe5dVkx+1kcc07uCBDC7vCePfhjchHD22WdnNlCRkBJEGxWLxSgpKaGkpCTToUgLOvvss3nrrbeYN28e//7OMIpyq6lx2FqdA8App5yiJkRpM5QgRFpRbm4ud955J0899RSTJk1iY9idyoABAzj33HM577zziMXUwYG0DUoQIq0sLy+PSy+9lO985zusX7+eWCzGgAEDlBjaoQ0bNvDss88yffp0tm3bRr9+/TjttNM455xz6N69e6bDO2BKECIZkp2dretK7dgHH3zAhAkT2L59z80Ga9eu5fe//z1TpgQ9HhQVFWUwwgOnQxYRkWaqqanh5z//Odu3b6d6UDXbz9zOlu9uYce4HdT0qWHNmjXccccdmQ7zgClBiIg001tvvcW6deuId49TcWoF8b5xyIaawTVUjKvAs505c+awevXqTId6QNTEJCLtXrp/Bd/wNzJr1qwBoGp4FWTVn8e7OtUHVZO7Mpef/exnFBYWNrr8tv5LeCUIEZFG1P4Kvtms6SrtgRKEiLR7rX0UPmvWLG688UZyV+Sy+8jd9c4ibLeR81Hwu5bbbruNoUOHtmpsLUnXIEREmumEE05gwIABZG3PouDvBWRtyoIEZK/LpmBKAVZjHHvsse06OYDOIEREmi07O5tbbrmFCRMmUFFWQU5ZTr3pgwYN6hBdtusMQkRkPxx22GE8+uijnHvuufTs2ROAfv36cckll/Db3/6W/v37ZzjCA2e1XRC3dyUlJT537txMhyEinZS7Y9b+rk6b2Tx3j+z0TWcQIiItoD0mh6YoQYiISCQlCBERiaQEISIikTrMRWozKwfad8cn+1YE7P2ke2kv9Pm1Xx39szvY3ftGTegwCaKjM7O5jd1pIG2fPr/2qzN/dmpiEhGRSEoQIiISSQmi/Xgk0wHIAdHn13512s9O1yBERCSSziBERCSSEoSIiERSgmghZlYRUXaFmX2nFV77UjN7z8zeNbNFZnaWmX3XzCY2qFdkZuVmlmdmOWZ2p5ktN7P5ZvaWmZ2W7libYmYDzOxpM1thZvPMbLKZHbqP+hXh/0Fm9lw4fImZPXiAcfzIzLo1c56xZvZS0vhpZjbXzJaY2QIz+1VYfrOZXXsg8TV43TeThu82s8Xh//3a/qLiNrMxZvZWg3rZZrbBzAY1Y9l7fU/2I74SM7t/H9OHmtm3Uq0f1lmV9B16zcwOPtA4W0pr7Uciubv+WuAPqMjAaxowBFgB9AzLCoBDgB4EP+7pllT/CuCxcPhO4I9AXjjeH/hGhtehAW8BVySVHQmc2Jz1DlwCPJjCa8X2MX0VUNTM+McCL4XDR4Sfy2HheBbwr+HwzcC1aVqH24Cs/Zw3u7G4CQ4mPyb4UVVt/XHAjGa+Rtq/J8mfQzPmqfu8gV8Cv2uBOPa5jbWHv4wH0FH+GtlR1e0IgJnAXcDbwAe1O73wC3g3MAd4F/hBWF4ATAfmA+8BZ4XlQ4FlwBPAYmAM8E7UTgH4C/DNpPGZwClAN2AT0CPT661BvCcBsyLKI9dF8noP18uicPgS4H/D97sc+EUj6+5g4GFgbjj+y7De1UBV+FqvhmWnEiSv+cCzQEFYPg54Pyy/nz0J4gng0kbeZ/J2cVn42S8MP69uYfn5wKKwfFZYdni4/bwTbisjGqyDSUA8nP7NBq8zDJgCzANmsycBPA78N/BP4N4m4v4V8NOk8ceBy1rge3IU8I/wPa6DF0MAAAlsSURBVL0A9A7LjwvL3iH4jtR+vmOT1nPt9v8OsADoHi5rW1j27w3qFwB/CD/bd4Fzw/JV7EkQ44DJ4XDf8HOZE/59Kal8WrjdPErQi0MR0dvYT9jz/a7dxvKBv4Wf7yLC7ynBgduSsO49EdtLY+tqJhH7lwP+TmZ6p9BR/hrZ8JM/2JnAr8LhfwH+Hg5fDtwYDucR7KwOITia6xGWFwGlBEckQ4EE8IVwWhYwFfgo3PDPSHr984AXwuFBwNqw/mhgQabXWcT6uhr4dUR55LpIXu/snSDWAYVA1/ALWNJw3YV1+yStx5nA6HB8FXt2GEXALCA/HP8pcBPQheCoekT42TzDnh3RfODIRt5n8nZRmFR+K3BVOPweMDgc7hX+fwC4KBzOBbo23PYaDCe/znT2JJTPEx75E+zkXyI8wGgi7pLa7YZgW/2kdv0d4PfkXWBMOHwL8JtweBHwxXD4TqITxF/Zs9MuCLeVuukR9e+qXX44XruDTf68fwNcHg7/D/DlcHgIsDQcfhC4PhweBzh7EkTy9/NUgttkjeAs7CXgK8C5JJ2lAD0Jttdl7Nm2az/35M+xsXU1k4j9y4H+6ZGjrev58P88gg0Jgg1otJmdF473JNjhlAG3m9lXCDa4wQTNQACr3f0fAO4eN7NxBEdbJwO/NrNj3f1mgiOU/zKzHsA3gL+E9dP4FtPCiF4X6/cxzzR33wRgZs8DXwZeJGndhb5hZpcT7FgGAqMIvoTJvhCWvxGuu1yCs4nDgJXuvjx8nScJEn5zHGFmtwK9CHZwU8PyN4DHzewZ9mw3bwE3mFkx8Hzt6zbFzAqAE4Bnkz77vKQqz7p7vKnluPtcMysws88CI4F/uvvmVGLYR2w9CXaEr4VFfwzj7AV0d/fa6x7/A5wesYg3gHvN7CmCdVLWxPb9NeCCpPe0JWnaq2bWB6gAfp5Uf1TSMnuE6/PLwPhwGVPMLHk5ydvYqeHfgnC8gOD7PRv4lZndRZC8ZptZNrAb+H14LeulpGU2uq6SqkTtXw6IEkTrqgz/x9mz7o3gqHFqckUzu4TgNPZYd682s1UER6wAnybX9eCw4W3gbTObRnAmcbO77zKzKQQb8gXAhHCWUmCImfVw9+0t+P4O1GKCs56GLqLxddEYb2S8bt2Z2SHAtcBx7r7FzB5vZLlGkHAurFdodtQ+Xn8xcCxBE8K+PA6c7e4Lw898LIC7X2Fmnwe+DswLk/7/mNk/w7LJZvYDd5/RxPIhOHLd6u6NxZu8PTUV90SCbWlkOJxR7n6nmf2N4Kj5DTP7PwewuK8CW4GnCK5DTCBYd19w993JFZtIQsnr04A73P23DSuZ2TFh3Lea2XR3v8XMjic40DsPuJKg2TVVUfuXA6K7mDJvKvCvZpYDYGaHmlk+wZnEJ+EO8asEbZl7Ce/eOSap6Cjq92o7kWBD709wBIq77wR+D9xnZrnhcvqa2fkt+9aabQaQFx7RA2Bmownee5ProoFTzKyPmXUFziY40myoB8GXeZuZ9QeS7+LaQdCeDUGb75fMbHgYU354Z9X7wFAzGxbWS04gdwM/q70Dy8xiZnZFRAzdgXXh539R0vse5u7/dPebgHLgIDP7DPChu99PcI1ldArrgfAgYGXt52uBIxup3lTcE4FvE+y4/jeV128itm3AFjM7MSy6GHjN3bcCO8IkCUlH/cnC9fSeu99F0M5/GPU/u4amAf+WNH/vBvHUAD8CvhOeTbwCXJVUvzbJvkFwVo6ZnQrUW06SqcCl4VkHZjbYzPqFd37tdPcnCdb5MWGdnu4+meDaSb3PqLF11cjrtgidQbScbmZWljR+b4rzPUpwOjjfgsOScoId2lPAX83sPYLrEu83Mn8OcE+4we0O50/+Qk8juGD2+/BMo9aNBG3eS8xsN8GO8qYUY04Ld3czGw/8xsx+SvB+VhG0wd6fwrpI9jbBxcVi4MmweWRog9dbaGYLwuV9TP0k8ggwxczWuvtXw6P7iWZW2zRzo7t/ECazv5nZToJmg+7hst81sx+F83QjOIOp12QQ+jnBBeLy8H/tju1uM6u9tjGd4Ij+p8DFZlZN0Lx2ewrrodZFwMNmdiPBNvM0EWcJTcXt7kvN7FNgnrt/2nD+FER9T74L/Hf4eh8C3wun/V/gd2aWINgRbotY3o/Cg4YEwdnPy+Fw3MwWEpyhLUiqfyvwkJktIjjS/iV7mmZq3+M6C24R/zeC62IPmdm7BPvLWQTfr18SrKOLCQ681hMkpoIGy3rFzEYCb4VnHRUECXY4wWecAKoJ7hTrDvyvmXUh+NwnsLfG1lVaqKsNEWmTzKzA3Wt/53IdMNDdr8lwWACEBwpxd68xsy8CD++jCa/d0hmEiLRVXzez6wn2U6sJ7k5rK4YAz5hZjOCW6MsyHE9a6AxCREQi6SK1iIhEUoIQEZFIShAiIhJJCUJERCIpQYjsBwu6hy460DoibZkShIiIRFKCkE7DggfJvG9mj5vZB2b2lJl9zczesODBSceH3XO8aMGDY/4RdvWBmRWa2SsWPIznUYJfutYu99tm9raZvWNmvzWzrBRjWWpmvwuX+UrYLQhmdpmZzTGzhWb2l/BXs4RxPxzG9aEFDyh6LFzO40nLPtWCB0DNN7Nna7t5EGkuJQjpbIYTPNfgsPDvWwQ9c14L/IygC4UF7j46HH8inO8XwOvufjhBP/xDAMJuFL5J0OX0UQTdN9T1qdSEEcBD4TK3EnQBDUGvpMe5+5HAUoIuJ2r1Br5I0FfPJODXBM+J+JyZHRU2ad0IfM3djyHomiSqywaRJumX1NLZrHT39wDMbDEwPewD6j2CPrEOJtxRu/uM8MyhB0Ef/ueE5X+zPd07n0zQ++mcsK+drgTPSUg1lnfC4eQumhvrAhzgr0nxbmjwXoYS9D0V1TW5SLMpQUhnU5k0nEgaTxB8H6qbuTwD/uju1x9gLHGC5AKNdAHeYJ7k2GvHs8Pl7NU1ucj+UBOTSH2zCZuIzGwssDHsLnsWQXMUZnYae7p3ng6cZ2b9wml97MAfeB/ZBXiKGuuaXKTZdAYhUt/NwGNh9847CbpXhj3dOy8G3iR4xCvuviTsQvuVsOO2aoJuolc3XHAzNNYFeJPcvTyqa3KC5xSLNIs66xMRkUhqYhIRkUhqYhJJIzMrJLhO0dDJ7r6pteMRaQ41MYmISCQ1MYmISCQlCBERiaQEISIikZQgREQk0v8HtV60vEGtH+UAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.boxplot(x='model_name', y='accuracy', data=cv_df)\n", + "sns.stripplot(x='model_name', y='accuracy', data=cv_df, \n", + " size=8, jitter=True, edgecolor=\"gray\", linewidth=2)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "model_name\n", + "CalibratedClassifierCV 0.957848\n", + "LinearSVC 0.957032\n", + "LogisticRegression 0.956464\n", + "Name: accuracy, dtype: float64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_df.groupby('model_name').accuracy.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "model = CalibratedClassifierCV(LinearSVC())\n", + "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df_train.index, test_size=0.33, random_state=0)\n", + "model.fit(X_train, y_train)\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAawAAAGpCAYAAADRBQIfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3dd5gUVdbH8e9hhjCgZESSiIKgsIgIigRXRQmK2VXUFVSU1UWMa87oGtZ3Ma2ui4jCrgEwuypIUjGQkyRlFJUgypIky8yc94+uGRucGQbomu6ifx+eeqi6XVX3dj/Tc+aeunXL3B0REZFUVybZDRARESkJBSwREYkEBSwREYkEBSwREYkEBSwREYmEzGQ3oChbF36k4Yu7oeoRvZLdhEjalpuT7CZEUtmMlP0VkvI2b/7OEnWubf/7JmG/L8vWPChh7Uo09bBERCQS9OeRiEjU5eUmuwWlQj0sERGJBPWwRESizvOS3YJSoYAlIhJ1eekRsJQSFBGRSFAPS0Qk4lwpQRERiQSlBEVERFKHelgiIlGnlKCIiESCbhwWERFJHephiYhEnVKCIiISCRolKCIikjrUwxIRiTjdOCwiItGglKCIiEjqUA9LRCTqlBIUEZFI0I3DIiIiqUM9LBGRqFNKUEREIkGjBEVERFKHelgiIlGnlKCIiESCUoIiIiKpQz0sEZGIc0+P+7AUsEREoi5NrmEpJSgiIpGgHpaISNSlyaALBSwRkahTSlBERCIhLzdxSwmYWVUze9XMFprZAjM7xsyqm9kYM1sU/F8t2NfM7AkzyzazOWbWOu48vYP9F5lZ753Vq4AlIiK76nFglLs3Aw4HFgC3AOPcvQkwLtgG6A40CZa+wD8BzKw6cDdwNHAUcHd+kCuKApaISNR5XuKWnTCzKsCxwHMA7v6Lu68FTgeGBrsNBc4I1k8HhnnMJKCqmdUBugJj3H21u68BxgDdiqtbAUtEJOry8hK2mFlfM5sWt/TdobZGwErgeTObaWaDzawSUNvdfwj2WQHUDtbrAUvijl8alBVVXiQNuhARkQLuPggYVMwumUBroL+7Tzazx/k1/Zd/DjczT3Tb1MMSEYm6UkwJEusJLXX3ycH2q8QC2I9Bqo/g/5+C15cBDeKOrx+UFVVeJAUsEZGoS2BKcGfcfQWwxMyaBkWdgfnA20D+SL/ewFvB+ttAr2C0YDtgXZA6HA10MbNqwWCLLkFZkZQSFBGRXdUfeNHMygHfAJcQ6wCNMLM+wHfAucG+7wEnA9nApmBf3H21md0HTA32G+Duq4urVAFLRCTqSnmmC3efBbQp5KXOhezrQL8izjMEGFLSehWwREQiLl1ma9c1LBERiQT1sAI/b9jEPf8YRvb3yzAzBvTvzdjPZ/LR1NmUzcykwf61GHD1xVTepyIAX327lAFP/4eNmzZjZYyX/+92cnJyufi2vxWc88f/reGU49px82XnJettlapnnnmE7t1PYOXKVbRp02W716655nIeeugO6tdvxapVawDo1KkdjzxyF2XLlmXVqtV06ZIen1NJlS9fng/Hv0a58uXJzMzg9dff5d4Bf092s1JGYT9vd911Az16nEReXh4rV66ib98b+OGHn+jR4yTuuusG8vLyyMnJ5aab7uWzz6Yl+R0kUJpMfmux9GLq2brwo1Jt2O2PPU/rwxpzdpdObNuWw+atvzB30WKOatmMzIwMHh36GgDX9T6bnNxczrvufh647lKaNmrA2p83sG+limRkbN9hPe/6+7mxz7m0aX5Iqb2Pqkf0KrW6dtShw1Fs3LiJwYMHbhew6tevw9NPP0zTpgfTvn0PVq1aQ5UqlZkw4XVOP70XS5Ysp1atGqxcuSppbd+Wm5O0uotTqVJFNm7cRGZmJh9/+AbXXX83k6fMSHazCpTNSN7fvIX9vO277z6sX78BgD//+WKaNWvC1VffXvA5ArRo0Yz//OcpWrX6zeWWUrV583eWsHNNGJyw35dZx1+WsHYlmlKCwPqNm5g+7yvOOqkjAGXLZlJ5n4q0P6I5mRkZALQ85CB+/F+sZ/D5zPkccmB9mjaK3UJQtfI+vwlW3y77kdVr13PkYU1K8Z0k16efTmH16rW/Kf/b3+7i9tsfJP6Po/POO5233hrFkiXLAZIarFJZ/i/ZsmUzySxbllT9AzMZCvt5yw9WABUrViz4vPI/R4j9EaCPMZpK9c8jMzsEuNHdLy/Nendm2Y+rqF5lX+584gW+WryUQw9uyM2Xn0fFCuUL9nlj3Kd06xgbFPPt8h8xgyvufozVP6+nW6e2XHrW9lNgjZo4la6d2mCWsn+slIoePU5i+fIVfPHFgu3KmzRpRGZmWUaPfoV99tmHp54awksvvZ6kVqauMmXKMGXyKBoffCD/fOYFpkydmewmpbx77rmRCy88i3Xr1tOtW8+C8tNO68qAATdRq1ZNzjrrkiS2MARpkhIMpYdlZi3N7AMzm2tm95tZHTN7DRhP7Aazoo4rmMNq8Ih3wmhaoXJzc1nw9fec2+33jHjsTrIqlGPIa6MKXh804l0yy5ThlN8fHeyfx4z52Tx4Qx+GPnQT4yfNYtLs7X8hj5o4lZM7HVVq7yEVZWVV4Kab+jFgwMDfvJaZmUnr1i0488xLOO20i7j11qtp3LhRElqZ2vLy8mjTtgsNG7WhbZsjaN686c4PSnP33PMITZocwyuvvMkVV/z6xIq33x5Nq1adOffcy7nrrhuS2MIQlO5MF0kTVkrwWeAl4GxikyTOAr4GGrv7o0Ud5O6D3L2Nu7e57NxTQ2rab9WuWY3aNavRsulBAJzU/kgWfP0dAG+N+4yPp33Bgzf0Kegt1a5RlSObH0K1yvuSVb48nY5swYKvvy8435eLl5Cbl8thjRuW2ntIRQcd1JCGDRswZcr7LFz4CfXq1eHzz9+ldu1aLFv2A2PGfMymTZtZtWoNn3wyhZYtD012k1PWunU/8+FHn9K1y3HJbkpkDB/+Jmec0f035Z9+OoVGjQ6gRo1in2QhKSisgFXe3V9w9y/d/XFgo7vf5O5bQqpvj9SsVoXaNauxeOkKACbPWcBBDeryyYy5PP/6aJ64vR9Z5X9ND3Zo3ZxF3y1l89at5OTmMm3uVxx8QJ2C19//eCrd0rx3BTBv3pc0bHgkzZp1pFmzjixb9gPHHHMKP/64knfeGUP79m3JyMggK6sCbdu2YuHC7GQ3OaXUrFmdKlUqA1ChQgVO7HwsX375dZJbldoOPvjAgvUePbrw1Vexz+ugg37947FVqxaUL1+uYLTqXqEUp2ZKprCuYVUwsyOA/As4W+O33T11hjkFbr38fG4d+BzbcnKov39N7rv6Ys6/4QF+2ZbDn+6OdQpbHnIQd/75j1TepxK9Tj+JC254AMzodGQLjm3TsuBcoz+dxtN39U/WW0maoUOfoFOnY6hZsxrZ2ZO4775HGTp0eKH7fvllNmPGfMTUqaPJy8vjhRdeYf78r0q5xamtTp3aDHnuMTIyylCmTBleffUd3n1vbLKblTIK+3nr1u14mjQ5iLy8PL7/fhlXX30bAGee2Z0LLjibbdu2sWXLVi66qNCJF6IrxVN5iRLKsHYz+xAo6sTu7ifs7BylPax9b5HMYe1RlqrD2lNdMoe1R11Ch7WP/kfihrV3vSplR4qF8tPm7seFcV4RESlEiqfyEiWsUYI3xa3/YYfXHgijThGRtJUm17DCGnTRM2791h1e64aIiMguCisBbUWsF7YtIiJ7Ik0GXYQVsLyI9cK2RURkT6R4Ki9RwgpYh5vZz8R6U1nBOsF2hZDqFBGRvVhYowQzwjiviIgUQinB3WdmFYArgMbAHGCIu+tGFxGRMKRJSjCsUYJDgTbAF8DJgJ46JyIieySsa1iHufvvAMzsOWBKSPWIiIhSgntkW/6Ku+ek+zOhRERClSYpwbBHCcL2IwWN2FyClUOqV0RE9lIaJSgiEnXqYYmISCSE8NSNVBTWKEEREZGEUg9LRCTqlBIUEZFISJOApZSgiIhEgnpYIiJRpxuHRUQkEpQSFBERSR3qYYmIRF2a3IelgCUiEnVKCYqIiKQO9bBERKIuTXpYClgiIlGXJsPalRIUEZFIUA9LRCTiPE+jBEVEJArS5BqWUoIiIhIJ6mGJiERdmgy6UMASEYm6NLmGpZSgiIhEgnpYIiJRlyaDLhSwRESiTgFLREQiIU1ma9c1LBERiQT1sEREoi5NUoLqYYmIRF2eJ24pATP71sy+MLNZZjYtKKtuZmPMbFHwf7Wg3MzsCTPLNrM5ZtY67jy9g/0XmVnvndWrgCUiIrvjeHdv5e5tgu1bgHHu3gQYF2wDdAeaBEtf4J8QC3DA3cDRwFHA3flBrigKWCIiUed5iVt23+nA0GB9KHBGXPkwj5kEVDWzOkBXYIy7r3b3NcAYoFtxFShgiYhEXQJTgmbW18ymxS19C6nRgQ/MbHrc67Xd/YdgfQVQO1ivByyJO3ZpUFZUeZFSdtBFlVYXJbsJkbSwWdNkNyGSGs+fn+wmRJKnyXDqdOLug4BBO9mto7svM7P9gDFmtnCHc7iZJfyHQz0sEZGI87y8hC0lqs99WfD/T8AbxK5B/Rik+gj+/ynYfRnQIO7w+kFZUeVFUsASEYm6UhwlaGaVzGzf/HWgCzAXeBvIH+nXG3grWH8b6BWMFmwHrAtSh6OBLmZWLRhs0SUoK1LKpgRFRCQl1QbeMDOIxZCX3H2UmU0FRphZH+A74Nxg//eAk4FsYBNwCYC7rzaz+4CpwX4D3H11cRUrYImIRF0pPg/L3b8BDi+kfBXQuZByB/oVca4hwJCS1q2AJSISdXoeloiISOpQD0tEJOrSZC5BBSwRkahTSlBERCR1qIclIhJ1pThKMJkUsEREok4pQRERkdShHpaISMSVdA7AqFPAEhGJOqUERUREUod6WCIiUZcmPSwFLBGRqEuTYe1KCYqISCSohyUiEnVKCYqISBR4mgQspQRFRCQS1MMSEYm6NOlhKWCJiERdmsx0oZSgiIhEgnpYIiJRp5SgiIhEQpoELKUERUQkEtTDEhGJOPf06GEpYImIRJ1SgiIiIqlDPSwRkahLkx6WApaISMRpLkEREZEUoh6WiEjUpUkPSwFLRCTq0mMqQaUERUQkGtTDEhGJuHQZdKGAJSISdWkSsJQSFBGRSFAPS0Qk6tJk0IUClohIxKXLNSylBEVEJBLUwxIRibo0SQmqh1WIf/3rEb7/fgbTp48pKHvggduYPXs8U6eOZvjwQVSpUhmAzMxMBg8eyLRpHzBr1jhuvLFfsppd6qxcWer850nqDn+Guq89S9UrewGw/5CBsbLhz1D/g1fY79F7tjuuXPNDaDhtFBVP7FRQVvupBzhg4hvs98R9pfkWUk79+nUZ+8FI5syewOxZ4+l/VR8AHn7wDuZ+8REzpo/h1ZGDC37+0tmufE8BWrRoxocfvsGMGWOZNu0Dypcvn4xmh8LzPGFLKlPAKsS//z2S007rtV3Z+PETad36JNq27cqiRYsLAtPZZ59CuXLlaNOmC8cccwqXXXYBDRvWT0azS53/so0Vl9/I8vOuYPl5V5DVvg3lf3coKy69vqBs65z5bBz3ya8HlSlDtWsuY/Ok6duda93Qkay8/eFSfgepJycnhxtvupeWhx9Ph46ncuWVF3PooU0YO+5jDm91Aq2PPIlFi77hlpuvSnZTk25XvqcZGRk8//zj9O9/G61bn0iXLueybdu2ZDRb9oACViE++WQKa9as3a5s7NiJ5ObmAjBlygzq198fiD3ps1KlimRkZJCVVYFfftnGzz+vL/U2J4tv3gKAZWZCZuZ2Tz61ShWpcFQrNk34rKCs8vmns2ncJ+St3v7z3TJlJr5pU+k0OoWtWPETM2fNBWDDho0sXLiIenX3Z8zYjwt+/iZNnkG9enWS2cyUsCvf0xNPPJa5cxfwxRcLAFi9ei15eXtRHi0vgUsKCyVgmVl9M+sYt329md0VLI3DqLM09e59HqNHfwjA66+/x8aNm/j222ksWjSJxx4bxJo165LbwNJUpgx1hz9Dg/Ej2TJpBr/MXVjwUsXj27Nl8kx8YywQZexXg4rHd2T9iHeS1dpIadiwPq0Ob8HkKTO3K7/k4p6MGj0hSa2KjvjvaZMmB+EO77zzbz7//F2uv/6K5DYuwTwvcUsqC6uH9QhQNW77T8BGwIF7izrIzPqa2TQzm5abuyGkpu2Zm2++ipycHF5++Q0A2rZtRV5eLo0ataVZsw5cc83lNGp0QJJbWYry8lh+3hUs7Xo+5Vo0pezBBxa8VKnb8Wwc9esv1uo3/pk1jw8GT+08eSqoVKkiI4Y/y/V/uZv163/9Ltx6y9Xk5OTw0kuvJ7F1qW/H72lmZgbt27fh4ouv5oQTzua007py/PEdktzKBEqTHlZYowSbuvt/47Y3ufvfAcxsYlEHufsgYBBAhQoHpNxvtYsuOofu3TvTvfv5BWXnnXc6H3zwETk5OaxcuYrPP59G69YtWbz4+yS2tPTlrd/IlqmzyerQhm1ff0uZqpUp36IZK6+/p2Cfcoc1odbDtwFQpmoVsjq2hdzc7VKGEhvIM3L4s7z88hu8+eb7BeW9LjqXU04+kZO6npvE1qW+wr6ny5b9wCefTGHVqjUAjB49gVatWjBhwqfJaqbshrB6WBV22O4ct14zpDpDddJJv+f666/knHP6sDm4bgOwZMlyjjuuPQAVK2Zx1FGt+fLL7GQ1s1SVqVaFMvtWAsDKlyOrXWu2LV4CQKUTj2XzxEn4L79e2F52Si+WnnwRS0++iE1jJ7LqgScVrArx7KC/s2BhNo89PqigrGuX4/jLX67kjLMu3u7nT7ZX1Pd0zJiPad68KVlZFcjIyKBTp3YsWLAoiS1NrHRJCYbVw1pvZoe4+1cA7r4awMyaASk/ImHYsCfp1OkYatasRnb2ZO6/fyA33tiP8uXL8e67LwIwZcpM+ve/jWeeGcqgQX9nxoyxmBnDho1gbtx1nL1ZRs3q1LzvJqxMGShjbPzgYzZPnAxApW7HsW7IKyU+1/5DBlL2wAZYxSzqj36J/90zkC2fTwur6SmrQ/u2XPTHc5jzxXymTf0AgDvvfIhHBw6gfPnyjHo/9plOnjyDflfdksymJt2ufE/Xrl3HE08M5tNP/4u7M2rUBEaNGp/kd5BAKR5oEsU8hOsJZtYNeAL4KzAjKD4SuA24xt3fL+rYfKmYEoyChc2aJrsJkdR4/vxkNyGSMstkJLsJkbVly/eWqHP9r+vvE/b7subojxLWrkQLJSXo7qOAs4ilAl8IluOBs0oSrEREpOSSkRI0swwzm2lm/w22G5nZZDPLNrPhZlYuKC8fbGcHrx8Yd45bg/IvzazrzuoM7T4sd5/r7r3c/chg6e3uc80sjYbQiYiEL0nXsK4BFsRtPww86u6NgTVAn6C8D7AmKH802A8zOwzoCTQHugFPm1mxXfbQApaZHWNm55jZfsF2SzN7CdCwHBGRCDOz+sApwOBg24ATgFeDXYYCZwTrpwfbBK93DvY/HXjF3be6+2IgGziquHrDunH4EWAIcDbwrpndD3wATAaahFGniEi6SmQPK/5+2GDpW0iVjwE38etwjxrAWnfPCbaXAvWC9XrAEoDg9XXB/gXlhRxTqLBGCZ4CHOHuW8ysWtCoFu7+bUj1iYikL0/cOIn4+2ELY2Y9gJ/cfbqZHZewiksgrIC1xd23ALj7GjNbpGAlIrJX6ACcZmYnE7vntjLwOFDVzDKDXlR9YFmw/zKgAbDUzDKBKsCquPJ88ccUKqxrWAeZ2dv5C9AoWH8n2BYRkQQpzUEX7n6ru9d39wOJDZoY7+4XAhOAc4LdegNvBetvB9sEr4/32P1UbwM9g1GEjYhdLppSXN1h9bBOD/7PChrxAbELaptDqk9EJG15XkrcOnUz8EowZmEm8FxQ/hzwbzPLBlYTC3K4+zwzGwHMB3KAfu6eW1wFYQWsz4jdNHwpkD+pXgNi92PdFlKdIiJSitz9Q+DDYP0bChnlF1we+kMRx/+VWKwokbBSgn8DqgGN3L21u7cGDiaWu3wkpDpFRNKS5hLcMz2AQzxu3id3/9nMrgQWAteGVK+ISNrxBI4STGVh9bA8PljFFeYCmiNQRER2WVgBa76Z9dqx0Mz+SKyHJSIiCaKU4J7pB7xuZpcC04OyNsRGDZ4ZUp0iImkpRUYJhi6UgOXuy4CjzewEYhMbArzn7uPCqE9ERPZ+YfWwAHD38cBe9JQ0EZHUE8JjDVNSqAFLRETCly4pwdAeLyIiIpJI6mGJiERcuvSwFLBERCIuXa5hKSUoIiKRoB6WiEjEKSUoIiKRoLkERUREUoh6WCIiEZfqcwAmigKWiEjE5SklKCIikjrUwxIRibh0GXShgCUiEnHpMqxdKUEREYmEIntYZvYkxTzO3t2vDqVFIiKyS9JlaqbiUoLTSq0VIiKy29IlJVhkwHL3oaXZEBERkeLsdNCFmdUCbgYOAyrkl7v7CSG2S0RESkj3Yf3qRWAB0Ai4F/gWmBpim0REZBe4W8KWVFaSgFXD3Z8Dtrn7R+5+KaDelYiIlKqS3Ie1Lfj/BzM7BVgOVA+vSSIisis0SvBX95tZFeAG4EmgMnBdqK0SEZESS5drWDsNWO7+32B1HXB8uM0REREpXElGCT5PITcQB9eyREQkyVJ9sESilCQl+N+49QrAmcSuY4mISArQNayAu78Wv21mLwOfhNYiERGRQuzObO1NgP0S3ZAd5eTlhl3FXqnx/PnJbkIkbV4+MdlNiKSsup2S3QRBgy4KmNl6tr+GtYLYzBciIpICdA0r4O77lkZDREREirPTmS7MbFxJykREJDny3BK2pLLinodVAagI1DSzakD+O6kM1CuFtomISAmkySDBYlOCfwKuBeoC0/k1YP0M/CPkdomISAmles8oUYp7HtbjwONm1t/dnyzFNomIiPxGSWZrzzOzqvkbZlbNzP4cYptERGQX6PEiv7rc3dfmb7j7GuDy8JokIiK7Ii+BSyorScDKMLOCsGtmGUC58JokIiLyWyWZ6WIUMNzM/hVs/wl4P7wmiYjIrnBSO5WXKCUJWDcDfYErgu05wP6htUhERHZJXpqMa99pStDd84DJwLfAUcAJwIJwmyUiIrK94m4cPgQ4P1j+BwwHcHc9xFFEJIXkKSXIQmAi0MPdswHM7LpSaZWIiJRYulzDKi4leBbwAzDBzJ41s86QJp+KiIgUyswqmNkUM5ttZvPM7N6gvJGZTTazbDMbbmblgvLywXZ28PqBcee6NSj/0sy67qzuIgOWu7/p7j2BZsAEYtM07Wdm/zSzLnv2lkVEJFFK+T6srcAJ7n440AroZmbtgIeBR929MbAG6BPs3wdYE5Q/GuyHmR0G9ASaA92Ap4PbpopUkkEXG939JXc/FagPzETPwxIRSRmOJWzZaV0xG4LNssHixAbkvRqUDwXOCNZPD7YJXu8c3Nt7OvCKu29198VANrGBfUUqyY3D8Q1d4+6D3L3zrhwnIiJ7DzPLMLNZwE/AGOBrYK275wS7LOXXp3rUA5YABK+vA2rElxdyTKF2KWCJiEjqSWRK0Mz6mtm0uKXvjvW5e667tyKWdTuK2KWj0JXkxmEREUlhiZwD0N0HAYNKuO9aM5sAHANUNbPMoBdVH1gW7LYMaAAsNbNMoAqwKq48X/wxhVIPS0RESszMauU/wcPMsoCTiE0mMQE4J9itN/BWsP52sE3w+nh396C8ZzCKsBHQBJhSXN3qYYmIRFwp34dVBxgajOgrA4xw9/+a2XzgFTO7n9jgvOeC/Z8D/m1m2cBqYiMDcfd5ZjYCmA/kAP3cPbe4ihWwREQiLq8U45W7zwGOKKT8GwoZ5efuW4A/FHGuvwJ/LWndSgmKiEgkqIclIhJxmktQREQiIU2eLqKUoIiIRIN6WCIiEZfI+7BSmQKWiEjE5Vl6XMNSSlBERCJBPSwRkYhLl0EXClgiIhGXLtewlBIUEZFIUA9LRCTiSnNqpmRSwBIRibh0melCKUEREYkE9bBERCJOowRFRCQS0uUallKCIiISCephiYhEXLrch6WAJSIScelyDUspQRERiQT1sEREIk6DLgSA+vXrMvaDkcyZPYHZs8bT/6o+ADz84B3M/eIjZkwfw6sjB1OlSuUktzS1FPW5nX12D2bPGs8vW5ZwZOuWSW5lcv28fgPX3X4/p55/Oade0JdZcxcA8OLItzj1/Ms5/cI/8fenngPgsykzOPfS/px50ZWce2l/Jk+f9ZvzXXXTPZzxxytK9T2kqv5X9WHWzHHMnjWeq/tfluzmhC4vgUsqUw9rJ3JycrjxpnuZOWsu++xTiSmTRzF23MeMHfcxt93xILm5uTz4wG3ccvNV3HrbA8lubsoo6nObN28hfzj3cv751EPJbmLSPfTYM3Q4ug2P/vUOtm3bxuYtW5kyfTYTPpnEa0Ofoly5cqxasxaAalUr84+H72G/WjVY9M23/Om6Oxj/1n8KzjXmw0+pWDErWW8lpTRv3pQ+fS7gmPan8Msv23jvvy/y7ntj+frrb5PdNNlD6mHtxIoVPzFz1lwANmzYyMKFi6hXd3/GjP2Y3NxcACZNnkG9enWS2cyUU9TntnBhNl999XWSW5d86zdsZPrsuZx9alcAypYtS+V992H4m+/S54/nUq5cOQBqVKsKwKGHNGa/WjUAaNyoIVu2buWXX34BYNOmzQwb/jp/6t0zCe8k9TRr1oQpU2ayefMWcnNz+XjiJM48o3uymxWqdOlhhRKwzKyWmR1WSPlhZlYrjDpLQ8OG9Wl1eAsmT5m5XfklF/dk1OgJSWpV6ivqc0tny5avoFrVKtzx14Gcc3E/7nrwMTZt3sK33y9j+uy5nH/5tVzc70a+WPDlb44d8+EnHNa0cUFQe/LZYfTueRYVKlQo7beRkubNW0jHjkdTvXo1srIq0L3bCdSvXzfZzQqVW+KWVBZWD+tJoGYh5TWAx4s6yMz6mtk0M5uWl7cxpKbtnkqVKjJi+LNc/5e7Wb9+Q0H5rbdcTU5ODi+99HoSW5e6ivrc0l1Obi4LvsrmvDNP4dUXniIrqwLP/XsEuUZsdJgAABgxSURBVLm5/Pzzel4a9Cg39LuMv9z5IO6/DlrO/uY7Bj49hLtu7A/Awq++ZsmyHzjx9x2S9VZSzsKF2TzyyFO8/95LvPffF5k1ex65uaned5CSCCtgNXb3j3csdPeJQJFX2t19kLu3cfc2ZcpUCqlpuy4zM5ORw5/l5Zff4M033y8o73XRuZxy8olc1OuqJLYudRX1uQnsv19NateqScvmzQDoclxH5n+VTe39anLi7ztgZvzusKaYGWvWrgNgxU8ruea2+3jgzr9wQNBjmDVvAfMWLqLL2b3pdeUNfLtkGRdfdVPS3leqeP6FVzi6XXeO73w2a9euY9Gib5LdpFClS0owrEEX+xbzWtmQ6gzNs4P+zoKF2Tz2+KCCsq5djuMvf7mSEzqfzebNW5LYutRV2OcmMTVrVGf//Wqx+LulNGpYn0nTZ3HwgQfQoF4dpsyYzVFHHs633y9lW04O1apW4ef1G/jzjXdz7RWX0Lpl84Lz9DyzBz3P7AHAsh9+pN+Nd/PCP/6WrLeVMmrVqsHKlato0KAuZ5zRnQ4dT012k0KV6oEmUcIKWNlmdrK7vxdfaGbdgUj9qdOhfVsu+uM5zPliPtOmfgDAnXc+xKMDB1C+fHlGvf8KAJMnz6DfVbcks6kppajPrVz5cjz+6P3UqlWdt98axuzZ8zi5x4VJbm1y3Hbdldx879/YlrONBnXrcN9t11ExqwJ3PPAoZ/zxCsqWzeSBO27AzHj5tXdYsnQ5zzz/Es88/xIAgx77a8GgDNneyOHPUr1GNbZty+Hqq29n3bqfk90kSQCLz48n7KRmTYB3gc+A6UFxG+AYoIe7f7Wzc2SWq5cus41ICti8fGKymxBJWXU7JbsJkZXzy7KEDXF4ssEfE/b7sv+S/6Ts0ItQeljuvsjMfgdcALQIij8C/uTuyp+JiCRQusx0EdqNw+6+1cw+BFYGRfMVrEREZHeFErDMrDIwGDgSmAUY0MrMpgN93F0JZRGRBNGgiz3zBDAf6OnueQBmZsCdwD+AXiHVKyKSdhSw9kwHd784vsBjozsGmNmikOoUEZG9WDImv02Ty4MiIqUjXYZUhzXTxWdmdleQBixgZncCn4dUp4hIWsqzxC2pLKweVn/gOWI3EOc/uKcVMBPoE1KdIiJpSdew9kAwCvAPZnYwkD9r+3x3/9rMrgUeC6NeERHZe4V6DcvdvwZ2fPjR9ShgiYgkTLpcw9KgCxGRiMtLk5CVjCcOp8cnKyIiCRXWTBfrKTwwGZAVRp0iIulKgy72gLsX9zwsERFJoHRJWyUjJSgiIrLLkjHoQkREEkgpQRERiYRUn6EiUZQSFBGRSFAPS0Qk4tLlPiwFLBGRiEuPcKWUoIiI7AIza2BmE8xsvpnNM7NrgvLqZjbGzBYF/1cLys3MnjCzbDObY2at487VO9h/kZn13lndClgiIhGXl8ClBHKAG9z9MKAd0M/MDgNuAca5exNgXLAN0B1oEix9gX9CLMABdwNHA0cBd+cHuaIoYImIRFwenrBlZ9z9B3efEayvBxYA9YDTgaHBbkOBM4L104FhHjMJqGpmdYCuwBh3X+3ua4AxQLfi6lbAEhGRAmbW18ymxS19i9n3QOAIYDJQ291/CF5aAdQO1usBS+IOWxqUFVVeJA26EBGJuEQOunD3QcCgne1nZvsArwHXuvvP8Q+Yd3c3s4SPBVEPS0Qk4kr5GhZmVpZYsHrR3V8Pin8MUn0E//8UlC8DGsQdXj8oK6q8SApYIiJSYhbrSj0HLHD3gXEvvQ3kj/TrDbwVV94rGC3YDlgXpA5HA13MrFow2KJLUFYkpQRFRCKulG8c7gBcBHxhZrOCstuAh4ARZtYH+A44N3jtPeBkIBvYBFwC4O6rzew+YGqw3wB3X11cxQpYIiIRV5rhyt0/oegnx3cuZH8H+hVxriHAkJLWrZSgiIhEgnpYIiIRp8eLiIhIJHiazCaolKCIiESCelgiIhGnlKCIiERCujwPSylBERGJBPWwREQiLj36VwpYIiKRp5SgiIhIClEPS0Qk4jRKUEREIkE3DouIiKSQlO1hZZbJSHYTIin+qZ9Scll1OyW7CZG0cd7IZDdBUEpQREQiQilBERGRFKIelohIxCklKCIikZDnSgmKiIikDPWwREQiLj36VwpYIiKRp7kERUREUoh6WCIiEZcu92EpYImIRFy6DGtXSlBERCJBPSwRkYhLl0EXClgiIhGXLtewlBIUEZFIUA9LRCTi0mXQhQKWiEjEueYSFBERSR3qYYmIRJxGCYqISCToGpaIiESChrWLiIikEPWwREQiTtewREQkEjSsXUREJIWohyUiEnEaJSgiIpGgUYIiIiIpRD0sEZGI0yhBERGJBI0SFBERSSHqYYmIRJxSgiIiEgkaJSgiIpJC1MMSEYm4PA26EBGRKPAELjtjZkPM7CczmxtXVt3MxpjZouD/akG5mdkTZpZtZnPMrHXcMb2D/ReZWe+SvE8FLBER2RUvAN12KLsFGOfuTYBxwTZAd6BJsPQF/gmxAAfcDRwNHAXcnR/kiqOAJSIScXl4wpadcfePgdU7FJ8ODA3WhwJnxJUP85hJQFUzqwN0Bca4+2p3XwOM4bdB8Dd0DUtEJOJSYFh7bXf/IVhfAdQO1usBS+L2WxqUFVVeLPWwRESkgJn1NbNpcUvfXTneY9NuhBJB1cMSEYm4RE7N5O6DgEG7eNiPZlbH3X8IUn4/BeXLgAZx+9UPypYBx+1Q/uHOKlEPS0Qk4krzGlYR3gbyR/r1Bt6KK+8VjBZsB6wLUoejgS5mVi0YbNElKCuWelgiIlJiZvYysd5RTTNbSmy030PACDPrA3wHnBvs/h5wMpANbAIuAXD31WZ2HzA12G+Au+84kOM3FLBERCKuNKdmcvfzi3ipcyH7OtCviPMMAYbsSt0KWIX4178eoXv3zqxcuYojjzwJgLPOOoU77riOZs0a07HjacyYMQeAnj3P4Lrr/lRw7O9+dyjt2p3MnDnzk9L2ZHrmmUfo3v0EVq5cRZs2XbZ77ZprLuehh+6gfv1WrFq1hp49z+D666/AzNiwYSNXX307X3yxIEktT13XXH05l156Pu7O3LkL6XPZ9WzdujXZzUqqnzds4p4nnif7+6UYxoBrLmXitDlMmDyTMmZUr1qZ+67tw341qjFh0gz+8Z83KGNGRkYGN11+Pq2bHwLAwCEjmDhtNnl5zjFHNOfmvhdgZkl+d7snXR4vYqn6RitUOCBpDevY8Sg2bNjEc889WhCwmjZtTF5eHk899SC33PLXgoAVr3nzpowcOZjDDutU2k0ukMwvXIcOR7Fx4yYGDx64XcCqX78OTz/9ME2bHkz79j1YtWoN7dodycKFi1i79me6dDmOO+64lmOPPaOYs4drW25O0uouSt26+/PRhDf43eHHs2XLFl5+6Rnef388w/49ItlNK7Bx3shSr/P2gc/SuvkhnN3192zblsPmrb9QpoyxT8UsAF58ewzffL+cO6/qzabNW8iqUB4z46vFS/jLw0/z9jMPMmvBIgYOGcHzD90KQO+bHuCa3ufQtmWzUnsf5Zu0T9iXtU2dTgn7fTnth4kpG7U16KIQn3wyhTVr1m5X9uWX2Sxa9E2xx5133umMHPl2mE1LaZ9+OoXVq9f+pvxvf7uL229/cLu/AidNms7atT8DMGXKDOrVq1Nq7YySzMxMsrIqkJGRQcWsLH74YUWym5RU6zduYvq8rziry7EAlC2bSeV9KhYEK4DNW7ZC8IdbxawKBX/Ebd6yFSO2bhhbf9nGtpwcftm2jZzcXGpUq1zK7yZxUmDQRalQSjCBzjnnVM45p0+ym5FSevQ4ieXLVxSb7rv44p6MHv1h6TUqIpYvX8HAR59h8ddT2Lx5C2PGfsSYsR8nu1lJtezH/1G98r7c+dhzfLV4CYc2bsjNfS+kYoXyPDHsNd4Z/yn7VKzIcw/eVHDMuM+m8/iwV1m9dj1P3X0tAIcf2pi2LZvRude1uEPPHp05qEHdZL2tPZaqmbJEC62HZWYZZlYzbrtccENakb+54m9Yy83dEFbTQtG2bSs2bdrM/PlfJbspKSMrqwI33dSPAQMGFrnPscceQ+/e53HHHQ+WYsuioWrVKpx2alcaH9KOBg1bU6lSRS644KxkNyupcnNzWfD1d5x78vGMeOJessqXZ8jIdwG4utfZjHlhIKcc146X/zuu4JjO7Y/k7Wce5LE7+vOP/7wBwPfLf2Txkh8Y88JAxg4dyJTZC5g+V9/dVBdKwDKznsTmmppjZh+ZWRfgG2ITIV5Y1HHuPsjd27h7m4yMfcJoWmj+8IfTGDHirZ3vmEYOOqghDRs2YMqU91m48BPq1avD55+/S+3atQBo0aIZ//znw/zhD5cVmkpMd507d2Lxt9/zv/+tJicnhzfefJ9j2rVJdrOSqnbN6tSuWY2WTQ8G4KQObVnw9Xfb7XPKcccw9tPpvzm2TYumLF2xkjXr1jPu8xm0bHowFbMqUDGrAh3b/I7ZC7NL5T2EIV1SgmH1sO4AjnT3usB1wDvAle5+prvPCKnOpDEzzj67ByNHvpPspqSUefO+pGHDI2nWrCPNmnVk2bIfOOaYU/jxx5U0aFCXV175F336XEd29uJkNzUlLfl+GUcf3ZqsrAoAnHB8RxYuXJTkViVXzWpVqF2zOouXxqatmzx7PgcdUJfvlv16bW/C5Jk0qh+7Jvr98h8L0mXzs79l27ZtVK28D3VqVWfa3C/Jyc1lW04O0774MtopwQT+S2VhXcP6xd2zAdx9hpktcvfI/DYfNuxJOnU6hpo1q5GdPZn77x/I6tVrGThwALVqVeeNN55nzpz5nHrqRQB06nQ0S5cuZ/Hi75Pc8uQaOvSJuM9tEvfd9yhDhw4vdN9bb72G6tWr8dhj9wGQk5NLx46nlmZzU96UqTN5/fV3mTplNDk5OcyaNY9nB7+Y7GYl3a1X/JFb/28Q23JyqL9/Le67tg93P/E83y5dQZkyRp1aNbizX2zShbGfTeOd8Z+RmZFB+XLl+NvNV2JmnNShLVPmLODsfndiZnRo3YLjjm6V5HcmOxPKsPbg7uf4CxfXx2+7e9EXNQLJHNYeZVG9jyTZUnFYexQkY1j73iKRw9pb1G6XsN+Xc3+clLK/RMLqYT0L7FvMtoiIJEiqp/ISJZSA5e73hnFeERFJX2GNEhwRt/7wDq99EEadIiLpKs89YUsqC2uUYJO49ZN2eK1WSHWKiKSldBklGFbAKu5dp/YnIiIiKSmsQRcVzewIYgExK1i3YMkq9kgREdklqZ7KS5SwAtYKfh3GHr+evy0iIgmS6qm8RAlrlOBxYZxXRETSVygBy8yKnaHT3V8Po14RkXSklOCeOXWH9fhpmRxQwBIRSRClBPeAu1+Sv25mM+O3RUREdkdpPMAxPUK/iEiSuOcluwmlQk8cFhGJuFR/jlWihDXo4h1+7VkdZGZvx7/u7qeFUa+IiOy9wuph/V/c+t9DqkNERIAwHhOVisIKWJe4+8UhnVtEROKkS0owrLkEW4Z0XhERSVNhzyVY6JMr3X1GSPWKiKQdpQT3TD1i164KC1gOnBBSvSIiaUczXeyZbHdXUBIRkYQJ9T4sM6sANA42s919S5j1iYikI03NtGduNrOHgT7Ad8RSgw3M7HngdnffFlK9IiJpJ12uYYU1SvBkoAbQyN2PdPfWwMFAVba/R0tERPZQHp6wJZWFFbBOAS539/X5Be7+M3AlsWAmIiKyS8JKCboX0kd191wzS+0QLiISMUoJ7pn5ZtZrx0Iz+yOwMKQ6RUTSUp57wpZUFlYPqx/wupldCkwPytoAWcCZIdUpIiJ7sbAe4LgMONrMTgCaB8Xvufu4MOoTEUln6ZISDPU+LHcfD4wPsw4RkXSX6qP7EiWsa1giIiIJpScOi4hEnFKCIiISCak+ui9RlBIUEZFIUA9LRCTiNPmtiIhEglKCIiIiKUQ9LBGRiNMoQRERiYR0uYallKCIiESCelgiIhGnlKCIiERCugQspQRFRCQS1MMSEYm49OhfgaVLVzKRzKyvuw9KdjuiRp/b7tNnt3v0ue1dlBLcPX2T3YCI0ue2+/TZ7R59bnsRBSwREYkEBSwREYkEBazdo5z47tHntvv02e0efW57EQ26EBGRSFAPS0REIkEBS0REIkEBqxBmlmtms8xsrpmNNLOKO5TnL7cE5R+a2ZdmNtvMpppZq+S+g9JnZvXN7C0zW2RmX5vZ42ZWzsyOM7N1wee10Mz+b4fjupnZlOC1WWY23MwOSNb7SAYz21BIWRUzG2Zm2cHnOczMqsS9foiZvRd83jPMbISZ1S7dloeviJ+rrnHfwQ3Bd29W8BkdZ2b/3eEcL5jZOcH6h3H7zzKzV4Pye8xsWVA238zOT8b7leIpYBVus7u3cvcWwC/AFTuU5y8PxR1zobsfDjwNPFLaDU4mMzPgdeBNd28CHALsA/w12GWiu7cCjgB6mFmH4LgWwJNAb3dvFuzzInBgKb+FVPQc8I27N3b3g4HFwGAAM6sAvAv8092buHtrYj93tZLW2hAU83N1Yv53EJhG7LvXyt17lfDUF8Z9h8+JK380OOfpwL/MrGwC344kgKZm2rmJQMtd2P9z4MaQ2pKqTgC2uPvzAO6ea2bXEfslOyF/J3ffbGazgHpB0c3AA+6+IG6ft0uv2anJzBoDRwLnxRUPALLN7GDg98Dn7v5O/ovu/mGpNrJ0FPlzZWZ3u/umMCp190VmtgmoBvwURh2ye9TDKoaZZQLdgS+CoqwdUoLnFXJYN+DNUmtkamgOTI8vcPefge+BxvllZlYNaAJ8HHfcjFJqY5QcBsxy99z8gmB9FrHPrAU7fN57qRL9XBWiU/z3FDhth9dfjHv9N9kQM2sNLHJ3BasUox5W4bKCH3SI9bCeC9Y3BymDwrxoZuWIpSzS7hrWTnQys9nEgtVj7r5ixx3MrAYwDqgIDHL3/9txH5ESmujuPfI3zOyFHV6/0N2nFXLcdWZ2CbHU46khtk92k3pYhYu/VtXf3X8pwTEXAgcBQ4ldl0kn84mlsAqYWWXgACCb2C+Qw4n9xdwnblDKPKA1gLuvCv4YGEQs6Kez+UArMyv4fgbrrYLX5rHD572X2tnPVaI96u7NgbOB54JrhZJCFLASyGN3Yd8JtDOzZsluTykaB1Q0s14AZpYB/B14ASi4zuDui4GHiF27AvgbcLuZHRp3roql0eBU5u7ZwEzgjrjiO4AZwWsvAe3N7JT8F83s2GAQy96kyJ+rsK5fQcF11GlA77DqkN2jgLVrdryG9dCOO7j7ZmJfqrQZeBEE6jOBP5jZIuArYAtwWyG7PwMca2YHuvsXwDXAsGCo8afAocR+IaeTima2NG65HugDHBIM5f6aWJqqDxT8jPUA+gfDvecDfwZWJusNhGEXf652Rfw1rLFF7DMAuD6+lyvJp6mZREQkEvTXg4iIRIICloiIRIICloiIRIICloiIRIICloiIRIIClkSKFTGT/m6eK34W78Fmdlgx+x5nZu13o45vzazm7rZRRH6lgCVRU9RM+kDB/I+7zN0vc/f5xexyHLDLAUtEEkcBS6JsItA46P1MNLO3gflmlmFmj1js2WRzzOxPEHtchZn9I7hJeSywX/6JgucktQnWu1nsGVOzzWycmR1ILDBeF/TuOplZLTN7LahjatwjU2qY2QdmNs/MBgNWuh+JyN5Lk99KJMXNpD8qKGoNtHD3xWbWF1jn7m3NrDzwqZl9QOx5XE2JzYZem9hcdUN2OG8t4Fng2OBc1d19tZk9A2zIn5TXzF4iNvfcJxZ74ORoYrN03A184u4DgqmT+oT6QYikEQUsiZrCZtJvD0wJ5ioE6AK0zL8+BVQhNlP8scDLwaM6lpvZ+ELO3w74OP9c7r66iHacCBwWe8YgAJXNbJ+gjrOCY981szW7+T5FZAcKWBI1v3nESxA0NsYXAf3dffQO+52cwHaUAdq5+5ZC2iIiIdA1LNkbjQautOAR52Z2iJlVIvbgyPOCa1x1gOMLOXYSscl5GwXHVg/K1wP7xu33AdA/fyPukSkfAxcEZd2JPbVWRBJAAUv2RoOJXZ+aYWZzgX8Ryya8ASwKXhsGfL7jge6+EugLvB48dHJ48NI7wJn5gy6Aq4E2waCO+fw6WvFeYgFvHrHU4PchvUeRtKPZ2kVEJBLUwxIRkUhQwBIRkUhQwBIRkUhQwBIRkUhQwBIRkUhQwBIRkUhQwBIRkUj4f0YHu5Kb5YZGAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "conf_mat = confusion_matrix(y_test, y_pred)\n", + "fig, ax = plt.subplots(figsize=(7,7))\n", + "sns.heatmap(conf_mat, annot=True, fmt='d',\n", + " xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)\n", + "plt.ylabel('Actual')\n", + "plt.xlabel('Predicted')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'ORG' predicted as 'PER' : 128 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
25532ORGThe Bluegrass Stallions are a basketball team ...
20641ORGThe Whig Party was a political party active i...
21111ORGAlexander Mair \"Sandy\" Courage, Jr. (December ...
30511ORGMysti Mayhem (born Misty Dawn Naholnik on May ...
24901ORGThe Alliance of the Orders of Saint John of Je...
20954ORGThe Red Bull Drifting World Championship is a ...
21410ORGThe Cambridge University Light Entertainment S...
20203ORGIn cricket, a googly (or wrong 'un) is a type ...
20036ORGThe Arts and Crafts movement was an internatio...
22173ORGHideo Yoshizawa (吉沢 秀雄 Yoshizawa Hideo) is a v...
25480ORGWilliam \"Bill\" McGlaughlin (born October 3, 19...
21667ORGThe Department of Science and Technology was a...
27092ORGEartha, born Eartha Moore in Los Angeles, Cali...
31854ORGRoyal Montserrat Police Service is the Royal P...
28804ORGJustice Not Crisis is a direct action pressure...
21288ORGBikram Keith (born December 29, 1982) is a Was...
29889ORGLos de Ramón are a Chilean folkloric group of ...
29863ORGThe London Sketch Club was founded on 1 April ...
21113ORGAlexander Smith (October 14, 1818 – November 5...
20603ORGThe Post Office Department (1792–1971) was the...
22646ORGMiguel Caballero is a bulletproof clothing com...
27007ORGDrew Danburry is a singer-songwriter who has t...
21056ORGAdolph Alfred Taubman (January 31, 1924 – Apri...
22310ORGJason Jones (born June 1, 1971) is a video gam...
31676ORGThe Bangladesh Army (Bengali: বাংলাদেশ সেনা বা...
23609ORGThe Trío Matamoros played boleros and son. The...
23953ORGDaniel R. aka Báalam is a Psytrance promoter/p...
20465ORGA state legislature in the United States is th...
26683ORGDJ Paulette (born Paulette Constable) is a Man...
22983ORGThe Pets Evacuation and Transportation Standar...
.........
20468ORGThe Stimson Doctrine is a policy of the United...
24951ORGThe American Combat Association is a small mix...
26281ORGChristopher Howard Wolf (born September 21, 19...
20261ORGKhlysts or Khlysty (Russian: Хлысты) was an un...
29729ORGLehman Brothers Treasury Co. B.V. (\"LBT\") is a...
24472ORGThe two hundred lei banknote is one of the cir...
21869ORGThe Federation of Nepalese Journalists (Nepali...
23028ORGProcess thinking, also known as '\"the process\"...
22387ORGKavka's toxin puzzle is a thought experiment a...
21592ORGCrescent Toys was a British toy manufacturing ...
22334ORGJohn Leask Lumley (4 November 1930 in Detroit,...
27840ORGGeorgian Post is the postal service in Georgia.
21274ORGThe Bible Presbyterian Church is an American P...
20939ORGPeter Walker (died 1879) was the owner and mas...
26110ORGThe Center for a Just Society is a conservativ...
20009ORGAbolitionism is a movement to end slavery, whe...
21610ORGEBK is a drum and bass producer and DJ from Lo...
24835ORGAl Ramsay Shield is an annual international me...
27280ORGEternal Descent is a British virtual band crea...
31470ORGThe Prairie Wolf Pack is a Canadian rugby unio...
20691ORGAnti H-Block was the political label used in 1...
26775ORGDavid Fray (born 24 May 1981) is a French clas...
21627ORGDavid Nicholl (14 June 1871 – 11 March 1918) w...
20545ORGTommy James and the Shondells are an American ...
21770ORGAn ell (from Proto-Germanic *alinō, cognate wi...
21105ORGAlbert Mortimer Jenkin (14 September 1872 – 3 ...
21337ORGBrandes Investment Partners is an investment a...
20919ORGNoahidism (/ˈnoʊə.haɪd.ɪsm/) or Noachidism (/ˈ...
31225ORGThe Northern Territory Parliament consists of ...
28125ORGHarrington Talents is a film production compan...
\n", + "

128 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "25532 ORG The Bluegrass Stallions are a basketball team ...\n", + "20641 ORG The Whig Party was a political party active i...\n", + "21111 ORG Alexander Mair \"Sandy\" Courage, Jr. (December ...\n", + "30511 ORG Mysti Mayhem (born Misty Dawn Naholnik on May ...\n", + "24901 ORG The Alliance of the Orders of Saint John of Je...\n", + "20954 ORG The Red Bull Drifting World Championship is a ...\n", + "21410 ORG The Cambridge University Light Entertainment S...\n", + "20203 ORG In cricket, a googly (or wrong 'un) is a type ...\n", + "20036 ORG The Arts and Crafts movement was an internatio...\n", + "22173 ORG Hideo Yoshizawa (吉沢 秀雄 Yoshizawa Hideo) is a v...\n", + "25480 ORG William \"Bill\" McGlaughlin (born October 3, 19...\n", + "21667 ORG The Department of Science and Technology was a...\n", + "27092 ORG Eartha, born Eartha Moore in Los Angeles, Cali...\n", + "31854 ORG Royal Montserrat Police Service is the Royal P...\n", + "28804 ORG Justice Not Crisis is a direct action pressure...\n", + "21288 ORG Bikram Keith (born December 29, 1982) is a Was...\n", + "29889 ORG Los de Ramón are a Chilean folkloric group of ...\n", + "29863 ORG The London Sketch Club was founded on 1 April ...\n", + "21113 ORG Alexander Smith (October 14, 1818 – November 5...\n", + "20603 ORG The Post Office Department (1792–1971) was the...\n", + "22646 ORG Miguel Caballero is a bulletproof clothing com...\n", + "27007 ORG Drew Danburry is a singer-songwriter who has t...\n", + "21056 ORG Adolph Alfred Taubman (January 31, 1924 – Apri...\n", + "22310 ORG Jason Jones (born June 1, 1971) is a video gam...\n", + "31676 ORG The Bangladesh Army (Bengali: বাংলাদেশ সেনা বা...\n", + "23609 ORG The Trío Matamoros played boleros and son. The...\n", + "23953 ORG Daniel R. aka Báalam is a Psytrance promoter/p...\n", + "20465 ORG A state legislature in the United States is th...\n", + "26683 ORG DJ Paulette (born Paulette Constable) is a Man...\n", + "22983 ORG The Pets Evacuation and Transportation Standar...\n", + "... ... ...\n", + "20468 ORG The Stimson Doctrine is a policy of the United...\n", + "24951 ORG The American Combat Association is a small mix...\n", + "26281 ORG Christopher Howard Wolf (born September 21, 19...\n", + "20261 ORG Khlysts or Khlysty (Russian: Хлысты) was an un...\n", + "29729 ORG Lehman Brothers Treasury Co. B.V. (\"LBT\") is a...\n", + "24472 ORG The two hundred lei banknote is one of the cir...\n", + "21869 ORG The Federation of Nepalese Journalists (Nepali...\n", + "23028 ORG Process thinking, also known as '\"the process\"...\n", + "22387 ORG Kavka's toxin puzzle is a thought experiment a...\n", + "21592 ORG Crescent Toys was a British toy manufacturing ...\n", + "22334 ORG John Leask Lumley (4 November 1930 in Detroit,...\n", + "27840 ORG Georgian Post is the postal service in Georgia.\n", + "21274 ORG The Bible Presbyterian Church is an American P...\n", + "20939 ORG Peter Walker (died 1879) was the owner and mas...\n", + "26110 ORG The Center for a Just Society is a conservativ...\n", + "20009 ORG Abolitionism is a movement to end slavery, whe...\n", + "21610 ORG EBK is a drum and bass producer and DJ from Lo...\n", + "24835 ORG Al Ramsay Shield is an annual international me...\n", + "27280 ORG Eternal Descent is a British virtual band crea...\n", + "31470 ORG The Prairie Wolf Pack is a Canadian rugby unio...\n", + "20691 ORG Anti H-Block was the political label used in 1...\n", + "26775 ORG David Fray (born 24 May 1981) is a French clas...\n", + "21627 ORG David Nicholl (14 June 1871 – 11 March 1918) w...\n", + "20545 ORG Tommy James and the Shondells are an American ...\n", + "21770 ORG An ell (from Proto-Germanic *alinō, cognate wi...\n", + "21105 ORG Albert Mortimer Jenkin (14 September 1872 – 3 ...\n", + "21337 ORG Brandes Investment Partners is an investment a...\n", + "20919 ORG Noahidism (/ˈnoʊə.haɪd.ɪsm/) or Noachidism (/ˈ...\n", + "31225 ORG The Northern Territory Parliament consists of ...\n", + "28125 ORG Harrington Talents is a film production compan...\n", + "\n", + "[128 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'LOC' predicted as 'PER' : 22 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
34848LOCThe 2010–11 season was Swansea City's 82nd sea...
37664LOCKuehnle's Hotel was an Atlantic City, New Jers...
36620LOCAccording to Imperial Gazetteer of India, Kala...
35451LOCIrará is a city in Bahia, Brazil. Irará is the...
33044LOCAnthony Robert Gizzo (August 4, 1902 – April 1...
37825LOCThe 1997 South American Under-17 Football Cham...
37859LOCYigal Ozeri (born 1958) is an Israeli artist b...
37855LOCMarvin Crenshaw (born February 3, 1952) is a f...
37827LOCThe 2003 South American Under-17 Football Cham...
37830LOCLocated on Chicago's South Side, the Velvet Lo...
37256LOCJerome Vered (born March 13, 1958) is a Studio...
33019LOCWilusa (Hittite: URUWi-lu-ša) <English pronunc...
50536LOCSignature Records was a jazz record company la...
37843LOCThe 1999–00 season was Swansea City A.F.C.'s 8...
37861LOCHenry B. Lembeck (April 8, 1826 – July 25, 190...
35202LOCDunedin Railway Station in Dunedin on New Zeal...
32474LOCDay Deborah Rica Lipford, now known as Dr. Day...
37668LOCNguroje is a town found on the mambilla platea...
33024LOCClarence Horatius \"Big\" Miller (October 18, 19...
32537LOCThe Musée de l’Œuvre Notre-Dame (or Frauenhaus...
34877LOCRummidge is a fictional city used by David Lod...
37826LOCThe 2001 South American Under-17 Football Cham...
\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "34848 LOC The 2010–11 season was Swansea City's 82nd sea...\n", + "37664 LOC Kuehnle's Hotel was an Atlantic City, New Jers...\n", + "36620 LOC According to Imperial Gazetteer of India, Kala...\n", + "35451 LOC Irará is a city in Bahia, Brazil. Irará is the...\n", + "33044 LOC Anthony Robert Gizzo (August 4, 1902 – April 1...\n", + "37825 LOC The 1997 South American Under-17 Football Cham...\n", + "37859 LOC Yigal Ozeri (born 1958) is an Israeli artist b...\n", + "37855 LOC Marvin Crenshaw (born February 3, 1952) is a f...\n", + "37827 LOC The 2003 South American Under-17 Football Cham...\n", + "37830 LOC Located on Chicago's South Side, the Velvet Lo...\n", + "37256 LOC Jerome Vered (born March 13, 1958) is a Studio...\n", + "33019 LOC Wilusa (Hittite: URUWi-lu-ša) \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
55097OTHERThe Social Contract: A Personal Inquiry into t...
58270OTHERThe Forger is a novel by Paul Watkins about a ...
58557OTHERThe Book of the Law of the Lord is a book acce...
65404OTHER(248835) 2006 SX368 /əˈkɪroʊ.iː/, also known a...
53914OTHERDara Moskowitz Grumdahl is a food and wine wri...
55413OTHERImpressionism is a 2009 play by Michael Jacobs...
57205OTHERThe Save Jersey Blog is a conservative politic...
56390OTHERIyyun: The Jerusalem Philosophical Quarterly (...
53608OTHER\"An Arundel Tomb\" is a poem by Philip Larkin, ...
54948OTHERThe 'Nam was a war comic book series detailing...
71718OTHERThe play Henceforward... is the first comedy i...
52899OTHER\"Sinners in the Hands of an Angry God\" is a se...
64971OTHERThe P Funk Mothership, otherwise known as The ...
55908OTHER\"Civil War II\" is a comic book crossover story...
63264OTHERJupiter Laughs is A. J. Cronin's 1940 play in ...
57933OTHERL'Escargot (1963–1984) was a racehorse notable...
52339OTHERAn Inspector Calls is a play written by Englis...
63601OTHEROld Wicked Songs is a two character play writt...
54065OTHERThe Renard was a cutter launched in 1812 and a...
54717OTHERQuiet American (April 29, 1986 – October 14 20...
55349OTHERYu-Gi-Oh! Zexal (遊☆戯☆王ZEXAL(ゼアル) Yū☆Gi☆Ō Zearu...
55001OTHERThe Diary of a Nobody is an English comic nove...
63600OTHEROld Grand-Dad is a brand of bourbon whiskey di...
53021OTHERThe Merchant of Venice is a play by William Sh...
65106OTHERIn Crown of Shadows, Damien and Tarrant return...
52594OTHERGoldbach's conjecture is one of the oldest and...
72203OTHERHellblazer (also known as John Constantine, He...
71278OTHERArarad (Armenian: Արարատ Օրաթերթ) is a daily L...
67564OTHERThe Johnson Helm House, also known as the Helm...
61952OTHERThe Warren Buffett Way, a book by author Rober...
.........
67185OTHERIn medicine, Garrod's tetrad is a term named f...
55796OTHERBengoshi no Kuzu (Japanese: 弁護士のくず, lit. Scum ...
59223OTHERHMS Amazon was a frigate of the Royal Navy. Sh...
61478OTHERA Stutenkerl belongs to the Saint Nicholas tra...
54090OTHERGemini is one of the constellations of the zod...
56841OTHERPermafrost is the farthest north literary jour...
60900OTHERIn Dutch architecture, the Traditionalist Scho...
55329OTHERIn science fiction stories or Superhero comics...
70003OTHERThe Rodiad is a pornographic poem on the subje...
53434OTHERTemptation is a Faustian play written by Czech...
54480OTHERMichael (popularly known as Great Michael) was...
63181OTHERTime: 1919
70503OTHERWitch of the Wave was a long-lived extreme cli...
71970OTHERThe Minnesota Law Review is a law review publi...
52445OTHERThe Clash of Civilizations (COC) is a hypothes...
52382OTHERBeyond the Fringe was a British comedy stage r...
54058OTHERFrankel (foaled 11 February 2008) is a British...
52590OTHERGladius (English pronunciation: /ɡleɪdiəs/; La...
57935OTHERLa Caricature was a satirical weekly published...
52294OTHER2000 AD is a weekly British science fiction-or...
52890OTHERShit is a word that is usually considered vulg...
70900OTHERMagosphaera planula was a spherical multiflage...
71872OTHERThe Little Artists are John Cake and Darren Ne...
68131OTHERMopsy was a comic strip created in 1939 by Gla...
55354OTHERZippy Chippy (born April 20, 1991) is a thorou...
57845OTHERHenry Bailey was a sternwheel steamboat that o...
52744OTHERMutiny on the Bounty is the title of the 1932 ...
52557OTHERFnord is a word that was coined in 1965 by Ker...
55680OTHERAmerican Exorcist: Critical Essays on William ...
54044OTHERFitzpatrick's War is a work of post-apocalypti...
\n", + "

117 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " category abstract\n", + "55097 OTHER The Social Contract: A Personal Inquiry into t...\n", + "58270 OTHER The Forger is a novel by Paul Watkins about a ...\n", + "58557 OTHER The Book of the Law of the Lord is a book acce...\n", + "65404 OTHER (248835) 2006 SX368 /əˈkɪroʊ.iː/, also known a...\n", + "53914 OTHER Dara Moskowitz Grumdahl is a food and wine wri...\n", + "55413 OTHER Impressionism is a 2009 play by Michael Jacobs...\n", + "57205 OTHER The Save Jersey Blog is a conservative politic...\n", + "56390 OTHER Iyyun: The Jerusalem Philosophical Quarterly (...\n", + "53608 OTHER \"An Arundel Tomb\" is a poem by Philip Larkin, ...\n", + "54948 OTHER The 'Nam was a war comic book series detailing...\n", + "71718 OTHER The play Henceforward... is the first comedy i...\n", + "52899 OTHER \"Sinners in the Hands of an Angry God\" is a se...\n", + "64971 OTHER The P Funk Mothership, otherwise known as The ...\n", + "55908 OTHER \"Civil War II\" is a comic book crossover story...\n", + "63264 OTHER Jupiter Laughs is A. J. Cronin's 1940 play in ...\n", + "57933 OTHER L'Escargot (1963–1984) was a racehorse notable...\n", + "52339 OTHER An Inspector Calls is a play written by Englis...\n", + "63601 OTHER Old Wicked Songs is a two character play writt...\n", + "54065 OTHER The Renard was a cutter launched in 1812 and a...\n", + "54717 OTHER Quiet American (April 29, 1986 – October 14 20...\n", + "55349 OTHER Yu-Gi-Oh! Zexal (遊☆戯☆王ZEXAL(ゼアル) Yū☆Gi☆Ō Zearu...\n", + "55001 OTHER The Diary of a Nobody is an English comic nove...\n", + "63600 OTHER Old Grand-Dad is a brand of bourbon whiskey di...\n", + "53021 OTHER The Merchant of Venice is a play by William Sh...\n", + "65106 OTHER In Crown of Shadows, Damien and Tarrant return...\n", + "52594 OTHER Goldbach's conjecture is one of the oldest and...\n", + "72203 OTHER Hellblazer (also known as John Constantine, He...\n", + "71278 OTHER Ararad (Armenian: Արարատ Օրաթերթ) is a daily L...\n", + "67564 OTHER The Johnson Helm House, also known as the Helm...\n", + "61952 OTHER The Warren Buffett Way, a book by author Rober...\n", + "... ... ...\n", + "67185 OTHER In medicine, Garrod's tetrad is a term named f...\n", + "55796 OTHER Bengoshi no Kuzu (Japanese: 弁護士のくず, lit. Scum ...\n", + "59223 OTHER HMS Amazon was a frigate of the Royal Navy. Sh...\n", + "61478 OTHER A Stutenkerl belongs to the Saint Nicholas tra...\n", + "54090 OTHER Gemini is one of the constellations of the zod...\n", + "56841 OTHER Permafrost is the farthest north literary jour...\n", + "60900 OTHER In Dutch architecture, the Traditionalist Scho...\n", + "55329 OTHER In science fiction stories or Superhero comics...\n", + "70003 OTHER The Rodiad is a pornographic poem on the subje...\n", + "53434 OTHER Temptation is a Faustian play written by Czech...\n", + "54480 OTHER Michael (popularly known as Great Michael) was...\n", + "63181 OTHER Time: 1919\n", + "70503 OTHER Witch of the Wave was a long-lived extreme cli...\n", + "71970 OTHER The Minnesota Law Review is a law review publi...\n", + "52445 OTHER The Clash of Civilizations (COC) is a hypothes...\n", + "52382 OTHER Beyond the Fringe was a British comedy stage r...\n", + "54058 OTHER Frankel (foaled 11 February 2008) is a British...\n", + "52590 OTHER Gladius (English pronunciation: /ɡleɪdiəs/; La...\n", + "57935 OTHER La Caricature was a satirical weekly published...\n", + "52294 OTHER 2000 AD is a weekly British science fiction-or...\n", + "52890 OTHER Shit is a word that is usually considered vulg...\n", + "70900 OTHER Magosphaera planula was a spherical multiflage...\n", + "71872 OTHER The Little Artists are John Cake and Darren Ne...\n", + "68131 OTHER Mopsy was a comic strip created in 1939 by Gla...\n", + "55354 OTHER Zippy Chippy (born April 20, 1991) is a thorou...\n", + "57845 OTHER Henry Bailey was a sternwheel steamboat that o...\n", + "52744 OTHER Mutiny on the Bounty is the title of the 1932 ...\n", + "52557 OTHER Fnord is a word that was coined in 1965 by Ker...\n", + "55680 OTHER American Exorcist: Critical Essays on William ...\n", + "54044 OTHER Fitzpatrick's War is a work of post-apocalypti...\n", + "\n", + "[117 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'PER' predicted as 'ORG' : 146 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
16564PERMert Yücel is an electronic music producer and...
13711PERJohn Horrocks (27 March 1768, in Edgworth, Lan...
2612PERChua Beng Huat (simplified Chinese: 蔡明发; tradi...
2871PEREdward Michael Scheidt is a retired Chairman o...
1599PERA Sikh (/siːk, sɪk/; Punjabi: ਸਿੱਖ sikkh [sɪkk...
12985PER(For other people named James Ballantyne, see ...
17428PERNicolas Perrot (c.1644–1717), a French explore...
18825PERReginald Cheyne was the Chamberlain of Scotlan...
17771PEROwsley Brown Frazier (May 7, 1935 – August 16,...
6347PERBeBe & CeCe Winans are an American gospel musi...
822PERJames Clayton \"Jim\" Dobson, Jr. (born April 21...
14973PERKevin Lynch is an American software developer....
1673PERA stored energy printer is a computer printer ...
1438PERRegina Prot(h)mann (also Brotmann) (1552 – 18 ...
18677PERRajiv Mishra is a well known media professiona...
14999PERMr. Justice Khawaja Muhammad Sharif (Urdu: خو...
12105PERJack King is an American rock drummer. He was ...
16384PERMasque is an American metal band.
18436PERPhoebe Kreutz is a singer-songwriter, primaril...
3721PERA liaison officer is a person who liaises betw...
3283PER\"Jack\" Young (1895 in Tyne and Wear – 1952) wa...
6384PERBen Davis is a cellist from the United Kingdom...
754PERHis Majesty's Armed Forces (HMAF) is the milit...
18569PERPuddletag is an audio tag editor, i.e. a metad...
8214PERDarren Storsley is the founder of Top Teen of ...
7776PERThe Commission for Energy Regulation-An Coimis...
8013PERDan Levitan is co-founder and partner for vent...
10377PERGeorge Marshall was Principal of Trevelyan Col...
3094PERGirish Sant was a noted energy analyst held in...
15420PERThe Rat Pack was the nickname given to a group...
.........
17167PERMuna AbuSulayman, (Arabic: منى أبو سليمان‎‎; b...
14964PERKevin R. Henke is an American geochemist and f...
1713PERA tax (from the Latin taxo) is a financial cha...
4732PERWilliam Weir (20 September 1865 – 8 July 1950)...
6968PERBryce Hospital, opened in 1861 in Tuscaloosa, ...
6669PERThe Blue Planet Aquarium is a marine and fresh...
8935PEReXo Platform is an open source, standard-based...
9123PERDr. Edward Vernon Pegge (5 June 1864 − 21 Marc...
17043PERModern Times is Al Stewart's sixth studio albu...
13194PERJohan Larsson is a European, former musician a...
16569PERMichael \"Mad Dog\" Mavridoglou (born 1978) is a...
18060PERPaul Reid is a New Zealand actor who played Ma...
18559PERA private member's bill in a parliamentary sys...
5389PERAlfred Hubert Donat Agache (1875 – 1959), some...
1783PERThomas E. Lovejoy, \"the Godfather of Biodivers...
10060PERGama is a German maker of toys, usually cars a...
8412PERDavid Knights (born David John Knights, 28 Jun...
16057PERMarian Chace (31 October 1896 – 19 July 1970) ...
9656PERForced Entry was an American thrash metal band...
10123PERGavan McDonell is an Australian civil engineer...
18562PERPromise Theory, in the context of information ...
5500PERAlu is a Los Angeles-based eclectic chanteuse ...
4355PERSamuel Moore (1742 – 1822) id notable as a lea...
14933PERKenya Airports Authority (KAA) is the owner an...
4748PERThe Chinese term, 柔功門, can be roughly translat...
9895PERFred D. Miller, Jr. is an emeritus professor o...
7597PERChristopher Buhler Berendt is an executive who...
542PERFiestas Patrias (English: Patriotic Holidays) ...
10909PERGustav Stickley (March 9, 1858 – April 21, 194...
2194PERAaron J Smith is a House music DJ/remixer from...
\n", + "

146 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "16564 PER Mert Yücel is an electronic music producer and...\n", + "13711 PER John Horrocks (27 March 1768, in Edgworth, Lan...\n", + "2612 PER Chua Beng Huat (simplified Chinese: 蔡明发; tradi...\n", + "2871 PER Edward Michael Scheidt is a retired Chairman o...\n", + "1599 PER A Sikh (/siːk, sɪk/; Punjabi: ਸਿੱਖ sikkh [sɪkk...\n", + "12985 PER (For other people named James Ballantyne, see ...\n", + "17428 PER Nicolas Perrot (c.1644–1717), a French explore...\n", + "18825 PER Reginald Cheyne was the Chamberlain of Scotlan...\n", + "17771 PER Owsley Brown Frazier (May 7, 1935 – August 16,...\n", + "6347 PER BeBe & CeCe Winans are an American gospel musi...\n", + "822 PER James Clayton \"Jim\" Dobson, Jr. (born April 21...\n", + "14973 PER Kevin Lynch is an American software developer....\n", + "1673 PER A stored energy printer is a computer printer ...\n", + "1438 PER Regina Prot(h)mann (also Brotmann) (1552 – 18 ...\n", + "18677 PER Rajiv Mishra is a well known media professiona...\n", + "14999 PER Mr. Justice Khawaja Muhammad Sharif (Urdu: خو...\n", + "12105 PER Jack King is an American rock drummer. He was ...\n", + "16384 PER Masque is an American metal band.\n", + "18436 PER Phoebe Kreutz is a singer-songwriter, primaril...\n", + "3721 PER A liaison officer is a person who liaises betw...\n", + "3283 PER \"Jack\" Young (1895 in Tyne and Wear – 1952) wa...\n", + "6384 PER Ben Davis is a cellist from the United Kingdom...\n", + "754 PER His Majesty's Armed Forces (HMAF) is the milit...\n", + "18569 PER Puddletag is an audio tag editor, i.e. a metad...\n", + "8214 PER Darren Storsley is the founder of Top Teen of ...\n", + "7776 PER The Commission for Energy Regulation-An Coimis...\n", + "8013 PER Dan Levitan is co-founder and partner for vent...\n", + "10377 PER George Marshall was Principal of Trevelyan Col...\n", + "3094 PER Girish Sant was a noted energy analyst held in...\n", + "15420 PER The Rat Pack was the nickname given to a group...\n", + "... ... ...\n", + "17167 PER Muna AbuSulayman, (Arabic: منى أبو سليمان‎‎; b...\n", + "14964 PER Kevin R. Henke is an American geochemist and f...\n", + "1713 PER A tax (from the Latin taxo) is a financial cha...\n", + "4732 PER William Weir (20 September 1865 – 8 July 1950)...\n", + "6968 PER Bryce Hospital, opened in 1861 in Tuscaloosa, ...\n", + "6669 PER The Blue Planet Aquarium is a marine and fresh...\n", + "8935 PER eXo Platform is an open source, standard-based...\n", + "9123 PER Dr. Edward Vernon Pegge (5 June 1864 − 21 Marc...\n", + "17043 PER Modern Times is Al Stewart's sixth studio albu...\n", + "13194 PER Johan Larsson is a European, former musician a...\n", + "16569 PER Michael \"Mad Dog\" Mavridoglou (born 1978) is a...\n", + "18060 PER Paul Reid is a New Zealand actor who played Ma...\n", + "18559 PER A private member's bill in a parliamentary sys...\n", + "5389 PER Alfred Hubert Donat Agache (1875 – 1959), some...\n", + "1783 PER Thomas E. Lovejoy, \"the Godfather of Biodivers...\n", + "10060 PER Gama is a German maker of toys, usually cars a...\n", + "8412 PER David Knights (born David John Knights, 28 Jun...\n", + "16057 PER Marian Chace (31 October 1896 – 19 July 1970) ...\n", + "9656 PER Forced Entry was an American thrash metal band...\n", + "10123 PER Gavan McDonell is an Australian civil engineer...\n", + "18562 PER Promise Theory, in the context of information ...\n", + "5500 PER Alu is a Los Angeles-based eclectic chanteuse ...\n", + "4355 PER Samuel Moore (1742 – 1822) id notable as a lea...\n", + "14933 PER Kenya Airports Authority (KAA) is the owner an...\n", + "4748 PER The Chinese term, 柔功門, can be roughly translat...\n", + "9895 PER Fred D. Miller, Jr. is an emeritus professor o...\n", + "7597 PER Christopher Buhler Berendt is an executive who...\n", + "542 PER Fiestas Patrias (English: Patriotic Holidays) ...\n", + "10909 PER Gustav Stickley (March 9, 1858 – April 21, 194...\n", + "2194 PER Aaron J Smith is a House music DJ/remixer from...\n", + "\n", + "[146 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'LOC' predicted as 'ORG' : 21 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
37834LOCThe 1913–14 season was the 22nd in the history...
35384LOCThe Louisville and Nashville Depot, located at...
47255LOCLeganés (Spanish pronunciation: [le.ɣa.ˈnes]) ...
35280LOCParanavaí is a city in southern Brazil that wa...
37732LOCFareed Town (Urdu: فرید ٹاون‎) is a new town ...
34342LOCKaraparamba is a suburb included in Kozhikode ...
33284LOCChamoli Gopeshwar is a township in garhwal hil...
41671LOCKidder is a city in Caldwell County, Missouri,...
35333LOCYenangyaung (Burmese: ရေနံချောင်း; literally \"...
32541LOCOpenCrowd is a New York City based privately h...
35018LOCEusebius and Jerome put the territory of Eleut...
35660LOCEden and John's East River String Band are a N...
37841LOCThe 1933–34 season was the 37th in the history...
33154LOCAshokenagar Kalyangarh (Bengali: অশোকনগর কল্যা...
46939LOCIksan (Korean: 익산) is a city and major railway...
35470LOCCopenhagen is a green city well endowed with o...
32641LOCJhang Sadar (name variants: Jhang Sadar and Jh...
33528LOCKHTB (101.9 FM) is a Salt Lake City, Utah-base...
34044LOCSystem Recordings is a New York City based ele...
49382LOCKapfenberg [ˈkapfənbɛʁk], with around 22,000 i...
32662LOCNebotičnik (pronounced [nɛbɔtiːtʃniːk]; Englis...
\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "37834 LOC The 1913–14 season was the 22nd in the history...\n", + "35384 LOC The Louisville and Nashville Depot, located at...\n", + "47255 LOC Leganés (Spanish pronunciation: [le.ɣa.ˈnes]) ...\n", + "35280 LOC Paranavaí is a city in southern Brazil that wa...\n", + "37732 LOC Fareed Town (Urdu: فرید ٹاون‎) is a new town ...\n", + "34342 LOC Karaparamba is a suburb included in Kozhikode ...\n", + "33284 LOC Chamoli Gopeshwar is a township in garhwal hil...\n", + "41671 LOC Kidder is a city in Caldwell County, Missouri,...\n", + "35333 LOC Yenangyaung (Burmese: ရေနံချောင်း; literally \"...\n", + "32541 LOC OpenCrowd is a New York City based privately h...\n", + "35018 LOC Eusebius and Jerome put the territory of Eleut...\n", + "35660 LOC Eden and John's East River String Band are a N...\n", + "37841 LOC The 1933–34 season was the 37th in the history...\n", + "33154 LOC Ashokenagar Kalyangarh (Bengali: অশোকনগর কল্যা...\n", + "46939 LOC Iksan (Korean: 익산) is a city and major railway...\n", + "35470 LOC Copenhagen is a green city well endowed with o...\n", + "32641 LOC Jhang Sadar (name variants: Jhang Sadar and Jh...\n", + "33528 LOC KHTB (101.9 FM) is a Salt Lake City, Utah-base...\n", + "34044 LOC System Recordings is a New York City based ele...\n", + "49382 LOC Kapfenberg [ˈkapfənbɛʁk], with around 22,000 i...\n", + "32662 LOC Nebotičnik (pronounced [nɛbɔtiːtʃniːk]; Englis..." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'OTHER' predicted as 'ORG' : 142 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
70869OTHERTrirated Cable is a high temperature, flame re...
70574OTHERArbeitslager (German pronunciation: [ˈʔaʁbaɪts...
59366OTHERNon-well-founded set theories are variants of ...
56919OTHERRasna is a soft drink concentrate brand owned ...
70578OTHERCrime is a prominent issue in South Africa. Th...
55974OTHERCurrent Issues in Comparative Education is an ...
54122OTHER(\"Google Doodles\" redirects here. For a list o...
55468OTHERRyKrisp is a brand of rye crisp bread introduc...
70570OTHERŞcoala (English: School) was a magazine from C...
56949OTHERRoger Burrows (born 19 July 1945, Evesham, Wor...
64279OTHERInspired by...The Bible Experience is an audio...
54255OTHERThe LBM file extension is an image file in Int...
54350OTHERThe Large Observatory for X-ray Timing (LOFT) ...
52877OTHERScotch whisky, often simply called Scotch, is ...
55870OTHERCarcrash International was a post-punk band as...
56674OTHERThe Mesa Arizona Easter Pageant Jesus the Chri...
54938OTHER(This article is about the motor yacht Tatoosh...
68323OTHEROld Charter is a brand of bourbon whiskey dist...
71386OTHERThe Cadillac Tower is a 40-story, 133.4 m (438...
72253OTHERSaudi Advanced Industries Company is an indust...
52520OTHERIn telecommunications emphasis is the intentio...
52791OTHERIn Universal Personal Telecommunications (UPT)...
53729OTHERBlueprint is an architecture and design magazi...
58989OTHERRemote surgery (also known as telesurgery) is ...
71313OTHERBad Subjects (more formally Bad Subjects: Poli...
53407OTHERThe use of a royal motto (Danish: valgsprog) i...
56097OTHEREstrella Galicia is a brand of pale lager beer...
57645OTHERChimes of Freedom: The Politics of Bob Dylan's...
55778OTHERBagpiper is a brand of Indian whisky, manufact...
54647OTHERPastel is the name given to different typical ...
.........
70733OTHERThis is an almanac of seasons played by Toront...
56051OTHERThe EEA Grants and Norway Grants are the finan...
59371OTHERA residential college is an organisational pat...
53075OTHERTotal Access Communication System (TACS) and E...
70637OTHERNegative-bias temperature instability (NBTI) i...
67079OTHERF/V Maverick is a crabbing vessel that operate...
53125OTHERUniversal access to education is the ability o...
59206OTHERCAPTRUST Tower at North Hills is a 17-story 26...
70416OTHERThe University of Maryland Arboretum and Botan...
57913OTHERThe kibibit is a multiple of the bit, a unit o...
70934OTHERViolence against women in Cambodia is a seriou...
52528OTHERIn telecommunication, equivalent pulse code mo...
52719OTHERIn telecommunications network management, a me...
56961OTHERThe Sapsan (Russian: Сапсан, lit. 'Peregrine F...
52457OTHERIn telecommunication, communications survivabi...
56731OTHERN-SAT-110, also known as JCSAT-7, JCSAT-110, S...
57374OTHEREnterprise Plaza (also known as 1100 Louisiana...
52518OTHEREmergency medicine, formerly known in some cou...
54742OTHERRegret is a negative conscious and emotional r...
64986OTHERA wildland fire tender is a specialized vehicl...
53158OTHERVoyeurism is the sexual interest in or practic...
52770OTHERObstetrics and Gynecology (often abbreviated t...
60183OTHERThe cryptomonads-haptophytes assemblage is a p...
57477OTHERThe Crazy Nastyass Honey Badger is a YouTube v...
66070OTHERBlanton's is a brand of bourbon whiskey produc...
71938OTHERAlluvium (from the Latin, alluvius, from allue...
56004OTHERThe Dental Procedure Education System (DPES), ...
71051OTHERJersey Mike's Subs is a submarine sandwich, or...
57338OTHERXt3.com is a Catholic social online network an...
56340OTHERIndrapura is the second-installment of an epic...
\n", + "

142 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "70869 OTHER Trirated Cable is a high temperature, flame re...\n", + "70574 OTHER Arbeitslager (German pronunciation: [ˈʔaʁbaɪts...\n", + "59366 OTHER Non-well-founded set theories are variants of ...\n", + "56919 OTHER Rasna is a soft drink concentrate brand owned ...\n", + "70578 OTHER Crime is a prominent issue in South Africa. Th...\n", + "55974 OTHER Current Issues in Comparative Education is an ...\n", + "54122 OTHER (\"Google Doodles\" redirects here. For a list o...\n", + "55468 OTHER RyKrisp is a brand of rye crisp bread introduc...\n", + "70570 OTHER Şcoala (English: School) was a magazine from C...\n", + "56949 OTHER Roger Burrows (born 19 July 1945, Evesham, Wor...\n", + "64279 OTHER Inspired by...The Bible Experience is an audio...\n", + "54255 OTHER The LBM file extension is an image file in Int...\n", + "54350 OTHER The Large Observatory for X-ray Timing (LOFT) ...\n", + "52877 OTHER Scotch whisky, often simply called Scotch, is ...\n", + "55870 OTHER Carcrash International was a post-punk band as...\n", + "56674 OTHER The Mesa Arizona Easter Pageant Jesus the Chri...\n", + "54938 OTHER (This article is about the motor yacht Tatoosh...\n", + "68323 OTHER Old Charter is a brand of bourbon whiskey dist...\n", + "71386 OTHER The Cadillac Tower is a 40-story, 133.4 m (438...\n", + "72253 OTHER Saudi Advanced Industries Company is an indust...\n", + "52520 OTHER In telecommunications emphasis is the intentio...\n", + "52791 OTHER In Universal Personal Telecommunications (UPT)...\n", + "53729 OTHER Blueprint is an architecture and design magazi...\n", + "58989 OTHER Remote surgery (also known as telesurgery) is ...\n", + "71313 OTHER Bad Subjects (more formally Bad Subjects: Poli...\n", + "53407 OTHER The use of a royal motto (Danish: valgsprog) i...\n", + "56097 OTHER Estrella Galicia is a brand of pale lager beer...\n", + "57645 OTHER Chimes of Freedom: The Politics of Bob Dylan's...\n", + "55778 OTHER Bagpiper is a brand of Indian whisky, manufact...\n", + "54647 OTHER Pastel is the name given to different typical ...\n", + "... ... ...\n", + "70733 OTHER This is an almanac of seasons played by Toront...\n", + "56051 OTHER The EEA Grants and Norway Grants are the finan...\n", + "59371 OTHER A residential college is an organisational pat...\n", + "53075 OTHER Total Access Communication System (TACS) and E...\n", + "70637 OTHER Negative-bias temperature instability (NBTI) i...\n", + "67079 OTHER F/V Maverick is a crabbing vessel that operate...\n", + "53125 OTHER Universal access to education is the ability o...\n", + "59206 OTHER CAPTRUST Tower at North Hills is a 17-story 26...\n", + "70416 OTHER The University of Maryland Arboretum and Botan...\n", + "57913 OTHER The kibibit is a multiple of the bit, a unit o...\n", + "70934 OTHER Violence against women in Cambodia is a seriou...\n", + "52528 OTHER In telecommunication, equivalent pulse code mo...\n", + "52719 OTHER In telecommunications network management, a me...\n", + "56961 OTHER The Sapsan (Russian: Сапсан, lit. 'Peregrine F...\n", + "52457 OTHER In telecommunication, communications survivabi...\n", + "56731 OTHER N-SAT-110, also known as JCSAT-7, JCSAT-110, S...\n", + "57374 OTHER Enterprise Plaza (also known as 1100 Louisiana...\n", + "52518 OTHER Emergency medicine, formerly known in some cou...\n", + "54742 OTHER Regret is a negative conscious and emotional r...\n", + "64986 OTHER A wildland fire tender is a specialized vehicl...\n", + "53158 OTHER Voyeurism is the sexual interest in or practic...\n", + "52770 OTHER Obstetrics and Gynecology (often abbreviated t...\n", + "60183 OTHER The cryptomonads-haptophytes assemblage is a p...\n", + "57477 OTHER The Crazy Nastyass Honey Badger is a YouTube v...\n", + "66070 OTHER Blanton's is a brand of bourbon whiskey produc...\n", + "71938 OTHER Alluvium (from the Latin, alluvius, from allue...\n", + "56004 OTHER The Dental Procedure Education System (DPES), ...\n", + "71051 OTHER Jersey Mike's Subs is a submarine sandwich, or...\n", + "57338 OTHER Xt3.com is a Catholic social online network an...\n", + "56340 OTHER Indrapura is the second-installment of an epic...\n", + "\n", + "[142 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'ORG' predicted as 'LOC' : 22 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
31313ORGPetromidia Refinery is the largest Romanian o...
22629ORGThe Miaoli County Government (Chinese: 苗栗縣政府; ...
22501ORGThe Malacca Little India is a Little India in ...
22210ORGThe Ibans are a branch of the Dayak peoples of...
23232ORGSelters is a German brand of natural mineral w...
22834ORGNorth Western Railway (NWR) was a railway comp...
21976ORGGath y Chaves (commonly given by the Argentine...
31570ORGLocated on Algonquin Island in the Toronto Isl...
22500ORGLittle India in the city of Ipoh Malaysia, is ...
24869ORGThe Comunidade Intermunicipal do Alentejo Lito...
21782ORGEnoggera Road is one of Brisbane's main roads ...
26373ORGCoast Guard Air Station Humboldt Bay is a Unit...
25293ORGThe Comunidade Intermunicipal do Baixo Alentej...
21031ORGVeps National Volost (Russian: Ве́псская нацио...
28390ORGILAUD (or I.L.A. & U.D. International Laborato...
26025ORGCapital City Academy (commonly abbreviated to ...
22286ORGIsfjorden is a branch of the Romsdalsfjorden i...
26241ORGChina Jiliang University (中国计量大学) is a univers...
29391ORGKalasalingam University (Tamil: கலசலிங்கம் பல்...
31563ORGQingdao Metro (Chinese: 青岛地铁) is a metro syste...
29756ORGLi County or Lixian is an administrative divis...
24401ORGSokol plant (Russian: Авиастроительный завод «...
\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "31313 ORG Petromidia Refinery is the largest Romanian o...\n", + "22629 ORG The Miaoli County Government (Chinese: 苗栗縣政府; ...\n", + "22501 ORG The Malacca Little India is a Little India in ...\n", + "22210 ORG The Ibans are a branch of the Dayak peoples of...\n", + "23232 ORG Selters is a German brand of natural mineral w...\n", + "22834 ORG North Western Railway (NWR) was a railway comp...\n", + "21976 ORG Gath y Chaves (commonly given by the Argentine...\n", + "31570 ORG Located on Algonquin Island in the Toronto Isl...\n", + "22500 ORG Little India in the city of Ipoh Malaysia, is ...\n", + "24869 ORG The Comunidade Intermunicipal do Alentejo Lito...\n", + "21782 ORG Enoggera Road is one of Brisbane's main roads ...\n", + "26373 ORG Coast Guard Air Station Humboldt Bay is a Unit...\n", + "25293 ORG The Comunidade Intermunicipal do Baixo Alentej...\n", + "21031 ORG Veps National Volost (Russian: Ве́псская нацио...\n", + "28390 ORG ILAUD (or I.L.A. & U.D. International Laborato...\n", + "26025 ORG Capital City Academy (commonly abbreviated to ...\n", + "22286 ORG Isfjorden is a branch of the Romsdalsfjorden i...\n", + "26241 ORG China Jiliang University (中国计量大学) is a univers...\n", + "29391 ORG Kalasalingam University (Tamil: கலசலிங்கம் பல்...\n", + "31563 ORG Qingdao Metro (Chinese: 青岛地铁) is a metro syste...\n", + "29756 ORG Li County or Lixian is an administrative divis...\n", + "24401 ORG Sokol plant (Russian: Авиастроительный завод «..." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'PER' predicted as 'OTHER' : 133 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
8166PERDarley Racing is a thoroughbred horse racing o...
6371PERBehrens (1994 – September 14, 2014) was an Ame...
13462PERJohn \"Johnny\" Cutts (c. 1829–1872) was the joc...
17834PERThe Paraplane GE-2 Golden Eagle is an American...
1884PERUlrike Marie Meinhof (7 October 1934 – 9 May 1...
7891PERCunning folk, also known as folk healers or (m...
12829PERJean Lemaître was a mechanical engineer from t...
10821PERThe Great Blizzard of 1978, also known as the ...
1322PERIn spiritual terminology, piety is a virtue th...
17819PERThe Paladin Golden Eagle is an American powere...
11442PERHenry Poole & Co is a bespoke tailor located a...
15799PERMagnificent Obsession is a 1954 Universal-Inte...
5951PERAnusim (Hebrew: אֲנוּסִים, pronounced [anuˈsim...
7110PERCarolus Adrianus Johannes \"Karel\" Kreutz (born...
4655PERVáclav Melzer (26 August 1878 – 1 May 1968) wa...
3634PERKatharine Murray Lyell (1817–1915) was a Briti...
54PERCarolyn Loretta Wasilewski (June 12, 1940 - No...
177PERAnton Herman Gerard \"Anthony\" Fokker (6 April ...
11627PERA high-functioning alcoholic (HFA) is a person...
2937PERThe Flammarion engraving is a wood engraving b...
1801PERThor Heyerdahl (Norwegian pronunciation: [tuːr...
11215PERA hatamoto (旗本, \"under the banners\") was a sam...
10943PERGwyneth Rees (born 10 May 1968) is a British a...
7212PERCharaka (Sanskrit चरक) was one of the principa...
1198PERUrsula Southeil (c. 1488–1561) (also variously...
11988PERIsabelle de Montolieu (1751–1832) was a Swiss ...
9638PERFlash Gordon M.D. (also spelled flash gordon m...
1594PERSidereal time /saɪˈdɪəriəl/ is a time-keeping ...
14941PERKessie Govender (1942–2002) was a pioneering v...
11611PERHideo Ogata (尾形英夫 Ogata Hideo, (ca.1934 - 25 J...
.........
4891PERAdeline Foo is a Singaporean short story write...
1414PERA puffball is a member of any of several group...
11322PERHenricus Martellus Germanus is the latinized n...
18690PERRaju Narisetti is a Senior Vice-President, Str...
7466PERThe chestnut woodpecker (Celeus elegans) is a ...
8051PERDaniel Thomas Gillespie is a physicist who is ...
550PERFlax (also known as common flax or linseed), L...
9236PEREllen Fein is an author best known for co-writ...
17711PERIn the study of religion, orthopraxy is correc...
11006PERHannes Lindemann (28 December 1922 - 17 April ...
9760PERFranco Maria Ricci (born December 2, 1937 in P...
2654PERCoronet is an American typeface designed in 19...
655PERGladys May Aylward (24 February 1902 – 3 Janua...
1921PERVaruna (/ˈvɜːrʊnə, ˈvɑːrə-/; Sanskrit: Varuṇa ...
6763PERBovine herpesvirus 4 is a member of the Herpes...
15090PERIn mathematics, the Korteweg–de Vries equation...
11829PERIdentical is Ellen Hopkins' fifth novel. The b...
2787PERIn the Dungeons & Dragons fantasy role-playing...
16160PERMark Jacobs is a former foreign service office...
6337PERA bashi-bazouk or bashibazouk (Turkish başıboz...
591PERIn telecommunication, frequency-change signali...
10962PERHSwMS Spica (T121) is a former Swedish Navy to...
9525PERThe experimental autoimmune encephalomyelitis,...
1695PERA symbol is a sign that represents, stands for...
14433PERJoseph Louis Rosefield (18 Dec 1882 - 8 Nov 19...
19696PERRosamund Lupton (born 1964), is a British auth...
4495PERSven Regener, born 1 January 1961 in Bremen, i...
1504PERRoger Joseph Zelazny (May 13, 1937 – June 14, ...
5893PERAntonio Manetti (6 July 1423 – 26 May 1497) wa...
1411PERProsper Menière (18 June 1799 – 7 February 186...
\n", + "

133 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "8166 PER Darley Racing is a thoroughbred horse racing o...\n", + "6371 PER Behrens (1994 – September 14, 2014) was an Ame...\n", + "13462 PER John \"Johnny\" Cutts (c. 1829–1872) was the joc...\n", + "17834 PER The Paraplane GE-2 Golden Eagle is an American...\n", + "1884 PER Ulrike Marie Meinhof (7 October 1934 – 9 May 1...\n", + "7891 PER Cunning folk, also known as folk healers or (m...\n", + "12829 PER Jean Lemaître was a mechanical engineer from t...\n", + "10821 PER The Great Blizzard of 1978, also known as the ...\n", + "1322 PER In spiritual terminology, piety is a virtue th...\n", + "17819 PER The Paladin Golden Eagle is an American powere...\n", + "11442 PER Henry Poole & Co is a bespoke tailor located a...\n", + "15799 PER Magnificent Obsession is a 1954 Universal-Inte...\n", + "5951 PER Anusim (Hebrew: אֲנוּסִים, pronounced [anuˈsim...\n", + "7110 PER Carolus Adrianus Johannes \"Karel\" Kreutz (born...\n", + "4655 PER Václav Melzer (26 August 1878 – 1 May 1968) wa...\n", + "3634 PER Katharine Murray Lyell (1817–1915) was a Briti...\n", + "54 PER Carolyn Loretta Wasilewski (June 12, 1940 - No...\n", + "177 PER Anton Herman Gerard \"Anthony\" Fokker (6 April ...\n", + "11627 PER A high-functioning alcoholic (HFA) is a person...\n", + "2937 PER The Flammarion engraving is a wood engraving b...\n", + "1801 PER Thor Heyerdahl (Norwegian pronunciation: [tuːr...\n", + "11215 PER A hatamoto (旗本, \"under the banners\") was a sam...\n", + "10943 PER Gwyneth Rees (born 10 May 1968) is a British a...\n", + "7212 PER Charaka (Sanskrit चरक) was one of the principa...\n", + "1198 PER Ursula Southeil (c. 1488–1561) (also variously...\n", + "11988 PER Isabelle de Montolieu (1751–1832) was a Swiss ...\n", + "9638 PER Flash Gordon M.D. (also spelled flash gordon m...\n", + "1594 PER Sidereal time /saɪˈdɪəriəl/ is a time-keeping ...\n", + "14941 PER Kessie Govender (1942–2002) was a pioneering v...\n", + "11611 PER Hideo Ogata (尾形英夫 Ogata Hideo, (ca.1934 - 25 J...\n", + "... ... ...\n", + "4891 PER Adeline Foo is a Singaporean short story write...\n", + "1414 PER A puffball is a member of any of several group...\n", + "11322 PER Henricus Martellus Germanus is the latinized n...\n", + "18690 PER Raju Narisetti is a Senior Vice-President, Str...\n", + "7466 PER The chestnut woodpecker (Celeus elegans) is a ...\n", + "8051 PER Daniel Thomas Gillespie is a physicist who is ...\n", + "550 PER Flax (also known as common flax or linseed), L...\n", + "9236 PER Ellen Fein is an author best known for co-writ...\n", + "17711 PER In the study of religion, orthopraxy is correc...\n", + "11006 PER Hannes Lindemann (28 December 1922 - 17 April ...\n", + "9760 PER Franco Maria Ricci (born December 2, 1937 in P...\n", + "2654 PER Coronet is an American typeface designed in 19...\n", + "655 PER Gladys May Aylward (24 February 1902 – 3 Janua...\n", + "1921 PER Varuna (/ˈvɜːrʊnə, ˈvɑːrə-/; Sanskrit: Varuṇa ...\n", + "6763 PER Bovine herpesvirus 4 is a member of the Herpes...\n", + "15090 PER In mathematics, the Korteweg–de Vries equation...\n", + "11829 PER Identical is Ellen Hopkins' fifth novel. The b...\n", + "2787 PER In the Dungeons & Dragons fantasy role-playing...\n", + "16160 PER Mark Jacobs is a former foreign service office...\n", + "6337 PER A bashi-bazouk or bashibazouk (Turkish başıboz...\n", + "591 PER In telecommunication, frequency-change signali...\n", + "10962 PER HSwMS Spica (T121) is a former Swedish Navy to...\n", + "9525 PER The experimental autoimmune encephalomyelitis,...\n", + "1695 PER A symbol is a sign that represents, stands for...\n", + "14433 PER Joseph Louis Rosefield (18 Dec 1882 - 8 Nov 19...\n", + "19696 PER Rosamund Lupton (born 1964), is a British auth...\n", + "4495 PER Sven Regener, born 1 January 1961 in Bremen, i...\n", + "1504 PER Roger Joseph Zelazny (May 13, 1937 – June 14, ...\n", + "5893 PER Antonio Manetti (6 July 1423 – 26 May 1497) wa...\n", + "1411 PER Prosper Menière (18 June 1799 – 7 February 186...\n", + "\n", + "[133 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "'ORG' predicted as 'OTHER' : 126 examples.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categoryabstract
20457ORGThe standard enthalpy of formation or standard...
20262ORGThe KEP (Kinetic Energy Penetrator) also desig...
30262ORGMindaNews is an online newspaper based in Mind...
21687ORGIn agriculture, disease management is the prac...
20442ORGSnake oil, originally a fraudulent liniment wi...
22701ORGNEMA, also known as Tenth and Market, is a 754...
31516ORGProtein Sciences Corporation is a biotech comp...
21778ORGThe Ends of the Earth is a figurative expressi...
23568ORGThomas Hill (Rotherham) Limited was a company ...
21141ORGAmplitude and phase-shift keying or asymmetric...
32056ORGSamKochAvto, originally Samarkand Automobile F...
29890ORGLotec is a German sports car manufacturer. The...
28396ORGITV News West Country is a regional news servi...
20093ORGA closed-loop transfer function in control the...
20139ORGA diaper (also called a nappy outside North Am...
20175ORGExtreme value theory or extreme value analysis...
20892ORGAnnette Marie Sarah Kellerman (6 July 1886 – 6...
21021ORGTrue north (geodetic north) is the direction a...
20616ORGVeterinary medicine is the branch of medicine ...
24786ORGAfterall is a nonprofit contemporary art resea...
26300ORGCitiBusiness Singapore is a unit of Citibank S...
21581ORGThe Courier Car Co. was an automobile manufact...
31583ORGThe Regulatory Affairs Journal Pharma (RAJ Pha...
25617ORGBright AC was a 24-hour music format produced ...
23907ORGThe American Roentgen Ray Society (ARRS) is a ...
29936ORGLärabar is a brand of energy bar produced by G...
20116ORGCumulonimbus, from the Latin cumulus (\"heap\") ...
26875ORGDesires for the Future (French: Désirs d'aveni...
20166ORGEspresso (/ɛˈsprɛsoʊ/, Italian: [esˈprɛsso]) i...
22880ORGAn off cutter is a type of delivery in the gam...
.........
20300ORGMise-en-scène (French pronunciation: ​[mizɑ̃sɛ...
20213ORGThe haptophytes, classified either as the Prym...
20050ORGIn the language of measurement, quantities are...
25027ORGAntenna Sicilia is a regional Italian televisi...
23936ORGBlack Joe Lewis is an American blues, funk and...
25163ORGAstyplaz is an Athens-based 4 piece electronic...
20287ORGMarsala is a wine, dry or sweet, produced in t...
28782ORGJonatan Söderström, also known by his internet...
20007ORG7 Up is a brand of lemon-lime flavored, non-ca...
20878ORGMagnetic shape-memory alloys (MSMAs), or ferro...
22463ORGA leg cutter is a type of delivery in the spor...
21957ORGThe Fulmar Gas Line is a natural gas pipeline,...
20110ORGThe Counter-Earth is a hypothetical body of th...
27348ORGThe Excellence Research Centers are national n...
21543ORGThe Common Ground between Islam and Buddhism p...
30114ORGMaster Point Press is a Canadian book publishi...
31045ORGOmid (Persian: امید‎‎, meaning \"Hope\") was Ira...
22530ORGLusterware or Lustreware (respectively the US ...
20312ORGIn computing, NaN, standing for not a number, ...
21983ORGGeminiJets is a die-cast model airplane manufa...
31087ORGOracle CRM is customer relationship management...
20672ORGThe Zimbabwe Defence Forces (ZDF) are composed...
24559ORG10 Cane was a premium Trinidadian light rum br...
20272ORGLidar (also called LIDAR, LiDAR, and LADAR) is...
21955ORGFully Buffered DIMM (or FB-DIMM) is a memory t...
21696ORGDogfights is a military aviation themed TV ser...
21956ORGA fully differential amplifier (FDA) is a DC-c...
28640ORGThe Irish Steam Preservation Society was forme...
24928ORGAfter working on the business process team at ...
20826ORGThe Hochdorf Chieftain's Grave is a richly-fur...
\n", + "

126 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " category abstract\n", + "20457 ORG The standard enthalpy of formation or standard...\n", + "20262 ORG The KEP (Kinetic Energy Penetrator) also desig...\n", + "30262 ORG MindaNews is an online newspaper based in Mind...\n", + "21687 ORG In agriculture, disease management is the prac...\n", + "20442 ORG Snake oil, originally a fraudulent liniment wi...\n", + "22701 ORG NEMA, also known as Tenth and Market, is a 754...\n", + "31516 ORG Protein Sciences Corporation is a biotech comp...\n", + "21778 ORG The Ends of the Earth is a figurative expressi...\n", + "23568 ORG Thomas Hill (Rotherham) Limited was a company ...\n", + "21141 ORG Amplitude and phase-shift keying or asymmetric...\n", + "32056 ORG SamKochAvto, originally Samarkand Automobile F...\n", + "29890 ORG Lotec is a German sports car manufacturer. The...\n", + "28396 ORG ITV News West Country is a regional news servi...\n", + "20093 ORG A closed-loop transfer function in control the...\n", + "20139 ORG A diaper (also called a nappy outside North Am...\n", + "20175 ORG Extreme value theory or extreme value analysis...\n", + "20892 ORG Annette Marie Sarah Kellerman (6 July 1886 – 6...\n", + "21021 ORG True north (geodetic north) is the direction a...\n", + "20616 ORG Veterinary medicine is the branch of medicine ...\n", + "24786 ORG Afterall is a nonprofit contemporary art resea...\n", + "26300 ORG CitiBusiness Singapore is a unit of Citibank S...\n", + "21581 ORG The Courier Car Co. was an automobile manufact...\n", + "31583 ORG The Regulatory Affairs Journal Pharma (RAJ Pha...\n", + "25617 ORG Bright AC was a 24-hour music format produced ...\n", + "23907 ORG The American Roentgen Ray Society (ARRS) is a ...\n", + "29936 ORG Lärabar is a brand of energy bar produced by G...\n", + "20116 ORG Cumulonimbus, from the Latin cumulus (\"heap\") ...\n", + "26875 ORG Desires for the Future (French: Désirs d'aveni...\n", + "20166 ORG Espresso (/ɛˈsprɛsoʊ/, Italian: [esˈprɛsso]) i...\n", + "22880 ORG An off cutter is a type of delivery in the gam...\n", + "... ... ...\n", + "20300 ORG Mise-en-scène (French pronunciation: ​[mizɑ̃sɛ...\n", + "20213 ORG The haptophytes, classified either as the Prym...\n", + "20050 ORG In the language of measurement, quantities are...\n", + "25027 ORG Antenna Sicilia is a regional Italian televisi...\n", + "23936 ORG Black Joe Lewis is an American blues, funk and...\n", + "25163 ORG Astyplaz is an Athens-based 4 piece electronic...\n", + "20287 ORG Marsala is a wine, dry or sweet, produced in t...\n", + "28782 ORG Jonatan Söderström, also known by his internet...\n", + "20007 ORG 7 Up is a brand of lemon-lime flavored, non-ca...\n", + "20878 ORG Magnetic shape-memory alloys (MSMAs), or ferro...\n", + "22463 ORG A leg cutter is a type of delivery in the spor...\n", + "21957 ORG The Fulmar Gas Line is a natural gas pipeline,...\n", + "20110 ORG The Counter-Earth is a hypothetical body of th...\n", + "27348 ORG The Excellence Research Centers are national n...\n", + "21543 ORG The Common Ground between Islam and Buddhism p...\n", + "30114 ORG Master Point Press is a Canadian book publishi...\n", + "31045 ORG Omid (Persian: امید‎‎, meaning \"Hope\") was Ira...\n", + "22530 ORG Lusterware or Lustreware (respectively the US ...\n", + "20312 ORG In computing, NaN, standing for not a number, ...\n", + "21983 ORG GeminiJets is a die-cast model airplane manufa...\n", + "31087 ORG Oracle CRM is customer relationship management...\n", + "20672 ORG The Zimbabwe Defence Forces (ZDF) are composed...\n", + "24559 ORG 10 Cane was a premium Trinidadian light rum br...\n", + "20272 ORG Lidar (also called LIDAR, LiDAR, and LADAR) is...\n", + "21955 ORG Fully Buffered DIMM (or FB-DIMM) is a memory t...\n", + "21696 ORG Dogfights is a military aviation themed TV ser...\n", + "21956 ORG A fully differential amplifier (FDA) is a DC-c...\n", + "28640 ORG The Irish Steam Preservation Society was forme...\n", + "24928 ORG After working on the business process team at ...\n", + "20826 ORG The Hochdorf Chieftain's Grave is a richly-fur...\n", + "\n", + "[126 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "for predicted in category_id_df.category_id:\n", + " for actual in category_id_df.category_id:\n", + " if predicted != actual and conf_mat[actual, predicted] >= 10:\n", + " print(\"'{}' predicted as '{}' : {} examples.\".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))\n", + " display(df_train.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['category', 'abstract']])\n", + " print('')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'CalibratedClassifierCV' object has no attribute 'coef_'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mN\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcategory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcategory_id\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcategory_to_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mindices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margsort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoef_\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcategory_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mfeature_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtfidf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_feature_names\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindices\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0munigrams\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mreversed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeature_names\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mN\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'CalibratedClassifierCV' object has no attribute 'coef_'" + ] + } + ], + "source": [ + "model.fit(features, labels)\n", + "N = 2\n", + "for category, category_id in sorted(category_to_id.items()):\n", + " indices = np.argsort(model.coef_[category_id])\n", + " feature_names = np.array(tfidf.get_feature_names())[indices]\n", + " unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]\n", + " bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]\n", + " print(\"# '{}':\".format(category))\n", + " print(\" . Top unigrams:\\n . {}\".format('\\n . '.join(unigrams)))\n", + " print(\" . Top bigrams:\\n . {}\".format('\\n . '.join(bigrams)))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " PER 0.96 0.96 0.96 6549\n", + " ORG 0.92 0.93 0.93 4017\n", + " LOC 1.00 0.99 0.99 6676\n", + " OTHER 0.96 0.96 0.96 6605\n", + "\n", + " accuracy 0.96 23847\n", + " macro avg 0.96 0.96 0.96 23847\n", + "weighted avg 0.96 0.96 0.96 23847\n", + "\n" + ] + } + ], + "source": [ + "print(metrics.classification_report(y_test, y_pred, target_names=df_train['category'].unique()))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1.72478931e-04 9.99279567e-01 2.94780159e-05 5.18476007e-04]]\n" + ] + } + ], + "source": [ + "print(model.predict_proba(tfidf.transform([\"band high scholll.\"])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## One-vs-all strategy" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0\n", + "0.999\n", + "0.0\n", + "0.001\n" + ] + }, + { + "data": { + "text/plain": [ + "[None, None, None, None]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[print(round(p, 3)) for p in [1.72478931e-04, 9.99279567e-01, 2.94780159e-05, 5.18476007e-04]]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "subset = df_train[df_train.category=='PER']" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "d = {}\n", + "for text in subset.abstract.values:\n", + " tokens = word_tokenize(text)\n", + " tokens_without_sw = [word for word in tokens if not word in stopwords.words()]\n", + " for token in tokens_without_sw:\n", + " d[token.lower()] = d.get(token.lower(), 0) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "173498" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#173498\n", + "len(d) " + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the 127104\n", + "of 78089\n", + "in 75648\n", + "and 68837\n", + "a 49616\n", + "he 41962\n", + "was 41190\n", + "to 32486\n", + "his 24551\n", + "for 23258\n", + "as 20973\n", + "is 20930\n", + "at 15142\n", + "an 13836\n", + "on 13683\n", + "with 12793\n", + "from 12377\n", + "by 9889\n", + "– 7758\n", + "who 7241\n", + "(born 6944\n", + "also 6500\n", + "she 6181\n", + "that 5937\n", + "first 5491\n", + "has 5291\n", + "american 5271\n", + "which 4724\n", + "after 4722\n", + "her 4585\n", + "new 4530\n", + "played 4155\n", + "known 4144\n", + "born 4008\n", + "one 3969\n", + "had 3581\n", + "during 3520\n", + "member 3321\n", + "university 3286\n", + "became 3030\n", + "but 2975\n", + "served 2905\n", + "john 2883\n", + "two 2770\n", + "former 2726\n", + "national 2700\n", + "world 2683\n", + "been 2614\n", + "were 2607\n", + "may 2591\n", + "football 2589\n", + "or 2531\n", + "won 2485\n", + "january 2409\n", + "most 2402\n", + "it 2390\n", + "where 2380\n", + "united 2370\n", + "march 2355\n", + "later 2355\n", + "best 2308\n", + "died 2305\n", + "be 2278\n", + "september 2269\n", + "league 2246\n", + "until 2237\n", + "june 2234\n", + "december 2216\n", + "august 2201\n", + "british 2181\n", + "july 2170\n", + "when 2163\n", + "this 2162\n", + "made 2158\n", + "april 2153\n", + "film 2134\n", + "october 2133\n", + "november 2128\n", + "english 2092\n", + "him 2029\n", + "de 1999\n", + "years 1983\n", + "february 1975\n", + "professional 1915\n", + "not 1908\n", + "other 1900\n", + "before 1874\n", + "work 1873\n", + "have 1808\n", + "career 1778\n", + "including 1755\n", + "their 1695\n", + "president 1665\n", + "three 1658\n", + "state 1647\n", + "then 1646\n", + "into 1646\n", + "against 1610\n", + "time 1609\n", + "are 1593\n", + "many 1592\n" + ] + } + ], + "source": [ + "i = 0\n", + "for k in sorted(d, key=d.get, reverse=True):\n", + " print(k, d[k])\n", + " i += 1\n", + " if i > 100:\n", + " break" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "horus", + "language": "python", + "name": "horus" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/training/notebooks/horus_v1/03-horus-training-ner-crf.ipynb b/training/notebooks/horus_v1/03-horus-training-ner-crf.ipynb similarity index 100% rename from src/training/notebooks/horus_v1/03-horus-training-ner-crf.ipynb rename to training/notebooks/horus_v1/03-horus-training-ner-crf.ipynb diff --git a/src/training/notebooks/horus_v1/__init__.py b/training/notebooks/horus_v1/__init__.py similarity index 100% rename from src/training/notebooks/horus_v1/__init__.py rename to training/notebooks/horus_v1/__init__.py diff --git a/src/training/notebooks/horus_v2/__init__.py b/training/notebooks/horus_v2/__init__.py similarity index 100% rename from src/training/notebooks/horus_v2/__init__.py rename to training/notebooks/horus_v2/__init__.py diff --git a/src/training/scripts/__init__.py b/training/scripts/__init__.py similarity index 100% rename from src/training/scripts/__init__.py rename to training/scripts/__init__.py