JZ-LIANG
diff --git a/‎.DS_Store
-14 KB b/‎.DS_Store
-14 KB
diff --git a/‎How To Use.ipynb
Lines changed: 387 additions & 0 deletions b/‎How To Use.ipynb
Lines changed: 387 additions & 0 deletions
@@ -0,0 +1,387 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### P.S.\n",
+    "* here we not focus on pursuing a higher F1 score, but give a quick example for how to set the model, so we set all the model hyper-parameters to a quite simple level to make the run faster.\n",
+    "* you need to modify the config.py to create a more robust model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "import tensorflow as tf\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import libs\n",
+    "from model import Model\n",
+    "tf.reset_default_graph()\n",
+    "from utils import get_idx, get_inputs\n",
+    "from config import Config"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GloVe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2019-03-26 17:14:45,310 config object Initialized\n",
+      "Building vocab...\n",
+      "vocabulary for this corpus: 12447 tokens, 85 chars, 8 labels\n",
+      "vocabulary construction time:  7.563478005002253\n"
+     ]
+    }
+   ],
+   "source": [
+    "# setting the embedding file path\n",
+    "from config_examples.config_glove import Config\n",
+    "config = Config('glove')\n",
+    "glove_file_path = 'data/glove/glove.6B.100d.txt'\n",
+    "# where to save the predictions, model, index files\n",
+    "save_path = 'test/glove_test/'\n",
+    "config.init_glove(glove_file_path, save_path)\n",
+    "\n",
+    "# parse the corpus and generate the input data\n",
+    "token2idx, char2idx, label2idx, lookup_table = get_idx(config)\n",
+    "train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)\n",
+    "eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)\n",
+    "test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2019-03-26 17:14:56,611 Initializing tf session\n",
+      "2019-03-26 17:14:56,848 Epoch 1 out of 5\n",
+      "2019-03-26 17:15:12,391 Epoch 1 's F1 =31.139110311133965, epoch_runing_time =15.541498899459839 .\n",
+      "2019-03-26 17:15:12,393 - new best F1, save new model.\n",
+      "2019-03-26 17:15:12,782 Epoch 2 out of 5\n",
+      "2019-03-26 17:15:26,976 Epoch 2 's F1 =62.235889296696755, epoch_runing_time =14.19306468963623 .\n",
+      "2019-03-26 17:15:26,978 - new best F1, save new model.\n",
+      "2019-03-26 17:15:27,249 Epoch 3 out of 5\n",
+      "2019-03-26 17:15:40,412 Epoch 3 's F1 =71.09647058823529, epoch_runing_time =13.162132740020752 .\n",
+      "2019-03-26 17:15:40,414 - new best F1, save new model.\n",
+      "2019-03-26 17:15:40,674 Epoch 4 out of 5\n",
+      "2019-03-26 17:15:54,906 Epoch 4 's F1 =75.21691973969631, epoch_runing_time =14.231114149093628 .\n",
+      "2019-03-26 17:15:54,908 - new best F1, save new model.\n",
+      "2019-03-26 17:15:55,156 Epoch 5 out of 5\n",
+      "2019-03-26 17:16:09,319 Epoch 5 's F1 =77.59048970901348, epoch_runing_time =14.161910057067871 .\n",
+      "2019-03-26 17:16:09,321 - new best F1, save new model.\n",
+      "2019-03-26 17:16:14,425 processed 51363 tokens with 5942 phrases; found: 5330 phrases; correct: 4373.\n",
+      "accuracy:  95.51%; precision:  82.05%; recall:  73.59%; FB1:  77.59\n",
+      "              LOC: precision:  82.74%; recall:  84.54%; FB1:  83.63  1877\n",
+      "             MISC: precision:  75.70%; recall:  49.67%; FB1:  59.99  605\n",
+      "              ORG: precision:  75.76%; recall:  53.84%; FB1:  62.95  953\n",
+      "              PER: precision:  86.54%; recall:  89.03%; FB1:  87.77  1895\n",
+      "\n",
+      "2019-03-26 17:16:18,860 processed 46436 tokens with 5648 phrases; found: 5110 phrases; correct: 3898.\n",
+      "accuracy:  94.37%; precision:  76.28%; recall:  69.02%; FB1:  72.47\n",
+      "              LOC: precision:  73.71%; recall:  76.80%; FB1:  75.22  1738\n",
+      "             MISC: precision:  67.84%; recall:  46.58%; FB1:  55.24  482\n",
+      "              ORG: precision:  72.91%; recall:  52.98%; FB1:  61.37  1207\n",
+      "              PER: precision:  83.78%; recall:  87.20%; FB1:  85.45  1683\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# initial the same NER model \n",
+    "ner_model = Model(config)\n",
+    "ner_model.build_graph()\n",
+    "ner_model.initialize_session()\n",
+    "\n",
+    "# training and test\n",
+    "ner_model.train(train_x,train_y,eval_x,eval_y)\n",
+    "ner_model.test(eval_x,eval_y, 'eval')\n",
+    "ner_model.test(test_x,test_y, 'test')\n",
+    "ner_model.close()\n",
+    "tf.reset_default_graph()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# w2v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setting the embedding file path\n",
+    "from config_examples.config_w2v import Config\n",
+    "from gensim.models import KeyedVectors\n",
+    "config = Config('w2v')\n",
+    "path =\"data/GoogleNews-vectors-negative300.bin\"\n",
+    "w2v = KeyedVectors.load_word2vec_format(path, binary=True)\n",
+    "config.init_w2v(w2v)\n",
+    "\n",
+    "# parse the corpus and generate the input data\n",
+    "token2idx, char2idx, label2idx, lookup_table = get_idx(config)\n",
+    "train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)\n",
+    "eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)\n",
+    "test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)\n",
+    "\n",
+    "# initial the same NER model \n",
+    "ner_model = Model(config)\n",
+    "ner_model.build_graph()\n",
+    "ner_model.initialize_session()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# training and test\n",
+    "ner_model.train(train_x,train_y,eval_x,eval_y)\n",
+    "ner_model.test(eval_x,eval_y, 'eval')\n",
+    "ner_model.test(test_x,test_y, 'test')\n",
+    "ner_model.close()\n",
+    "tf.reset_default_graph()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fasttext"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# setting the embedding file path\n",
+    "from config_examples.config_fasttext import Config\n",
+    "config = Config('fasttext')\n",
+    "command ='../fastText/fasttext'\n",
+    "bin_file ='../fastText/data/cc.en.300.bin'\n",
+    "config.init_fasttext(command, bin_file)\n",
+    "\n",
+    "# parse the corpus and generate the input data\n",
+    "token2idx, char2idx, label2idx, lookup_table = get_idx(config)\n",
+    "train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)\n",
+    "eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)\n",
+    "test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)\n",
+    "\n",
+    "# initial the same NER model \n",
+    "ner_model = Model(config)\n",
+    "ner_model.build_graph()\n",
+    "ner_model.initialize_session()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# training and test\n",
+    "ner_model.train(train_x,train_y,eval_x,eval_y)\n",
+    "ner_model.test(eval_x,eval_y, 'eval')\n",
+    "ner_model.test(test_x,test_y, 'test')\n",
+    "ner_model.close()\n",
+    "tf.reset_default_graph()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Contextual Embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## flair + glove"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from config import Config\n",
+    "from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, FlairEmbeddings\n",
+    "from config_examples.config_contextual import Config\n",
+    "from utils import load_cropus, get_cropus_len, get_inputs_contextual\n",
+    "config = Config('flair_glove')\n",
+    "\n",
+    "# create a StackedEmbedding object that combines the embedding you want\n",
+    "stacked_embeddings = StackedEmbeddings([\n",
+    "                                        WordEmbeddings('glove'), \n",
+    "                                        FlairEmbeddings('news-forward-fast'), \n",
+    "                                        FlairEmbeddings('news-backward-fast'),\n",
+    "                                       ])\n",
+    "\n",
+    "# load the corpus into flair libs\n",
+    "token2idx1, char2idx, label2idx = get_idx(config)\n",
+    "train, dev, test = load_cropus(config)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# setting the [the number of token in corpus, the dimension of the stacked embedding]\n",
+    "# this two number should be computed by your own cropus and the embedding combination your choose\n",
+    "# for CONLL dataset, the cropus_len = 301418, flair-news-forward-fast + glove.100d = 2148\n",
+    "datasets = [config.path_train, config.path_eval, config.path_test]\n",
+    "cropus_len = get_cropus_len(datasets)\n",
+    "lookup_table = np.zeros([cropus_len, 1124])\n",
+    "token2idx = []\n",
+    "\n",
+    "\n",
+    "train_x, train_y, offset = get_inputs_contextual(train,stacked_embeddings, 0, \n",
+    "                                            lookup_table,token2idx, char2idx, label2idx,)\n",
+    "eval_x, eval_y, offset1 = get_inputs_contextual(dev,stacked_embeddings, offset, \n",
+    "                                            lookup_table,token2idx, char2idx, label2idx,)\n",
+    "test_x, test_y, offset2 = get_inputs_contextual(test,stacked_embeddings, offset1, \n",
+    "                                            lookup_table,token2idx, char2idx, label2idx,)\n",
+    "\n",
+    "# update the lookup_table and token2idx according to the dataset since they will be contextual dependent\n",
+    "config.init_contextual(lookup_table, token2idx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# initial the same NER model \n",
+    "ner_model = Model(config)\n",
+    "ner_model.build_graph()\n",
+    "ner_model.initialize_session()\n",
+    "\n",
+    "# training and test\n",
+    "ner_model.train(train_x,train_y,eval_x,eval_y)\n",
+    "ner_model.test(eval_x,eval_y,'eval')\n",
+    "ner_model.test(test_x,test_y, 'test')\n",
+    "ner_model.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## elmo + w2v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from config import Config\n",
+    "from config_examples.config_contextual import Config\n",
+    "from utils import load_cropus, get_cropus_len, get_inputs_contextual\n",
+    "from flair.embeddings import ELMoEmbeddings,StackedEmbeddings,WordEmbeddings\n",
+    "elmo_embedding = ELMoEmbeddings()\n",
+    "w2v_embedding = WordEmbeddings('/home/semantic/Liang_NER/data/word_embedding/word2vec/w2v.gensim')\n",
+    "config = Config('elmo_w2v')\n",
+    "\n",
+    "# load the corpus into flair libs\n",
+    "token2idx1, char2idx, label2idx = get_idx(config)\n",
+    "train, dev, test = load_cropus(config)\n",
+    "\n",
+    "# create a StackedEmbedding object that combines the embedding you want\n",
+    "stacked_embeddings = StackedEmbeddings(embeddings=[w2v_embedding,elmo_embedding])\n",
+    "datasets = [config.path_train, config.path_eval, config.path_test]\n",
+    "cropus_len = get_cropus_len(datasets)\n",
+    "lookup_table = np.zeros([cropus_len, 1124])\n",
+    "token2idx = []\n",
+    "\n",
+    "\n",
+    "train_x, train_y, offset = get_inputs_contextual(train,stacked_embeddings, 0, \n",
+    "                                            lookup_table,token2idx, char2idx, label2idx,)\n",
+    "eval_x, eval_y, offset1 = get_inputs_contextual(dev,stacked_embeddings, offset, \n",
+    "                                            lookup_table,token2idx, char2idx, label2idx,)\n",
+    "test_x, test_y, offset2 = get_inputs_contextual(test,stacked_embeddings, offset1, \n",
+    "                                            lookup_table,token2idx, char2idx, label2idx,)\n",
+    "\n",
+    "# update the lookup_table and token2idx according to the dataset since they will be contextual dependent\n",
+    "config.init_contextual(lookup_table, token2idx)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}