Skip to content

Commit ad4b6f5

Browse files
committed
clean repo
1 parent d8d64a8 commit ad4b6f5

File tree

7 files changed

+1834
-0
lines changed

7 files changed

+1834
-0
lines changed

.DS_Store

-14 KB
Binary file not shown.

How To Use.ipynb

Lines changed: 387 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,387 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"### P.S.\n",
8+
"* here we not focus on pursuing a higher F1 score, but give a quick example for how to set the model, so we set all the model hyper-parameters to a quite simple level to make the run faster.\n",
9+
"* you need to modify the config.py to create a more robust model."
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 1,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import warnings\n",
19+
"warnings.filterwarnings('ignore')\n",
20+
"import tensorflow as tf\n",
21+
"import numpy as np"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 6,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"# import libs\n",
31+
"from model import Model\n",
32+
"tf.reset_default_graph()\n",
33+
"from utils import get_idx, get_inputs\n",
34+
"from config import Config"
35+
]
36+
},
37+
{
38+
"cell_type": "markdown",
39+
"metadata": {},
40+
"source": [
41+
"# GloVe"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": 7,
47+
"metadata": {},
48+
"outputs": [
49+
{
50+
"name": "stdout",
51+
"output_type": "stream",
52+
"text": [
53+
"2019-03-26 17:14:45,310 config object Initialized\n",
54+
"Building vocab...\n",
55+
"vocabulary for this corpus: 12447 tokens, 85 chars, 8 labels\n",
56+
"vocabulary construction time: 7.563478005002253\n"
57+
]
58+
}
59+
],
60+
"source": [
61+
"# setting the embedding file path\n",
62+
"from config_examples.config_glove import Config\n",
63+
"config = Config('glove')\n",
64+
"glove_file_path = 'data/glove/glove.6B.100d.txt'\n",
65+
"# where to save the predictions, model, index files\n",
66+
"save_path = 'test/glove_test/'\n",
67+
"config.init_glove(glove_file_path, save_path)\n",
68+
"\n",
69+
"# parse the corpus and generate the input data\n",
70+
"token2idx, char2idx, label2idx, lookup_table = get_idx(config)\n",
71+
"train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)\n",
72+
"eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)\n",
73+
"test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 8,
79+
"metadata": {
80+
"scrolled": false
81+
},
82+
"outputs": [
83+
{
84+
"name": "stdout",
85+
"output_type": "stream",
86+
"text": [
87+
"2019-03-26 17:14:56,611 Initializing tf session\n",
88+
"2019-03-26 17:14:56,848 Epoch 1 out of 5\n",
89+
"2019-03-26 17:15:12,391 Epoch 1 's F1 =31.139110311133965, epoch_runing_time =15.541498899459839 .\n",
90+
"2019-03-26 17:15:12,393 - new best F1, save new model.\n",
91+
"2019-03-26 17:15:12,782 Epoch 2 out of 5\n",
92+
"2019-03-26 17:15:26,976 Epoch 2 's F1 =62.235889296696755, epoch_runing_time =14.19306468963623 .\n",
93+
"2019-03-26 17:15:26,978 - new best F1, save new model.\n",
94+
"2019-03-26 17:15:27,249 Epoch 3 out of 5\n",
95+
"2019-03-26 17:15:40,412 Epoch 3 's F1 =71.09647058823529, epoch_runing_time =13.162132740020752 .\n",
96+
"2019-03-26 17:15:40,414 - new best F1, save new model.\n",
97+
"2019-03-26 17:15:40,674 Epoch 4 out of 5\n",
98+
"2019-03-26 17:15:54,906 Epoch 4 's F1 =75.21691973969631, epoch_runing_time =14.231114149093628 .\n",
99+
"2019-03-26 17:15:54,908 - new best F1, save new model.\n",
100+
"2019-03-26 17:15:55,156 Epoch 5 out of 5\n",
101+
"2019-03-26 17:16:09,319 Epoch 5 's F1 =77.59048970901348, epoch_runing_time =14.161910057067871 .\n",
102+
"2019-03-26 17:16:09,321 - new best F1, save new model.\n",
103+
"2019-03-26 17:16:14,425 processed 51363 tokens with 5942 phrases; found: 5330 phrases; correct: 4373.\n",
104+
"accuracy: 95.51%; precision: 82.05%; recall: 73.59%; FB1: 77.59\n",
105+
" LOC: precision: 82.74%; recall: 84.54%; FB1: 83.63 1877\n",
106+
" MISC: precision: 75.70%; recall: 49.67%; FB1: 59.99 605\n",
107+
" ORG: precision: 75.76%; recall: 53.84%; FB1: 62.95 953\n",
108+
" PER: precision: 86.54%; recall: 89.03%; FB1: 87.77 1895\n",
109+
"\n",
110+
"2019-03-26 17:16:18,860 processed 46436 tokens with 5648 phrases; found: 5110 phrases; correct: 3898.\n",
111+
"accuracy: 94.37%; precision: 76.28%; recall: 69.02%; FB1: 72.47\n",
112+
" LOC: precision: 73.71%; recall: 76.80%; FB1: 75.22 1738\n",
113+
" MISC: precision: 67.84%; recall: 46.58%; FB1: 55.24 482\n",
114+
" ORG: precision: 72.91%; recall: 52.98%; FB1: 61.37 1207\n",
115+
" PER: precision: 83.78%; recall: 87.20%; FB1: 85.45 1683\n",
116+
"\n"
117+
]
118+
}
119+
],
120+
"source": [
121+
"# initial the same NER model \n",
122+
"ner_model = Model(config)\n",
123+
"ner_model.build_graph()\n",
124+
"ner_model.initialize_session()\n",
125+
"\n",
126+
"# training and test\n",
127+
"ner_model.train(train_x,train_y,eval_x,eval_y)\n",
128+
"ner_model.test(eval_x,eval_y, 'eval')\n",
129+
"ner_model.test(test_x,test_y, 'test')\n",
130+
"ner_model.close()\n",
131+
"tf.reset_default_graph()"
132+
]
133+
},
134+
{
135+
"cell_type": "markdown",
136+
"metadata": {},
137+
"source": [
138+
"# w2v"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": 15,
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"# setting the embedding file path\n",
148+
"from config_examples.config_w2v import Config\n",
149+
"from gensim.models import KeyedVectors\n",
150+
"config = Config('w2v')\n",
151+
"path =\"data/GoogleNews-vectors-negative300.bin\"\n",
152+
"w2v = KeyedVectors.load_word2vec_format(path, binary=True)\n",
153+
"config.init_w2v(w2v)\n",
154+
"\n",
155+
"# parse the corpus and generate the input data\n",
156+
"token2idx, char2idx, label2idx, lookup_table = get_idx(config)\n",
157+
"train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)\n",
158+
"eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)\n",
159+
"test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)\n",
160+
"\n",
161+
"# initial the same NER model \n",
162+
"ner_model = Model(config)\n",
163+
"ner_model.build_graph()\n",
164+
"ner_model.initialize_session()"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 14,
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"# training and test\n",
174+
"ner_model.train(train_x,train_y,eval_x,eval_y)\n",
175+
"ner_model.test(eval_x,eval_y, 'eval')\n",
176+
"ner_model.test(test_x,test_y, 'test')\n",
177+
"ner_model.close()\n",
178+
"tf.reset_default_graph()"
179+
]
180+
},
181+
{
182+
"cell_type": "code",
183+
"execution_count": null,
184+
"metadata": {},
185+
"outputs": [],
186+
"source": []
187+
},
188+
{
189+
"cell_type": "markdown",
190+
"metadata": {},
191+
"source": [
192+
"# Fasttext"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": 12,
198+
"metadata": {},
199+
"outputs": [],
200+
"source": [
201+
"# setting the embedding file path\n",
202+
"from config_examples.config_fasttext import Config\n",
203+
"config = Config('fasttext')\n",
204+
"command ='../fastText/fasttext'\n",
205+
"bin_file ='../fastText/data/cc.en.300.bin'\n",
206+
"config.init_fasttext(command, bin_file)\n",
207+
"\n",
208+
"# parse the corpus and generate the input data\n",
209+
"token2idx, char2idx, label2idx, lookup_table = get_idx(config)\n",
210+
"train_x, train_y = get_inputs('train', token2idx, char2idx, label2idx, config)\n",
211+
"eval_x, eval_y = get_inputs('eval', token2idx, char2idx, label2idx, config)\n",
212+
"test_x, test_y = get_inputs('test', token2idx, char2idx, label2idx, config)\n",
213+
"\n",
214+
"# initial the same NER model \n",
215+
"ner_model = Model(config)\n",
216+
"ner_model.build_graph()\n",
217+
"ner_model.initialize_session()"
218+
]
219+
},
220+
{
221+
"cell_type": "code",
222+
"execution_count": 13,
223+
"metadata": {},
224+
"outputs": [],
225+
"source": [
226+
"# training and test\n",
227+
"ner_model.train(train_x,train_y,eval_x,eval_y)\n",
228+
"ner_model.test(eval_x,eval_y, 'eval')\n",
229+
"ner_model.test(test_x,test_y, 'test')\n",
230+
"ner_model.close()\n",
231+
"tf.reset_default_graph()"
232+
]
233+
},
234+
{
235+
"cell_type": "code",
236+
"execution_count": null,
237+
"metadata": {},
238+
"outputs": [],
239+
"source": []
240+
},
241+
{
242+
"cell_type": "markdown",
243+
"metadata": {},
244+
"source": [
245+
"# Contextual Embedding"
246+
]
247+
},
248+
{
249+
"cell_type": "markdown",
250+
"metadata": {},
251+
"source": [
252+
"## flair + glove"
253+
]
254+
},
255+
{
256+
"cell_type": "code",
257+
"execution_count": 16,
258+
"metadata": {},
259+
"outputs": [],
260+
"source": [
261+
"# from config import Config\n",
262+
"from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharLMEmbeddings, FlairEmbeddings\n",
263+
"from config_examples.config_contextual import Config\n",
264+
"from utils import load_cropus, get_cropus_len, get_inputs_contextual\n",
265+
"config = Config('flair_glove')\n",
266+
"\n",
267+
"# create a StackedEmbedding object that combines the embedding you want\n",
268+
"stacked_embeddings = StackedEmbeddings([\n",
269+
" WordEmbeddings('glove'), \n",
270+
" FlairEmbeddings('news-forward-fast'), \n",
271+
" FlairEmbeddings('news-backward-fast'),\n",
272+
" ])\n",
273+
"\n",
274+
"# load the corpus into flair libs\n",
275+
"token2idx1, char2idx, label2idx = get_idx(config)\n",
276+
"train, dev, test = load_cropus(config)\n",
277+
"\n",
278+
"\n",
279+
"\n",
280+
"\n",
281+
"# setting the [the number of token in corpus, the dimension of the stacked embedding]\n",
282+
"# this two number should be computed by your own cropus and the embedding combination your choose\n",
283+
"# for CONLL dataset, the cropus_len = 301418, flair-news-forward-fast + glove.100d = 2148\n",
284+
"datasets = [config.path_train, config.path_eval, config.path_test]\n",
285+
"cropus_len = get_cropus_len(datasets)\n",
286+
"lookup_table = np.zeros([cropus_len, 1124])\n",
287+
"token2idx = []\n",
288+
"\n",
289+
"\n",
290+
"train_x, train_y, offset = get_inputs_contextual(train,stacked_embeddings, 0, \n",
291+
" lookup_table,token2idx, char2idx, label2idx,)\n",
292+
"eval_x, eval_y, offset1 = get_inputs_contextual(dev,stacked_embeddings, offset, \n",
293+
" lookup_table,token2idx, char2idx, label2idx,)\n",
294+
"test_x, test_y, offset2 = get_inputs_contextual(test,stacked_embeddings, offset1, \n",
295+
" lookup_table,token2idx, char2idx, label2idx,)\n",
296+
"\n",
297+
"# update the lookup_table and token2idx according to the dataset since they will be contextual dependent\n",
298+
"config.init_contextual(lookup_table, token2idx)"
299+
]
300+
},
301+
{
302+
"cell_type": "code",
303+
"execution_count": 17,
304+
"metadata": {
305+
"scrolled": false
306+
},
307+
"outputs": [],
308+
"source": [
309+
"# initial the same NER model \n",
310+
"ner_model = Model(config)\n",
311+
"ner_model.build_graph()\n",
312+
"ner_model.initialize_session()\n",
313+
"\n",
314+
"# training and test\n",
315+
"ner_model.train(train_x,train_y,eval_x,eval_y)\n",
316+
"ner_model.test(eval_x,eval_y,'eval')\n",
317+
"ner_model.test(test_x,test_y, 'test')\n",
318+
"ner_model.close()"
319+
]
320+
},
321+
{
322+
"cell_type": "markdown",
323+
"metadata": {},
324+
"source": [
325+
"## elmo + w2v"
326+
]
327+
},
328+
{
329+
"cell_type": "code",
330+
"execution_count": null,
331+
"metadata": {},
332+
"outputs": [],
333+
"source": [
334+
"# from config import Config\n",
335+
"from config_examples.config_contextual import Config\n",
336+
"from utils import load_cropus, get_cropus_len, get_inputs_contextual\n",
337+
"from flair.embeddings import ELMoEmbeddings,StackedEmbeddings,WordEmbeddings\n",
338+
"elmo_embedding = ELMoEmbeddings()\n",
339+
"w2v_embedding = WordEmbeddings('/home/semantic/Liang_NER/data/word_embedding/word2vec/w2v.gensim')\n",
340+
"config = Config('elmo_w2v')\n",
341+
"\n",
342+
"# load the corpus into flair libs\n",
343+
"token2idx1, char2idx, label2idx = get_idx(config)\n",
344+
"train, dev, test = load_cropus(config)\n",
345+
"\n",
346+
"# create a StackedEmbedding object that combines the embedding you want\n",
347+
"stacked_embeddings = StackedEmbeddings(embeddings=[w2v_embedding,elmo_embedding])\n",
348+
"datasets = [config.path_train, config.path_eval, config.path_test]\n",
349+
"cropus_len = get_cropus_len(datasets)\n",
350+
"lookup_table = np.zeros([cropus_len, 1124])\n",
351+
"token2idx = []\n",
352+
"\n",
353+
"\n",
354+
"train_x, train_y, offset = get_inputs_contextual(train,stacked_embeddings, 0, \n",
355+
" lookup_table,token2idx, char2idx, label2idx,)\n",
356+
"eval_x, eval_y, offset1 = get_inputs_contextual(dev,stacked_embeddings, offset, \n",
357+
" lookup_table,token2idx, char2idx, label2idx,)\n",
358+
"test_x, test_y, offset2 = get_inputs_contextual(test,stacked_embeddings, offset1, \n",
359+
" lookup_table,token2idx, char2idx, label2idx,)\n",
360+
"\n",
361+
"# update the lookup_table and token2idx according to the dataset since they will be contextual dependent\n",
362+
"config.init_contextual(lookup_table, token2idx)"
363+
]
364+
}
365+
],
366+
"metadata": {
367+
"kernelspec": {
368+
"display_name": "Python 3",
369+
"language": "python",
370+
"name": "python3"
371+
},
372+
"language_info": {
373+
"codemirror_mode": {
374+
"name": "ipython",
375+
"version": 3
376+
},
377+
"file_extension": ".py",
378+
"mimetype": "text/x-python",
379+
"name": "python",
380+
"nbconvert_exporter": "python",
381+
"pygments_lexer": "ipython3",
382+
"version": "3.6.6"
383+
}
384+
},
385+
"nbformat": 4,
386+
"nbformat_minor": 2
387+
}

0 commit comments

Comments
 (0)