forked from eraldoluis/LeNER-Br
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_data.py
79 lines (63 loc) · 3.1 KB
/
build_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# This file was developed as part of the project reported in the paper below.
# We request that users cite our paper in any publication that is generated
# as a result of the use of our source code or our dataset.
#
# Pedro H. Luz de Araujo, Teófilo E. de Campos, Renato R. R. de Oliveira, Matheus Stauffer, Samuel Couto and Paulo Bermejo.
# LeNER-Br: a Dataset for Named Entity Recognition in Brazilian Legal Text.
# International Conference on the Computational Processing of Portuguese (PROPOR),
# September 24-26, Canela, Brazil, 2018.
#
# @InProceedings{luz_etal_propor2018,
# author = {Pedro H. {Luz de Araujo} and Te\'{o}filo E. {de Campos} and
# Renato R. R. {de Oliveira} and Matheus Stauffer and
# Samuel Couto and Paulo Bermejo},
# title = {LeNER-Br: a Dataset for Named Entity Recognition in Brazilian Legal Text},
# booktitle = {International Conference on the Computational Processing of Portuguese
# ({PROPOR})},
# year = {2018},
# month = {September 24-26},
# address = {Canela, RS, Brazil},
# note = {Available from \url{https://cic.unb.br/~teodecampos/LeNER-Br/}}
# }
from model.config import Config
from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
export_trimmed_glove_vectors, get_processing_word
def main():
"""Procedure to build data
You MUST RUN this procedure. It iterates over the whole dataset (train,
dev and test) and extract the vocabularies in terms of words, tags, and
characters. Having built the vocabularies it writes them in a file. The
writing of vocabulary in a file assigns an id (the line #) to each word.
It then extract the relevant GloVe vectors and stores them in a np array
such that the i-th entry corresponds to the i-th word in the vocabulary.
Args:
config: (instance of Config) has attributes like hyper-params...
"""
# get config and processing of words
config = Config(load=False)
processing_word = get_processing_word(lowercase=True)
# Generators
dev = CoNLLDataset(config.filename_dev, processing_word)
test = CoNLLDataset(config.filename_test, processing_word)
train = CoNLLDataset(config.filename_train, processing_word)
# Build Word and Tag vocab
# vocab_words, vocab_tags = get_vocabs([train, dev, test, test2])
vocab_words, vocab_tags = get_vocabs([train, dev])
vocab_glove = get_glove_vocab(config.filename_glove)
vocab = vocab_words & vocab_glove
vocab.add(UNK)
vocab.add(NUM)
# Save vocab
write_vocab(vocab, config.filename_words)
write_vocab(vocab_tags, config.filename_tags)
# Trim GloVe Vectors
vocab = load_vocab(config.filename_words)
export_trimmed_glove_vectors(vocab, config.filename_glove,
config.filename_trimmed, config.dim_word)
# Build and save char vocab
train = CoNLLDataset(config.filename_train)
vocab_chars = get_char_vocab(train)
write_vocab(vocab_chars, config.filename_chars)
if __name__ == "__main__":
main()