Skip to content

Commit 9548b52

Browse files
committed
Fix vocab size and set letters only
Fix vocab size and set letters only
1 parent 05ce745 commit 9548b52

File tree

2 files changed

+17
-2
lines changed

2 files changed

+17
-2
lines changed

onmt/inputters/inputter.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,8 @@ def _build_field_vocab(field, counter, **kwargs):
277277

278278
def build_vocab(train_dataset_files, fields, data_type, share_vocab,
279279
src_vocab_path, src_vocab_size, src_words_min_frequency,
280-
tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency):
280+
tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency,
281+
only_letters_vocab):
281282
"""
282283
Args:
283284
train_dataset_files: a list of train dataset pt file.
@@ -292,6 +293,7 @@ def build_vocab(train_dataset_files, fields, data_type, share_vocab,
292293
tgt_vocab_size(int): size of the target vocabulary.
293294
tgt_words_min_frequency(int): the minimum frequency needed to
294295
include a target word in the vocabulary.
296+
only_letters_vocab(int): accept only words with letters.
295297
296298
Returns:
297299
Dict of Fields
@@ -328,13 +330,24 @@ def build_vocab(train_dataset_files, fields, data_type, share_vocab,
328330
for ex in dataset.examples:
329331
for k in fields:
330332
val = getattr(ex, k, None)
333+
if only_letters_vocab == 1 and k in ('tgt','src'):
334+
wordsREGEX = '^[a-zäàáãëèéïìíöòóõüùúßçñ\'\-\´\`]+$'
335+
pesq = re.compile(wordsREGEX, re.IGNORECASE)
336+
lst_val = list(val)
337+
lst_new = list()
338+
for valor in lst_val:
339+
g = pesq.match(valor)
340+
if g:
341+
x = g.group()
342+
lst_new.append(x)
343+
if len(lst_new) > 0: val = tuple(lst_new)
331344
if not fields[k].sequential:
332345
continue
333346
elif k == 'src' and src_vocab:
334347
continue
335348
elif k == 'tgt' and tgt_vocab:
336349
continue
337-
counter[k].update(val)
350+
if len(val) > 0: counter[k].update(val)
338351

339352
# Drop the none-using from memory but keep the last
340353
if (index < len(train_dataset_files) - 1):

onmt/opts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ def preprocess_opts(parser):
224224
help="Create dynamic dictionaries")
225225
group.add('--share_vocab', '-share_vocab', action='store_true',
226226
help="Share source and target vocabulary")
227+
group.add('--only_letters_vocab', '-only_letters_vocab', type=int, default=0,
228+
help="Only accept words with letters")
227229

228230
# Truncation options, for text corpus
229231
group = parser.add_argument_group('Pruning')

0 commit comments

Comments
 (0)