@@ -277,7 +277,8 @@ def _build_field_vocab(field, counter, **kwargs):
277
277
278
278
def build_vocab (train_dataset_files , fields , data_type , share_vocab ,
279
279
src_vocab_path , src_vocab_size , src_words_min_frequency ,
280
- tgt_vocab_path , tgt_vocab_size , tgt_words_min_frequency ):
280
+ tgt_vocab_path , tgt_vocab_size , tgt_words_min_frequency ,
281
+ only_letters_vocab ):
281
282
"""
282
283
Args:
283
284
train_dataset_files: a list of train dataset pt file.
@@ -292,6 +293,7 @@ def build_vocab(train_dataset_files, fields, data_type, share_vocab,
292
293
tgt_vocab_size(int): size of the target vocabulary.
293
294
tgt_words_min_frequency(int): the minimum frequency needed to
294
295
include a target word in the vocabulary.
296
+ only_letters_vocab(int): accept only words with letters.
295
297
296
298
Returns:
297
299
Dict of Fields
@@ -328,13 +330,24 @@ def build_vocab(train_dataset_files, fields, data_type, share_vocab,
328
330
for ex in dataset .examples :
329
331
for k in fields :
330
332
val = getattr (ex , k , None )
333
+ if only_letters_vocab == 1 and k in ('tgt' ,'src' ):
334
+ wordsREGEX = '^[a-zäàáãëèéïìíöòóõüùúßçñ\' \-\´\`]+$'
335
+ pesq = re .compile (wordsREGEX , re .IGNORECASE )
336
+ lst_val = list (val )
337
+ lst_new = list ()
338
+ for valor in lst_val :
339
+ g = pesq .match (valor )
340
+ if g :
341
+ x = g .group ()
342
+ lst_new .append (x )
343
+ if len (lst_new ) > 0 : val = tuple (lst_new )
331
344
if not fields [k ].sequential :
332
345
continue
333
346
elif k == 'src' and src_vocab :
334
347
continue
335
348
elif k == 'tgt' and tgt_vocab :
336
349
continue
337
- counter [k ].update (val )
350
+ if len ( val ) > 0 : counter [k ].update (val )
338
351
339
352
# Drop the none-using from memory but keep the last
340
353
if (index < len (train_dataset_files ) - 1 ):
0 commit comments