diff --git a/README.md b/README.md index ddd522b52..ef167b45e 100644 --- a/README.md +++ b/README.md @@ -65,3 +65,71 @@ If you use OpenSeq2Seq, please cite [this paper](https://arxiv.org/abs/1805.1038 primaryClass={cs.CL} } ``` + +## Install Decoders + +### Install boost/automake and bison +``` +sudo apt-get install libboost-all-dev -y +sudo apt-get install automake -y +sudo apt-get install bison -y +``` + +### Install SWIG + ``` + git clone https://github.com/swig/swig.git +cd swig +./autogen.sh +./configure +make +sudo make install + ``` + #### Test once + ``` + $ swig + ``` + if you encounter +``` +$ swig: error while loading shared libraries: libpcre.so.1: cannot open shared object file: No such file or directory +``` +## Install PCRE +``` +cd /usr/local/src +sudo curl --remote-name ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-8.42.tar.gz + +tar -xzvf pcre-8.42.tar.gz +cd pcre-8.42 +sudo ./configure --prefix=/usr/local/mac-dev-env/pcre-8.42 +sudo make +sudo make install +sudo ln -s mac-dev-env/pcre-8.42 /usr/local/pcre +echo 'export PATH=/usr/local/pcre/bin:$PATH' >> ~/.bash_profile +source ~/.bash_profile +cd .libs +sudo mv -v libpcre.so.* /usr/lib/ +``` +If the above doesnt works then please use the latest version as follows: + +``` +sudo curl --remote-name https://ftp.pcre.org/pub/pcre/pcre-8.43.tar.bz2 +tar xjf pcre-8.43.tar.bz2 +cd pcre-8.43/ +sudo ./configure --prefix=/usr/local/mac-dev-env/pcre-8.43 +sudo make +sudo make install +sudo ln -s mac-dev-env/pcre-8.43 /usr/local/pcre +echo 'export PATH=/usr/local/pcre/bin:$PATH' >> ~/.bash_profile +source ~/.bash_profile +cd .libs +sudo mv -v libpcre.so.* /usr/lib/ +``` + +If the symlink is already used..either delete or use another symlink +## Final Output + +``` +$ swig +Must specify an input file. Use -help for available options. +``` + +### ThankYou diff --git a/docs/sources/source/getting-started/asr.rst b/docs/sources/source/getting-started/asr.rst index 848085bb6..f0ecf4624 100644 --- a/docs/sources/source/getting-started/asr.rst +++ b/docs/sources/source/getting-started/asr.rst @@ -26,6 +26,7 @@ dataset size will be around 224GB (including archives and original compressed au Now, everything should be setup to train the model:: python run.py --config_file=example_configs/speech2text/ds2_librispeech_larc_config.py --mode=train_eval + python run.py --config_file=example_configs/speech2text/ds2_librispeech_larc_config.py --mode=train_eval --infer_dataset=example_configs/datasets/infer.csv If you want to run evaluation/inference with the trained model, replace ``--mode=train_eval`` with ``--mode=eval`` or ``--mode=infer``. diff --git a/open_seq2seq/utils/utils.py b/open_seq2seq/utils/utils.py index 69dd0f45e..112a9f525 100644 --- a/open_seq2seq/utils/utils.py +++ b/open_seq2seq/utils/utils.py @@ -506,7 +506,13 @@ def get_base_config(args): help='whether to log output, git info, cmd args, etc.') parser.add_argument('--use_xla_jit', dest='use_xla_jit', action='store_true', help='whether to use XLA_JIT to compile and run the model.') + parser.add_argument('--infer_dataset', dest='infer_dataset', + help='infer_dataset csv file.') + parser.add_argument('--train_dataset', dest='train_dataset', + help='train_dataset csv file.') args, unknown = parser.parse_known_args(args) + infer_params = args.infer_dataset + train_params = args.train_dataset if args.mode not in [ 'train', @@ -519,7 +525,10 @@ def get_base_config(args): "['train', 'eval', 'train_eval', 'infer', " "'interactive_infer']") config_module = runpy.run_path(args.config_file, init_globals={'tf': tf}) - + if infer_params: + config_module['infer_params']['data_layer_params']['dataset_files'] = infer_params.split(',') + if train_params: + config_module['train_params']['data_layer_params']['dataset_files'] = train_params.split(',') base_config = config_module.get('base_params', None) if base_config is None: raise ValueError('base_config dictionary has to be ' @@ -541,7 +550,6 @@ def get_base_config(args): parser_unk.add_argument('--' + pm, default=value, type=ast.literal_eval) config_update = parser_unk.parse_args(unknown) nested_update(base_config, nest_dict(vars(config_update))) - return args, base_config, base_model, config_module def get_calibration_config(arguments): diff --git a/scripts/decode.py b/scripts/decode.py index 2d5c3babc..4cf56ab21 100644 --- a/scripts/decode.py +++ b/scripts/decode.py @@ -4,7 +4,7 @@ ''' import argparse - +import time import pickle import numpy as np @@ -89,7 +89,6 @@ num_cpus = multiprocessing.cpu_count() - def levenshtein(a, b): """Calculates the Levenshtein distance between a and b. The code was taken from: http://hetland.org/coding/python/levenshtein.py @@ -170,6 +169,8 @@ def softmax(x): def evaluate_wer(logits, labels, vocab, decoder): + eval_start=time.time() + print("evaluation started at ",eval_start) total_dist = 0.0 total_count = 0.0 wer_per_sample = np.empty(shape=len(labels)) @@ -187,31 +188,52 @@ def evaluate_wer(logits, labels, vocab, decoder): wer_per_sample[idx] = dist / len(label.split()) print('# empty preds: {}'.format(empty_preds)) wer = total_dist / total_count + eval_end=time.time() + print("evaluation took %s time"%(eval_end-eval_start)) return wer, wer_per_sample +def divide_chunks(l, n): + # looping till length l + for i in range(0, len(l), n): + yield l[i:i + n] +data_load_start=time.time() data = load_dump(args.logits) labels = load_labels(args.labels) logits = get_logits(data, labels) vocab = load_vocab(args.vocab) vocab[-1] = '_' - +data_load_end=time.time() +print("Data loading took %s seconds" %(data_load_end-data_load_start) ) probs_batch = [] for line in labels: audio_filename = line[0] probs_batch.append(softmax(logits[audio_filename])) +batch_prob_end=time.time() +print("Batch logit loading took %s seconds" %(batch_prob_end-data_load_end) ) if args.mode == 'eval': + eval_start=time.time() wer, _ = evaluate_wer(logits, labels, vocab, greedy_decoder) print('Greedy WER = {:.4f}'.format(wer)) best_result = {'wer': 1e6, 'alpha': 0.0, 'beta': 0.0, 'beams': None} for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step): for beta in np.arange(args.beta, args.beta_max, args.beta_step): scorer = Scorer(alpha, beta, model_path=args.lm, vocabulary=vocab[:-1]) - res = ctc_beam_search_decoder_batch(probs_batch, vocab[:-1], - beam_size=args.beam_width, - num_processes=num_cpus, - ext_scoring_func=scorer) + print("scorer complete") + probs_batch_list = list(divide_chunks(probs_batch, 500)) + res=[] + for probs_batch in probs_batch_list: + f=time.time() + result = ctc_beam_search_decoder_batch(probs_batch, vocab[:-1], + beam_size=args.beam_width, + num_processes=num_cpus, + ext_scoring_func=scorer) + e=time.time() + for j in result: + res.append(j) + print("500 files batched took %s time"%(e-f)) + total_dist = 0.0 total_count = 0.0 for idx, line in enumerate(labels): @@ -230,7 +252,8 @@ def evaluate_wer(logits, labels, vocab, decoder): print('alpha={:.2f}, beta={:.2f}: WER={:.4f}'.format(alpha, beta, wer)) print('BEST: alpha={:.2f}, beta={:.2f}, WER={:.4f}'.format( best_result['alpha'], best_result['beta'], best_result['wer'])) - + eval_end=time.time() + print("evaluation took %s seconds",eval_end-eval_start) if args.dump_all_beams_to: with open(args.dump_all_beams_to, 'w') as f: for beam in best_result['beams']: @@ -239,19 +262,48 @@ def evaluate_wer(logits, labels, vocab, decoder): f.write('{} 0.0 0.0 {}\n'.format(pred[0], pred[1])) f.write('E=>>>>>>>>\n') +elif args.mode == 'greedy': + print("Greedy Mode") + greedy_preds = np.empty(shape=(len(labels), 2), dtype=object) + for idx, line in enumerate(labels): + filename = line[0] + greedy_preds[idx, 0] = filename + greedy_preds[idx, 1] = greedy_decoder(logits[filename], vocab) + + np.savetxt(args.infer_output_file, greedy_preds, fmt='%s', delimiter=',', + header='wav_filename,greedy') + + elif args.mode == 'infer': - scorer = Scorer(args.alpha, args.beta, model_path=args.lm, vocabulary=vocab[:-1]) - res = ctc_beam_search_decoder_batch(probs_batch, vocab[:-1], - beam_size=args.beam_width, - num_processes=num_cpus, - ext_scoring_func=scorer) - infer_preds = np.empty(shape=(len(labels), 2), dtype=object) - for idx, line in enumerate(labels): - filename = line[0] - score, text = [v for v in zip(*res[idx])] - infer_preds[idx, 0] = filename - infer_preds[idx, 1] = text[0] + print("Inference Mode") + infer_start=time.time() + scorer = Scorer(args.alpha, args.beta, model_path=args.lm, vocabulary=vocab[:-1]) + + probs_batch_list = list(divide_chunks(probs_batch, 500)) + res=[] + for probs_batch in probs_batch_list: + f=time.time() + result = ctc_beam_search_decoder_batch(probs_batch, vocab[:-1], + beam_size=args.beam_width, + num_processes=num_cpus, + ext_scoring_func=scorer) + e=time.time() + + for j in result: + res.append(j) + + print("500 files batched took %s time"%(e-f)) + + infer_preds = np.empty(shape=(len(labels), 3), dtype=object) + for idx, line in enumerate(labels): + filename = line[0] + score, text = [v for v in zip(*res[idx])] + infer_preds[idx, 0] = filename + infer_preds[idx, 1] = text[0] + #Greedy + infer_preds[idx, 2] = greedy_decoder(logits[filename], vocab) - np.savetxt(args.infer_output_file, infer_preds, fmt='%s', delimiter=',', - header='wav_filename,transcript') + infer_end=time.time() + print("Inference took %s seconds",infer_end-infer_start) + np.savetxt(args.infer_output_file, infer_preds, fmt='%s', delimiter=',',header='wav_filename,lm,greedy') diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..c085e178e --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +import setuptools +import numpy as np + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="open_seq2seq", + version="0.0.1", + author="voicezen", + author_email="all@voicezen.ai", + description="Python repo for components and analysis", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/voicezen/jivaka", + packages=setuptools.find_packages(), + include_dirs=[np.get_include()], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], +)