Merge pull request #9 from Vernacular-ai/word-level-features

[WIP] Word level features
skit-ai · Dec 31, 2019 · 278a832 · 278a832
2 parents 010da74 + 4b53095
commit 278a832
Show file tree

Hide file tree

Showing 8 changed files with 250 additions and 99 deletions.
diff --git a/protos/kaldi_serve.proto b/protos/kaldi_serve.proto
@@ -40,11 +40,11 @@ message RecognitionConfig {
   bool punctuation = 5;
   repeated SpeechContext speech_contexts = 6;
   int32 audio_channel_count = 7;
-  bool enable_word_time_offsets = 8;
   // RecognitionMetadata metadata = 9;
   string model = 10;
   bool raw = 11;
   int32 data_bytes = 12;
+  bool word_level = 13;
 }
 
 // Either `content` or `uri` must be supplied.
@@ -64,6 +64,14 @@ message SpeechRecognitionAlternative {
   float confidence = 2;
   float am_score = 3;
   float lm_score = 4;
+  repeated Word words = 5;
+}
+
+message Word {
+  float start_time = 1;
+  float end_time = 2;
+  string word = 3;
+  float confidence = 4;
 }
 
 message SpeechContext {

diff --git a/python/kaldi_serve/kaldi_serve_pb2.py b/python/kaldi_serve/kaldi_serve_pb2.py
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "kaldi_serve"
-version = "0.1.0"
+version = "0.2.0"
 description = "Python bindings for kaldi streaming ASR"
 authors = []
 

diff --git a/python/scripts/example_client.py b/python/scripts/example_client.py
@@ -2,15 +2,16 @@
 Script for testing out ASR server.
 
 Usage:
-  example_client.py mic [--n-secs=<n-secs>] [--model=<model>] [--lang=<lang>] [--raw] [--pcm]
-  example_client.py <file>... [--model=<model>] [--lang=<lang>] [--raw] [--pcm]
+  example_client.py mic [--n-secs=<n-secs>] [--model=<model>] [--lang=<lang>] [--raw] [--pcm] [--word-level]
+  example_client.py <file>... [--model=<model>] [--lang=<lang>] [--raw] [--pcm] [--word-level]
 
 Options:
   --n-secs=<n-secs>     Number of seconds to records, ideally there should be a VAD here. [default: 3]
   --model=<model>       Name of the model to hit [default: general]
   --lang=<lang>         Language code of the model [default: hi]
   --raw                 Flag that specifies whether to stream raw audio bytes to server.
   --pcm                 Flag for sending raw pcm bytes
+  --word-level          Whether to get word level features from server.
 """
 
 import random
@@ -19,8 +20,8 @@
 from pprint import pprint
 from typing import List
 
-from pydub import AudioSegment
 from docopt import docopt
+from pydub import AudioSegment
 
 from kaldi_serve import KaldiServeClient, RecognitionAudio, RecognitionConfig
 from kaldi_serve.utils import (chunks_from_file, chunks_from_mic,
@@ -39,14 +40,23 @@ def parse_response(response):
                 "transcript": alt.transcript,
                 "confidence": alt.confidence,
                 "am_score": alt.am_score,
-                "lm_score": alt.lm_score
+                "lm_score": alt.lm_score,
+                "words": [
+                    {
+                        "start_time": word.start_time,
+                        "end_time": word.end_time,
+                        "word": word.word,
+                        "confidence": word.confidence
+                    }
+                    for word in alt.words
+                ]
             }
             for alt in res.alternatives
         ])
     return output
 
 
-def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw: bool=False):
+def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw: bool=False, word_level: bool=False):
     """
     Transcribe the given audio chunks
     """
@@ -64,6 +74,7 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
                 max_alternatives=10,
                 model=model,
                 raw=True,
+                word_level=word_level,
                 data_bytes=chunk_len
             )
             audio_params = [(config(len(chunk)), RecognitionAudio(content=chunk)) for chunk in audio_chunks]
@@ -77,6 +88,7 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
                 language_code=language_code,
                 max_alternatives=10,
                 model=model,
+                word_level=word_level
             )
             response = client.streaming_recognize(config, audio, uuid="")
     except Exception as e:
@@ -86,14 +98,14 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
     pprint(parse_response(response))
 
 
-def decode_files(client, audio_paths: List[str], model: str, language_code: str, raw: bool=False, pcm: bool=False):
+def decode_files(client, audio_paths: List[str], model: str, language_code: str, raw: bool=False, pcm: bool=False, word_level: bool=False):
     """
     Decode files using threaded requests
     """
     chunked_audios = [chunks_from_file(x, chunk_size=random.randint(1, 3), raw=raw, pcm=pcm) for x in audio_paths]
 
     threads = [
-        threading.Thread(target=transcribe_chunks, args=(client, chunks, model, language_code, raw))
+        threading.Thread(target=transcribe_chunks, args=(client, chunks, model, language_code, raw, word_level))
         for chunks in chunked_audios
     ]
 
@@ -111,8 +123,9 @@ def decode_files(client, audio_paths: List[str], model: str, language_code: str,
     language_code = args["--lang"]
     raw = args['--raw']
     pcm = args['--pcm']
+    word_level = args["--word-level"]
 
     if args["mic"]:
-        transcribe_chunks(client, chunks_from_mic(int(args["--n-secs"]), SR, 1), model, language_code, raw)
+        transcribe_chunks(client, chunks_from_mic(int(args["--n-secs"]), SR, 1), model, language_code, raw, word_level)
     else:
-        decode_files(client, args["<file>"], model, language_code, raw, pcm)
+        decode_files(client, args["<file>"], model, language_code, raw, pcm, word_level)
diff --git a/resources/model-spec.toml b/resources/model-spec.toml
@@ -39,6 +39,7 @@ frame_subsampling_factor = 3 # 3
 # │   ├── final.ie
 # │   ├── final.mat
 # │   └── global_cmvn.stats
+# ├── word_boundary.int (optional; needed only for word level confidence and timing information)
 # └── words.txt
 
 # The files above have the default kaldi chain model interpretation (with