Skip to content

Commit

Permalink
Merge pull request #9 from Vernacular-ai/word-level-features
Browse files Browse the repository at this point in the history
[WIP] Word level features
  • Loading branch information
lepisma authored Dec 31, 2019
2 parents 010da74 + 4b53095 commit 278a832
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 99 deletions.
10 changes: 9 additions & 1 deletion protos/kaldi_serve.proto
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ message RecognitionConfig {
bool punctuation = 5;
repeated SpeechContext speech_contexts = 6;
int32 audio_channel_count = 7;
bool enable_word_time_offsets = 8;
// RecognitionMetadata metadata = 9;
string model = 10;
bool raw = 11;
int32 data_bytes = 12;
bool word_level = 13;
}

// Either `content` or `uri` must be supplied.
Expand All @@ -64,6 +64,14 @@ message SpeechRecognitionAlternative {
float confidence = 2;
float am_score = 3;
float lm_score = 4;
repeated Word words = 5;
}

message Word {
float start_time = 1;
float end_time = 2;
string word = 3;
float confidence = 4;
}

message SpeechContext {
Expand Down
56 changes: 28 additions & 28 deletions python/kaldi_serve/kaldi_serve_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "kaldi_serve"
version = "0.1.0"
version = "0.2.0"
description = "Python bindings for kaldi streaming ASR"
authors = []

Expand Down
31 changes: 22 additions & 9 deletions python/scripts/example_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
Script for testing out ASR server.
Usage:
example_client.py mic [--n-secs=<n-secs>] [--model=<model>] [--lang=<lang>] [--raw] [--pcm]
example_client.py <file>... [--model=<model>] [--lang=<lang>] [--raw] [--pcm]
example_client.py mic [--n-secs=<n-secs>] [--model=<model>] [--lang=<lang>] [--raw] [--pcm] [--word-level]
example_client.py <file>... [--model=<model>] [--lang=<lang>] [--raw] [--pcm] [--word-level]
Options:
--n-secs=<n-secs> Number of seconds to records, ideally there should be a VAD here. [default: 3]
--model=<model> Name of the model to hit [default: general]
--lang=<lang> Language code of the model [default: hi]
--raw Flag that specifies whether to stream raw audio bytes to server.
--pcm Flag for sending raw pcm bytes
--word-level Whether to get word level features from server.
"""

import random
Expand All @@ -19,8 +20,8 @@
from pprint import pprint
from typing import List

from pydub import AudioSegment
from docopt import docopt
from pydub import AudioSegment

from kaldi_serve import KaldiServeClient, RecognitionAudio, RecognitionConfig
from kaldi_serve.utils import (chunks_from_file, chunks_from_mic,
Expand All @@ -39,14 +40,23 @@ def parse_response(response):
"transcript": alt.transcript,
"confidence": alt.confidence,
"am_score": alt.am_score,
"lm_score": alt.lm_score
"lm_score": alt.lm_score,
"words": [
{
"start_time": word.start_time,
"end_time": word.end_time,
"word": word.word,
"confidence": word.confidence
}
for word in alt.words
]
}
for alt in res.alternatives
])
return output


def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw: bool=False):
def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw: bool=False, word_level: bool=False):
"""
Transcribe the given audio chunks
"""
Expand All @@ -64,6 +74,7 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
max_alternatives=10,
model=model,
raw=True,
word_level=word_level,
data_bytes=chunk_len
)
audio_params = [(config(len(chunk)), RecognitionAudio(content=chunk)) for chunk in audio_chunks]
Expand All @@ -77,6 +88,7 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
language_code=language_code,
max_alternatives=10,
model=model,
word_level=word_level
)
response = client.streaming_recognize(config, audio, uuid="")
except Exception as e:
Expand All @@ -86,14 +98,14 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
pprint(parse_response(response))


def decode_files(client, audio_paths: List[str], model: str, language_code: str, raw: bool=False, pcm: bool=False):
def decode_files(client, audio_paths: List[str], model: str, language_code: str, raw: bool=False, pcm: bool=False, word_level: bool=False):
"""
Decode files using threaded requests
"""
chunked_audios = [chunks_from_file(x, chunk_size=random.randint(1, 3), raw=raw, pcm=pcm) for x in audio_paths]

threads = [
threading.Thread(target=transcribe_chunks, args=(client, chunks, model, language_code, raw))
threading.Thread(target=transcribe_chunks, args=(client, chunks, model, language_code, raw, word_level))
for chunks in chunked_audios
]

Expand All @@ -111,8 +123,9 @@ def decode_files(client, audio_paths: List[str], model: str, language_code: str,
language_code = args["--lang"]
raw = args['--raw']
pcm = args['--pcm']
word_level = args["--word-level"]

if args["mic"]:
transcribe_chunks(client, chunks_from_mic(int(args["--n-secs"]), SR, 1), model, language_code, raw)
transcribe_chunks(client, chunks_from_mic(int(args["--n-secs"]), SR, 1), model, language_code, raw, word_level)
else:
decode_files(client, args["<file>"], model, language_code, raw, pcm)
decode_files(client, args["<file>"], model, language_code, raw, pcm, word_level)
1 change: 1 addition & 0 deletions resources/model-spec.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ frame_subsampling_factor = 3 # 3
# │   ├── final.ie
# │   ├── final.mat
# │   └── global_cmvn.stats
# ├── word_boundary.int (optional; needed only for word level confidence and timing information)
# └── words.txt

# The files above have the default kaldi chain model interpretation (with
Expand Down
Loading

0 comments on commit 278a832

Please sign in to comment.