misc

vivekuppal · Jun 17, 2024 · d022c40 · d022c40
1 parent bdcf85c
commit d022c40
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 16 deletions.
diff --git a/app/transcribe/app_utils.py b/app/transcribe/app_utils.py
@@ -11,7 +11,7 @@
 sys.path.append('../..')
 import interactions  # noqa: E402 pylint: disable=C0413
 from sdk import transcriber_models as tm  # noqa: E402 pylint: disable=C0413
-from tsutils import utilities
+from tsutils import utilities, language
 
 
 def create_responder(provider_name: str, config, convo, save_to_file: bool,
@@ -134,7 +134,8 @@ def create_transcriber(
 
     if name.lower() == 'deepgram':
         stt_model_config: dict = {
-            'api_key': config['Deepgram']['api_key']
+            'api_key': config['Deepgram']['api_key'],
+            'audio_lang': get_language_code(config['OpenAI']['audio_lang'])
         }
         model = model_factory.get_stt_model_instance(
             stt_model=tm.STTEnum.DEEPGRAM_API,
@@ -149,6 +150,7 @@ def create_transcriber(
     elif name.lower() == 'whisper.cpp':
         stt_model_config: dict = {
             'local_transcripton_model_file': 'ggml-' + config['WhisperCpp']['local_transcripton_model_file'],
+            'audio_lang': get_language_code(config['OpenAI']['audio_lang'])
         }
         model = model_factory.get_stt_model_instance(
             stt_model=tm.STTEnum.WHISPER_CPP,
@@ -163,6 +165,7 @@ def create_transcriber(
         stt_model_config: dict = {
             'api_key': config['OpenAI']['api_key'],
             'local_transcripton_model_file': config['OpenAI']['local_transcripton_model_file'],
+            'audio_lang': get_language_code(config['OpenAI']['audio_lang'])
         }
         model = model_factory.get_stt_model_instance(
             stt_model=tm.STTEnum.WHISPER_LOCAL,
@@ -176,7 +179,8 @@ def create_transcriber(
     elif name.lower() == 'whisper' and api:
         stt_model_config: dict = {
             'api_key': config['OpenAI']['api_key'],
-            'timeout': config['OpenAI']['response_request_timeout_seconds']
+            'timeout': config['OpenAI']['response_request_timeout_seconds'],
+            'audio_lang': get_language_code(config['OpenAI']['audio_lang'])
         }
         model = model_factory.get_stt_model_instance(
             stt_model=tm.STTEnum.WHISPER_API,
@@ -192,6 +196,17 @@ def create_transcriber(
     global_vars.set_transcriber(t)
 
 
+def get_language_code(lang: str) -> str:
+    """Get the language code from the configuration.
+    """
+    lang_lower = lang.lower()
+    try:
+        return next(key for key, value in language.LANGUAGES_DICT.items() if value == lang_lower)
+    except StopIteration:
+        # Return dafault lang if nothing else is found
+        return 'en'
+
+
 def shutdown(global_vars: TranscriptionGlobals):
     """Activities to be performed right before application shutdown.
     """

diff --git a/app/transcribe/audio_player.py b/app/transcribe/audio_player.py
@@ -72,7 +72,11 @@ def play_audio_loop(self, config: dict):
     def _get_language_code(self, lang: str) -> str:
         """Get the language code from the configuration.
         """
-        return next(key for key, value in LANGUAGES_DICT.items() if value == lang)
+        try:
+            return next(key for key, value in LANGUAGES_DICT.items() if value == lang)
+        except StopIteration:
+            # Return dafault lang if nothing else is found
+            return 'en'
 
     def _get_speech_text(self) -> str:
         """Get the speech text from the conversation.

diff --git a/app/transcribe/audio_transcriber.py b/app/transcribe/audio_transcriber.py
@@ -19,7 +19,7 @@
 sys.path.append('../..')
 import custom_speech_recognition as sr  # noqa: E402 pylint: disable=C0413
 from tsutils import app_logging as al  # noqa: E402 pylint: disable=C0413
-from tsutils import duration  # noqa: E402 pylint: disable=C0413
+from tsutils import duration, utilities  # noqa: E402 pylint: disable=C0413
 from sdk.transcriber_models import WhisperCPPSTTModel
 
 
@@ -228,9 +228,10 @@ def convert_wav_to_16khz_format(self, file_path: str) -> str:
             file_descritor, mod_file_path = tempfile.mkstemp(suffix=".wav")
             os.close(file_descritor)
             # print(f'Convert file {file_path} to 16khz file {mod_file_path}')
+            log_file = f"{utilities.get_data_path(app_name='Transcribe')}/logs/ffmpeg.txt"
             subprocess.call(["ffmpeg", '-i', file_path, '-ar', '16000', '-ac',  # nosec
                              '1', '-c:a', 'pcm_s16le', '-y', mod_file_path],
-                            stdout=open(file='logs/ffmpeg.txt', mode='a', encoding='utf-8'),
+                            stdout=open(file=log_file, mode='a', encoding='utf-8'),
                             stderr=subprocess.STDOUT)
             return mod_file_path
         except Exception as ex:

diff --git a/sdk/transcriber_models.py b/sdk/transcriber_models.py
@@ -80,7 +80,7 @@ class WhisperSTTModel(STTModelInterface):
     """
     def __init__(self, stt_model_config: dict):
         self.model = stt_model_config['local_transcripton_model_file']
-        self.lang = 'en'
+        self.lang = stt_model_config['audio_lang']
         model_filename = MODELS_DIR + self.model + ".pt"
         self.model_name = self.model + ".pt"
         self.model_filename = os.path.join(MODELS_DIR, model_filename)
@@ -206,7 +206,7 @@ def __init__(self, stt_model_config: dict):
         # A better solution is to create a base class for APIWhisperSTTModel,
         # WhisperSTTModel and create set_lang method there and remove it from
         # this class
-        self.lang = 'en'
+        self.lang = stt_model_config['audio_lang']
 
     def set_lang(self, lang: str):
         """Set STT Language"""
@@ -249,7 +249,8 @@ def get_sentences(self, wav_file_path) -> dict:
         """
         try:
             with open(wav_file_path, "rb") as audio_file:
-                result = self.stt_client.audio.transcriptions.create(model='whisper-1', file=audio_file)
+                result = self.stt_client.audio.transcriptions.create(model='whisper-1', file=audio_file,
+                                                                     language=self.lang)
         except Exception as exception:
             print('Exception in transcribing audio using whisper API.')
             print(exception)
@@ -264,7 +265,7 @@ class WhisperCPPSTTModel(STTModelInterface):
     This model works best when used with GPU
     """
     def __init__(self, stt_model_config: dict):
-        self.lang = 'en-US'
+        self.lang = stt_model_config['audio_lang']
         model = stt_model_config['local_transcripton_model_file']
         self.model_filename = MODELS_DIR + model + ".bin"
         self.model = model
@@ -287,15 +288,18 @@ def get_transcription(self, wav_file_path: str):
         """
         mod_file_path = wav_file_path
         try:
+            log_file = f"{utilities.get_data_path(app_name='Transcribe')}/logs/whisper.cpp.txt"
             # main.exe <filename> -oj
             if os.path.isfile("../../bin/main.exe"):
-                subprocess.call(["../../bin/main.exe", mod_file_path, '-oj', '-m', self.model_filename],
-                                stdout=open(file='logs/whisper.cpp.txt', mode='a', encoding='utf-8'),
+                subprocess.call(["../../bin/main.exe", mod_file_path, '-oj', '-m',
+                                 self.model_filename, '-l', self.lang],
+                                stdout=open(file=log_file, mode='a', encoding='utf-8'),
                                 stderr=subprocess.STDOUT)
             else:
                 # This path is used in case of binary.
-                subprocess.call(["./bin/main.exe", mod_file_path, '-oj', '-m', self.model_filename],
-                                stdout=open(file='logs/whisper.cpp.txt', mode='a', encoding='utf-8'),
+                subprocess.call(["./bin/main.exe", mod_file_path, '-oj', '-m', self.model_filename,
+                                 '-l', self.lang],
+                                stdout=open(file=log_file, mode='a', encoding='utf-8'),
                                 stderr=subprocess.STDOUT)
         except Exception as ex:
             print(f'ERROR: converting wav file {wav_file_path} to text using whisper.cpp.')
@@ -331,7 +335,15 @@ def process_response(self, response) -> str:
     def get_sentences(self, wav_file_path: str):
         """Not Implemented
         """
-        raise Exception('Method not implemnted')  # pylint: disable=W0719
+        transcript = ''
+        response = self.get_transcription(wav_file_path=wav_file_path)
+        for segment in response["transcription"]:
+            if segment["text"].strip() == '[BLANK_AUDIO]':
+                continue
+            transcript += segment["text"]
+        return transcript
+
+        # raise Exception('Method not implemnted')  # pylint: disable=W0719
 
 
 class DeepgramSTTModel(STTModelInterface):

diff --git a/tsutils/language.py b/tsutils/language.py
@@ -88,7 +88,7 @@
     'tt': 'tatar',
     'te': 'telugu',
     'th': 'thai',
-    'bo': 'tibetan', 
+    'bo': 'tibetan',
     'tr': 'turkish',
     'tk': 'turkmen',
     'uk': 'ukrainian',