vivekuppal · vivekuppal · Mar 29, 2024 · Mar 29, 2024
diff --git a/app/transcribe/args.py b/app/transcribe/args.py
@@ -20,7 +20,7 @@ def create_args() -> argparse.Namespace:
                           \nThis option requires an API KEY and will consume Open AI credits.')
     cmd_args.add_argument('-e', '--experimental', action='store_true',
                           help='Experimental command line argument. Behavior is undefined.')
-    cmd_args.add_argument('-stt', '--speech_to_text', action='store', default='whisper',
+    cmd_args.add_argument('-stt', '--speech_to_text', action='store', default=None,
                           choices=['whisper', 'whisper.cpp', 'deepgram'],
                           help='Specify the Speech to text Engine.'
                           '\nLocal STT models tend to perform best for response times.'
@@ -164,6 +164,9 @@ def update_args_config(args: argparse.Namespace, config: dict):
     if args.speaker_device_index is not None:
         config['General']['speaker_device_index'] = int(args.speaker_device_index)
 
+    if args.speech_to_text is not None:
+        config['General']['stt'] = args.speech_to_text
+
 
 def update_audio_devices(global_vars: TranscriptionGlobals, config: dict):
     """Handle all application configuration using the command line args"""

diff --git a/app/transcribe/audio_transcriber.py b/app/transcribe/audio_transcriber.py
@@ -134,7 +134,7 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue):
             except Exception as exception:
                 print(exception)
             finally:
-                # print(f'transcribe_audio_queue: filesize: {os.path.getsize(path)}')
+                # print(f'transcribe_audio_queue: file: {path} filesize: {os.path.getsize(path)}')
                 os.unlink(path)
 
             if text != '' and text.lower() != 'you':

diff --git a/app/transcribe/gpt_responder.py b/app/transcribe/gpt_responder.py
@@ -313,7 +313,8 @@ def __init__(self,
         base_url = self.config['OpenAI']['base_url']
         self.llm_client = openai.OpenAI(api_key=api_key, base_url=base_url)
         self.model = self.config['OpenAI']['ai_model']
-        print(f'[INFO] Using OpenAI for inference. Model: {self.model}')
+        stt = self.config['General']['stt']
+        print(f'[INFO] Using {stt} for inference. Model: {self.model}')
         super().__init__(config=self.config,
                          convo=convo,
                          save_to_file=save_to_file,

diff --git a/app/transcribe/main.py b/app/transcribe/main.py
@@ -19,7 +19,6 @@ def main():
     args = create_args()
 
     config = configuration.Config().data
-
     au.start_ffmpeg()
 
     # Initiate global variables
@@ -29,7 +28,7 @@ def main():
 
     update_args_config(args, config)
     global_vars.initiate_audio_devices(config)
-    au.create_transcriber(name=args.speech_to_text,
+    au.create_transcriber(name=config['General']['stt'],
                           config=config,
                           api=bool(config['General']['use_api']),
                           global_vars=global_vars)

diff --git a/app/transcribe/requirements.txt b/app/transcribe/requirements.txt
@@ -17,7 +17,4 @@ deepgram-sdk==3.1.0
 #--extra-index-url https://download.pytorch.org/whl/cu118
 --extra-index-url https://download.pytorch.org/whl/cu121
 torch
-transformers>=4.36.0 # not directly required, pinned by Snyk to avoid a vulnerability
 bandit==1.7.8
-setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability
-wheel>=0.38.0 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/examples/deepgram/stt.py b/examples/deepgram/stt.py
@@ -28,7 +28,8 @@ def main():
         smart_format=True,
         utterances=True,
         punctuate=True,
-        diarize=True)
+        diarize=True,
+        detect_language=True)
 
     response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
     print(response.to_json(indent=4))

diff --git a/sdk/transcriber_models.py b/sdk/transcriber_models.py
@@ -283,6 +283,9 @@ def __init__(self, stt_model_config: dict):
         # Check for api_key
         if stt_model_config["api_key"] is None:
             raise Exception("Attempt to create Deepgram STT Model without an api key.")  # pylint: disable=W0719
+
+        # This parameter exists primarily to adhere to the interface.
+        # Deepgram does auto language detection.
         self.lang = 'en-US'
 
         print('[INFO] Using Deepgram API for transcription.')
@@ -308,7 +311,8 @@ def get_transcription(self, wav_file_path: str):
                 smart_format=True,
                 utterances=True,
                 punctuate=True,
-                paragraphs=True)
+                paragraphs=True,
+                detect_language=True)
 
             response = self.audio_model.listen.prerecorded.v("1").transcribe_file(payload, options)
             # This is not necessary and just a debugging aid