Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add language support to deepgram #190

Merged
merged 1 commit into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion app/transcribe/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def create_args() -> argparse.Namespace:
\nThis option requires an API KEY and will consume Open AI credits.')
cmd_args.add_argument('-e', '--experimental', action='store_true',
help='Experimental command line argument. Behavior is undefined.')
cmd_args.add_argument('-stt', '--speech_to_text', action='store', default='whisper',
cmd_args.add_argument('-stt', '--speech_to_text', action='store', default=None,
choices=['whisper', 'whisper.cpp', 'deepgram'],
help='Specify the Speech to text Engine.'
'\nLocal STT models tend to perform best for response times.'
Expand Down Expand Up @@ -164,6 +164,9 @@ def update_args_config(args: argparse.Namespace, config: dict):
if args.speaker_device_index is not None:
config['General']['speaker_device_index'] = int(args.speaker_device_index)

if args.speech_to_text is not None:
config['General']['stt'] = args.speech_to_text


def update_audio_devices(global_vars: TranscriptionGlobals, config: dict):
"""Handle all application configuration using the command line args"""
Expand Down
2 changes: 1 addition & 1 deletion app/transcribe/audio_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue):
except Exception as exception:
print(exception)
finally:
# print(f'transcribe_audio_queue: filesize: {os.path.getsize(path)}')
# print(f'transcribe_audio_queue: file: {path} filesize: {os.path.getsize(path)}')
os.unlink(path)

if text != '' and text.lower() != 'you':
Expand Down
3 changes: 2 additions & 1 deletion app/transcribe/gpt_responder.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,8 @@ def __init__(self,
base_url = self.config['OpenAI']['base_url']
self.llm_client = openai.OpenAI(api_key=api_key, base_url=base_url)
self.model = self.config['OpenAI']['ai_model']
print(f'[INFO] Using OpenAI for inference. Model: {self.model}')
stt = self.config['General']['stt']
print(f'[INFO] Using {stt} for inference. Model: {self.model}')
super().__init__(config=self.config,
convo=convo,
save_to_file=save_to_file,
Expand Down
3 changes: 1 addition & 2 deletions app/transcribe/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def main():
args = create_args()

config = configuration.Config().data

au.start_ffmpeg()

# Initiate global variables
Expand All @@ -29,7 +28,7 @@ def main():

update_args_config(args, config)
global_vars.initiate_audio_devices(config)
au.create_transcriber(name=args.speech_to_text,
au.create_transcriber(name=config['General']['stt'],
config=config,
api=bool(config['General']['use_api']),
global_vars=global_vars)
Expand Down
3 changes: 0 additions & 3 deletions app/transcribe/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,4 @@ deepgram-sdk==3.1.0
#--extra-index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://download.pytorch.org/whl/cu121
torch
transformers>=4.36.0 # not directly required, pinned by Snyk to avoid a vulnerability
bandit==1.7.8
setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability
wheel>=0.38.0 # not directly required, pinned by Snyk to avoid a vulnerability
3 changes: 2 additions & 1 deletion examples/deepgram/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def main():
smart_format=True,
utterances=True,
punctuate=True,
diarize=True)
diarize=True,
detect_language=True)

response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
print(response.to_json(indent=4))
Expand Down
6 changes: 5 additions & 1 deletion sdk/transcriber_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,9 @@ def __init__(self, stt_model_config: dict):
# Check for api_key
if stt_model_config["api_key"] is None:
raise Exception("Attempt to create Deepgram STT Model without an api key.") # pylint: disable=W0719

# This parameter exists primarily to adhere to the interface.
# Deepgram does auto language detection.
self.lang = 'en-US'

print('[INFO] Using Deepgram API for transcription.')
Expand All @@ -308,7 +311,8 @@ def get_transcription(self, wav_file_path: str):
smart_format=True,
utterances=True,
punctuate=True,
paragraphs=True)
paragraphs=True,
detect_language=True)

response = self.audio_model.listen.prerecorded.v("1").transcribe_file(payload, options)
# This is not necessary and just a debugging aid
Expand Down