From 9e78ec70c99e51332bd64cf6c7dbe5e89d9edaf8 Mon Sep 17 00:00:00 2001 From: Matthew Kennedy Date: Wed, 3 Apr 2024 16:25:40 -0400 Subject: [PATCH] check for any segment with speech prob (#121) --- openduck-py/openduck_py/routers/ml.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/openduck-py/openduck_py/routers/ml.py b/openduck-py/openduck_py/routers/ml.py index 54eb70b..5639b19 100644 --- a/openduck-py/openduck_py/routers/ml.py +++ b/openduck-py/openduck_py/routers/ml.py @@ -1,10 +1,11 @@ +from pprint import pprint +import io + +import numpy as np from fastapi import APIRouter, UploadFile, File, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel -import io - from whisper import load_model -import numpy as np from nemo_text_processing.text_normalization.normalize import Normalizer from openduck_py.voices.styletts2 import styletts2_inference @@ -44,13 +45,17 @@ async def transcribe_audio( audio_bytes = await audio.read() audio_data = np.frombuffer(audio_bytes, dtype=np.float32) response = whisper_model.transcribe(audio_data) + pprint(response) if len(response["segments"]) == 0: return {"text": ""} - no_speech_prob = response["segments"][0]["no_speech_prob"] - print("No speech prob:", no_speech_prob) - transcription = response["text"] - if no_speech_prob > NO_SPEECH_PROB_THRESHOLD: - transcription = "" + + transcription = " ".join( + [ + segment["text"] + for segment in response["segments"] + if segment["no_speech_prob"] <= NO_SPEECH_PROB_THRESHOLD + ] + ) return {"text": transcription} except Exception as e: raise HTTPException(status_code=500, detail=str(e))