Revert "Revert "only append audio if speech has started""

This reverts commit 992ca8f.
uberduck-ai · Apr 4, 2024 · f8851ed · f8851ed
1 parent 30ad985
commit f8851ed
Showing 1 changed file with 24 additions and 3 deletions.
diff --git a/openduck-py/openduck_py/response_agent.py b/openduck-py/openduck_py/response_agent.py
@@ -243,6 +243,7 @@ def __init__(
         self.response_task: Optional[asyncio.Task] = None
         self.interrupt_event = asyncio.Event()
         self.system_prompt = system_prompt
+        self.speech_has_started = False
 
         if context is None:
             context = {}
@@ -325,10 +326,30 @@ async def receive_audio(self, message: bytes):
             vad_result = self.vad(audio_16k_chunk)
             if vad_result:
                 async with SessionAsync() as db:
-                    if "end" in vad_result:
-                        print("end of speech detected.")
+                    transcription = ""
+                    if "start" in vad_result or "end" in vad_result:
                         self.time_of_last_activity = time()
-                        await log_event(db, self.session_id, "detected_end_of_speech")
+                        if "start" in vad_result:
+                            self.speech_has_started = True
+                            print("Detected start of speech", flush=True)
+                            # await log_event(
+                            #     db,
+                            #     self.session_id,
+                            #     "detected_start_of_speech",
+                            #     audio=audio_data,
+                            # )
+                        else:
+                            self.speech_has_started = False
+                            print("Detected end of speech", flush=True)
+                            # await log_event(
+                            #     db,
+                            #     self.session_id,
+                            #     "detected_end_of_speech",
+                            #     audio=audio_data,
+                            # )
+
+                        if self.speech_has_started:
+                            self.audio_data.append(audio_16k_np)
 
                         audio_data = np.concatenate(self.audio_data)
                         transcription = await _transcribe(audio_data)