From 6061f19897d7e425c00d224a21244fe6719d5ffc Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 12 Jul 2024 16:20:52 +0100 Subject: [PATCH] listening to what is being said while speaking --- wafl/answerer/answerer_implementation.py | 6 +++++- wafl/connectors/clients/llm_chat_client.py | 6 +++++- wafl/speaker/fairseq_speaker.py | 24 +++++++++++++++++++--- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/wafl/answerer/answerer_implementation.py b/wafl/answerer/answerer_implementation.py index e83c5306..3c8b36e8 100644 --- a/wafl/answerer/answerer_implementation.py +++ b/wafl/answerer/answerer_implementation.py @@ -121,7 +121,11 @@ def get_text_from_facts_and_thresholds( if item[0].text not in memory: text = item[0].text if item[0].metadata: - text = f"Metadata for the following text: {str(item[0].metadata)}" + "\n" + text + text = ( + f"Metadata for the following text: {str(item[0].metadata)}" + + "\n" + + text + ) text_list.append(text) return text_list diff --git a/wafl/connectors/clients/llm_chat_client.py b/wafl/connectors/clients/llm_chat_client.py index ef0da698..c144105c 100644 --- a/wafl/connectors/clients/llm_chat_client.py +++ b/wafl/connectors/clients/llm_chat_client.py @@ -27,4 +27,8 @@ async def _get_answer_prompt( ) def _get_system_prompt(self, text, rules_text): - return self.prompt.replace("{facts}", text.strip()).replace("{rules}", rules_text.strip()).strip() + return ( + self.prompt.replace("{facts}", text.strip()) + .replace("{rules}", rules_text.strip()) + .strip() + ) diff --git a/wafl/speaker/fairseq_speaker.py b/wafl/speaker/fairseq_speaker.py index 176429d3..b62c8799 100644 --- a/wafl/speaker/fairseq_speaker.py +++ b/wafl/speaker/fairseq_speaker.py @@ -1,4 +1,6 @@ import asyncio + +import numpy as np import pyaudio from wafl.connectors.factories.speaker_connector_factory import SpeakerConnectorFactory @@ -10,8 +12,13 @@ class FairSeqSpeaker(BaseSpeaker): def __init__(self, config): self._connector = SpeakerConnectorFactory.get_connector(config) self._p = pyaudio.PyAudio() + self._input_chunk_size = 1024 + self._output_chunk_size = 4096 + self._volume_threshold = ( + config.get_value("listener_model")["listener_volume_threshold"] / 5e3 + ) - async def speak(self, text): #### This is the function that is called in the VoiceInterface class + async def speak(self, text): text = convert_numbers_to_words(text) prediction = await self._connector.predict(text) wav = prediction["wav"] @@ -20,10 +27,21 @@ async def speak(self, text): #### This is the function that is called in the Vo format=pyaudio.paFloat32, channels=1, rate=rate, + input=True, output=True, ) - await asyncio.sleep(0.2) - stream.write(wav) + stream.start_stream() + await asyncio.sleep(0.1) + for i in range(0, len(wav), self._output_chunk_size): + inp = stream.read(self._input_chunk_size) + if _rms(inp) > self._volume_threshold: + break + stream.write(wav[i : i + self._output_chunk_size]) stream.stop_stream() stream.close() await asyncio.sleep(0.1) + + +def _rms(frame): + data = np.frombuffer(frame, dtype=np.float32) + return np.std(data) / len(data)