From 4b23febfdc0dbba66b983f1a5c2b5051256e4dc4 Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 01/29] checkpoint between win, mac --- AudioRecorder.py | 87 +++++++++++++++++++++++++++++++++++++-------- AudioTranscriber.py | 14 +++++--- main.py | 2 +- requirements.txt | 6 ++-- 4 files changed, 86 insertions(+), 23 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 911c8ae..0152419 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,13 +1,22 @@ from datetime import datetime from abc import abstractmethod -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio +import platform import app_logging as al RECORD_TIMEOUT = 3 ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False +MBP_MIC_NAME = "MacBook Pro Microphone" +PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" +HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME +# macOS specific, see README.md#macos for the details on how to configure the BlackHole device +BLACKHOLE_MIC_NAME = "BlackHole 2ch" + root_logger = al.get_logger() @@ -29,6 +38,12 @@ 13: 'Windows Vista Audio stack architecture' } +# This needs to be formatted better +# Attempt to get more info from it like, device_type Mic vs speaker +def print_detailed_audio_info_2(): + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f'Audio device with name "{name}" found at index {index}') + def print_detailed_audio_info(print_func=print): """ @@ -136,19 +151,44 @@ class MicRecorder(BaseRecorder): """ def __init__(self): root_logger.info(MicRecorder.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + py_audio = pyaudio.PyAudio() # WASAPI is windows specific wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultInputDevice"] default_mic = py_audio.get_device_info_by_index(self.device_index) - self.device_info = default_mic + self.device_info = default_mic + + source = sr.Microphone(device_index=default_mic["index"], + sample_rate=int(default_mic["defaultSampleRate"]) + # channels=default_mic["maxInputChannels"] + ) + self.source = source + py_audio.terminate() + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + + # this assumes that mic has lower index number for combinded headsets (like Plantronics) + if name == HUMAN_MIC_NAME: + self.device_index = index + + default_mic = py_audio.get_device_info_by_index(self.device_index) + + self.device_info = default_mic + + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + + print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) - source = sr.Microphone(device_index=default_mic["index"], - sample_rate=int(default_mic["defaultSampleRate"]), - channels=default_mic["maxInputChannels"] - ) - self.source = source super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take @@ -182,7 +222,12 @@ class SpeakerRecorder(BaseRecorder): """ def __init__(self): root_logger.info(SpeakerRecorder.__name__) - with pyaudio.PyAudio() as p: + + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + p = pyaudio.PyAudio() wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultOutputDevice"] default_speakers = p.get_device_info_by_index(self.device_index) @@ -194,14 +239,26 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") + p.terminate() + source = sr.Microphone(speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + if name == BLACKHOLE_MIC_NAME: + self.device_index = index + + p = pyaudio.PyAudio() + default_speakers = p.get_device_info_by_index(self.device_index) + + print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) self.device_info = default_speakers - source = sr.Microphone(speaker=True, - device_index=default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 70def0c..186a506 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -7,8 +7,10 @@ import pprint import wave import tempfile -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio import conversation import constants @@ -25,11 +27,12 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver self.audio_model = model # Determines if transcription is enabled for the application. By default it is enabled. self.transcribe = True + # channels commented for mac self.audio_sources = { "You": { "sample_rate": mic_source.SAMPLE_RATE, "sample_width": mic_source.SAMPLE_WIDTH, - "channels": mic_source.channels, + # "channels": mic_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -38,7 +41,7 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver "Speaker": { "sample_rate": speaker_source.SAMPLE_RATE, "sample_width": speaker_source.SAMPLE_WIDTH, - "channels": speaker_source.channels, + # "channels": speaker_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -99,7 +102,8 @@ def process_speaker_data(self, data, temp_file_name): if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: - wf.setnchannels(self.audio_sources["Speaker"]["channels"]) + # commented for mac, get from pyaudio itself + # wf.setnchannels(self.audio_sources["Speaker"]["channels"]) p = pyaudio.PyAudio() wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(self.audio_sources["Speaker"]["sample_rate"]) diff --git a/main.py b/main.py index eb37657..10ed221 100644 --- a/main.py +++ b/main.py @@ -79,7 +79,7 @@ def main(): if args.list_devices: print('\n\nList all audio drivers and devices on this machine') - ar.print_detailed_audio_info() + ar.print_detailed_audio_info_2() return # Initiate global variables diff --git a/requirements.txt b/requirements.txt index ba4d3f8..652b528 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,8 @@ openai-whisper==20230314 Wave==0.0.2 openai==0.27.6 customtkinter==5.1.3 -PyAudioWPatch==0.2.12.5 +# PyAudioWPatch==0.2.12.5 +PyAudio==0.2.13 pyinstaller==5.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 torch @@ -14,4 +15,5 @@ soundfile gtts # Playsound version 1.3 has issues in playing back audio files # in case of continuous play back of files in quick succession -playsound==1.2.2 \ No newline at end of file +playsound==1.2.2 +SpeechRecognition==3.10.0 From 2b7ac981992225eb91135b5d72cb90afae0fe5cd Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 02/29] Checkpoint on mac --- AudioRecorder.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 0152419..d7ea6ad 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -11,7 +11,8 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False -MBP_MIC_NAME = "MacBook Pro Microphone" +MBP_MIC_NAME = "MacBook Air Microphone" +MBP_SPEAKER_NAME = "MacBook Air Speakers" PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME # macOS specific, see README.md#macos for the details on how to configure the BlackHole device @@ -129,12 +130,14 @@ def disable(self): """ self.enabled = False - def adjust_for_noise(self, device_name, msg): + # def adjust_for_noise(self, device_name, msg): + def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment for {device_name}.") + print(f"[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -172,13 +175,14 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): - # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == HUMAN_MIC_NAME: + if name == MBP_MIC_NAME: self.device_index = index - - default_mic = py_audio.get_device_info_by_index(self.device_index) + + py_audio = pyaudio.PyAudio() + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic @@ -186,6 +190,7 @@ def __init__(self): device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) + py_audio.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) @@ -249,11 +254,16 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) default_speakers = p.get_device_info_by_index(self.device_index) + p.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) @@ -261,8 +271,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Default Speaker", - "Please play sound from Default Speaker...") + # self.adjust_for_noise("Default Speaker", + # "Please play sound from Default Speaker...") + self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -292,8 +303,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Speaker", - f"Please play sound from selected Speakers {self.get_name()}...") + # self.adjust_for_noise("Speaker", + self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From 76d5d5f38f97e42d1e4fa204d4c304c2e513e4ed Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 03/29] Add support for Mac. --- AudioRecorder.py | 98 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index d7ea6ad..db0ef25 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -130,11 +130,9 @@ def disable(self): """ self.enabled = False - # def adjust_for_noise(self, device_name, msg): - def adjust_for_noise(self, msg): + def adjust_for_noise(self, device_name, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) - print(f"[INFO] Adjusting for ambient noise... " + msg) + print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) print(f"[INFO] Completed ambient noise adjustment.") @@ -156,6 +154,7 @@ def __init__(self): root_logger.info(MicRecorder.__name__) os_name = platform.system() self.device_index = None + default_mic = None if os_name == 'Windows': py_audio = pyaudio.PyAudio() @@ -174,49 +173,73 @@ def __init__(self): py_audio.terminate() elif os_name == 'Darwin': + audio = sr.Microphone.get_pyaudio().PyAudio() + # Prints a list of all devices + # for i in range(audio.get_device_count()): + # device_info = audio.get_device_info_by_index(i) + # print(f'Name: {device_info.get("name")}, InputChannels: {device_info.get("maxInputChannels")} OutputChannels: {device_info.get("maxOutputChannels")}') + # + # Prints device info to see all fields inside the device info object + # print(device_info) + audio.terminate() + for index, name in enumerate(sr.Microphone.list_microphone_names()): - print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print(f'Microphone with name "{name}" found for device_index={index})') - # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == MBP_MIC_NAME: + if name == HUMAN_MIC_NAME: self.device_index = index py_audio = pyaudio.PyAudio() - default_mic = py_audio.get_device_info_by_index(self.device_index) + if self.device_index is not None: + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic source = sr.Microphone( - device_index=self.device_index, + device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) py_audio.terminate() - print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) + print(f'[DEBUG] "{MBP_MIC_NAME}" microphone index is: {self.device_index}') super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take # several minutes to execute, thus delaying the start of the application. - # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") + self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") def get_name(self): - return f'#{self.device_index} - {self.device_info["name"]}' + if self.device_info is not None: + return f'#{self.device_index} - {self.device_info["name"]}' + return None def set_device(self, index: int): """Set active device based on index. """ root_logger.info(MicRecorder.set_device.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + if os_name == 'Windows': + with pyaudio.PyAudio() as py_audio: + self.device_index = index + mic = py_audio.get_device_info_by_index(self.device_index) + + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]), + channels=mic["maxInputChannels"] + ) + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index - mic = py_audio.get_device_info_by_index(self.device_index) + mic = p.get_device_info_by_index(self.device_index) + p.terminate() + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]) + ) self.device_info = mic - source = sr.Microphone(device_index=mic["index"], - sample_rate=int(mic["defaultSampleRate"]), - channels=mic["maxInputChannels"] - ) self.source = source print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...") @@ -254,7 +277,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() @@ -271,9 +294,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Default Speaker", - # "Please play sound from Default Speaker...") - self.adjust_for_noise("Please play sound from Default Speaker...") + self.adjust_for_noise("Default Speaker", + "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -282,17 +305,26 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(SpeakerRecorder.set_device.__name__) - with pyaudio.PyAudio() as p: + os_name = platform.system() + + if os_name == 'Windows': + with pyaudio.PyAudio() as p: + self.device_index = index + speakers = p.get_device_info_by_index(self.device_index) + + if not speakers["isLoopbackDevice"]: + for loopback in p.get_loopback_device_info_generator(): + if speakers["name"] in loopback["name"]: + speakers = loopback + break + else: + print("[ERROR] No loopback device found.") + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index speakers = p.get_device_info_by_index(self.device_index) - - if not speakers["isLoopbackDevice"]: - for loopback in p.get_loopback_device_info_generator(): - if speakers["name"] in loopback["name"]: - speakers = loopback - break - else: - print("[ERROR] No loopback device found.") + p.terminate() self.device_info = speakers @@ -303,8 +335,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Speaker", - self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") + self.adjust_for_noise("Speaker", + f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From 3e4c90be66042f5f5e499501ac06787dd1f16f16 Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 04/29] checkpoint between win, mac --- AudioRecorder.py | 87 +++++++++++++++++++++++++++++++++++++-------- AudioTranscriber.py | 14 +++++--- main.py | 2 +- requirements.txt | 6 ++-- 4 files changed, 86 insertions(+), 23 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 911c8ae..0152419 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,13 +1,22 @@ from datetime import datetime from abc import abstractmethod -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio +import platform import app_logging as al RECORD_TIMEOUT = 3 ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False +MBP_MIC_NAME = "MacBook Pro Microphone" +PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" +HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME +# macOS specific, see README.md#macos for the details on how to configure the BlackHole device +BLACKHOLE_MIC_NAME = "BlackHole 2ch" + root_logger = al.get_logger() @@ -29,6 +38,12 @@ 13: 'Windows Vista Audio stack architecture' } +# This needs to be formatted better +# Attempt to get more info from it like, device_type Mic vs speaker +def print_detailed_audio_info_2(): + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f'Audio device with name "{name}" found at index {index}') + def print_detailed_audio_info(print_func=print): """ @@ -136,19 +151,44 @@ class MicRecorder(BaseRecorder): """ def __init__(self): root_logger.info(MicRecorder.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + py_audio = pyaudio.PyAudio() # WASAPI is windows specific wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultInputDevice"] default_mic = py_audio.get_device_info_by_index(self.device_index) - self.device_info = default_mic + self.device_info = default_mic + + source = sr.Microphone(device_index=default_mic["index"], + sample_rate=int(default_mic["defaultSampleRate"]) + # channels=default_mic["maxInputChannels"] + ) + self.source = source + py_audio.terminate() + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + + # this assumes that mic has lower index number for combinded headsets (like Plantronics) + if name == HUMAN_MIC_NAME: + self.device_index = index + + default_mic = py_audio.get_device_info_by_index(self.device_index) + + self.device_info = default_mic + + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + + print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) - source = sr.Microphone(device_index=default_mic["index"], - sample_rate=int(default_mic["defaultSampleRate"]), - channels=default_mic["maxInputChannels"] - ) - self.source = source super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take @@ -182,7 +222,12 @@ class SpeakerRecorder(BaseRecorder): """ def __init__(self): root_logger.info(SpeakerRecorder.__name__) - with pyaudio.PyAudio() as p: + + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + p = pyaudio.PyAudio() wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultOutputDevice"] default_speakers = p.get_device_info_by_index(self.device_index) @@ -194,14 +239,26 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") + p.terminate() + source = sr.Microphone(speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + if name == BLACKHOLE_MIC_NAME: + self.device_index = index + + p = pyaudio.PyAudio() + default_speakers = p.get_device_info_by_index(self.device_index) + + print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) self.device_info = default_speakers - source = sr.Microphone(speaker=True, - device_index=default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 70def0c..186a506 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -7,8 +7,10 @@ import pprint import wave import tempfile -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio import conversation import constants @@ -25,11 +27,12 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver self.audio_model = model # Determines if transcription is enabled for the application. By default it is enabled. self.transcribe = True + # channels commented for mac self.audio_sources = { "You": { "sample_rate": mic_source.SAMPLE_RATE, "sample_width": mic_source.SAMPLE_WIDTH, - "channels": mic_source.channels, + # "channels": mic_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -38,7 +41,7 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver "Speaker": { "sample_rate": speaker_source.SAMPLE_RATE, "sample_width": speaker_source.SAMPLE_WIDTH, - "channels": speaker_source.channels, + # "channels": speaker_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -99,7 +102,8 @@ def process_speaker_data(self, data, temp_file_name): if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: - wf.setnchannels(self.audio_sources["Speaker"]["channels"]) + # commented for mac, get from pyaudio itself + # wf.setnchannels(self.audio_sources["Speaker"]["channels"]) p = pyaudio.PyAudio() wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(self.audio_sources["Speaker"]["sample_rate"]) diff --git a/main.py b/main.py index 19617be..cf9cf96 100644 --- a/main.py +++ b/main.py @@ -79,7 +79,7 @@ def main(): if args.list_devices: print('\n\nList all audio drivers and devices on this machine') - ar.print_detailed_audio_info() + ar.print_detailed_audio_info_2() return # Initiate global variables diff --git a/requirements.txt b/requirements.txt index ba4d3f8..652b528 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,8 @@ openai-whisper==20230314 Wave==0.0.2 openai==0.27.6 customtkinter==5.1.3 -PyAudioWPatch==0.2.12.5 +# PyAudioWPatch==0.2.12.5 +PyAudio==0.2.13 pyinstaller==5.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 torch @@ -14,4 +15,5 @@ soundfile gtts # Playsound version 1.3 has issues in playing back audio files # in case of continuous play back of files in quick succession -playsound==1.2.2 \ No newline at end of file +playsound==1.2.2 +SpeechRecognition==3.10.0 From a834db6e384c0eba36611fe0537886c074ba5ec9 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 05/29] Checkpoint on mac --- AudioRecorder.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 0152419..d7ea6ad 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -11,7 +11,8 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False -MBP_MIC_NAME = "MacBook Pro Microphone" +MBP_MIC_NAME = "MacBook Air Microphone" +MBP_SPEAKER_NAME = "MacBook Air Speakers" PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME # macOS specific, see README.md#macos for the details on how to configure the BlackHole device @@ -129,12 +130,14 @@ def disable(self): """ self.enabled = False - def adjust_for_noise(self, device_name, msg): + # def adjust_for_noise(self, device_name, msg): + def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment for {device_name}.") + print(f"[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -172,13 +175,14 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): - # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == HUMAN_MIC_NAME: + if name == MBP_MIC_NAME: self.device_index = index - - default_mic = py_audio.get_device_info_by_index(self.device_index) + + py_audio = pyaudio.PyAudio() + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic @@ -186,6 +190,7 @@ def __init__(self): device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) + py_audio.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) @@ -249,11 +254,16 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) default_speakers = p.get_device_info_by_index(self.device_index) + p.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) @@ -261,8 +271,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Default Speaker", - "Please play sound from Default Speaker...") + # self.adjust_for_noise("Default Speaker", + # "Please play sound from Default Speaker...") + self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -292,8 +303,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Speaker", - f"Please play sound from selected Speakers {self.get_name()}...") + # self.adjust_for_noise("Speaker", + self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From 470f1bb8c74db460da48f47e769b02273651476e Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 06/29] Add support for Mac. --- AudioRecorder.py | 98 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index d7ea6ad..db0ef25 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -130,11 +130,9 @@ def disable(self): """ self.enabled = False - # def adjust_for_noise(self, device_name, msg): - def adjust_for_noise(self, msg): + def adjust_for_noise(self, device_name, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) - print(f"[INFO] Adjusting for ambient noise... " + msg) + print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) print(f"[INFO] Completed ambient noise adjustment.") @@ -156,6 +154,7 @@ def __init__(self): root_logger.info(MicRecorder.__name__) os_name = platform.system() self.device_index = None + default_mic = None if os_name == 'Windows': py_audio = pyaudio.PyAudio() @@ -174,49 +173,73 @@ def __init__(self): py_audio.terminate() elif os_name == 'Darwin': + audio = sr.Microphone.get_pyaudio().PyAudio() + # Prints a list of all devices + # for i in range(audio.get_device_count()): + # device_info = audio.get_device_info_by_index(i) + # print(f'Name: {device_info.get("name")}, InputChannels: {device_info.get("maxInputChannels")} OutputChannels: {device_info.get("maxOutputChannels")}') + # + # Prints device info to see all fields inside the device info object + # print(device_info) + audio.terminate() + for index, name in enumerate(sr.Microphone.list_microphone_names()): - print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print(f'Microphone with name "{name}" found for device_index={index})') - # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == MBP_MIC_NAME: + if name == HUMAN_MIC_NAME: self.device_index = index py_audio = pyaudio.PyAudio() - default_mic = py_audio.get_device_info_by_index(self.device_index) + if self.device_index is not None: + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic source = sr.Microphone( - device_index=self.device_index, + device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) py_audio.terminate() - print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) + print(f'[DEBUG] "{MBP_MIC_NAME}" microphone index is: {self.device_index}') super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take # several minutes to execute, thus delaying the start of the application. - # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") + self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") def get_name(self): - return f'#{self.device_index} - {self.device_info["name"]}' + if self.device_info is not None: + return f'#{self.device_index} - {self.device_info["name"]}' + return None def set_device(self, index: int): """Set active device based on index. """ root_logger.info(MicRecorder.set_device.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + if os_name == 'Windows': + with pyaudio.PyAudio() as py_audio: + self.device_index = index + mic = py_audio.get_device_info_by_index(self.device_index) + + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]), + channels=mic["maxInputChannels"] + ) + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index - mic = py_audio.get_device_info_by_index(self.device_index) + mic = p.get_device_info_by_index(self.device_index) + p.terminate() + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]) + ) self.device_info = mic - source = sr.Microphone(device_index=mic["index"], - sample_rate=int(mic["defaultSampleRate"]), - channels=mic["maxInputChannels"] - ) self.source = source print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...") @@ -254,7 +277,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() @@ -271,9 +294,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Default Speaker", - # "Please play sound from Default Speaker...") - self.adjust_for_noise("Please play sound from Default Speaker...") + self.adjust_for_noise("Default Speaker", + "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -282,17 +305,26 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(SpeakerRecorder.set_device.__name__) - with pyaudio.PyAudio() as p: + os_name = platform.system() + + if os_name == 'Windows': + with pyaudio.PyAudio() as p: + self.device_index = index + speakers = p.get_device_info_by_index(self.device_index) + + if not speakers["isLoopbackDevice"]: + for loopback in p.get_loopback_device_info_generator(): + if speakers["name"] in loopback["name"]: + speakers = loopback + break + else: + print("[ERROR] No loopback device found.") + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index speakers = p.get_device_info_by_index(self.device_index) - - if not speakers["isLoopbackDevice"]: - for loopback in p.get_loopback_device_info_generator(): - if speakers["name"] in loopback["name"]: - speakers = loopback - break - else: - print("[ERROR] No loopback device found.") + p.terminate() self.device_info = speakers @@ -303,8 +335,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Speaker", - self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") + self.adjust_for_noise("Speaker", + f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From b50f58c732d2741d868d2d79060089163bc27d77 Mon Sep 17 00:00:00 2001 From: Vivek Uppal Date: Fri, 8 Sep 2023 10:08:21 -0400 Subject: [PATCH 07/29] Bring readme up to date with current functionality. Describe content customization. (#60) --- README.md | 31 ++++++++++++++++++++++++++++--- parameters.yaml | 13 ++++++++----- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8a10bac..7f8d2d4 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ Please run these commands in a PowerShell window with administrator privileges. pip install -r requirements.txt ``` + It is recommended to create a virtual environment for installing the required packages + 4. (Optional) Replace the Open API key in `parameters.yaml` file in the transcribe directory: Replace the Open API key in `parameters.yaml` file manually. Open in a text editor and alter the line: @@ -73,7 +75,32 @@ Upon initiation, Transcribe will begin transcribing microphone input and speaker The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). However, keep in mind, using the Whisper API consumes OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile for your use case. -### 🎬 Running Transcribe +### 🎬 Customizing Transcribe + +By default chatGPT API behaves like a casual friend engaging in light hearted banter. To customize the responses and make it specific to a field see this section in parameters.yaml and the corresponding examples + +``` + system_prompt: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + system_prompt: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + + + initial_convo: + first: + role: "You" + # content: "I am V, I want to learn about Fantasy Football" + # content: "I am V, I want to learn about Basketball" + content: Hey assistant, how are you doing today, I am in mood of a casual conversation. + second: + role: "assistant" + # content: "Hello, V. That's awesome! What do you want to know about basketball" + # content: "Hello, V. That's awesome! What do you want to know about Fantasy Football" + content: Hello, V. You are awesome. I am doing very well and looking forward to some light hearted banter with you. +``` + +Change system_prompt, intial_convo to be specific to the scenario you are intersted in. + +### 🎬 Testing Transcribe Code changes Unit Tests @@ -124,8 +151,6 @@ https://drive.google.com/file/d/1Iy32YjDXK7Bga7amOUTA4Gx9VEoibPi-/view?usp=shari While Transcribe provides real-time transcription and optional response suggestions, there are several known limitations to its functionality that you should be aware of: -**Default Mic and Speaker:** Transcribe is currently configured to listen only to the default microphone and speaker set in your system. It will not detect sound from other devices or systems. To use a different mic or speaker, need to set it as your default device in your system settings. - **Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words. **OpenAI Account**: If a paid OpenAI account with a valid Open API Key is not used, the command window displays the following error message repeatedly, though the application behvaior is not impacted in any way. diff --git a/parameters.yaml b/parameters.yaml index 5c0facd..cbd27d0 100644 --- a/parameters.yaml +++ b/parameters.yaml @@ -21,9 +21,9 @@ OpenAI: # The combination of system_prompt, initial_convo is used to create a multi turn prompt message for LLM. # system_prompt_1, systen_prompt_2 are here as samples of other possible prompts. # Only the content of system_prompt parameter will be used - system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." - system_prompt_1: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." - system_prompt_2: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + system_prompt: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." +# system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." +# system_prompt: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." # When we anticipate to talk about a specific topic, seed the content with some conversation # If the conversation is generic, replace this text with something like this. @@ -35,10 +35,13 @@ OpenAI: first: role: "You" # content: "I am V, I want to learn about Fantasy Football" - content: "I am V, I want to learn about Basketball" + # content: "I am V, I want to learn about Basketball" + content: Hey assistant, how are you doing today, I am in mood of a casual conversation. second: role: "assistant" - content: "Hello, V. That's awesome! What do you want to know about basketball" + # content: "Hello, V. That's awesome! What do you want to know about basketball" + # content: "Hello, V. That's awesome! What do you want to know about Fantasy Football" + content: Hello, V. You are awesome. I am doing very well and looking forward to some light hearted banter with you. General: log_file: 'Transcribe.log' From 76d349fd571f8d1d487b549150e8a752ef5709ef Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 08/29] checkpoint between win, mac --- AudioRecorder.py | 87 +++++++++++++++++++++++++++++++++++++-------- AudioTranscriber.py | 14 +++++--- main.py | 2 +- requirements.txt | 6 ++-- 4 files changed, 86 insertions(+), 23 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 911c8ae..0152419 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,13 +1,22 @@ from datetime import datetime from abc import abstractmethod -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio +import platform import app_logging as al RECORD_TIMEOUT = 3 ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False +MBP_MIC_NAME = "MacBook Pro Microphone" +PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" +HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME +# macOS specific, see README.md#macos for the details on how to configure the BlackHole device +BLACKHOLE_MIC_NAME = "BlackHole 2ch" + root_logger = al.get_logger() @@ -29,6 +38,12 @@ 13: 'Windows Vista Audio stack architecture' } +# This needs to be formatted better +# Attempt to get more info from it like, device_type Mic vs speaker +def print_detailed_audio_info_2(): + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f'Audio device with name "{name}" found at index {index}') + def print_detailed_audio_info(print_func=print): """ @@ -136,19 +151,44 @@ class MicRecorder(BaseRecorder): """ def __init__(self): root_logger.info(MicRecorder.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + py_audio = pyaudio.PyAudio() # WASAPI is windows specific wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultInputDevice"] default_mic = py_audio.get_device_info_by_index(self.device_index) - self.device_info = default_mic + self.device_info = default_mic + + source = sr.Microphone(device_index=default_mic["index"], + sample_rate=int(default_mic["defaultSampleRate"]) + # channels=default_mic["maxInputChannels"] + ) + self.source = source + py_audio.terminate() + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + + # this assumes that mic has lower index number for combinded headsets (like Plantronics) + if name == HUMAN_MIC_NAME: + self.device_index = index + + default_mic = py_audio.get_device_info_by_index(self.device_index) + + self.device_info = default_mic + + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + + print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) - source = sr.Microphone(device_index=default_mic["index"], - sample_rate=int(default_mic["defaultSampleRate"]), - channels=default_mic["maxInputChannels"] - ) - self.source = source super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take @@ -182,7 +222,12 @@ class SpeakerRecorder(BaseRecorder): """ def __init__(self): root_logger.info(SpeakerRecorder.__name__) - with pyaudio.PyAudio() as p: + + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + p = pyaudio.PyAudio() wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultOutputDevice"] default_speakers = p.get_device_info_by_index(self.device_index) @@ -194,14 +239,26 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") + p.terminate() + source = sr.Microphone(speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + if name == BLACKHOLE_MIC_NAME: + self.device_index = index + + p = pyaudio.PyAudio() + default_speakers = p.get_device_info_by_index(self.device_index) + + print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) self.device_info = default_speakers - source = sr.Microphone(speaker=True, - device_index=default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 70def0c..186a506 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -7,8 +7,10 @@ import pprint import wave import tempfile -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio import conversation import constants @@ -25,11 +27,12 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver self.audio_model = model # Determines if transcription is enabled for the application. By default it is enabled. self.transcribe = True + # channels commented for mac self.audio_sources = { "You": { "sample_rate": mic_source.SAMPLE_RATE, "sample_width": mic_source.SAMPLE_WIDTH, - "channels": mic_source.channels, + # "channels": mic_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -38,7 +41,7 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver "Speaker": { "sample_rate": speaker_source.SAMPLE_RATE, "sample_width": speaker_source.SAMPLE_WIDTH, - "channels": speaker_source.channels, + # "channels": speaker_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -99,7 +102,8 @@ def process_speaker_data(self, data, temp_file_name): if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: - wf.setnchannels(self.audio_sources["Speaker"]["channels"]) + # commented for mac, get from pyaudio itself + # wf.setnchannels(self.audio_sources["Speaker"]["channels"]) p = pyaudio.PyAudio() wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(self.audio_sources["Speaker"]["sample_rate"]) diff --git a/main.py b/main.py index 19617be..cf9cf96 100644 --- a/main.py +++ b/main.py @@ -79,7 +79,7 @@ def main(): if args.list_devices: print('\n\nList all audio drivers and devices on this machine') - ar.print_detailed_audio_info() + ar.print_detailed_audio_info_2() return # Initiate global variables diff --git a/requirements.txt b/requirements.txt index ba4d3f8..652b528 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,8 @@ openai-whisper==20230314 Wave==0.0.2 openai==0.27.6 customtkinter==5.1.3 -PyAudioWPatch==0.2.12.5 +# PyAudioWPatch==0.2.12.5 +PyAudio==0.2.13 pyinstaller==5.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 torch @@ -14,4 +15,5 @@ soundfile gtts # Playsound version 1.3 has issues in playing back audio files # in case of continuous play back of files in quick succession -playsound==1.2.2 \ No newline at end of file +playsound==1.2.2 +SpeechRecognition==3.10.0 From c2000a0e4bbf37bbcfee5778f465e36d9c8a8b88 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 09/29] Checkpoint on mac --- AudioRecorder.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 0152419..d7ea6ad 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -11,7 +11,8 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False -MBP_MIC_NAME = "MacBook Pro Microphone" +MBP_MIC_NAME = "MacBook Air Microphone" +MBP_SPEAKER_NAME = "MacBook Air Speakers" PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME # macOS specific, see README.md#macos for the details on how to configure the BlackHole device @@ -129,12 +130,14 @@ def disable(self): """ self.enabled = False - def adjust_for_noise(self, device_name, msg): + # def adjust_for_noise(self, device_name, msg): + def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment for {device_name}.") + print(f"[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -172,13 +175,14 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): - # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == HUMAN_MIC_NAME: + if name == MBP_MIC_NAME: self.device_index = index - - default_mic = py_audio.get_device_info_by_index(self.device_index) + + py_audio = pyaudio.PyAudio() + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic @@ -186,6 +190,7 @@ def __init__(self): device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) + py_audio.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) @@ -249,11 +254,16 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) default_speakers = p.get_device_info_by_index(self.device_index) + p.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) @@ -261,8 +271,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Default Speaker", - "Please play sound from Default Speaker...") + # self.adjust_for_noise("Default Speaker", + # "Please play sound from Default Speaker...") + self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -292,8 +303,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Speaker", - f"Please play sound from selected Speakers {self.get_name()}...") + # self.adjust_for_noise("Speaker", + self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From 30fafd32ba3ce775701db8a72841a79b9038c15d Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 10/29] Add support for Mac. --- AudioRecorder.py | 98 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index d7ea6ad..db0ef25 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -130,11 +130,9 @@ def disable(self): """ self.enabled = False - # def adjust_for_noise(self, device_name, msg): - def adjust_for_noise(self, msg): + def adjust_for_noise(self, device_name, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) - print(f"[INFO] Adjusting for ambient noise... " + msg) + print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) print(f"[INFO] Completed ambient noise adjustment.") @@ -156,6 +154,7 @@ def __init__(self): root_logger.info(MicRecorder.__name__) os_name = platform.system() self.device_index = None + default_mic = None if os_name == 'Windows': py_audio = pyaudio.PyAudio() @@ -174,49 +173,73 @@ def __init__(self): py_audio.terminate() elif os_name == 'Darwin': + audio = sr.Microphone.get_pyaudio().PyAudio() + # Prints a list of all devices + # for i in range(audio.get_device_count()): + # device_info = audio.get_device_info_by_index(i) + # print(f'Name: {device_info.get("name")}, InputChannels: {device_info.get("maxInputChannels")} OutputChannels: {device_info.get("maxOutputChannels")}') + # + # Prints device info to see all fields inside the device info object + # print(device_info) + audio.terminate() + for index, name in enumerate(sr.Microphone.list_microphone_names()): - print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print(f'Microphone with name "{name}" found for device_index={index})') - # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == MBP_MIC_NAME: + if name == HUMAN_MIC_NAME: self.device_index = index py_audio = pyaudio.PyAudio() - default_mic = py_audio.get_device_info_by_index(self.device_index) + if self.device_index is not None: + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic source = sr.Microphone( - device_index=self.device_index, + device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) py_audio.terminate() - print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) + print(f'[DEBUG] "{MBP_MIC_NAME}" microphone index is: {self.device_index}') super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take # several minutes to execute, thus delaying the start of the application. - # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") + self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") def get_name(self): - return f'#{self.device_index} - {self.device_info["name"]}' + if self.device_info is not None: + return f'#{self.device_index} - {self.device_info["name"]}' + return None def set_device(self, index: int): """Set active device based on index. """ root_logger.info(MicRecorder.set_device.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + if os_name == 'Windows': + with pyaudio.PyAudio() as py_audio: + self.device_index = index + mic = py_audio.get_device_info_by_index(self.device_index) + + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]), + channels=mic["maxInputChannels"] + ) + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index - mic = py_audio.get_device_info_by_index(self.device_index) + mic = p.get_device_info_by_index(self.device_index) + p.terminate() + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]) + ) self.device_info = mic - source = sr.Microphone(device_index=mic["index"], - sample_rate=int(mic["defaultSampleRate"]), - channels=mic["maxInputChannels"] - ) self.source = source print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...") @@ -254,7 +277,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() @@ -271,9 +294,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Default Speaker", - # "Please play sound from Default Speaker...") - self.adjust_for_noise("Please play sound from Default Speaker...") + self.adjust_for_noise("Default Speaker", + "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -282,17 +305,26 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(SpeakerRecorder.set_device.__name__) - with pyaudio.PyAudio() as p: + os_name = platform.system() + + if os_name == 'Windows': + with pyaudio.PyAudio() as p: + self.device_index = index + speakers = p.get_device_info_by_index(self.device_index) + + if not speakers["isLoopbackDevice"]: + for loopback in p.get_loopback_device_info_generator(): + if speakers["name"] in loopback["name"]: + speakers = loopback + break + else: + print("[ERROR] No loopback device found.") + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index speakers = p.get_device_info_by_index(self.device_index) - - if not speakers["isLoopbackDevice"]: - for loopback in p.get_loopback_device_info_generator(): - if speakers["name"] in loopback["name"]: - speakers = loopback - break - else: - print("[ERROR] No loopback device found.") + p.terminate() self.device_info = speakers @@ -303,8 +335,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Speaker", - self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") + self.adjust_for_noise("Speaker", + f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From 17bf60fa8063537e6169904e9bc261b53e912c01 Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 11/29] checkpoint between win, mac --- AudioRecorder.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index db0ef25..2a3f775 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -11,8 +11,9 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False -MBP_MIC_NAME = "MacBook Air Microphone" -MBP_SPEAKER_NAME = "MacBook Air Speakers" +MBP_SPEAKER_NAME = "MacBook Pro Speakers" +MBP_MIC_NAME = "MacBook Pro Microphone" + PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME # macOS specific, see README.md#macos for the details on how to configure the BlackHole device @@ -173,7 +174,7 @@ def __init__(self): py_audio.terminate() elif os_name == 'Darwin': - audio = sr.Microphone.get_pyaudio().PyAudio() + # audio = sr.Microphone.get_pyaudio().PyAudio() # Prints a list of all devices # for i in range(audio.get_device_count()): # device_info = audio.get_device_info_by_index(i) @@ -181,7 +182,7 @@ def __init__(self): # # Prints device info to see all fields inside the device info object # print(device_info) - audio.terminate() + # audio.terminate() for index, name in enumerate(sr.Microphone.list_microphone_names()): print(f'Microphone with name "{name}" found for device_index={index})') @@ -273,13 +274,12 @@ def __init__(self): sample_rate=int(default_speakers["defaultSampleRate"]), chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), channels=default_speakers["maxInputChannels"]) - elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) if name == BLACKHOLE_MIC_NAME: self.device_index = index - + p = pyaudio.PyAudio() source = sr.Microphone( device_index=self.device_index, @@ -295,8 +295,7 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", - "Please play sound from Default Speaker...") - # self.adjust_for_noise("Please play sound from Default Speaker...") + "Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' From 18c5f7728896c25b408800cb56aa877a8e689eb8 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 12/29] Checkpoint on mac --- AudioRecorder.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 2a3f775..a4ba79d 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -132,11 +132,13 @@ def disable(self): self.enabled = False def adjust_for_noise(self, device_name, msg): + # def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment.") + print("[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -189,7 +191,7 @@ def __init__(self): if name == HUMAN_MIC_NAME: self.device_index = index - + py_audio = pyaudio.PyAudio() if self.device_index is not None: default_mic = py_audio.get_device_info_by_index(self.device_index) @@ -229,7 +231,7 @@ def set_device(self, index: int): sample_rate=int(mic["defaultSampleRate"]), channels=mic["maxInputChannels"] ) - + elif os_name == 'Darwin': p = pyaudio.PyAudio() self.device_index = index @@ -277,7 +279,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() @@ -296,6 +298,7 @@ def __init__(self): print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' From 09195c67f85e08ccdea0e10514d238cfd6903740 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 13/29] Add support for Mac. --- AudioRecorder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index a4ba79d..60fd0c3 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -40,6 +40,7 @@ 13: 'Windows Vista Audio stack architecture' } + # This needs to be formatted better # Attempt to get more info from it like, device_type Mic vs speaker def print_detailed_audio_info_2(): @@ -231,7 +232,6 @@ def set_device(self, index: int): sample_rate=int(mic["defaultSampleRate"]), channels=mic["maxInputChannels"] ) - elif os_name == 'Darwin': p = pyaudio.PyAudio() self.device_index = index @@ -279,7 +279,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() From 27d6b818c4febe5ceeb1058a02602e9718ffaa5c Mon Sep 17 00:00:00 2001 From: Vivek Uppal Date: Fri, 8 Sep 2023 10:51:14 -0400 Subject: [PATCH 14/29] Continuous mode broke after updates to the UI. (#64) --- ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui.py b/ui.py index 425d0f5..07b1bef 100644 --- a/ui.py +++ b/ui.py @@ -44,7 +44,7 @@ def freeze_unfreeze(self): root_logger.info(ui_callbacks.freeze_unfreeze.__name__) self.global_vars.freeze_state[0] = not self.global_vars.freeze_state[0] # Invert the state self.global_vars.freeze_button.configure( - value="Suggest Responses Continuously" if self.global_vars.freeze_state[0] else "Do Not Suggest Responses Continuously" + text="Suggest Responses Continuously" if self.global_vars.freeze_state[0] else "Do Not Suggest Responses Continuously" ) # to enable/disable speaker/microphone when args are given or button is pressed From 8ee7e6aeff862f4a8e0e5132820504050f349747 Mon Sep 17 00:00:00 2001 From: Vivek Uppal Date: Fri, 8 Sep 2023 11:52:01 -0400 Subject: [PATCH 15/29] Clear transcript data from UI (#65) --- AudioTranscriber.py | 2 ++ conversation.py | 5 ++++- duration.py | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 70def0c..39a7350 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -165,3 +165,5 @@ def clear_transcript_data(self): self.audio_sources["You"]["new_phrase"] = True self.audio_sources["Speaker"]["new_phrase"] = True + + self.conversation.clear_conversation_data() diff --git a/conversation.py b/conversation.py index a014b11..4a6b6c5 100644 --- a/conversation.py +++ b/conversation.py @@ -15,6 +15,9 @@ def __init__(self): constants.PERSONA_YOU: [], constants.PERSONA_SPEAKER: [], constants.PERSONA_ASSISTANT: []} + self.initialize_conversation() + + def initialize_conversation(self): config = configuration.Config().get_data() prompt = config["OpenAI"]["system_prompt"] self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt, @@ -35,7 +38,7 @@ def clear_conversation_data(self): self.transcript_data[constants.PERSONA_SPEAKER].clear() self.transcript_data[constants.PERSONA_SYSTEM].clear() self.transcript_data[constants.PERSONA_ASSISTANT].clear() - self.last_update = datetime.datetime.now() + self.initialize_conversation() def update_conversation(self, persona: str, text: str, time_spoken, pop: bool = False): """Update conversation with new data diff --git a/duration.py b/duration.py index 690ef5f..9bca3dc 100644 --- a/duration.py +++ b/duration.py @@ -16,10 +16,10 @@ class Duration: Duration(dd:hh:ss:ms) of Test Operation 0:00:02.000826 """ - def __init__(self, operation_name: str = 'undefined'): + def __init__(self, name: str = 'undefined'): self.start: datetime.datetime = None self.end: datetime.datetime = None - self.operation_name = operation_name + self.operation_name = name def __enter__(self): """Records the start time of an operation From a1b86e3572aa8bad454426d737dea4555229e616 Mon Sep 17 00:00:00 2001 From: Vivek Uppal Date: Fri, 8 Sep 2023 16:45:53 -0400 Subject: [PATCH 16/29] Faster transcription from user perspective (#66) Increase transcription frequency to improve transcription response time from user perspective. Logging improvements. --- AudioRecorder.py | 3 ++- AudioTranscriber.py | 18 ++++++++++++------ GPTResponder.py | 18 ++++++++++-------- ui.py | 9 ++++++++- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 911c8ae..6045263 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -4,7 +4,8 @@ import pyaudiowpatch as pyaudio import app_logging as al -RECORD_TIMEOUT = 3 +# Attempt transcription of the sound file after every RECORD_TIMEOUT seconds +RECORD_TIMEOUT = 1 ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 39a7350..7dea680 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -3,21 +3,24 @@ from heapq import merge import threading import io -from datetime import timedelta -import pprint +import datetime +# import pprint import wave import tempfile import custom_speech_recognition as sr import pyaudiowpatch as pyaudio import conversation import constants +import app_logging as al PHRASE_TIMEOUT = 3.05 +root_logger = al.get_logger() class AudioTranscriber: def __init__(self, mic_source, speaker_source, model, convo: conversation.Conversation): + root_logger.info(AudioTranscriber.__name__) # Transcript_data should be replaced with the conversation object. # We do not need to store transcription in 2 different places. self.transcript_data = {"You": [], "Speaker": []} @@ -60,8 +63,8 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue): text = '' try: - fd, path = tempfile.mkstemp(suffix=".wav") - os.close(fd) + file_descritor, path = tempfile.mkstemp(suffix=".wav") + os.close(file_descritor) source_info["process_data_func"](source_info["last_sample"], path) if self.transcribe: text = self.audio_model.get_transcription(path) @@ -75,10 +78,11 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue): self.transcript_changed_event.set() def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken): + root_logger.info(AudioTranscriber.update_last_sample_and_phrase_status.__name__) if not self.transcribe: return source_info = self.audio_sources[who_spoke] - if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT): + if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > datetime.timedelta(seconds=PHRASE_TIMEOUT): source_info["last_sample"] = bytes() source_info["new_phrase"] = True else: @@ -88,6 +92,7 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken): source_info["last_spoken"] = time_spoken def process_mic_data(self, data, temp_file_name): + root_logger.info(AudioTranscriber.process_mic_data.__name__) if not self.transcribe: return audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) @@ -96,6 +101,7 @@ def process_mic_data(self, data, temp_file_name): file_handle.write(wav_data.read()) def process_speaker_data(self, data, temp_file_name): + root_logger.info(AudioTranscriber.process_speaker_data.__name__) if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: @@ -138,7 +144,7 @@ def get_transcript(self, length: int = 0): self.transcript_data["You"], self.transcript_data["Speaker"], key=lambda x: x[1], reverse=False)) combined_transcript = combined_transcript[-length:] - current_return_val = "".join([t[0] for t in combined_transcript]) + # current_return_val = "".join([t[0] for t in combined_transcript]) sources = [ constants.PERSONA_YOU, constants.PERSONA_SPEAKER diff --git a/GPTResponder.py b/GPTResponder.py index 58b6da8..b353bce 100644 --- a/GPTResponder.py +++ b/GPTResponder.py @@ -32,17 +32,18 @@ def generate_response_from_transcript_no_check(self, transcript) -> str: Updates the conversation object with the response from LLM. """ try: - prompt_api_message = prompts.create_single_turn_prompt_message(transcript) + root_logger.info(GPTResponder.generate_response_from_transcript_no_check.__name__) + # prompt_api_message = prompts.create_single_turn_prompt_message(transcript) multiturn_prompt_content = self.conversation.get_merged_conversation( length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM) multiturn_prompt_api_message = prompts.create_multiturn_prompt(multiturn_prompt_content) # pprint.pprint(f'Prompt api message: {prompt_api_message}') # print(f'Multiturn prompt for ChatGPT: {multiturn_prompt_api_message}') - usual_response = openai.ChatCompletion.create( - model=self.model, - messages=prompt_api_message, - temperature=0.0 - ) + # usual_response = openai.ChatCompletion.create( + # model=self.model, + # messages=prompt_api_message, + # temperature=0.0 + # ) # Multi turn response is only effective when continuous mode is off. # In continuous mode, there are far too many responses from LLM, # they confuse the LLM if that many responses are replayed back to LLM. @@ -101,13 +102,14 @@ def process_response(self, input_str: str) -> str: def generate_response_from_transcript(self, transcript): """Ping OpenAI LLM model to get response from the Assistant """ - + root_logger.info(GPTResponder.generate_response_from_transcript.__name__) if self.gl_vars.freeze_state[0]: return '' return self.generate_response_from_transcript_no_check(transcript) def update_conversation(self, response, persona): + root_logger.info(GPTResponder.update_conversation.__name__) if response != '': self.response = response self.conversation.update_conversation(persona=persona, @@ -129,7 +131,7 @@ def respond_to_transcriber(self, transcriber): if not self.gl_vars.freeze_state[0]: transcript_string = transcriber.get_transcript( length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM) - response = self.generate_response_from_transcript(transcript_string) + self.generate_response_from_transcript(transcript_string) end_time = time.time() # Measure end time execution_time = end_time - start_time # Calculate time to execute the function diff --git a/ui.py b/ui.py index 07b1bef..d210a24 100644 --- a/ui.py +++ b/ui.py @@ -14,6 +14,7 @@ root_logger = al.get_logger() UI_FONT_SIZE = 20 last_transcript_ui_update_time: datetime.datetime = datetime.datetime.now() +global_vars_module: GlobalVars.TranscriptionGlobals = None class ui_callbacks: @@ -49,10 +50,12 @@ def freeze_unfreeze(self): # to enable/disable speaker/microphone when args are given or button is pressed def enable_disable_speaker(self, editmenu): + """Toggles the state of speaker""" self.global_vars.speaker_audio_recorder.enabled = not self.global_vars.speaker_audio_recorder.enabled editmenu.entryconfigure(2, label="Disable Speaker" if self.global_vars.speaker_audio_recorder.enabled else "Enable Speaker") def enable_disable_microphone(self, editmenu): + """Toggles the state of microphone""" self.global_vars.user_audio_recorder.enabled = not self.global_vars.user_audio_recorder.enabled editmenu.entryconfigure(3, label="Disable Microphone" if self.global_vars.user_audio_recorder.enabled else "Enable Microphone") @@ -108,8 +111,12 @@ def update_transcript_ui(transcriber: AudioTranscriber, textbox: ctk.CTkTextbox) """ global last_transcript_ui_update_time + global global_vars_module - if last_transcript_ui_update_time < GlobalVars.TranscriptionGlobals().convo.last_update: + if global_vars_module is None: + global_vars_module = GlobalVars.TranscriptionGlobals() + + if last_transcript_ui_update_time < global_vars_module.convo.last_update: transcript_string = transcriber.get_transcript() write_in_textbox(textbox, transcript_string) textbox.see("end") From 4f1de938f7272a12362a5bb1ac3c414257473be4 Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 17/29] checkpoint between win, mac --- AudioRecorder.py | 87 +++++++++++++++++++++++++++++++++++++-------- AudioTranscriber.py | 14 +++++--- main.py | 2 +- requirements.txt | 6 ++-- 4 files changed, 86 insertions(+), 23 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 6045263..310dfa5 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,7 +1,10 @@ from datetime import datetime from abc import abstractmethod -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio +import platform import app_logging as al # Attempt transcription of the sound file after every RECORD_TIMEOUT seconds @@ -9,6 +12,12 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False +MBP_MIC_NAME = "MacBook Pro Microphone" +PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" +HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME +# macOS specific, see README.md#macos for the details on how to configure the BlackHole device +BLACKHOLE_MIC_NAME = "BlackHole 2ch" + root_logger = al.get_logger() @@ -30,6 +39,12 @@ 13: 'Windows Vista Audio stack architecture' } +# This needs to be formatted better +# Attempt to get more info from it like, device_type Mic vs speaker +def print_detailed_audio_info_2(): + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f'Audio device with name "{name}" found at index {index}') + def print_detailed_audio_info(print_func=print): """ @@ -137,19 +152,44 @@ class MicRecorder(BaseRecorder): """ def __init__(self): root_logger.info(MicRecorder.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + py_audio = pyaudio.PyAudio() # WASAPI is windows specific wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultInputDevice"] default_mic = py_audio.get_device_info_by_index(self.device_index) - self.device_info = default_mic + self.device_info = default_mic + + source = sr.Microphone(device_index=default_mic["index"], + sample_rate=int(default_mic["defaultSampleRate"]) + # channels=default_mic["maxInputChannels"] + ) + self.source = source + py_audio.terminate() + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + + # this assumes that mic has lower index number for combinded headsets (like Plantronics) + if name == HUMAN_MIC_NAME: + self.device_index = index + + default_mic = py_audio.get_device_info_by_index(self.device_index) + + self.device_info = default_mic + + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + + print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) - source = sr.Microphone(device_index=default_mic["index"], - sample_rate=int(default_mic["defaultSampleRate"]), - channels=default_mic["maxInputChannels"] - ) - self.source = source super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take @@ -183,7 +223,12 @@ class SpeakerRecorder(BaseRecorder): """ def __init__(self): root_logger.info(SpeakerRecorder.__name__) - with pyaudio.PyAudio() as p: + + os_name = platform.system() + self.device_index = None + + if os_name == 'Windows': + p = pyaudio.PyAudio() wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultOutputDevice"] default_speakers = p.get_device_info_by_index(self.device_index) @@ -195,14 +240,26 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") + p.terminate() + source = sr.Microphone(speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + + elif os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + if name == BLACKHOLE_MIC_NAME: + self.device_index = index + + p = pyaudio.PyAudio() + default_speakers = p.get_device_info_by_index(self.device_index) + + print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) self.device_info = default_speakers - source = sr.Microphone(speaker=True, - device_index=default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 7dea680..f31ea28 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -7,8 +7,10 @@ # import pprint import wave import tempfile -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +# import custom_speech_recognition as sr +import speech_recognition as sr +# import pyaudiowpatch as pyaudio +import pyaudio import conversation import constants import app_logging as al @@ -28,11 +30,12 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver self.audio_model = model # Determines if transcription is enabled for the application. By default it is enabled. self.transcribe = True + # channels commented for mac self.audio_sources = { "You": { "sample_rate": mic_source.SAMPLE_RATE, "sample_width": mic_source.SAMPLE_WIDTH, - "channels": mic_source.channels, + # "channels": mic_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -41,7 +44,7 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver "Speaker": { "sample_rate": speaker_source.SAMPLE_RATE, "sample_width": speaker_source.SAMPLE_WIDTH, - "channels": speaker_source.channels, + # "channels": speaker_source.channels, "last_sample": bytes(), "last_spoken": None, "new_phrase": True, @@ -105,7 +108,8 @@ def process_speaker_data(self, data, temp_file_name): if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: - wf.setnchannels(self.audio_sources["Speaker"]["channels"]) + # commented for mac, get from pyaudio itself + # wf.setnchannels(self.audio_sources["Speaker"]["channels"]) p = pyaudio.PyAudio() wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(self.audio_sources["Speaker"]["sample_rate"]) diff --git a/main.py b/main.py index 19617be..cf9cf96 100644 --- a/main.py +++ b/main.py @@ -79,7 +79,7 @@ def main(): if args.list_devices: print('\n\nList all audio drivers and devices on this machine') - ar.print_detailed_audio_info() + ar.print_detailed_audio_info_2() return # Initiate global variables diff --git a/requirements.txt b/requirements.txt index ba4d3f8..652b528 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,8 @@ openai-whisper==20230314 Wave==0.0.2 openai==0.27.6 customtkinter==5.1.3 -PyAudioWPatch==0.2.12.5 +# PyAudioWPatch==0.2.12.5 +PyAudio==0.2.13 pyinstaller==5.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 torch @@ -14,4 +15,5 @@ soundfile gtts # Playsound version 1.3 has issues in playing back audio files # in case of continuous play back of files in quick succession -playsound==1.2.2 \ No newline at end of file +playsound==1.2.2 +SpeechRecognition==3.10.0 From 4cf7ab6d6b90ef5e38ba5f5bce10edfec09d2de8 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 18/29] Checkpoint on mac --- AudioRecorder.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 310dfa5..1424a27 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -12,7 +12,8 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False -MBP_MIC_NAME = "MacBook Pro Microphone" +MBP_MIC_NAME = "MacBook Air Microphone" +MBP_SPEAKER_NAME = "MacBook Air Speakers" PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME # macOS specific, see README.md#macos for the details on how to configure the BlackHole device @@ -130,12 +131,14 @@ def disable(self): """ self.enabled = False - def adjust_for_noise(self, device_name, msg): + # def adjust_for_noise(self, device_name, msg): + def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment for {device_name}.") + print(f"[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -173,13 +176,14 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): - # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == HUMAN_MIC_NAME: + if name == MBP_MIC_NAME: self.device_index = index - - default_mic = py_audio.get_device_info_by_index(self.device_index) + + py_audio = pyaudio.PyAudio() + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic @@ -187,6 +191,7 @@ def __init__(self): device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) + py_audio.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) @@ -250,11 +255,16 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) default_speakers = p.get_device_info_by_index(self.device_index) + p.terminate() print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) @@ -262,8 +272,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Default Speaker", - "Please play sound from Default Speaker...") + # self.adjust_for_noise("Default Speaker", + # "Please play sound from Default Speaker...") + self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -293,8 +304,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Speaker", - f"Please play sound from selected Speakers {self.get_name()}...") + # self.adjust_for_noise("Speaker", + self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From 7534af39f109dd71f2f7e06dd1b549b598d11497 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 19/29] Add support for Mac. --- AudioRecorder.py | 98 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 1424a27..191690c 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -131,11 +131,9 @@ def disable(self): """ self.enabled = False - # def adjust_for_noise(self, device_name, msg): - def adjust_for_noise(self, msg): + def adjust_for_noise(self, device_name, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) - # print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) - print(f"[INFO] Adjusting for ambient noise... " + msg) + print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) print(f"[INFO] Completed ambient noise adjustment.") @@ -157,6 +155,7 @@ def __init__(self): root_logger.info(MicRecorder.__name__) os_name = platform.system() self.device_index = None + default_mic = None if os_name == 'Windows': py_audio = pyaudio.PyAudio() @@ -175,49 +174,73 @@ def __init__(self): py_audio.terminate() elif os_name == 'Darwin': + audio = sr.Microphone.get_pyaudio().PyAudio() + # Prints a list of all devices + # for i in range(audio.get_device_count()): + # device_info = audio.get_device_info_by_index(i) + # print(f'Name: {device_info.get("name")}, InputChannels: {device_info.get("maxInputChannels")} OutputChannels: {device_info.get("maxOutputChannels")}') + # + # Prints device info to see all fields inside the device info object + # print(device_info) + audio.terminate() + for index, name in enumerate(sr.Microphone.list_microphone_names()): - print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + print(f'Microphone with name "{name}" found for device_index={index})') - # this assumes that mic has lower index number for combinded headsets (like Plantronics) - if name == MBP_MIC_NAME: + if name == HUMAN_MIC_NAME: self.device_index = index py_audio = pyaudio.PyAudio() - default_mic = py_audio.get_device_info_by_index(self.device_index) + if self.device_index is not None: + default_mic = py_audio.get_device_info_by_index(self.device_index) self.device_info = default_mic source = sr.Microphone( - device_index=self.device_index, + device_index=self.device_index, chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) ) py_audio.terminate() - print("[DEBUG] \"{}\" microphone index is: {}".format(HUMAN_MIC_NAME, self.device_index)) + print(f'[DEBUG] "{MBP_MIC_NAME}" microphone index is: {self.device_index}') super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take # several minutes to execute, thus delaying the start of the application. - # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") + self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") def get_name(self): - return f'#{self.device_index} - {self.device_info["name"]}' + if self.device_info is not None: + return f'#{self.device_index} - {self.device_info["name"]}' + return None def set_device(self, index: int): """Set active device based on index. """ root_logger.info(MicRecorder.set_device.__name__) - with pyaudio.PyAudio() as py_audio: + os_name = platform.system() + if os_name == 'Windows': + with pyaudio.PyAudio() as py_audio: + self.device_index = index + mic = py_audio.get_device_info_by_index(self.device_index) + + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]), + channels=mic["maxInputChannels"] + ) + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index - mic = py_audio.get_device_info_by_index(self.device_index) + mic = p.get_device_info_by_index(self.device_index) + p.terminate() + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]) + ) self.device_info = mic - source = sr.Microphone(device_index=mic["index"], - sample_rate=int(mic["defaultSampleRate"]), - channels=mic["maxInputChannels"] - ) self.source = source print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...") @@ -255,7 +278,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() @@ -272,9 +295,9 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Default Speaker", - # "Please play sound from Default Speaker...") - self.adjust_for_noise("Please play sound from Default Speaker...") + self.adjust_for_noise("Default Speaker", + "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -283,17 +306,26 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(SpeakerRecorder.set_device.__name__) - with pyaudio.PyAudio() as p: + os_name = platform.system() + + if os_name == 'Windows': + with pyaudio.PyAudio() as p: + self.device_index = index + speakers = p.get_device_info_by_index(self.device_index) + + if not speakers["isLoopbackDevice"]: + for loopback in p.get_loopback_device_info_generator(): + if speakers["name"] in loopback["name"]: + speakers = loopback + break + else: + print("[ERROR] No loopback device found.") + + elif os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index speakers = p.get_device_info_by_index(self.device_index) - - if not speakers["isLoopbackDevice"]: - for loopback in p.get_loopback_device_info_generator(): - if speakers["name"] in loopback["name"]: - speakers = loopback - break - else: - print("[ERROR] No loopback device found.") + p.terminate() self.device_info = speakers @@ -304,8 +336,8 @@ def set_device(self, index: int): channels=speakers["maxInputChannels"]) self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - # self.adjust_for_noise("Speaker", - self.adjust_for_noise(f"Please play sound from selected Speakers {self.get_name()}...") + self.adjust_for_noise("Speaker", + f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": From b2e8e0d3c780453c72d386171b9748aa6ed0314d Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 20/29] checkpoint between win, mac --- AudioRecorder.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 191690c..1c3ef80 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -12,8 +12,9 @@ ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False -MBP_MIC_NAME = "MacBook Air Microphone" -MBP_SPEAKER_NAME = "MacBook Air Speakers" +MBP_SPEAKER_NAME = "MacBook Pro Speakers" +MBP_MIC_NAME = "MacBook Pro Microphone" + PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME # macOS specific, see README.md#macos for the details on how to configure the BlackHole device @@ -174,7 +175,7 @@ def __init__(self): py_audio.terminate() elif os_name == 'Darwin': - audio = sr.Microphone.get_pyaudio().PyAudio() + # audio = sr.Microphone.get_pyaudio().PyAudio() # Prints a list of all devices # for i in range(audio.get_device_count()): # device_info = audio.get_device_info_by_index(i) @@ -182,7 +183,7 @@ def __init__(self): # # Prints device info to see all fields inside the device info object # print(device_info) - audio.terminate() + # audio.terminate() for index, name in enumerate(sr.Microphone.list_microphone_names()): print(f'Microphone with name "{name}" found for device_index={index})') @@ -274,13 +275,12 @@ def __init__(self): sample_rate=int(default_speakers["defaultSampleRate"]), chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), channels=default_speakers["maxInputChannels"]) - elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) if name == BLACKHOLE_MIC_NAME: self.device_index = index - + p = pyaudio.PyAudio() source = sr.Microphone( device_index=self.device_index, @@ -296,8 +296,7 @@ def __init__(self): super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", - "Please play sound from Default Speaker...") - # self.adjust_for_noise("Please play sound from Default Speaker...") + "Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' From 096d6a981dac7cb47c5d192ecdfb0114e0d8f2c6 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 21/29] Checkpoint on mac --- AudioRecorder.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 1c3ef80..7f68f4a 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -133,11 +133,13 @@ def disable(self): self.enabled = False def adjust_for_noise(self, device_name, msg): + # def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment.") + print("[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -190,7 +192,7 @@ def __init__(self): if name == HUMAN_MIC_NAME: self.device_index = index - + py_audio = pyaudio.PyAudio() if self.device_index is not None: default_mic = py_audio.get_device_info_by_index(self.device_index) @@ -230,7 +232,7 @@ def set_device(self, index: int): sample_rate=int(mic["defaultSampleRate"]), channels=mic["maxInputChannels"] ) - + elif os_name == 'Darwin': p = pyaudio.PyAudio() self.device_index = index @@ -278,7 +280,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() @@ -297,6 +299,7 @@ def __init__(self): print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' From ae088c4749d0c23687923658f3d993348d28fa02 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 22/29] Add support for Mac. --- AudioRecorder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 7f68f4a..311a50f 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -41,6 +41,7 @@ 13: 'Windows Vista Audio stack architecture' } + # This needs to be formatted better # Attempt to get more info from it like, device_type Mic vs speaker def print_detailed_audio_info_2(): @@ -232,7 +233,6 @@ def set_device(self, index: int): sample_rate=int(mic["defaultSampleRate"]), channels=mic["maxInputChannels"] ) - elif os_name == 'Darwin': p = pyaudio.PyAudio() self.device_index = index @@ -280,7 +280,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() From 49bb0d5cc44d8bba2101dd39f405c333fa838abb Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 23/29] checkpoint between win, mac --- AudioRecorder.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/AudioRecorder.py b/AudioRecorder.py index 311a50f..34fc6d4 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -41,6 +41,12 @@ 13: 'Windows Vista Audio stack architecture' } +# This needs to be formatted better +# Attempt to get more info from it like, device_type Mic vs speaker +def print_detailed_audio_info_2(): + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f'Audio device with name "{name}" found at index {index}') + # This needs to be formatted better # Attempt to get more info from it like, device_type Mic vs speaker From 6c447451004af754de8c1b60031624538123ef32 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 24/29] Checkpoint on mac --- AudioRecorder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 34fc6d4..efe6299 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -286,7 +286,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() From ac2e541822352c9ebd4638cdb38a3bc79e39a357 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 25/29] Add support for Mac. --- AudioRecorder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index efe6299..34fc6d4 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -286,7 +286,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() From c824d926b1be913c86e73595c09783ab1437ae13 Mon Sep 17 00:00:00 2001 From: vivek Date: Thu, 7 Sep 2023 16:54:12 -0400 Subject: [PATCH 26/29] checkpoint between win, mac --- AudioRecorder.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 34fc6d4..a11a67b 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -48,13 +48,6 @@ def print_detailed_audio_info_2(): print(f'Audio device with name "{name}" found at index {index}') -# This needs to be formatted better -# Attempt to get more info from it like, device_type Mic vs speaker -def print_detailed_audio_info_2(): - for index, name in enumerate(sr.Microphone.list_microphone_names()): - print(f'Audio device with name "{name}" found at index {index}') - - def print_detailed_audio_info(print_func=print): """ Print information about Host APIs and devices, From 2fc32c9fe35b8ef284dd463fdb75a599eef7cf65 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 17:52:57 -0400 Subject: [PATCH 27/29] Checkpoint on mac --- AudioRecorder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index a11a67b..3e37194 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -279,7 +279,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == BLACKHOLE_MIC_NAME: + if name == MBP_SPEAKER_NAME: self.device_index = index p = pyaudio.PyAudio() From d3c9247019774c8f204b57fb80072730f0230791 Mon Sep 17 00:00:00 2001 From: Abhinav Uppal Date: Thu, 7 Sep 2023 20:46:21 -0400 Subject: [PATCH 28/29] Add support for Mac. --- AudioRecorder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index 3e37194..a11a67b 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -279,7 +279,7 @@ def __init__(self): elif os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) - if name == MBP_SPEAKER_NAME: + if name == BLACKHOLE_MIC_NAME: self.device_index = index p = pyaudio.PyAudio() From 454f4250d09640e9d18f4cca69f7905d6840b867 Mon Sep 17 00:00:00 2001 From: vivek Date: Mon, 11 Sep 2023 12:57:29 -0400 Subject: [PATCH 29/29] win compatibility for mac changes. --- AudioRecorder.py | 218 ++++++++++++++------------ AudioTranscriber.py | 91 +++++++---- README.md | 23 ++- custom_speech_recognition/__main__.py | 12 +- main.py | 2 +- requirements.txt | 3 +- 6 files changed, 214 insertions(+), 135 deletions(-) diff --git a/AudioRecorder.py b/AudioRecorder.py index f369e23..f10a392 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,12 +1,19 @@ from datetime import datetime from abc import abstractmethod -# import custom_speech_recognition as sr +import platform import speech_recognition as sr -# import pyaudiowpatch as pyaudio +import custom_speech_recognition as csr import pyaudio -import platform import app_logging as al +os_name = platform.system() +if os_name == 'Windows': + import custom_speech_recognition as csr + import pyaudiowpatch as pyaudio +if os_name == 'Darwin': + import speech_recognition as sr + import pyaudio + # Attempt transcription of the sound file after every RECORD_TIMEOUT seconds RECORD_TIMEOUT = 1 ENERGY_THRESHOLD = 1000 @@ -41,71 +48,17 @@ 13: 'Windows Vista Audio stack architecture' } -# This needs to be formatted better -# Attempt to get more info from it like, device_type Mic vs speaker -def print_detailed_audio_info_2(): - for index, name in enumerate(sr.Microphone.list_microphone_names()): - print(f'Audio device with name "{name}" found at index {index}') - - -def print_detailed_audio_info(print_func=print): - """ - Print information about Host APIs and devices, - using `print_func`. - - :param print_func: Print function(or wrapper) - :type print_func: function - :rtype: None - """ - print_func("\n|", "~ Audio Drivers on this machine ~".center(20), "|\n") - header = f" ^ #{'INDEX'.center(7)}#{'DRIVER TYPE'.center(13)}#{'DEVICE COUNT'.center(15)}#{'NAME'.center(5)}" - print_func(header) - print_func("-"*len(header)) - py_audio = pyaudio.PyAudio() - for host_api in py_audio.get_host_api_info_generator(): - print_func( - ( - f" » " - f"{('['+str(host_api['index'])+']').center(8)}|" - f"{str(host_api['type']).center(13)}|" - f"{str(host_api['deviceCount']).center(15)}|" - f" {host_api['name']}" - ) - ) - - print_func("\n\n\n|", "~ Audio Devices on this machine ~".center(20), "|\n") - header = f" ^ #{'INDEX'.center(7)}# HOST API INDEX #{'LOOPBACK'.center(10)}#{'NAME'.center(5)}" - print_func(header) - print_func("-"*len(header)) - for device in py_audio.get_device_info_generator(): - print_func( - ( - f" » " - f"{('['+str(device['index'])+']').center(8)}" - f"{str(device['hostApi']).center(16)}" - f" {str(device['isLoopbackDevice']).center(10)}" - f" {device['name']}" - ) - ) - - # Below statements are useful to view all available fields in the - # driver and device list - # Do not remove these statements from here - # print('Windows Audio Drivers') - # for host_api_info_gen in py_audio.get_host_api_info_generator(): - # print(host_api_info_gen) - - # print('Windows Audio Devices') - # for device_info_gen in py_audio.get_device_info_generator(): - # print(device_info_gen) - class BaseRecorder: """Base class for Speaker, Microphone classes """ def __init__(self, source, source_name): root_logger.info(BaseRecorder.__name__) - self.recorder = sr.Recognizer() + self.os_name = platform.system() + if self.os_name == 'Windows': + self.recorder = csr.Recognizer() + elif self.os_name == 'Darwin': + self.recorder = sr.Recognizer() self.recorder.energy_threshold = ENERGY_THRESHOLD self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD # Determines if this device is being used for transcription @@ -122,6 +75,74 @@ def get_name(self): """Get the name of this device """ + @staticmethod + def print_detailed_audio_info(print_func=print): + """ + Print information about Host APIs and devices, + using `print_func`. + + :param print_func: Print function(or wrapper) + :type print_func: function + :rtype: None + """ + os_name = platform.system() + if os_name == 'Darwin': + print_func("\n|", "~ Audio devices on this machine ~".center(20), "|\n") + header = f" ^ #{'INDEX'.center(7)}#{'NAME'.center(5)}" + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print_func( + ( + f" » " + f"{('['+str(index)+']').center(8)}|" + f" {name}" + ) + ) + # print_func(f'Audio device with name "{name}" found at index {index}') + + elif os_name == 'Windows': + print_func("\n|", "~ Audio Drivers on this machine ~".center(20), "|\n") + header = f" ^ #{'INDEX'.center(7)}#{'DRIVER TYPE'.center(13)}#{'DEVICE COUNT'.center(15)}#{'NAME'.center(5)}" + print_func(header) + print_func("-"*len(header)) + py_audio = pyaudio.PyAudio() + for host_api in py_audio.get_host_api_info_generator(): + print_func( + ( + f" » " + f"{('['+str(host_api['index'])+']').center(8)}|" + f"{str(host_api['type']).center(13)}|" + f"{str(host_api['deviceCount']).center(15)}|" + f" {host_api['name']}" + ) + ) + + print_func("\n\n\n|", "~ Audio Devices on this machine ~".center(20), "|\n") + header = f" ^ #{'INDEX'.center(7)}# DRIVER INDEX #{'LOOPBACK'.center(10)}#{'NAME'.center(5)}" + print_func(header) + print_func("-"*len(header)) + for device in py_audio.get_device_info_generator(): + print_func( + ( + f" » " + f"{('['+str(device['index'])+']').center(8)}" + f"{str(device['hostApi']).center(14)}" + f" {str(device['isLoopbackDevice']).center(10)}" + f" {device['name']}" + ) + ) + + # Below statements are useful to view all available fields in the + # driver and device list + # Do not remove these statements from here + # print('Windows Audio Drivers') + # for host_api_info_gen in py_audio.get_host_api_info_generator(): + # print(host_api_info_gen) + + # print('Windows Audio Devices') + # for device_info_gen in py_audio.get_device_info_generator(): + # print(device_info_gen) + py_audio.terminate() + def enable(self): """Enable transcription from this device """ @@ -133,7 +154,6 @@ def disable(self): self.enabled = False def adjust_for_noise(self, device_name, msg): - # def adjust_for_noise(self, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) # print(f"[INFO] Adjusting for ambient noise... " + msg) @@ -156,11 +176,11 @@ class MicRecorder(BaseRecorder): """ def __init__(self): root_logger.info(MicRecorder.__name__) - os_name = platform.system() + self.os_name = platform.system() self.device_index = None default_mic = None - if os_name == 'Windows': + if self.os_name == 'Windows': py_audio = pyaudio.PyAudio() # WASAPI is windows specific wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI) @@ -169,14 +189,14 @@ def __init__(self): self.device_info = default_mic - source = sr.Microphone(device_index=default_mic["index"], - sample_rate=int(default_mic["defaultSampleRate"]) - # channels=default_mic["maxInputChannels"] - ) + source = csr.Microphone(device_index=default_mic["index"], + sample_rate=int(default_mic["defaultSampleRate"]) + # channels=default_mic["maxInputChannels"] + ) self.source = source py_audio.terminate() - elif os_name == 'Darwin': + elif self.os_name == 'Darwin': # audio = sr.Microphone.get_pyaudio().PyAudio() # Prints a list of all devices # for i in range(audio.get_device_count()): @@ -221,17 +241,16 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(MicRecorder.set_device.__name__) - os_name = platform.system() - if os_name == 'Windows': + if self.os_name == 'Windows': with pyaudio.PyAudio() as py_audio: self.device_index = index mic = py_audio.get_device_info_by_index(self.device_index) - source = sr.Microphone(device_index=mic["index"], - sample_rate=int(mic["defaultSampleRate"]), - channels=mic["maxInputChannels"] - ) - elif os_name == 'Darwin': + source = csr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]), + channels=mic["maxInputChannels"] + ) + elif self.os_name == 'Darwin': p = pyaudio.PyAudio() self.device_index = index mic = p.get_device_info_by_index(self.device_index) @@ -248,15 +267,15 @@ def set_device(self, index: int): class SpeakerRecorder(BaseRecorder): - """Encapsultes the Speaer device audio input + """Encapsultes the Speaker device audio input """ def __init__(self): root_logger.info(SpeakerRecorder.__name__) - os_name = platform.system() + self.os_name = platform.system() self.device_index = None - if os_name == 'Windows': + if self.os_name == 'Windows': p = pyaudio.PyAudio() wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultOutputDevice"] @@ -269,13 +288,12 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") - p.terminate() - source = sr.Microphone(speaker=True, - device_index=default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) - elif os_name == 'Darwin': + source = csr.Microphone(speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + elif self.os_name == 'Darwin': for index, name in enumerate(sr.Microphone.list_microphone_names()): # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) if name == BLACKHOLE_MIC_NAME: @@ -305,9 +323,8 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(SpeakerRecorder.set_device.__name__) - os_name = platform.system() - if os_name == 'Windows': + if self.os_name == 'Windows': with pyaudio.PyAudio() as p: self.device_index = index speakers = p.get_device_info_by_index(self.device_index) @@ -320,7 +337,7 @@ def set_device(self, index: int): else: print("[ERROR] No loopback device found.") - elif os_name == 'Darwin': + elif self.os_name == 'Darwin': p = pyaudio.PyAudio() self.device_index = index speakers = p.get_device_info_by_index(self.device_index) @@ -328,19 +345,26 @@ def set_device(self, index: int): self.device_info = speakers - source = sr.Microphone(speaker=True, - device_index=speakers["index"], - sample_rate=int(speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=speakers["maxInputChannels"]) + if self.os_name == 'Windows': + source = csr.Microphone(speaker=True, + device_index=speakers["index"], + sample_rate=int(speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=speakers["maxInputChannels"]) + elif self.os_name == 'Darwin': + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') - self.adjust_for_noise("Speaker", + self.adjust_for_noise("Speaker", f"Please play sound from selected Speakers {self.get_name()}...") if __name__ == "__main__": - print_detailed_audio_info() + BaseRecorder.print_detailed_audio_info() # Below statements are useful to view all available fields in the # default Input Device. # Do not delete these lines diff --git a/AudioTranscriber.py b/AudioTranscriber.py index f31ea28..7b3c2a1 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -2,19 +2,24 @@ import queue from heapq import merge import threading +import platform import io import datetime # import pprint import wave import tempfile -# import custom_speech_recognition as sr -import speech_recognition as sr -# import pyaudiowpatch as pyaudio -import pyaudio import conversation import constants import app_logging as al +os_name = platform.system() +if os_name == 'Windows': + import custom_speech_recognition as csr + import pyaudiowpatch as pyaudio +if os_name == 'Darwin': + import speech_recognition as sr + import pyaudio + PHRASE_TIMEOUT = 3.05 root_logger = al.get_logger() @@ -30,27 +35,51 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver self.audio_model = model # Determines if transcription is enabled for the application. By default it is enabled. self.transcribe = True - # channels commented for mac - self.audio_sources = { - "You": { - "sample_rate": mic_source.SAMPLE_RATE, - "sample_width": mic_source.SAMPLE_WIDTH, - # "channels": mic_source.channels, - "last_sample": bytes(), - "last_spoken": None, - "new_phrase": True, - "process_data_func": self.process_mic_data - }, - "Speaker": { - "sample_rate": speaker_source.SAMPLE_RATE, - "sample_width": speaker_source.SAMPLE_WIDTH, - # "channels": speaker_source.channels, - "last_sample": bytes(), - "last_spoken": None, - "new_phrase": True, - "process_data_func": self.process_speaker_data + self.os_name = platform.system() + + if self.os_name == 'Windows': + self.audio_sources = { + "You": { + "sample_rate": mic_source.SAMPLE_RATE, + "sample_width": mic_source.SAMPLE_WIDTH, + "channels": mic_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_mic_data + }, + "Speaker": { + "sample_rate": speaker_source.SAMPLE_RATE, + "sample_width": speaker_source.SAMPLE_WIDTH, + "channels": speaker_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_speaker_data + } + } + elif self.os_name == 'Darwin': + self.audio_sources = { + "You": { + "sample_rate": mic_source.SAMPLE_RATE, + "sample_width": mic_source.SAMPLE_WIDTH, + # "channels": mic_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_mic_data + }, + "Speaker": { + "sample_rate": speaker_source.SAMPLE_RATE, + "sample_width": speaker_source.SAMPLE_WIDTH, + # "channels": speaker_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_speaker_data + } } - } + self.conversation = convo def transcribe_audio_queue(self, audio_queue: queue.Queue): @@ -98,7 +127,10 @@ def process_mic_data(self, data, temp_file_name): root_logger.info(AudioTranscriber.process_mic_data.__name__) if not self.transcribe: return - audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) + if self.os_name == 'Windows': + audio_data = csr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) + elif self.os_name == 'Darwin': + audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) wav_data = io.BytesIO(audio_data.get_wav_data()) with open(temp_file_name, 'w+b') as file_handle: file_handle.write(wav_data.read()) @@ -108,9 +140,12 @@ def process_speaker_data(self, data, temp_file_name): if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: - # commented for mac, get from pyaudio itself - # wf.setnchannels(self.audio_sources["Speaker"]["channels"]) - p = pyaudio.PyAudio() + if self.os_name == 'Windows': + wf.setnchannels(self.audio_sources["Speaker"]["channels"]) + p = pyaudio.PyAudio() + if self.os_name == 'Darwin': + p = pyaudio.PyAudio() + wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(self.audio_sources["Speaker"]["sample_rate"]) wf.writeframes(data) diff --git a/README.md b/README.md index 7f8d2d4..0c616b3 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,12 @@ Follow below steps to run transcribe on your local machine. ### 📋 Prerequisites +#### Common - Python >=3.8.0 - (Optional) An OpenAI API key that can access OpenAI API (set up a paid account OpenAI account) -- Windows OS (Not tested on others) - FFmpeg +#### Windows If FFmpeg is not installed in your system, follow the steps below to install it. First, install Chocolatey, a package manager for Windows. Open PowerShell as Administrator and run the following command: @@ -26,6 +27,23 @@ choco install ffmpeg ``` Please run these commands in a PowerShell window with administrator privileges. For any issues during the installation, visit the official [Chocolatey](https://chocolatey.org/) and [FFmpeg](https://ffmpeg.org/) websites for troubleshooting. +#### macOS + +**Dependencies and tools** + +- XCode Command Line Tools +- `brew install portaudio python-tk ffmpeg blackhole-2ch` + +**BlackHole configuration** + +Setup "Multi-Ouput Device" and set it as default sound output device for your macOS. Guidelines are available [here](https://github.com/ExistentialAudio/BlackHole/wiki/Multi-Output-Device) + +**Configuring device names** + +Speakers audio on macOS will be recorded from the virtual "BlackHole 2ch" microphone. Your and BlackHole microphone device names could be adjusted via `HUMAN_MIC_NAME` and `BLACKHOLE_MIC_NAME` vars in the [AudioRecorder.py](./AudioRecorder.py). + +Run `python main.py -l` to get speaker and microphone devices list and their indices. + ### 🔧 Code Installation 1. Clone transcribe repository: @@ -149,7 +167,7 @@ https://drive.google.com/file/d/1Iy32YjDXK7Bga7amOUTA4Gx9VEoibPi-/view?usp=shari ### ⚡️ Limitations ⚡️ -While Transcribe provides real-time transcription and optional response suggestions, there are several known limitations to its functionality that you should be aware of: +While Transcribe provides real-time transcription and optional response suggestions, there are few known limitations to its functionality: **Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words. @@ -167,6 +185,7 @@ Incorrect API key provided: API_KEY. You can find your API key at https://platfo This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. ## ➕ Enhancements from base repository ➕ +- macOS support - Speech Mode - Read out responses from ChatGPT as Audio - Do not need Open AI key, paid Open AI account to use the complete functionality - Allow users selective disabling of mic, speaker audio input diff --git a/custom_speech_recognition/__main__.py b/custom_speech_recognition/__main__.py index 68f5652..dcfd0e4 100644 --- a/custom_speech_recognition/__main__.py +++ b/custom_speech_recognition/__main__.py @@ -5,20 +5,22 @@ try: print("A moment of silence, please...") - with m as source: r.adjust_for_ambient_noise(source) - print("Set minimum energy threshold to {}".format(r.energy_threshold)) + with m as source: + r.adjust_for_ambient_noise(source) + print(f'Set minimum energy threshold to {r.energy_threshold}') while True: print("Say something!") - with m as source: audio = r.listen(source) + with m as source: + audio = r.listen(source) print("Got it! Now to recognize it...") try: # recognize speech using Google Speech Recognition value = r.recognize_google(audio) - print("You said {}".format(value)) + print(f'You said {value}') except sr.UnknownValueError: print("Oops! Didn't catch that") except sr.RequestError as e: - print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e)) + print(f"Uh oh! Couldn't request results from Google Speech Recognition service; {e}") except KeyboardInterrupt: pass diff --git a/main.py b/main.py index cf9cf96..e3e59d5 100644 --- a/main.py +++ b/main.py @@ -79,7 +79,7 @@ def main(): if args.list_devices: print('\n\nList all audio drivers and devices on this machine') - ar.print_detailed_audio_info_2() + ar.BaseRecorder.print_detailed_audio_info() return # Initiate global variables diff --git a/requirements.txt b/requirements.txt index 652b528..71ebdb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,17 +3,16 @@ openai-whisper==20230314 Wave==0.0.2 openai==0.27.6 customtkinter==5.1.3 -# PyAudioWPatch==0.2.12.5 PyAudio==0.2.13 pyinstaller==5.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 torch pyperclip PyYAML -numpy soundfile gtts # Playsound version 1.3 has issues in playing back audio files # in case of continuous play back of files in quick succession playsound==1.2.2 SpeechRecognition==3.10.0 +PyAudioWPatch==0.2.12.5; platform_system == "Windows"