diff --git a/AudioRecorder.py b/AudioRecorder.py index 911c8ae..f10a392 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -1,13 +1,32 @@ from datetime import datetime from abc import abstractmethod -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio +import platform +import speech_recognition as sr +import custom_speech_recognition as csr +import pyaudio import app_logging as al -RECORD_TIMEOUT = 3 +os_name = platform.system() +if os_name == 'Windows': + import custom_speech_recognition as csr + import pyaudiowpatch as pyaudio +if os_name == 'Darwin': + import speech_recognition as sr + import pyaudio + +# Attempt transcription of the sound file after every RECORD_TIMEOUT seconds +RECORD_TIMEOUT = 1 ENERGY_THRESHOLD = 1000 DYNAMIC_ENERGY_THRESHOLD = False +MBP_SPEAKER_NAME = "MacBook Pro Speakers" +MBP_MIC_NAME = "MacBook Pro Microphone" + +PLANTRONICS_3220_MIC_NAME = "Plantronics Blackwire 3220 Series" +HUMAN_MIC_NAME = PLANTRONICS_3220_MIC_NAME +# macOS specific, see README.md#macos for the details on how to configure the BlackHole device +BLACKHOLE_MIC_NAME = "BlackHole 2ch" + root_logger = al.get_logger() @@ -30,64 +49,16 @@ } -def print_detailed_audio_info(print_func=print): - """ - Print information about Host APIs and devices, - using `print_func`. - - :param print_func: Print function(or wrapper) - :type print_func: function - :rtype: None - """ - print_func("\n|", "~ Audio Drivers on this machine ~".center(20), "|\n") - header = f" ^ #{'INDEX'.center(7)}#{'DRIVER TYPE'.center(13)}#{'DEVICE COUNT'.center(15)}#{'NAME'.center(5)}" - print_func(header) - print_func("-"*len(header)) - py_audio = pyaudio.PyAudio() - for host_api in py_audio.get_host_api_info_generator(): - print_func( - ( - f" » " - f"{('['+str(host_api['index'])+']').center(8)}|" - f"{str(host_api['type']).center(13)}|" - f"{str(host_api['deviceCount']).center(15)}|" - f" {host_api['name']}" - ) - ) - - print_func("\n\n\n|", "~ Audio Devices on this machine ~".center(20), "|\n") - header = f" ^ #{'INDEX'.center(7)}# HOST API INDEX #{'LOOPBACK'.center(10)}#{'NAME'.center(5)}" - print_func(header) - print_func("-"*len(header)) - for device in py_audio.get_device_info_generator(): - print_func( - ( - f" » " - f"{('['+str(device['index'])+']').center(8)}" - f"{str(device['hostApi']).center(16)}" - f" {str(device['isLoopbackDevice']).center(10)}" - f" {device['name']}" - ) - ) - - # Below statements are useful to view all available fields in the - # driver and device list - # Do not remove these statements from here - # print('Windows Audio Drivers') - # for host_api_info_gen in py_audio.get_host_api_info_generator(): - # print(host_api_info_gen) - - # print('Windows Audio Devices') - # for device_info_gen in py_audio.get_device_info_generator(): - # print(device_info_gen) - - class BaseRecorder: """Base class for Speaker, Microphone classes """ def __init__(self, source, source_name): root_logger.info(BaseRecorder.__name__) - self.recorder = sr.Recognizer() + self.os_name = platform.system() + if self.os_name == 'Windows': + self.recorder = csr.Recognizer() + elif self.os_name == 'Darwin': + self.recorder = sr.Recognizer() self.recorder.energy_threshold = ENERGY_THRESHOLD self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD # Determines if this device is being used for transcription @@ -104,6 +75,74 @@ def get_name(self): """Get the name of this device """ + @staticmethod + def print_detailed_audio_info(print_func=print): + """ + Print information about Host APIs and devices, + using `print_func`. + + :param print_func: Print function(or wrapper) + :type print_func: function + :rtype: None + """ + os_name = platform.system() + if os_name == 'Darwin': + print_func("\n|", "~ Audio devices on this machine ~".center(20), "|\n") + header = f" ^ #{'INDEX'.center(7)}#{'NAME'.center(5)}" + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print_func( + ( + f" » " + f"{('['+str(index)+']').center(8)}|" + f" {name}" + ) + ) + # print_func(f'Audio device with name "{name}" found at index {index}') + + elif os_name == 'Windows': + print_func("\n|", "~ Audio Drivers on this machine ~".center(20), "|\n") + header = f" ^ #{'INDEX'.center(7)}#{'DRIVER TYPE'.center(13)}#{'DEVICE COUNT'.center(15)}#{'NAME'.center(5)}" + print_func(header) + print_func("-"*len(header)) + py_audio = pyaudio.PyAudio() + for host_api in py_audio.get_host_api_info_generator(): + print_func( + ( + f" » " + f"{('['+str(host_api['index'])+']').center(8)}|" + f"{str(host_api['type']).center(13)}|" + f"{str(host_api['deviceCount']).center(15)}|" + f" {host_api['name']}" + ) + ) + + print_func("\n\n\n|", "~ Audio Devices on this machine ~".center(20), "|\n") + header = f" ^ #{'INDEX'.center(7)}# DRIVER INDEX #{'LOOPBACK'.center(10)}#{'NAME'.center(5)}" + print_func(header) + print_func("-"*len(header)) + for device in py_audio.get_device_info_generator(): + print_func( + ( + f" » " + f"{('['+str(device['index'])+']').center(8)}" + f"{str(device['hostApi']).center(14)}" + f" {str(device['isLoopbackDevice']).center(10)}" + f" {device['name']}" + ) + ) + + # Below statements are useful to view all available fields in the + # driver and device list + # Do not remove these statements from here + # print('Windows Audio Drivers') + # for host_api_info_gen in py_audio.get_host_api_info_generator(): + # print(host_api_info_gen) + + # print('Windows Audio Devices') + # for device_info_gen in py_audio.get_device_info_generator(): + # print(device_info_gen) + py_audio.terminate() + def enable(self): """Enable transcription from this device """ @@ -117,9 +156,10 @@ def disable(self): def adjust_for_noise(self, device_name, msg): root_logger.info(BaseRecorder.adjust_for_noise.__name__) print(f"[INFO] Adjusting for ambient noise from {device_name}. " + msg) + # print(f"[INFO] Adjusting for ambient noise... " + msg) with self.source: self.recorder.adjust_for_ambient_noise(self.source) - print(f"[INFO] Completed ambient noise adjustment for {device_name}.") + print("[INFO] Completed ambient noise adjustment.") def record_into_queue(self, audio_queue): def record_callback(_, audio: sr.AudioData) -> None: @@ -136,53 +176,107 @@ class MicRecorder(BaseRecorder): """ def __init__(self): root_logger.info(MicRecorder.__name__) - with pyaudio.PyAudio() as py_audio: + self.os_name = platform.system() + self.device_index = None + default_mic = None + + if self.os_name == 'Windows': + py_audio = pyaudio.PyAudio() # WASAPI is windows specific wasapi_info = py_audio.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultInputDevice"] default_mic = py_audio.get_device_info_by_index(self.device_index) - self.device_info = default_mic + self.device_info = default_mic + + source = csr.Microphone(device_index=default_mic["index"], + sample_rate=int(default_mic["defaultSampleRate"]) + # channels=default_mic["maxInputChannels"] + ) + self.source = source + py_audio.terminate() + + elif self.os_name == 'Darwin': + # audio = sr.Microphone.get_pyaudio().PyAudio() + # Prints a list of all devices + # for i in range(audio.get_device_count()): + # device_info = audio.get_device_info_by_index(i) + # print(f'Name: {device_info.get("name")}, InputChannels: {device_info.get("maxInputChannels")} OutputChannels: {device_info.get("maxOutputChannels")}') + # + # Prints device info to see all fields inside the device info object + # print(device_info) + # audio.terminate() + + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f'Microphone with name "{name}" found for device_index={index})') + + if name == HUMAN_MIC_NAME: + self.device_index = index + py_audio = pyaudio.PyAudio() + if self.device_index is not None: + default_mic = py_audio.get_device_info_by_index(self.device_index) + + self.device_info = default_mic + + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + py_audio.terminate() + + print(f'[DEBUG] "{MBP_MIC_NAME}" microphone index is: {self.device_index}') - source = sr.Microphone(device_index=default_mic["index"], - sample_rate=int(default_mic["defaultSampleRate"]), - channels=default_mic["maxInputChannels"] - ) - self.source = source super().__init__(source=source, source_name="You") print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') # This line is commented because in case of non default microphone it can occasionally take # several minutes to execute, thus delaying the start of the application. - # self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") + self.adjust_for_noise("Default Mic", "Please make some noise from the Default Mic...") def get_name(self): - return f'#{self.device_index} - {self.device_info["name"]}' + if self.device_info is not None: + return f'#{self.device_index} - {self.device_info["name"]}' + return None def set_device(self, index: int): """Set active device based on index. """ root_logger.info(MicRecorder.set_device.__name__) - with pyaudio.PyAudio() as py_audio: + if self.os_name == 'Windows': + with pyaudio.PyAudio() as py_audio: + self.device_index = index + mic = py_audio.get_device_info_by_index(self.device_index) + + source = csr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]), + channels=mic["maxInputChannels"] + ) + elif self.os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index - mic = py_audio.get_device_info_by_index(self.device_index) + mic = p.get_device_info_by_index(self.device_index) + p.terminate() + source = sr.Microphone(device_index=mic["index"], + sample_rate=int(mic["defaultSampleRate"]) + ) self.device_info = mic - source = sr.Microphone(device_index=mic["index"], - sample_rate=int(mic["defaultSampleRate"]), - channels=mic["maxInputChannels"] - ) self.source = source print(f'[INFO] Listening to sound from Microphone: {self.get_name()} ') self.adjust_for_noise("Mic", "Please make some noise from the chosen Mic...") class SpeakerRecorder(BaseRecorder): - """Encapsultes the Speaer device audio input + """Encapsultes the Speaker device audio input """ def __init__(self): root_logger.info(SpeakerRecorder.__name__) - with pyaudio.PyAudio() as p: + + self.os_name = platform.system() + self.device_index = None + + if self.os_name == 'Windows': + p = pyaudio.PyAudio() wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) self.device_index = wasapi_info["defaultOutputDevice"] default_speakers = p.get_device_info_by_index(self.device_index) @@ -194,18 +288,33 @@ def __init__(self): break else: print("[ERROR] No loopback device found.") + source = csr.Microphone(speaker=True, + device_index=default_speakers["index"], + sample_rate=int(default_speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=default_speakers["maxInputChannels"]) + elif self.os_name == 'Darwin': + for index, name in enumerate(sr.Microphone.list_microphone_names()): + # print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) + if name == BLACKHOLE_MIC_NAME: + self.device_index = index + p = pyaudio.PyAudio() + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + default_speakers = p.get_device_info_by_index(self.device_index) + p.terminate() + + print("[DEBUG] \"{}\" microphone index is: {}".format(BLACKHOLE_MIC_NAME, self.device_index)) self.device_info = default_speakers - source = sr.Microphone(speaker=True, - device_index=default_speakers["index"], - sample_rate=int(default_speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=default_speakers["maxInputChannels"]) super().__init__(source=source, source_name="Speaker") print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Default Speaker", "Please play sound from Default Speaker...") + # self.adjust_for_noise("Please play sound from Default Speaker...") def get_name(self): return f'#{self.device_index} - {self.device_info["name"]}' @@ -214,25 +323,40 @@ def set_device(self, index: int): """Set active device based on index. """ root_logger.info(SpeakerRecorder.set_device.__name__) - with pyaudio.PyAudio() as p: + + if self.os_name == 'Windows': + with pyaudio.PyAudio() as p: + self.device_index = index + speakers = p.get_device_info_by_index(self.device_index) + + if not speakers["isLoopbackDevice"]: + for loopback in p.get_loopback_device_info_generator(): + if speakers["name"] in loopback["name"]: + speakers = loopback + break + else: + print("[ERROR] No loopback device found.") + + elif self.os_name == 'Darwin': + p = pyaudio.PyAudio() self.device_index = index speakers = p.get_device_info_by_index(self.device_index) - - if not speakers["isLoopbackDevice"]: - for loopback in p.get_loopback_device_info_generator(): - if speakers["name"] in loopback["name"]: - speakers = loopback - break - else: - print("[ERROR] No loopback device found.") + p.terminate() self.device_info = speakers - source = sr.Microphone(speaker=True, - device_index=speakers["index"], - sample_rate=int(speakers["defaultSampleRate"]), - chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), - channels=speakers["maxInputChannels"]) + if self.os_name == 'Windows': + source = csr.Microphone(speaker=True, + device_index=speakers["index"], + sample_rate=int(speakers["defaultSampleRate"]), + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16), + channels=speakers["maxInputChannels"]) + elif self.os_name == 'Darwin': + source = sr.Microphone( + device_index=self.device_index, + chunk_size=pyaudio.get_sample_size(pyaudio.paInt16) + ) + self.source = source print(f'[INFO] Listening to sound from Speaker: {self.get_name()} ') self.adjust_for_noise("Speaker", @@ -240,7 +364,7 @@ def set_device(self, index: int): if __name__ == "__main__": - print_detailed_audio_info() + BaseRecorder.print_detailed_audio_info() # Below statements are useful to view all available fields in the # default Input Device. # Do not delete these lines diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 70def0c..7b3c2a1 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -2,22 +2,32 @@ import queue from heapq import merge import threading +import platform import io -from datetime import timedelta -import pprint +import datetime +# import pprint import wave import tempfile -import custom_speech_recognition as sr -import pyaudiowpatch as pyaudio import conversation import constants +import app_logging as al + +os_name = platform.system() +if os_name == 'Windows': + import custom_speech_recognition as csr + import pyaudiowpatch as pyaudio +if os_name == 'Darwin': + import speech_recognition as sr + import pyaudio PHRASE_TIMEOUT = 3.05 +root_logger = al.get_logger() class AudioTranscriber: def __init__(self, mic_source, speaker_source, model, convo: conversation.Conversation): + root_logger.info(AudioTranscriber.__name__) # Transcript_data should be replaced with the conversation object. # We do not need to store transcription in 2 different places. self.transcript_data = {"You": [], "Speaker": []} @@ -25,26 +35,51 @@ def __init__(self, mic_source, speaker_source, model, convo: conversation.Conver self.audio_model = model # Determines if transcription is enabled for the application. By default it is enabled. self.transcribe = True - self.audio_sources = { - "You": { - "sample_rate": mic_source.SAMPLE_RATE, - "sample_width": mic_source.SAMPLE_WIDTH, - "channels": mic_source.channels, - "last_sample": bytes(), - "last_spoken": None, - "new_phrase": True, - "process_data_func": self.process_mic_data - }, - "Speaker": { - "sample_rate": speaker_source.SAMPLE_RATE, - "sample_width": speaker_source.SAMPLE_WIDTH, - "channels": speaker_source.channels, - "last_sample": bytes(), - "last_spoken": None, - "new_phrase": True, - "process_data_func": self.process_speaker_data + self.os_name = platform.system() + + if self.os_name == 'Windows': + self.audio_sources = { + "You": { + "sample_rate": mic_source.SAMPLE_RATE, + "sample_width": mic_source.SAMPLE_WIDTH, + "channels": mic_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_mic_data + }, + "Speaker": { + "sample_rate": speaker_source.SAMPLE_RATE, + "sample_width": speaker_source.SAMPLE_WIDTH, + "channels": speaker_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_speaker_data + } + } + elif self.os_name == 'Darwin': + self.audio_sources = { + "You": { + "sample_rate": mic_source.SAMPLE_RATE, + "sample_width": mic_source.SAMPLE_WIDTH, + # "channels": mic_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_mic_data + }, + "Speaker": { + "sample_rate": speaker_source.SAMPLE_RATE, + "sample_width": speaker_source.SAMPLE_WIDTH, + # "channels": speaker_source.channels, + "last_sample": bytes(), + "last_spoken": None, + "new_phrase": True, + "process_data_func": self.process_speaker_data + } } - } + self.conversation = convo def transcribe_audio_queue(self, audio_queue: queue.Queue): @@ -60,8 +95,8 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue): text = '' try: - fd, path = tempfile.mkstemp(suffix=".wav") - os.close(fd) + file_descritor, path = tempfile.mkstemp(suffix=".wav") + os.close(file_descritor) source_info["process_data_func"](source_info["last_sample"], path) if self.transcribe: text = self.audio_model.get_transcription(path) @@ -75,10 +110,11 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue): self.transcript_changed_event.set() def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken): + root_logger.info(AudioTranscriber.update_last_sample_and_phrase_status.__name__) if not self.transcribe: return source_info = self.audio_sources[who_spoke] - if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT): + if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > datetime.timedelta(seconds=PHRASE_TIMEOUT): source_info["last_sample"] = bytes() source_info["new_phrase"] = True else: @@ -88,19 +124,28 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken): source_info["last_spoken"] = time_spoken def process_mic_data(self, data, temp_file_name): + root_logger.info(AudioTranscriber.process_mic_data.__name__) if not self.transcribe: return - audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) + if self.os_name == 'Windows': + audio_data = csr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) + elif self.os_name == 'Darwin': + audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"]) wav_data = io.BytesIO(audio_data.get_wav_data()) with open(temp_file_name, 'w+b') as file_handle: file_handle.write(wav_data.read()) def process_speaker_data(self, data, temp_file_name): + root_logger.info(AudioTranscriber.process_speaker_data.__name__) if not self.transcribe: return with wave.open(temp_file_name, 'wb') as wf: - wf.setnchannels(self.audio_sources["Speaker"]["channels"]) - p = pyaudio.PyAudio() + if self.os_name == 'Windows': + wf.setnchannels(self.audio_sources["Speaker"]["channels"]) + p = pyaudio.PyAudio() + if self.os_name == 'Darwin': + p = pyaudio.PyAudio() + wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(self.audio_sources["Speaker"]["sample_rate"]) wf.writeframes(data) @@ -138,7 +183,7 @@ def get_transcript(self, length: int = 0): self.transcript_data["You"], self.transcript_data["Speaker"], key=lambda x: x[1], reverse=False)) combined_transcript = combined_transcript[-length:] - current_return_val = "".join([t[0] for t in combined_transcript]) + # current_return_val = "".join([t[0] for t in combined_transcript]) sources = [ constants.PERSONA_YOU, constants.PERSONA_SPEAKER @@ -165,3 +210,5 @@ def clear_transcript_data(self): self.audio_sources["You"]["new_phrase"] = True self.audio_sources["Speaker"]["new_phrase"] = True + + self.conversation.clear_conversation_data() diff --git a/GPTResponder.py b/GPTResponder.py index 58b6da8..b353bce 100644 --- a/GPTResponder.py +++ b/GPTResponder.py @@ -32,17 +32,18 @@ def generate_response_from_transcript_no_check(self, transcript) -> str: Updates the conversation object with the response from LLM. """ try: - prompt_api_message = prompts.create_single_turn_prompt_message(transcript) + root_logger.info(GPTResponder.generate_response_from_transcript_no_check.__name__) + # prompt_api_message = prompts.create_single_turn_prompt_message(transcript) multiturn_prompt_content = self.conversation.get_merged_conversation( length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM) multiturn_prompt_api_message = prompts.create_multiturn_prompt(multiturn_prompt_content) # pprint.pprint(f'Prompt api message: {prompt_api_message}') # print(f'Multiturn prompt for ChatGPT: {multiturn_prompt_api_message}') - usual_response = openai.ChatCompletion.create( - model=self.model, - messages=prompt_api_message, - temperature=0.0 - ) + # usual_response = openai.ChatCompletion.create( + # model=self.model, + # messages=prompt_api_message, + # temperature=0.0 + # ) # Multi turn response is only effective when continuous mode is off. # In continuous mode, there are far too many responses from LLM, # they confuse the LLM if that many responses are replayed back to LLM. @@ -101,13 +102,14 @@ def process_response(self, input_str: str) -> str: def generate_response_from_transcript(self, transcript): """Ping OpenAI LLM model to get response from the Assistant """ - + root_logger.info(GPTResponder.generate_response_from_transcript.__name__) if self.gl_vars.freeze_state[0]: return '' return self.generate_response_from_transcript_no_check(transcript) def update_conversation(self, response, persona): + root_logger.info(GPTResponder.update_conversation.__name__) if response != '': self.response = response self.conversation.update_conversation(persona=persona, @@ -129,7 +131,7 @@ def respond_to_transcriber(self, transcriber): if not self.gl_vars.freeze_state[0]: transcript_string = transcriber.get_transcript( length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM) - response = self.generate_response_from_transcript(transcript_string) + self.generate_response_from_transcript(transcript_string) end_time = time.time() # Measure end time execution_time = end_time - start_time # Calculate time to execute the function diff --git a/README.md b/README.md index 8a10bac..0c616b3 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,12 @@ Follow below steps to run transcribe on your local machine. ### 📋 Prerequisites +#### Common - Python >=3.8.0 - (Optional) An OpenAI API key that can access OpenAI API (set up a paid account OpenAI account) -- Windows OS (Not tested on others) - FFmpeg +#### Windows If FFmpeg is not installed in your system, follow the steps below to install it. First, install Chocolatey, a package manager for Windows. Open PowerShell as Administrator and run the following command: @@ -26,6 +27,23 @@ choco install ffmpeg ``` Please run these commands in a PowerShell window with administrator privileges. For any issues during the installation, visit the official [Chocolatey](https://chocolatey.org/) and [FFmpeg](https://ffmpeg.org/) websites for troubleshooting. +#### macOS + +**Dependencies and tools** + +- XCode Command Line Tools +- `brew install portaudio python-tk ffmpeg blackhole-2ch` + +**BlackHole configuration** + +Setup "Multi-Ouput Device" and set it as default sound output device for your macOS. Guidelines are available [here](https://github.com/ExistentialAudio/BlackHole/wiki/Multi-Output-Device) + +**Configuring device names** + +Speakers audio on macOS will be recorded from the virtual "BlackHole 2ch" microphone. Your and BlackHole microphone device names could be adjusted via `HUMAN_MIC_NAME` and `BLACKHOLE_MIC_NAME` vars in the [AudioRecorder.py](./AudioRecorder.py). + +Run `python main.py -l` to get speaker and microphone devices list and their indices. + ### 🔧 Code Installation 1. Clone transcribe repository: @@ -46,6 +64,8 @@ Please run these commands in a PowerShell window with administrator privileges. pip install -r requirements.txt ``` + It is recommended to create a virtual environment for installing the required packages + 4. (Optional) Replace the Open API key in `parameters.yaml` file in the transcribe directory: Replace the Open API key in `parameters.yaml` file manually. Open in a text editor and alter the line: @@ -73,7 +93,32 @@ Upon initiation, Transcribe will begin transcribing microphone input and speaker The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). However, keep in mind, using the Whisper API consumes OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile for your use case. -### 🎬 Running Transcribe +### 🎬 Customizing Transcribe + +By default chatGPT API behaves like a casual friend engaging in light hearted banter. To customize the responses and make it specific to a field see this section in parameters.yaml and the corresponding examples + +``` + system_prompt: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + system_prompt: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + + + initial_convo: + first: + role: "You" + # content: "I am V, I want to learn about Fantasy Football" + # content: "I am V, I want to learn about Basketball" + content: Hey assistant, how are you doing today, I am in mood of a casual conversation. + second: + role: "assistant" + # content: "Hello, V. That's awesome! What do you want to know about basketball" + # content: "Hello, V. That's awesome! What do you want to know about Fantasy Football" + content: Hello, V. You are awesome. I am doing very well and looking forward to some light hearted banter with you. +``` + +Change system_prompt, intial_convo to be specific to the scenario you are intersted in. + +### 🎬 Testing Transcribe Code changes Unit Tests @@ -122,9 +167,7 @@ https://drive.google.com/file/d/1Iy32YjDXK7Bga7amOUTA4Gx9VEoibPi-/view?usp=shari ### ⚡️ Limitations ⚡️ -While Transcribe provides real-time transcription and optional response suggestions, there are several known limitations to its functionality that you should be aware of: - -**Default Mic and Speaker:** Transcribe is currently configured to listen only to the default microphone and speaker set in your system. It will not detect sound from other devices or systems. To use a different mic or speaker, need to set it as your default device in your system settings. +While Transcribe provides real-time transcription and optional response suggestions, there are few known limitations to its functionality: **Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words. @@ -142,6 +185,7 @@ Incorrect API key provided: API_KEY. You can find your API key at https://platfo This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. ## ➕ Enhancements from base repository ➕ +- macOS support - Speech Mode - Read out responses from ChatGPT as Audio - Do not need Open AI key, paid Open AI account to use the complete functionality - Allow users selective disabling of mic, speaker audio input diff --git a/conversation.py b/conversation.py index a014b11..4a6b6c5 100644 --- a/conversation.py +++ b/conversation.py @@ -15,6 +15,9 @@ def __init__(self): constants.PERSONA_YOU: [], constants.PERSONA_SPEAKER: [], constants.PERSONA_ASSISTANT: []} + self.initialize_conversation() + + def initialize_conversation(self): config = configuration.Config().get_data() prompt = config["OpenAI"]["system_prompt"] self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt, @@ -35,7 +38,7 @@ def clear_conversation_data(self): self.transcript_data[constants.PERSONA_SPEAKER].clear() self.transcript_data[constants.PERSONA_SYSTEM].clear() self.transcript_data[constants.PERSONA_ASSISTANT].clear() - self.last_update = datetime.datetime.now() + self.initialize_conversation() def update_conversation(self, persona: str, text: str, time_spoken, pop: bool = False): """Update conversation with new data diff --git a/custom_speech_recognition/__main__.py b/custom_speech_recognition/__main__.py index 68f5652..dcfd0e4 100644 --- a/custom_speech_recognition/__main__.py +++ b/custom_speech_recognition/__main__.py @@ -5,20 +5,22 @@ try: print("A moment of silence, please...") - with m as source: r.adjust_for_ambient_noise(source) - print("Set minimum energy threshold to {}".format(r.energy_threshold)) + with m as source: + r.adjust_for_ambient_noise(source) + print(f'Set minimum energy threshold to {r.energy_threshold}') while True: print("Say something!") - with m as source: audio = r.listen(source) + with m as source: + audio = r.listen(source) print("Got it! Now to recognize it...") try: # recognize speech using Google Speech Recognition value = r.recognize_google(audio) - print("You said {}".format(value)) + print(f'You said {value}') except sr.UnknownValueError: print("Oops! Didn't catch that") except sr.RequestError as e: - print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e)) + print(f"Uh oh! Couldn't request results from Google Speech Recognition service; {e}") except KeyboardInterrupt: pass diff --git a/duration.py b/duration.py index 690ef5f..9bca3dc 100644 --- a/duration.py +++ b/duration.py @@ -16,10 +16,10 @@ class Duration: Duration(dd:hh:ss:ms) of Test Operation 0:00:02.000826 """ - def __init__(self, operation_name: str = 'undefined'): + def __init__(self, name: str = 'undefined'): self.start: datetime.datetime = None self.end: datetime.datetime = None - self.operation_name = operation_name + self.operation_name = name def __enter__(self): """Records the start time of an operation diff --git a/main.py b/main.py index 19617be..e3e59d5 100644 --- a/main.py +++ b/main.py @@ -79,7 +79,7 @@ def main(): if args.list_devices: print('\n\nList all audio drivers and devices on this machine') - ar.print_detailed_audio_info() + ar.BaseRecorder.print_detailed_audio_info() return # Initiate global variables diff --git a/parameters.yaml b/parameters.yaml index 5c0facd..cbd27d0 100644 --- a/parameters.yaml +++ b/parameters.yaml @@ -21,9 +21,9 @@ OpenAI: # The combination of system_prompt, initial_convo is used to create a multi turn prompt message for LLM. # system_prompt_1, systen_prompt_2 are here as samples of other possible prompts. # Only the content of system_prompt parameter will be used - system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." - system_prompt_1: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." - system_prompt_2: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." + system_prompt: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." +# system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." +# system_prompt: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly." # When we anticipate to talk about a specific topic, seed the content with some conversation # If the conversation is generic, replace this text with something like this. @@ -35,10 +35,13 @@ OpenAI: first: role: "You" # content: "I am V, I want to learn about Fantasy Football" - content: "I am V, I want to learn about Basketball" + # content: "I am V, I want to learn about Basketball" + content: Hey assistant, how are you doing today, I am in mood of a casual conversation. second: role: "assistant" - content: "Hello, V. That's awesome! What do you want to know about basketball" + # content: "Hello, V. That's awesome! What do you want to know about basketball" + # content: "Hello, V. That's awesome! What do you want to know about Fantasy Football" + content: Hello, V. You are awesome. I am doing very well and looking forward to some light hearted banter with you. General: log_file: 'Transcribe.log' diff --git a/requirements.txt b/requirements.txt index ba4d3f8..71ebdb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,15 +3,16 @@ openai-whisper==20230314 Wave==0.0.2 openai==0.27.6 customtkinter==5.1.3 -PyAudioWPatch==0.2.12.5 +PyAudio==0.2.13 pyinstaller==5.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 torch pyperclip PyYAML -numpy soundfile gtts # Playsound version 1.3 has issues in playing back audio files # in case of continuous play back of files in quick succession -playsound==1.2.2 \ No newline at end of file +playsound==1.2.2 +SpeechRecognition==3.10.0 +PyAudioWPatch==0.2.12.5; platform_system == "Windows" diff --git a/ui.py b/ui.py index 425d0f5..d210a24 100644 --- a/ui.py +++ b/ui.py @@ -14,6 +14,7 @@ root_logger = al.get_logger() UI_FONT_SIZE = 20 last_transcript_ui_update_time: datetime.datetime = datetime.datetime.now() +global_vars_module: GlobalVars.TranscriptionGlobals = None class ui_callbacks: @@ -44,15 +45,17 @@ def freeze_unfreeze(self): root_logger.info(ui_callbacks.freeze_unfreeze.__name__) self.global_vars.freeze_state[0] = not self.global_vars.freeze_state[0] # Invert the state self.global_vars.freeze_button.configure( - value="Suggest Responses Continuously" if self.global_vars.freeze_state[0] else "Do Not Suggest Responses Continuously" + text="Suggest Responses Continuously" if self.global_vars.freeze_state[0] else "Do Not Suggest Responses Continuously" ) # to enable/disable speaker/microphone when args are given or button is pressed def enable_disable_speaker(self, editmenu): + """Toggles the state of speaker""" self.global_vars.speaker_audio_recorder.enabled = not self.global_vars.speaker_audio_recorder.enabled editmenu.entryconfigure(2, label="Disable Speaker" if self.global_vars.speaker_audio_recorder.enabled else "Enable Speaker") def enable_disable_microphone(self, editmenu): + """Toggles the state of microphone""" self.global_vars.user_audio_recorder.enabled = not self.global_vars.user_audio_recorder.enabled editmenu.entryconfigure(3, label="Disable Microphone" if self.global_vars.user_audio_recorder.enabled else "Enable Microphone") @@ -108,8 +111,12 @@ def update_transcript_ui(transcriber: AudioTranscriber, textbox: ctk.CTkTextbox) """ global last_transcript_ui_update_time + global global_vars_module - if last_transcript_ui_update_time < GlobalVars.TranscriptionGlobals().convo.last_update: + if global_vars_module is None: + global_vars_module = GlobalVars.TranscriptionGlobals() + + if last_transcript_ui_update_time < global_vars_module.convo.last_update: transcript_string = transcriber.get_transcript() write_in_textbox(textbox, transcript_string) textbox.see("end")