vivekuppal · vivekuppal · Sep 7, 2023 · Sep 7, 2023 · Sep 8, 2023 · Sep 7, 2023
diff --git a/AudioRecorder.py b/AudioRecorder.py
diff --git a/AudioTranscriber.py b/AudioTranscriber.py
@@ -2,49 +2,84 @@
 import queue
 from heapq import merge
 import threading
+import platform
 import io
-from datetime import timedelta
-import pprint
+import datetime
+# import pprint
 import wave
 import tempfile
-import custom_speech_recognition as sr
-import pyaudiowpatch as pyaudio
 import conversation
 import constants
+import app_logging as al
+
+os_name = platform.system()
+if os_name == 'Windows':
+    import custom_speech_recognition as csr
+    import pyaudiowpatch as pyaudio
+if os_name == 'Darwin':
+    import speech_recognition as sr
+    import pyaudio
 
 
 PHRASE_TIMEOUT = 3.05
+root_logger = al.get_logger()
 
 
 class AudioTranscriber:
     def __init__(self, mic_source, speaker_source, model, convo: conversation.Conversation):
+        root_logger.info(AudioTranscriber.__name__)
         # Transcript_data should be replaced with the conversation object.
         # We do not need to store transcription in 2 different places.
         self.transcript_data = {"You": [], "Speaker": []}
         self.transcript_changed_event = threading.Event()
         self.audio_model = model
         # Determines if transcription is enabled for the application. By default it is enabled.
         self.transcribe = True
-        self.audio_sources = {
-            "You": {
-                "sample_rate": mic_source.SAMPLE_RATE,
-                "sample_width": mic_source.SAMPLE_WIDTH,
-                "channels": mic_source.channels,
-                "last_sample": bytes(),
-                "last_spoken": None,
-                "new_phrase": True,
-                "process_data_func": self.process_mic_data
-            },
-            "Speaker": {
-                "sample_rate": speaker_source.SAMPLE_RATE,
-                "sample_width": speaker_source.SAMPLE_WIDTH,
-                "channels": speaker_source.channels,
-                "last_sample": bytes(),
-                "last_spoken": None,
-                "new_phrase": True,
-                "process_data_func": self.process_speaker_data
+        self.os_name = platform.system()
+
+        if self.os_name == 'Windows':
+            self.audio_sources = {
+                "You": {
+                    "sample_rate": mic_source.SAMPLE_RATE,
+                    "sample_width": mic_source.SAMPLE_WIDTH,
+                    "channels": mic_source.channels,
+                    "last_sample": bytes(),
+                    "last_spoken": None,
+                    "new_phrase": True,
+                    "process_data_func": self.process_mic_data
+                },
+                "Speaker": {
+                    "sample_rate": speaker_source.SAMPLE_RATE,
+                    "sample_width": speaker_source.SAMPLE_WIDTH,
+                    "channels": speaker_source.channels,
+                    "last_sample": bytes(),
+                    "last_spoken": None,
+                    "new_phrase": True,
+                    "process_data_func": self.process_speaker_data
+                }
+            }
+        elif self.os_name == 'Darwin':
+            self.audio_sources = {
+                "You": {
+                    "sample_rate": mic_source.SAMPLE_RATE,
+                    "sample_width": mic_source.SAMPLE_WIDTH,
+                    # "channels": mic_source.channels,
+                    "last_sample": bytes(),
+                    "last_spoken": None,
+                    "new_phrase": True,
+                    "process_data_func": self.process_mic_data
+                },
+                "Speaker": {
+                    "sample_rate": speaker_source.SAMPLE_RATE,
+                    "sample_width": speaker_source.SAMPLE_WIDTH,
+                    # "channels": speaker_source.channels,
+                    "last_sample": bytes(),
+                    "last_spoken": None,
+                    "new_phrase": True,
+                    "process_data_func": self.process_speaker_data
+                }
             }
-        }
+
         self.conversation = convo
 
     def transcribe_audio_queue(self, audio_queue: queue.Queue):
@@ -60,8 +95,8 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue):
 
             text = ''
             try:
-                fd, path = tempfile.mkstemp(suffix=".wav")
-                os.close(fd)
+                file_descritor, path = tempfile.mkstemp(suffix=".wav")
+                os.close(file_descritor)
                 source_info["process_data_func"](source_info["last_sample"], path)
                 if self.transcribe:
                     text = self.audio_model.get_transcription(path)
@@ -75,10 +110,11 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue):
                 self.transcript_changed_event.set()
 
     def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
+        root_logger.info(AudioTranscriber.update_last_sample_and_phrase_status.__name__)
         if not self.transcribe:
             return
         source_info = self.audio_sources[who_spoke]
-        if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
+        if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > datetime.timedelta(seconds=PHRASE_TIMEOUT):
             source_info["last_sample"] = bytes()
             source_info["new_phrase"] = True
         else:
@@ -88,19 +124,28 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
         source_info["last_spoken"] = time_spoken
 
     def process_mic_data(self, data, temp_file_name):
+        root_logger.info(AudioTranscriber.process_mic_data.__name__)
         if not self.transcribe:
             return
-        audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
+        if self.os_name == 'Windows':
+            audio_data = csr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
+        elif self.os_name == 'Darwin':
+            audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
         wav_data = io.BytesIO(audio_data.get_wav_data())
         with open(temp_file_name, 'w+b') as file_handle:
             file_handle.write(wav_data.read())
 
     def process_speaker_data(self, data, temp_file_name):
+        root_logger.info(AudioTranscriber.process_speaker_data.__name__)
         if not self.transcribe:
             return
         with wave.open(temp_file_name, 'wb') as wf:
-            wf.setnchannels(self.audio_sources["Speaker"]["channels"])
-            p = pyaudio.PyAudio()
+            if self.os_name == 'Windows':
+                wf.setnchannels(self.audio_sources["Speaker"]["channels"])
+                p = pyaudio.PyAudio()
+            if self.os_name == 'Darwin':
+                p = pyaudio.PyAudio()
+
             wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
             wf.setframerate(self.audio_sources["Speaker"]["sample_rate"])
             wf.writeframes(data)
@@ -138,7 +183,7 @@ def get_transcript(self, length: int = 0):
             self.transcript_data["You"], self.transcript_data["Speaker"],
             key=lambda x: x[1], reverse=False))
         combined_transcript = combined_transcript[-length:]
-        current_return_val = "".join([t[0] for t in combined_transcript])
+        # current_return_val = "".join([t[0] for t in combined_transcript])
         sources = [
             constants.PERSONA_YOU,
             constants.PERSONA_SPEAKER
@@ -165,3 +210,5 @@ def clear_transcript_data(self):
 
         self.audio_sources["You"]["new_phrase"] = True
         self.audio_sources["Speaker"]["new_phrase"] = True
+
+        self.conversation.clear_conversation_data()
diff --git a/GPTResponder.py b/GPTResponder.py
@@ -32,17 +32,18 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
            Updates the conversation object with the response from LLM.
         """
         try:
-            prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
+            root_logger.info(GPTResponder.generate_response_from_transcript_no_check.__name__)
+            # prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
             multiturn_prompt_content = self.conversation.get_merged_conversation(
                 length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
             multiturn_prompt_api_message = prompts.create_multiturn_prompt(multiturn_prompt_content)
             # pprint.pprint(f'Prompt api message: {prompt_api_message}')
             # print(f'Multiturn prompt for ChatGPT: {multiturn_prompt_api_message}')
-            usual_response = openai.ChatCompletion.create(
-                    model=self.model,
-                    messages=prompt_api_message,
-                    temperature=0.0
-            )
+            # usual_response = openai.ChatCompletion.create(
+            #        model=self.model,
+            #        messages=prompt_api_message,
+            #        temperature=0.0
+            # )
             # Multi turn response is only effective when continuous mode is off.
             # In continuous mode, there are far too many responses from LLM,
             # they confuse the LLM if that many responses are replayed back to LLM.
@@ -101,13 +102,14 @@ def process_response(self, input_str: str) -> str:
     def generate_response_from_transcript(self, transcript):
         """Ping OpenAI LLM model to get response from the Assistant
         """
-
+        root_logger.info(GPTResponder.generate_response_from_transcript.__name__)
         if self.gl_vars.freeze_state[0]:
             return ''
 
         return self.generate_response_from_transcript_no_check(transcript)
 
     def update_conversation(self, response, persona):
+        root_logger.info(GPTResponder.update_conversation.__name__)
         if response != '':
             self.response = response
             self.conversation.update_conversation(persona=persona,
@@ -129,7 +131,7 @@ def respond_to_transcriber(self, transcriber):
                 if not self.gl_vars.freeze_state[0]:
                     transcript_string = transcriber.get_transcript(
                         length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
-                    response = self.generate_response_from_transcript(transcript_string)
+                    self.generate_response_from_transcript(transcript_string)
 
                 end_time = time.time()  # Measure end time
                 execution_time = end_time - start_time  # Calculate time to execute the function

diff --git a/README.md b/README.md
@@ -9,11 +9,12 @@ Follow below steps to run transcribe on your local machine.
 
 ### 📋 Prerequisites
 
+#### Common
 - Python >=3.8.0
 - (Optional) An OpenAI API key that can access OpenAI API (set up a paid account OpenAI account)
-- Windows OS (Not tested on others)
 - FFmpeg 
 
+#### Windows
 If FFmpeg is not installed in your system, follow the steps below to install it.
 
 First, install Chocolatey, a package manager for Windows. Open PowerShell as Administrator and run the following command:
@@ -26,6 +27,23 @@ choco install ffmpeg
 ```
 Please run these commands in a PowerShell window with administrator privileges. For any issues during the installation, visit the official [Chocolatey](https://chocolatey.org/) and [FFmpeg](https://ffmpeg.org/) websites for troubleshooting.
 
+#### macOS
+
+**Dependencies and tools**
+
+- XCode Command Line Tools
+- `brew install portaudio python-tk ffmpeg blackhole-2ch`
+
+**BlackHole configuration**
+
+Setup "Multi-Ouput Device" and set it as default sound output device for your macOS. Guidelines are available [here](https://github.com/ExistentialAudio/BlackHole/wiki/Multi-Output-Device)
+
+**Configuring device names**
+
+Speakers audio on macOS will be recorded from the virtual "BlackHole 2ch" microphone. Your and BlackHole microphone device names could be adjusted via `HUMAN_MIC_NAME` and `BLACKHOLE_MIC_NAME` vars in the [AudioRecorder.py](./AudioRecorder.py).
+
+Run `python main.py -l` to get speaker and microphone devices list and their indices.
+
 ### 🔧 Code Installation
 
 1. Clone transcribe repository:
@@ -46,6 +64,8 @@ Please run these commands in a PowerShell window with administrator privileges.
    pip install -r requirements.txt
    ```
 
+   It is recommended to create a virtual environment for installing the required packages
+
 4. (Optional) Replace the Open API key in `parameters.yaml` file in the transcribe directory:
 
    Replace the Open API key in `parameters.yaml` file manually. Open in a text editor and alter the line:
@@ -73,7 +93,32 @@ Upon initiation, Transcribe will begin transcribing microphone input and speaker
 
 The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). However, keep in mind, using the Whisper API consumes OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile for your use case.
 
-### 🎬 Running Transcribe
+### 🎬 Customizing Transcribe
+
+By default chatGPT API behaves like a casual friend engaging in light hearted banter. To customize the responses and make it specific to a field see this section in parameters.yaml and the corresponding examples
+
+```
+  system_prompt: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly."
+  system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly."
+  system_prompt: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly."
+
+
+  initial_convo:
+    first:
+      role: "You"
+      # content: "I am V, I want to learn about Fantasy Football"
+      # content: "I am V, I want to learn about Basketball"
+      content: Hey assistant, how are you doing today, I am in mood of a casual conversation.
+    second:
+      role: "assistant"
+      # content: "Hello, V. That's awesome! What do you want to know about basketball"
+      # content: "Hello, V. That's awesome! What do you want to know about Fantasy Football"
+      content: Hello, V. You are awesome. I am doing very well and looking forward to some light hearted banter with you.
+```
+
+Change system_prompt, intial_convo to be specific to the scenario you are intersted in.
+
+### 🎬 Testing Transcribe Code changes
 
 Unit Tests
 
@@ -122,9 +167,7 @@ https://drive.google.com/file/d/1Iy32YjDXK7Bga7amOUTA4Gx9VEoibPi-/view?usp=shari
 
 ### ⚡️ Limitations ⚡️
 
-While Transcribe provides real-time transcription and optional response suggestions, there are several known limitations to its functionality that you should be aware of:
-
-**Default Mic and Speaker:** Transcribe is currently configured to listen only to the default microphone and speaker set in your system. It will not detect sound from other devices or systems. To use a different mic or speaker, need to set it as your default device in your system settings.
+While Transcribe provides real-time transcription and optional response suggestions, there are few known limitations to its functionality:
 
 **Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.
 
@@ -142,6 +185,7 @@ Incorrect API key provided: API_KEY. You can find your API key at https://platfo
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
 
 ## ➕ Enhancements from base repository ➕
+- macOS support
 - Speech Mode - Read out responses from ChatGPT as Audio
 - Do not need Open AI key, paid Open AI account to use the complete functionality
 - Allow users selective disabling of mic, speaker audio input

diff --git a/conversation.py b/conversation.py
@@ -15,6 +15,9 @@ def __init__(self):
                                 constants.PERSONA_YOU: [],
                                 constants.PERSONA_SPEAKER: [],
                                 constants.PERSONA_ASSISTANT: []}
+        self.initialize_conversation()
+
+    def initialize_conversation(self):
         config = configuration.Config().get_data()
         prompt = config["OpenAI"]["system_prompt"]
         self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt,
@@ -35,7 +38,7 @@ def clear_conversation_data(self):
         self.transcript_data[constants.PERSONA_SPEAKER].clear()
         self.transcript_data[constants.PERSONA_SYSTEM].clear()
         self.transcript_data[constants.PERSONA_ASSISTANT].clear()
-        self.last_update = datetime.datetime.now()
+        self.initialize_conversation()
 
     def update_conversation(self, persona: str, text: str, time_spoken, pop: bool = False):
         """Update conversation with new data

diff --git a/custom_speech_recognition/__main__.py b/custom_speech_recognition/__main__.py
@@ -5,20 +5,22 @@
 
 try:
     print("A moment of silence, please...")
-    with m as source: r.adjust_for_ambient_noise(source)
-    print("Set minimum energy threshold to {}".format(r.energy_threshold))
+    with m as source:
+        r.adjust_for_ambient_noise(source)
+    print(f'Set minimum energy threshold to {r.energy_threshold}')
     while True:
         print("Say something!")
-        with m as source: audio = r.listen(source)
+        with m as source:
+            audio = r.listen(source)
         print("Got it! Now to recognize it...")
         try:
             # recognize speech using Google Speech Recognition
             value = r.recognize_google(audio)
 
-            print("You said {}".format(value))
+            print(f'You said {value}')
         except sr.UnknownValueError:
             print("Oops! Didn't catch that")
         except sr.RequestError as e:
-            print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
+            print(f"Uh oh! Couldn't request results from Google Speech Recognition service; {e}")
 except KeyboardInterrupt:
     pass
diff --git a/duration.py b/duration.py
@@ -16,10 +16,10 @@ class Duration:
         Duration(dd:hh:ss:ms) of Test Operation 0:00:02.000826
     """
 
-    def __init__(self, operation_name: str = 'undefined'):
+    def __init__(self, name: str = 'undefined'):
         self.start: datetime.datetime = None
         self.end: datetime.datetime = None
-        self.operation_name = operation_name
+        self.operation_name = name
 
     def __enter__(self):
         """Records the start time of an operation

diff --git a/main.py b/main.py
@@ -79,7 +79,7 @@ def main():
 
     if args.list_devices:
         print('\n\nList all audio drivers and devices on this machine')
-        ar.print_detailed_audio_info()
+        ar.BaseRecorder.print_detailed_audio_info()
         return
 
     # Initiate global variables