Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Mac #61

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4b23feb
checkpoint between win, mac
vivekuppal Sep 7, 2023
2b7ac98
Checkpoint on mac
Sep 7, 2023
76d5d5f
Add support for Mac.
Sep 8, 2023
3e4c90b
checkpoint between win, mac
vivekuppal Sep 7, 2023
a834db6
Checkpoint on mac
Sep 7, 2023
470f1bb
Add support for Mac.
Sep 8, 2023
2b80988
Merge branch 'vu-win-mac' of https://github.com/vivekuppal/transcribe…
vivekuppal Sep 8, 2023
b50f58c
Bring readme up to date with current functionality. Describe content …
vivekuppal Sep 8, 2023
76d349f
checkpoint between win, mac
vivekuppal Sep 7, 2023
c2000a0
Checkpoint on mac
Sep 7, 2023
30fafd3
Add support for Mac.
Sep 8, 2023
17bf60f
checkpoint between win, mac
vivekuppal Sep 7, 2023
18c5f77
Checkpoint on mac
Sep 7, 2023
09195c6
Add support for Mac.
Sep 8, 2023
b50009a
rebase
vivekuppal Sep 8, 2023
27d6b81
Continuous mode broke after updates to the UI. (#64)
vivekuppal Sep 8, 2023
8ee7e6a
Clear transcript data from UI (#65)
vivekuppal Sep 8, 2023
a1b86e3
Faster transcription from user perspective (#66)
vivekuppal Sep 8, 2023
4f1de93
checkpoint between win, mac
vivekuppal Sep 7, 2023
4cf7ab6
Checkpoint on mac
Sep 7, 2023
7534af3
Add support for Mac.
Sep 8, 2023
b2e8e0d
checkpoint between win, mac
vivekuppal Sep 7, 2023
096d6a9
Checkpoint on mac
Sep 7, 2023
ae088c4
Add support for Mac.
Sep 8, 2023
49bb0d5
checkpoint between win, mac
vivekuppal Sep 7, 2023
6c44745
Checkpoint on mac
Sep 7, 2023
ac2e541
Add support for Mac.
Sep 8, 2023
c824d92
checkpoint between win, mac
vivekuppal Sep 7, 2023
2fc32c9
Checkpoint on mac
Sep 7, 2023
d3c9247
Add support for Mac.
Sep 8, 2023
1f8cc47
rebase
vivekuppal Sep 9, 2023
454f425
win compatibility for mac changes.
vivekuppal Sep 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
312 changes: 218 additions & 94 deletions AudioRecorder.py

Large diffs are not rendered by default.

107 changes: 77 additions & 30 deletions AudioTranscriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,84 @@
import queue
from heapq import merge
import threading
import platform
import io
from datetime import timedelta
import pprint
import datetime
# import pprint
import wave
import tempfile
import custom_speech_recognition as sr
import pyaudiowpatch as pyaudio
import conversation
import constants
import app_logging as al

os_name = platform.system()
if os_name == 'Windows':
import custom_speech_recognition as csr
import pyaudiowpatch as pyaudio
if os_name == 'Darwin':
import speech_recognition as sr
import pyaudio


PHRASE_TIMEOUT = 3.05
root_logger = al.get_logger()


class AudioTranscriber:
def __init__(self, mic_source, speaker_source, model, convo: conversation.Conversation):
root_logger.info(AudioTranscriber.__name__)
# Transcript_data should be replaced with the conversation object.
# We do not need to store transcription in 2 different places.
self.transcript_data = {"You": [], "Speaker": []}
self.transcript_changed_event = threading.Event()
self.audio_model = model
# Determines if transcription is enabled for the application. By default it is enabled.
self.transcribe = True
self.audio_sources = {
"You": {
"sample_rate": mic_source.SAMPLE_RATE,
"sample_width": mic_source.SAMPLE_WIDTH,
"channels": mic_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_mic_data
},
"Speaker": {
"sample_rate": speaker_source.SAMPLE_RATE,
"sample_width": speaker_source.SAMPLE_WIDTH,
"channels": speaker_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_speaker_data
self.os_name = platform.system()

if self.os_name == 'Windows':
self.audio_sources = {
"You": {
"sample_rate": mic_source.SAMPLE_RATE,
"sample_width": mic_source.SAMPLE_WIDTH,
"channels": mic_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_mic_data
},
"Speaker": {
"sample_rate": speaker_source.SAMPLE_RATE,
"sample_width": speaker_source.SAMPLE_WIDTH,
"channels": speaker_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_speaker_data
}
}
elif self.os_name == 'Darwin':
self.audio_sources = {
"You": {
"sample_rate": mic_source.SAMPLE_RATE,
"sample_width": mic_source.SAMPLE_WIDTH,
# "channels": mic_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_mic_data
},
"Speaker": {
"sample_rate": speaker_source.SAMPLE_RATE,
"sample_width": speaker_source.SAMPLE_WIDTH,
# "channels": speaker_source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
"process_data_func": self.process_speaker_data
}
}
}

self.conversation = convo

def transcribe_audio_queue(self, audio_queue: queue.Queue):
Expand All @@ -60,8 +95,8 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue):

text = ''
try:
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
file_descritor, path = tempfile.mkstemp(suffix=".wav")
os.close(file_descritor)
source_info["process_data_func"](source_info["last_sample"], path)
if self.transcribe:
text = self.audio_model.get_transcription(path)
Expand All @@ -75,10 +110,11 @@ def transcribe_audio_queue(self, audio_queue: queue.Queue):
self.transcript_changed_event.set()

def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
root_logger.info(AudioTranscriber.update_last_sample_and_phrase_status.__name__)
if not self.transcribe:
return
source_info = self.audio_sources[who_spoke]
if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > datetime.timedelta(seconds=PHRASE_TIMEOUT):
source_info["last_sample"] = bytes()
source_info["new_phrase"] = True
else:
Expand All @@ -88,19 +124,28 @@ def update_last_sample_and_phrase_status(self, who_spoke, data, time_spoken):
source_info["last_spoken"] = time_spoken

def process_mic_data(self, data, temp_file_name):
root_logger.info(AudioTranscriber.process_mic_data.__name__)
if not self.transcribe:
return
audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
if self.os_name == 'Windows':
audio_data = csr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
elif self.os_name == 'Darwin':
audio_data = sr.AudioData(data, self.audio_sources["You"]["sample_rate"], self.audio_sources["You"]["sample_width"])
wav_data = io.BytesIO(audio_data.get_wav_data())
with open(temp_file_name, 'w+b') as file_handle:
file_handle.write(wav_data.read())

def process_speaker_data(self, data, temp_file_name):
root_logger.info(AudioTranscriber.process_speaker_data.__name__)
if not self.transcribe:
return
with wave.open(temp_file_name, 'wb') as wf:
wf.setnchannels(self.audio_sources["Speaker"]["channels"])
p = pyaudio.PyAudio()
if self.os_name == 'Windows':
wf.setnchannels(self.audio_sources["Speaker"]["channels"])
p = pyaudio.PyAudio()
if self.os_name == 'Darwin':
p = pyaudio.PyAudio()

wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(self.audio_sources["Speaker"]["sample_rate"])
wf.writeframes(data)
Expand Down Expand Up @@ -138,7 +183,7 @@ def get_transcript(self, length: int = 0):
self.transcript_data["You"], self.transcript_data["Speaker"],
key=lambda x: x[1], reverse=False))
combined_transcript = combined_transcript[-length:]
current_return_val = "".join([t[0] for t in combined_transcript])
# current_return_val = "".join([t[0] for t in combined_transcript])
sources = [
constants.PERSONA_YOU,
constants.PERSONA_SPEAKER
Expand All @@ -165,3 +210,5 @@ def clear_transcript_data(self):

self.audio_sources["You"]["new_phrase"] = True
self.audio_sources["Speaker"]["new_phrase"] = True

self.conversation.clear_conversation_data()
18 changes: 10 additions & 8 deletions GPTResponder.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,18 @@ def generate_response_from_transcript_no_check(self, transcript) -> str:
Updates the conversation object with the response from LLM.
"""
try:
prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
root_logger.info(GPTResponder.generate_response_from_transcript_no_check.__name__)
# prompt_api_message = prompts.create_single_turn_prompt_message(transcript)
multiturn_prompt_content = self.conversation.get_merged_conversation(
length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
multiturn_prompt_api_message = prompts.create_multiturn_prompt(multiturn_prompt_content)
# pprint.pprint(f'Prompt api message: {prompt_api_message}')
# print(f'Multiturn prompt for ChatGPT: {multiturn_prompt_api_message}')
usual_response = openai.ChatCompletion.create(
model=self.model,
messages=prompt_api_message,
temperature=0.0
)
# usual_response = openai.ChatCompletion.create(
# model=self.model,
# messages=prompt_api_message,
# temperature=0.0
# )
# Multi turn response is only effective when continuous mode is off.
# In continuous mode, there are far too many responses from LLM,
# they confuse the LLM if that many responses are replayed back to LLM.
Expand Down Expand Up @@ -101,13 +102,14 @@ def process_response(self, input_str: str) -> str:
def generate_response_from_transcript(self, transcript):
"""Ping OpenAI LLM model to get response from the Assistant
"""

root_logger.info(GPTResponder.generate_response_from_transcript.__name__)
if self.gl_vars.freeze_state[0]:
return ''

return self.generate_response_from_transcript_no_check(transcript)

def update_conversation(self, response, persona):
root_logger.info(GPTResponder.update_conversation.__name__)
if response != '':
self.response = response
self.conversation.update_conversation(persona=persona,
Expand All @@ -129,7 +131,7 @@ def respond_to_transcriber(self, transcriber):
if not self.gl_vars.freeze_state[0]:
transcript_string = transcriber.get_transcript(
length=constants.MAX_TRANSCRIPTION_PHRASES_FOR_LLM)
response = self.generate_response_from_transcript(transcript_string)
self.generate_response_from_transcript(transcript_string)

end_time = time.time() # Measure end time
execution_time = end_time - start_time # Calculate time to execute the function
Expand Down
54 changes: 49 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ Follow below steps to run transcribe on your local machine.

### 📋 Prerequisites

#### Common
- Python >=3.8.0
- (Optional) An OpenAI API key that can access OpenAI API (set up a paid account OpenAI account)
- Windows OS (Not tested on others)
- FFmpeg

#### Windows
If FFmpeg is not installed in your system, follow the steps below to install it.

First, install Chocolatey, a package manager for Windows. Open PowerShell as Administrator and run the following command:
Expand All @@ -26,6 +27,23 @@ choco install ffmpeg
```
Please run these commands in a PowerShell window with administrator privileges. For any issues during the installation, visit the official [Chocolatey](https://chocolatey.org/) and [FFmpeg](https://ffmpeg.org/) websites for troubleshooting.

#### macOS

**Dependencies and tools**

- XCode Command Line Tools
- `brew install portaudio python-tk ffmpeg blackhole-2ch`

**BlackHole configuration**

Setup "Multi-Ouput Device" and set it as default sound output device for your macOS. Guidelines are available [here](https://github.com/ExistentialAudio/BlackHole/wiki/Multi-Output-Device)

**Configuring device names**

Speakers audio on macOS will be recorded from the virtual "BlackHole 2ch" microphone. Your and BlackHole microphone device names could be adjusted via `HUMAN_MIC_NAME` and `BLACKHOLE_MIC_NAME` vars in the [AudioRecorder.py](./AudioRecorder.py).

Run `python main.py -l` to get speaker and microphone devices list and their indices.

### 🔧 Code Installation

1. Clone transcribe repository:
Expand All @@ -46,6 +64,8 @@ Please run these commands in a PowerShell window with administrator privileges.
pip install -r requirements.txt
```

It is recommended to create a virtual environment for installing the required packages

4. (Optional) Replace the Open API key in `parameters.yaml` file in the transcribe directory:

Replace the Open API key in `parameters.yaml` file manually. Open in a text editor and alter the line:
Expand Down Expand Up @@ -73,7 +93,32 @@ Upon initiation, Transcribe will begin transcribing microphone input and speaker

The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). However, keep in mind, using the Whisper API consumes OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile for your use case.

### 🎬 Running Transcribe
### 🎬 Customizing Transcribe

By default chatGPT API behaves like a casual friend engaging in light hearted banter. To customize the responses and make it specific to a field see this section in parameters.yaml and the corresponding examples

```
system_prompt: "You are a casual pal, genuinely interested in the conversation at hand. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly."
system_prompt: "You are an expert at Basketball and helping others learn about basketball. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly."
system_prompt: "You are an expert at Fantasy Football and helping others learn about Fantasy football. Please respond, in detail, to the conversation. Confidently give a straightforward response to the speaker, even if you don't understand them. Give your response in square brackets. DO NOT ask to repeat, and DO NOT ask for clarification. Just answer the speaker directly."


initial_convo:
first:
role: "You"
# content: "I am V, I want to learn about Fantasy Football"
# content: "I am V, I want to learn about Basketball"
content: Hey assistant, how are you doing today, I am in mood of a casual conversation.
second:
role: "assistant"
# content: "Hello, V. That's awesome! What do you want to know about basketball"
# content: "Hello, V. That's awesome! What do you want to know about Fantasy Football"
content: Hello, V. You are awesome. I am doing very well and looking forward to some light hearted banter with you.
```

Change system_prompt, intial_convo to be specific to the scenario you are intersted in.

### 🎬 Testing Transcribe Code changes

Unit Tests

Expand Down Expand Up @@ -122,9 +167,7 @@ https://drive.google.com/file/d/1Iy32YjDXK7Bga7amOUTA4Gx9VEoibPi-/view?usp=shari

### ⚡️ Limitations ⚡️

While Transcribe provides real-time transcription and optional response suggestions, there are several known limitations to its functionality that you should be aware of:

**Default Mic and Speaker:** Transcribe is currently configured to listen only to the default microphone and speaker set in your system. It will not detect sound from other devices or systems. To use a different mic or speaker, need to set it as your default device in your system settings.
While Transcribe provides real-time transcription and optional response suggestions, there are few known limitations to its functionality:

**Whisper Model**: If the --api flag is not used, we utilize the 'tiny' version of the Whisper ASR model, due to its low resource consumption and fast response times. However, this model may not be as accurate as the larger models in transcribing certain types of speech, including accents or uncommon words.

Expand All @@ -142,6 +185,7 @@ Incorrect API key provided: API_KEY. You can find your API key at https://platfo
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

## ➕ Enhancements from base repository ➕
- macOS support
- Speech Mode - Read out responses from ChatGPT as Audio
- Do not need Open AI key, paid Open AI account to use the complete functionality
- Allow users selective disabling of mic, speaker audio input
Expand Down
5 changes: 4 additions & 1 deletion conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def __init__(self):
constants.PERSONA_YOU: [],
constants.PERSONA_SPEAKER: [],
constants.PERSONA_ASSISTANT: []}
self.initialize_conversation()

def initialize_conversation(self):
config = configuration.Config().get_data()
prompt = config["OpenAI"]["system_prompt"]
self.update_conversation(persona=constants.PERSONA_SYSTEM, text=prompt,
Expand All @@ -35,7 +38,7 @@ def clear_conversation_data(self):
self.transcript_data[constants.PERSONA_SPEAKER].clear()
self.transcript_data[constants.PERSONA_SYSTEM].clear()
self.transcript_data[constants.PERSONA_ASSISTANT].clear()
self.last_update = datetime.datetime.now()
self.initialize_conversation()

def update_conversation(self, persona: str, text: str, time_spoken, pop: bool = False):
"""Update conversation with new data
Expand Down
12 changes: 7 additions & 5 deletions custom_speech_recognition/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@

try:
print("A moment of silence, please...")
with m as source: r.adjust_for_ambient_noise(source)
print("Set minimum energy threshold to {}".format(r.energy_threshold))
with m as source:
r.adjust_for_ambient_noise(source)
print(f'Set minimum energy threshold to {r.energy_threshold}')
while True:
print("Say something!")
with m as source: audio = r.listen(source)
with m as source:
audio = r.listen(source)
print("Got it! Now to recognize it...")
try:
# recognize speech using Google Speech Recognition
value = r.recognize_google(audio)

print("You said {}".format(value))
print(f'You said {value}')
except sr.UnknownValueError:
print("Oops! Didn't catch that")
except sr.RequestError as e:
print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
print(f"Uh oh! Couldn't request results from Google Speech Recognition service; {e}")
except KeyboardInterrupt:
pass
4 changes: 2 additions & 2 deletions duration.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ class Duration:
Duration(dd:hh:ss:ms) of Test Operation 0:00:02.000826
"""

def __init__(self, operation_name: str = 'undefined'):
def __init__(self, name: str = 'undefined'):
self.start: datetime.datetime = None
self.end: datetime.datetime = None
self.operation_name = operation_name
self.operation_name = name

def __enter__(self):
"""Records the start time of an operation
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def main():

if args.list_devices:
print('\n\nList all audio drivers and devices on this machine')
ar.print_detailed_audio_info()
ar.BaseRecorder.print_detailed_audio_info()
return

# Initiate global variables
Expand Down
Loading