|
| 1 | +# audio |
| 2 | +try: |
| 3 | + import whisper |
| 4 | + from playsound import playsound |
| 5 | + from gtts import gTTS |
| 6 | + import pyaudio |
| 7 | +except Exception as e: |
| 8 | + print(f"Error importing whisper: {e}") |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | +def get_audio_level(audio_data): |
| 13 | + return np.max(np.abs(np.frombuffer(audio_data, dtype=np.int16))) |
| 14 | + |
| 15 | + |
| 16 | +def calibrate_silence(sample_rate=16000, duration=2): |
| 17 | + """ |
| 18 | + Function Description: |
| 19 | + This function calibrates the silence level for audio recording. |
| 20 | + Args: |
| 21 | + None |
| 22 | + Keyword Args: |
| 23 | + sample_rate: The sample rate for audio recording. |
| 24 | + duration: The duration in seconds for calibration. |
| 25 | + Returns: |
| 26 | + The silence threshold level. |
| 27 | + """ |
| 28 | + |
| 29 | + p = pyaudio.PyAudio() |
| 30 | + stream = p.open( |
| 31 | + format=pyaudio.paInt16, |
| 32 | + channels=1, |
| 33 | + rate=sample_rate, |
| 34 | + input=True, |
| 35 | + frames_per_buffer=1024, |
| 36 | + ) |
| 37 | + |
| 38 | + print("Calibrating silence level. Please remain quiet...") |
| 39 | + levels = [] |
| 40 | + for _ in range(int(sample_rate * duration / 1024)): |
| 41 | + data = stream.read(1024) |
| 42 | + levels.append(get_audio_level(data)) |
| 43 | + |
| 44 | + stream.stop_stream() |
| 45 | + stream.close() |
| 46 | + p.terminate() |
| 47 | + |
| 48 | + avg_level = np.mean(levels) |
| 49 | + silence_threshold = avg_level * 1.5 # Set threshold slightly above average |
| 50 | + print(f"Silence threshold set to: {silence_threshold}") |
| 51 | + return silence_threshold |
| 52 | + |
| 53 | + |
| 54 | +def is_silent(audio_data : bytes, |
| 55 | + threshold : float) -> bool: |
| 56 | + """ |
| 57 | + Function Description: |
| 58 | + This function checks if audio data is silent based on a threshold. |
| 59 | + Args: |
| 60 | + audio_data: The audio data to check. |
| 61 | + threshold: The silence threshold level. |
| 62 | + Keyword Args: |
| 63 | + None |
| 64 | + Returns: |
| 65 | + A boolean indicating whether the audio is silent. |
| 66 | + """ |
| 67 | + |
| 68 | + |
| 69 | + return get_audio_level(audio_data) < threshold |
| 70 | + |
| 71 | + |
| 72 | +def record_audio(sample_rate : int = 16000, max_duration : int = 10, silence_threshold : Optional[float] = None) -> bytes: |
| 73 | + """ |
| 74 | + Function Description: |
| 75 | + This function records audio from the microphone. |
| 76 | + Args: |
| 77 | + None |
| 78 | + Keyword Args: |
| 79 | + sample_rate: The sample rate for audio recording. |
| 80 | + max_duration: The maximum duration in seconds. |
| 81 | + silence_threshold: The silence threshold level. |
| 82 | + Returns: |
| 83 | + The recorded audio data. |
| 84 | + """ |
| 85 | + |
| 86 | + if silence_threshold is None: |
| 87 | + silence_threshold = calibrate_silence() |
| 88 | + |
| 89 | + p = pyaudio.PyAudio() |
| 90 | + stream = p.open( |
| 91 | + format=pyaudio.paInt16, |
| 92 | + channels=1, |
| 93 | + rate=sample_rate, |
| 94 | + input=True, |
| 95 | + frames_per_buffer=1024, |
| 96 | + ) |
| 97 | + |
| 98 | + print("Listening... (speak now)") |
| 99 | + frames = [] |
| 100 | + silent_chunks = 0 |
| 101 | + has_speech = False |
| 102 | + max_silent_chunks = int(sample_rate * 1.0 / 1024) # 1.0 seconds of silence |
| 103 | + max_chunks = int(sample_rate * max_duration / 1024) # Maximum duration in chunks |
| 104 | + |
| 105 | + start_time = time.time() |
| 106 | + for _ in range(max_chunks): |
| 107 | + data = stream.read(1024) |
| 108 | + frames.append(data) |
| 109 | + |
| 110 | + if is_silent(data, silence_threshold): |
| 111 | + silent_chunks += 1 |
| 112 | + if has_speech and silent_chunks > max_silent_chunks: |
| 113 | + break |
| 114 | + else: |
| 115 | + silent_chunks = 0 |
| 116 | + has_speech = True |
| 117 | + |
| 118 | + if len(frames) % 10 == 0: # Print a dot every ~0.5 seconds |
| 119 | + print(".", end="", flush=True) |
| 120 | + |
| 121 | + if time.time() - start_time > max_duration: |
| 122 | + print("\nMax duration reached.") |
| 123 | + break |
| 124 | + |
| 125 | + print("\nProcessing...") |
| 126 | + |
| 127 | + stream.stop_stream() |
| 128 | + stream.close() |
| 129 | + p.terminate() |
| 130 | + |
| 131 | + return b"".join(frames) |
| 132 | + |
| 133 | + |
| 134 | +def speak_text(text : str) -> None: |
| 135 | + """ |
| 136 | + Function Description: |
| 137 | + This function converts text to speech and plays the audio. |
| 138 | + Args: |
| 139 | + text: The text to convert to speech. |
| 140 | + Keyword Args: |
| 141 | + None |
| 142 | + Returns: |
| 143 | + None |
| 144 | + """ |
| 145 | + |
| 146 | + try: |
| 147 | + tts = gTTS(text=text, lang="en") |
| 148 | + with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: |
| 149 | + tts.save(fp.name) |
| 150 | + playsound(fp.name) |
| 151 | + os.unlink(fp.name) |
| 152 | + except Exception as e: |
| 153 | + print(f"Text-to-speech error: {e}") |
| 154 | + |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | + |
| 159 | +def process_audio(file_path : str, table_name : str) -> List: |
| 160 | + """ |
| 161 | + Function Description: |
| 162 | + This function is used to process an audio file. |
| 163 | + Args: |
| 164 | + file_path : str : The file path. |
| 165 | + table_name : str : The table name. |
| 166 | + Keyword Args: |
| 167 | + None |
| 168 | + Returns: |
| 169 | + List : The embeddings and texts. |
| 170 | + """ |
| 171 | + |
| 172 | + embeddings = [] |
| 173 | + texts = [] |
| 174 | + try: |
| 175 | + audio, sr = librosa.load(file_path) |
| 176 | + # Transcribe audio using Whisper |
| 177 | + model = whisper.load_model("base") # Or a larger model if available |
| 178 | + result = model.transcribe(file_path) |
| 179 | + transcribed_text = result["text"].strip() |
| 180 | + |
| 181 | + # Split transcribed text into chunks (adjust chunk_size as needed) |
| 182 | + chunk_size = 1000 |
| 183 | + for i in range(0, len(transcribed_text), chunk_size): |
| 184 | + chunk = transcribed_text[i : i + chunk_size] |
| 185 | + text_embedding_response = get_llm_response( |
| 186 | + f"Generate an embedding for: {chunk}", |
| 187 | + model="text-embedding-ada-002", |
| 188 | + provider="openai", |
| 189 | + ) # Use a text embedding model |
| 190 | + if ( |
| 191 | + isinstance(text_embedding_response, dict) |
| 192 | + and "error" in text_embedding_response |
| 193 | + ): |
| 194 | + print( |
| 195 | + f"Error generating text embedding: {text_embedding_response['error']}" |
| 196 | + ) |
| 197 | + else: |
| 198 | + embeddings.append(text_embedding_response) # Store the embedding |
| 199 | + texts.append(chunk) # Store the corresponding text chunk |
| 200 | + |
| 201 | + return embeddings, texts |
| 202 | + |
| 203 | + except Exception as e: |
| 204 | + print(f"Error processing audio: {e}") |
| 205 | + return [], [] # Return empty lists in case of error |
0 commit comments