-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
transcribe.py
92 lines (80 loc) · 3.74 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
import os
import threading
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer
import wave
import json
def convert_to_wav(audio_path, output_directory):
if not audio_path.lower().endswith('.wav'):
output_path = os.path.join(output_directory, os.path.basename(audio_path).rsplit('.', 1)[0] + '.wav')
audio = AudioSegment.from_file(audio_path, format=audio_path.split('.')[-1])
audio.export(output_path, format="wav")
return output_path
else:
return audio_path
def transcribe_audio_vosk(audio_path, model_path, callback):
try:
model = Model(model_path)
with wave.open(audio_path, "rb") as wf:
recognizer = KaldiRecognizer(model, wf.getframerate())
full_transcription = ""
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if recognizer.AcceptWaveform(data):
part_result = json.loads(recognizer.Result())
full_transcription += part_result.get('text', '') + " "
part_result = json.loads(recognizer.FinalResult())
full_transcription += part_result.get('text', '')
callback(full_transcription.strip())
except Exception as e:
messagebox.showerror("Error", f"Failed to transcribe audio. Error: {e}")
def update_transcription_text(transcription):
transcription_text.configure(state='normal')
transcription_text.delete(1.0, tk.END)
transcription_text.insert(tk.END, transcription)
transcription_text.configure(state='disabled')
def select_model_path():
model_path_value = filedialog.askdirectory()
if model_path_value:
model_path.set(model_path_value)
def select_file():
model_path_value = model_path.get()
if not model_path_value or not os.path.exists(model_path_value):
messagebox.showerror("Error", "Please select a valid Vosk model directory.")
return
file_path = filedialog.askopenfilename()
if file_path:
output_directory = None
if not file_path.lower().endswith('.wav'):
output_directory = filedialog.askdirectory(title="Select Output Directory for WAV Conversion")
if not output_directory:
messagebox.showerror("Error", "Output directory is required for non-WAV files.")
return
file_path = convert_to_wav(file_path, output_directory)
# Transcription is run on a separate thread to keep GUI responsive
threading.Thread(target=transcribe_audio_vosk, args=(file_path, model_path_value, update_transcription_text), daemon=True).start()
root = tk.Tk()
root.title("AudioDictate")
# Model path selection
model_path_frame = tk.Frame(root)
model_path_label = tk.Label(model_path_frame, text="Vosk Model Path:")
model_path_label.pack(side=tk.LEFT, padx=(0, 10))
model_path = tk.StringVar()
model_path_entry = tk.Entry(model_path_frame, textvariable=model_path, width=50)
model_path_entry.pack(side=tk.LEFT, expand=True, fill=tk.X)
model_path_button = tk.Button(model_path_frame, text="Select", command=select_model_path)
model_path_button.pack(side=tk.LEFT)
model_path_frame.pack(pady=5, padx=5, fill=tk.X)
# Transcription display area
transcription_frame = tk.LabelFrame(root, text="Transcription")
transcription_text = scrolledtext.ScrolledText(transcription_frame, width=60, height=15, state='disabled')
transcription_text.pack(expand=True, fill=tk.BOTH, padx=5, pady=5)
transcription_frame.pack(pady=10, padx=5, fill=tk.BOTH, expand=True)
# Button to select file
select_file_button = tk.Button(root, text="Select Audio File", command=select_file)
select_file_button.pack(pady=5)
root.mainloop()