-
Notifications
You must be signed in to change notification settings - Fork 5
/
deep_speech.py
110 lines (98 loc) · 4.08 KB
/
deep_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import deepspeech
import wave
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
##Author : Govardhan Chitrada
##Last Modified on: 19/09/20
def func(audio_file, transcript):
print(audio_file)
model_file_path = 'ds_models061/output_graph.pbmm'
beam_width = 500
model = deepspeech.Model(model_file_path, beam_width)
lm_file_path = 'ds_models061/lm.binary'
trie_file_path = 'ds_models061/trie'
lm_alpha = 0.75
lm_beta = 1.85
extended= 'extended'
model.enableDecoderWithLM(lm_file_path, trie_file_path, lm_alpha, lm_beta)
filename = audio_file
w = wave.open(filename, 'r')
rate = w.getframerate()
frames = w.getnframes()
buffer = w.readframes(frames)
"""
sample_rate, samples = wavfile.read(filename)
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
#plt.pcolormesh(times, frequencies, spectrogram)
#eplt.imshow(spectrogram)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
#plt.savefig('test.png')
"""
data16 = np.frombuffer(buffer, dtype=np.int16)
#print(str(type(data16)))
print(rate)
text = model.stt(data16)
text1= model.sttWithMetadata(data16)
generated_words=words_from_metadata(text1)
#print(len(words_from_metadata(text1)),words_from_metadata(text1))
count_time,count_delay,total_words=0,0,0
words_list=[]
total_words = len(generated_words)
print('Total Recognised Words:', total_words)
for i,words_full in enumerate(generated_words):
words_list.append(generated_words[i]['word'])
count_time=count_time+float(generated_words[i]['duration'])
if i != len(generated_words)-1:
#print(len(generated_words))
count_delay=count_delay+abs((generated_words[i]['start_time ']+generated_words[i]['duration']-generated_words[i+1]['start_time ']))
total_time=round(count_time+count_delay,3)
#print('total time for words:',round(count_time+count_delay,3))
if total_words==0:
print("words per minute: 0 \n Not Recognised")
wpm=0
else:
print('words per minute:',round((60/total_time)*total_words))
wpm=(60/total_time)*total_words
aigalore_filler_words=[]
aigalore_filler_words.append(' '.join(token.lower() for token in nltk.word_tokenize(text) if token.lower() not in stopwords.words('english')))
#print(len(str(text)),len(str(aigalore_filler_words[0])))
total_filler_words=len(word_tokenize(text))-len(word_tokenize(str(aigalore_filler_words[0])))
text = text + "<br><br>" + "<b>Total Recognised Words:</b>" + str(total_words) + '<br>' + '<b>Words Per Minute:</b>' +str(wpm)+'<br>'+'<b>Total Filler Words:</b>'+str(total_filler_words)
print('total filler words:',total_filler_words)
with open("Files/Transcript/output.txt", "w+") as f:
f.write(text)
return(text)
def words_from_metadata(metadata):
word = ""
word_list = []
word_start_time = 0
# Loop through each character
for i in range(0, metadata.num_items):
item = metadata.items[i]
# Append character to word if it's not a space
if item.character != " ":
word = word + item.character
# Word boundary is either a space or the last character in the array
if item.character == " " or i == metadata.num_items - 1:
word_duration = item.start_time - word_start_time
if word_duration < 0:
word_duration = 0
each_word = dict()
each_word["word"] = word
each_word["start_time "] = round(word_start_time, 4)
each_word["duration"] = round(word_duration, 4)
word_list.append(each_word)
# Reset
word = ""
word_start_time = 0
else:
if len(word) == 1:
# Log the start time of the new word
word_start_time = item.start_time
return word_list
##ddd