-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkazmodel.py
163 lines (141 loc) · 5.73 KB
/
kazmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = "/anaconda3/lib/python3.6/site-packages/pocketsphinx/model/"
DATADIR = "/anaconda3/lib/python3.6/site-packages/pocketsphinx/data/"
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string('-hmm', os.path.join(MODELDIR, 'kaz'))
config.set_string('-lm', os.path.join(MODELDIR, 'yourOutputFile.lm.bin'))
config.set_string('-dict', os.path.join(MODELDIR, 'kz.dict'))
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
"""
print("Getting intensity values from mic.")
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
print(" Finished ")
print(" Average audio intensity is {}".format(r))
stream.close()
p.terminate()
if r < 400:
self.THRESHOLD = 500
else:
self.THRESHOLD = r + 5
def save_speech(self, data, p):
"""
Saves mic data to temporary WAV file. Returns filename of saved
file
"""
filename = 'output_'+str(int(time.time()))
# writes data to WAV file
result = ""
for elem in data:
result += elem.decode('latin-1')
data = ''.join(result)#data[0].decode('latin-1'))
data = data.encode('latin-1')
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
def decode_phrase(self, wav_file):
self.decoder.start_utt()
stream = open(wav_file, "rb")
while True:
buf = stream.read(1024)
if buf:
self.decoder.process_raw(buf, False, False)
else:
break
self.decoder.end_utt()
words = []
[words.append(seg.word) for seg in self.decoder.seg()]
return words
def run(self):
"""
Listens to Microphone, extracts phrases from it and calls pocketsphinx
to decode the sound
"""
self.setup_mic()
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
print("* Mic set up and listening. ")
audio2send = []
cur_data = '' # current chunk of audio data
rel = self.RATE/self.CHUNK
slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel))
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=int(self.PREV_AUDIO * rel))
started = False
while True:
cur_data = stream.read(self.CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
if sum([x > self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print("Starting recording of phrase")
started = True
audio2send.append(cur_data)
elif started:
print("Finished recording, decoding phrase")
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print("DETECTED: {}".format(r))
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel))
prev_audio = deque(maxlen=int(0.5 * rel))
audio2send = []
print("Listening ...")
else:
prev_audio.append(cur_data)
print("* Done listening")
stream.close()
p.terminate()
if __name__ == "__main__":
sd = SpeechDetector()
sd.run()