-
Notifications
You must be signed in to change notification settings - Fork 75
/
audio_producer.py
199 lines (163 loc) · 6.61 KB
/
audio_producer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from __future__ import division
from __future__ import print_function
import json
import math
import multiprocessing as mp
import numpy as np
import os
import random
from utils import wave_utils
from utils import cspecgram
# TODO, awni fix this. Baking the tokens in for now
BLANK = 0
TOKENS = list("abcdefghijklmnopqrstuvwxyz \'")
TOKEN_DICT = dict(zip(TOKENS, range(1, len(TOKENS) + 1)))
class AudioProducer(object):
def __init__(self, data_jsons, batch_size, sample_rate=8000,
min_duration=.3, max_duration=16.0):
"""
Args:
data_json : List of paths to files with speech data in json
format. Each line should be a new example and the required
fields for each example are 'duration' (seconds) is the
length of the audio, 'key' is the path to the wave file
and 'text' is the transcription.
batch_size : Size of the batches for training.
sample_rate : Rate to resample audio prior to feature computation.
min_duration : Minimum length of allowed audio in seconds.
max_duration : Maximum length of allowed audio in seconds.
"""
self.tokens = TOKEN_DICT
self.batch_size = batch_size
self.sample_rate = sample_rate
data = []
for dj in data_jsons:
data.extend(_read_data_json(dj))
data = sorted(data, key=lambda x : x['duration'])
def bad_data_fn(d):
if d['duration'] > max_duration or d['duration'] < min_duration:
return False
# For CTC number of input steps has to be greater
# than number of labels.
# TODO, awni, better way to do this.
if len(d['text']) >= _spec_time_steps(d['duration']) / 3:
return False
return True
data = filter(bad_data_fn, data)
# *NB* this cuts off the longest data items in the last segment
# if len(data) is not a multiple of batch_size
batches = [data[i:i+batch_size]
for i in range(0, len(data) - batch_size + 1, batch_size)]
random.shuffle(batches)
self.batches = batches
def estimate_mean_std(self, sample_size=2048):
keys = [random.choice(random.choice(self.batches))['key']
for _ in range(sample_size)]
feats = np.hstack([compute_features(k, self.sample_rate)
for k in keys])
mean = np.mean(feats, axis=1)
std = np.std(feats, axis=1)
return mean, std
def iterator(self, sort=False, max_size=64,
num_workers=8, max_examples=None):
if sort:
batches = sorted(self.batches,
key=lambda d : d[-1]['duration'])
else:
batches = self.batches
if max_examples is not None:
batches = batches[:int(max_examples / self.batch_size)]
consumer = mp.Queue()
producer = mp.Queue(max_size)
for b in batches:
consumer.put(b)
procs = [mp.Process(target=queue_featurize_example,
args=(consumer, producer,
self.sample_rate, self.tokens))
for _ in range(num_workers)]
[p.start() for p in procs]
for _ in batches:
yield producer.get()
@property
def alphabet_size(self):
return len(self.tokens)
@property
def input_dim(self):
return _spec_freq_dim(self.sample_rate)
def to_ints(text):
return [TOKEN_DICT[t] for t in text]
def to_text(inds):
return ''.join([TOKENS[i-1] for i in inds])
def queue_featurize_example(consumer, producer, sample_rate, tokens):
while True:
try:
batch = consumer.get(block=True, timeout=5)
except mp.queues.Empty as e:
return
labels = [[tokens[t] for t in b['text']] for b in batch]
inputs = [compute_features(b['key'], sample_rate) for b in batch]
producer.put((inputs, labels))
def compute_features(audio_file, sample_rate):
data, d_sample_rate = wave_utils.read_wave_array(audio_file)
return compute_features_raw(data, sample_rate, d_sample_rate)
def compute_features_raw(data, output_sample_rate,
input_sample_rate):
if input_sample_rate != output_sample_rate:
data = wave_utils.resample(data, input_sample_rate,
output_sample_rate)
epsilon = 1e-7
features = _compute_specgram(data, output_sample_rate)
features = np.log(features + epsilon)
return features
def _read_data_json(file_name):
with open(file_name, 'r') as fid:
return [json.loads(l) for l in fid]
def _spec_time_steps(duration, window_size=25, hop_size=10):
"""
Compute the number of time steps of a spectrogram.
Args:
duration : Length of audio in seconds.
window_size : Size of specgram window in milliseconds.
hop_size : Size of steps between ffts in
specgram in milliseconds.
Returns:
The number of time-steps in the
output of the spectrogram.
"""
duration_ms = duration * 1000
return math.ceil((duration_ms - window_size) / hop_size)
def _spec_freq_dim(sample_rate, window_size=25):
"""
Compute the number of frequency bins of a spectrogram.
Args:
sample_rate : Hz of the audio.
window_size : Size of specgram window in milliseconds.
Returns:
An integer representing the number of dimensions in
the spectrogram.
"""
return int(((sample_rate / 1000) * window_size) / 2) + 1
def _compute_specgram(data, sample_rate,
window_size=25, hop_size=10):
"""
Compute the spectrogram of the given audio file.
Args:
data : 1D numpy array with wave data.
sample_rate : rate to resample audio to before specgram.
window_size : size in milliseconds of the stft window.
hop_size : size in milliseconds of the step between windows.
Returns:
spectrum : M x N numpy array where M is the number of
spectral bins and N is the number of time-steps.
"""
if data.dtype != np.int16:
data = data.astype(np.int16)
NFFT = int(window_size * (sample_rate / 1000))
noverlap = int((window_size - hop_size) * (sample_rate / 1000))
n_freqs = int(NFFT / 2 + 1)
time_steps = int((data.size - noverlap) / (NFFT - noverlap))
spectrum = np.empty((n_freqs, time_steps),
dtype=np.float32, order='F')
cspecgram.specgram(data, NFFT, sample_rate,
noverlap, spectrum)
return spectrum