|
1 |
| -from concurrent.futures import ProcessPoolExecutor |
2 |
| -from functools import partial |
3 | 1 | import numpy as np
|
4 | 2 | import os
|
5 |
| -import audio |
| 3 | +import Audio |
6 | 4 |
|
7 | 5 |
|
8 |
| -def build_from_path(in_dir, out_dir, num_workers=16, tqdm=lambda x: x): |
9 |
| - '''Preprocesses the LJ Speech dataset from a given input path into a given output directory. |
10 |
| -
|
11 |
| - Args: |
12 |
| - in_dir: The directory where you have downloaded the LJ Speech dataset |
13 |
| - out_dir: The directory to write the output into |
14 |
| - num_workers: Optional number of worker processes to parallelize across |
15 |
| - tqdm: You can optionally pass tqdm to get a nice progress bar |
16 |
| -
|
17 |
| - Returns: |
18 |
| - A list of tuples describing the training examples. This should be written to train.txt |
19 |
| - ''' |
20 |
| - |
21 |
| - # We use ProcessPoolExecutor to parallelize across processes. This is just an optimization and you |
22 |
| - # can omit it and just call _process_utterance on each input if you want. |
23 |
| - |
24 |
| - executor = ProcessPoolExecutor(max_workers=num_workers) |
25 |
| - futures = [] |
| 6 | +def build_from_path(in_dir, out_dir): |
26 | 7 | index = 1
|
| 8 | + out = list() |
27 | 9 |
|
28 | 10 | with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f:
|
29 | 11 | for line in f:
|
30 | 12 | parts = line.strip().split('|')
|
31 | 13 | wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
|
32 | 14 | text = parts[2]
|
33 |
| - futures.append(executor.submit( |
34 |
| - partial(_process_utterance, out_dir, index, wav_path, text))) |
| 15 | + out.append(_process_utterance(out_dir, index, wav_path, text)) |
35 | 16 |
|
36 | 17 | if index % 100 == 0:
|
37 | 18 | print("Done %d" % index)
|
38 | 19 | index = index + 1
|
39 | 20 |
|
40 |
| - return [future.result() for future in tqdm(futures)] |
| 21 | + return out |
41 | 22 |
|
42 | 23 |
|
43 | 24 | def _process_utterance(out_dir, index, wav_path, text):
|
44 |
| - '''Preprocesses a single utterance audio/text pair. |
45 |
| -
|
46 |
| - This writes the mel and linear scale spectrograms to disk and returns a tuple to write |
47 |
| - to the train.txt file. |
48 |
| -
|
49 |
| - Args: |
50 |
| - out_dir: The directory to write the spectrograms into |
51 |
| - index: The numeric index to use in the spectrogram filenames. |
52 |
| - wav_path: Path to the audio file containing the speech input |
53 |
| - text: The text spoken in the input audio file |
54 |
| -
|
55 |
| - Returns: |
56 |
| - A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt |
57 |
| - ''' |
58 |
| - |
59 |
| - # Load the audio to a numpy array: |
60 |
| - wav = audio.load_wav(wav_path) |
61 |
| - |
62 |
| - # Compute the linear-scale spectrogram from the wav: |
63 |
| - spectrogram = audio.spectrogram(wav).astype(np.float32) |
64 |
| - n_frames = spectrogram.shape[1] |
65 |
| - |
66 | 25 | # Compute a mel-scale spectrogram from the wav:
|
67 |
| - mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) |
| 26 | + mel_spectrogram = Audio.tools.get_mel(wav_path).numpy().astype(np.float32) |
| 27 | + # print(mel_spectrogram) |
68 | 28 |
|
69 | 29 | # Write the spectrograms to disk:
|
70 |
| - # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index |
71 | 30 | mel_filename = 'ljspeech-mel-%05d.npy' % index
|
72 | 31 | np.save(os.path.join(out_dir, mel_filename),
|
73 | 32 | mel_spectrogram.T, allow_pickle=False)
|
74 | 33 |
|
75 |
| - # Return a tuple describing this training example: |
76 |
| - # return (spectrogram_filename, mel_filename, n_frames, text) |
77 |
| - return (mel_filename, n_frames, text) |
| 34 | + return text |
0 commit comments