|
| 1 | +# coding=utf-8 |
| 2 | +# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# ============================================================================== |
| 17 | +# Only support eager mode and TF>=2.0.0 |
| 18 | +# pylint: disable=no-member, invalid-name, relative-beyond-top-level |
| 19 | +# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes |
| 20 | +''' voxceleb 1 & 2 ''' |
| 21 | + |
| 22 | +import os |
| 23 | +import sys |
| 24 | +import zipfile |
| 25 | +import subprocess |
| 26 | +import hashlib |
| 27 | +import pandas |
| 28 | +from absl import logging |
| 29 | +import tensorflow as tf |
| 30 | +import soundfile as sf |
| 31 | + |
| 32 | +gfile = tf.compat.v1.gfile |
| 33 | + |
| 34 | +SUBSETS = { |
| 35 | + "vox1_dev_wav": |
| 36 | + ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa", |
| 37 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab", |
| 38 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac", |
| 39 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad"], |
| 40 | + "vox1_test_wav": |
| 41 | + ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"], |
| 42 | + "vox2_dev_aac": |
| 43 | + ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa", |
| 44 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab", |
| 45 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac", |
| 46 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad", |
| 47 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae", |
| 48 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf", |
| 49 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag", |
| 50 | + "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah"], |
| 51 | + "vox2_test_aac": |
| 52 | + ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"] |
| 53 | +} |
| 54 | + |
| 55 | +MD5SUM = { |
| 56 | + "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b", |
| 57 | + "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402", |
| 58 | + "vox1_test_wav": "185fdc63c3c739954633d50379a3d102", |
| 59 | + "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312" |
| 60 | +} |
| 61 | + |
| 62 | +USER = { |
| 63 | + "user": "", |
| 64 | + "password": "" |
| 65 | +} |
| 66 | + |
| 67 | +speaker_id_dict = {} |
| 68 | + |
| 69 | +def download_and_extract(directory, subset, urls): |
| 70 | + """Download and extract the given split of dataset. |
| 71 | +
|
| 72 | + Args: |
| 73 | + directory: the directory where to put the downloaded data. |
| 74 | + subset: subset name of the corpus. |
| 75 | + urls: the list of urls to download the data file. |
| 76 | + """ |
| 77 | + if not gfile.Exists(directory): |
| 78 | + gfile.MakeDirs(directory) |
| 79 | + |
| 80 | + try: |
| 81 | + for url in urls: |
| 82 | + zip_filepath = os.path.join(directory, url.split("/")[-1]) |
| 83 | + if os.path.exists(zip_filepath): |
| 84 | + continue |
| 85 | + logging.info("Downloading %s to %s" % (url, zip_filepath)) |
| 86 | + subprocess.call('wget %s --user %s --password %s -O %s' % |
| 87 | + (url, USER["user"], USER["password"], zip_filepath), shell=True) |
| 88 | + |
| 89 | + statinfo = os.stat(zip_filepath) |
| 90 | + logging.info( |
| 91 | + "Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size) |
| 92 | + ) |
| 93 | + |
| 94 | + # concatenate all parts into zip files |
| 95 | + if ".zip" not in zip_filepath: |
| 96 | + zip_filepath = "_".join(zip_filepath.split("_")[:-1]) |
| 97 | + subprocess.call('cat %s* > %s.zip' % |
| 98 | + (zip_filepath, zip_filepath), shell=True) |
| 99 | + zip_filepath += ".zip" |
| 100 | + extract_path = zip_filepath.strip(".zip") |
| 101 | + |
| 102 | + # check zip file md5sum |
| 103 | + md5 = hashlib.md5(open(zip_filepath, 'rb').read()).hexdigest() |
| 104 | + if md5 != MD5SUM[subset]: |
| 105 | + raise ValueError("md5sum of %s mismatch" % zip_filepath) |
| 106 | + |
| 107 | + with zipfile.ZipFile(zip_filepath, "r") as zfile: |
| 108 | + zfile.extractall(directory) |
| 109 | + extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename) |
| 110 | + subprocess.call('mv %s %s' % (extract_path_ori, extract_path), shell=True) |
| 111 | + finally: |
| 112 | + # gfile.Remove(zip_filepath) |
| 113 | + pass |
| 114 | + |
| 115 | + |
| 116 | +def exec_cmd(cmd): |
| 117 | + """Run a command in a subprocess. |
| 118 | + Args: |
| 119 | + cmd: command line to be executed. |
| 120 | + Return: |
| 121 | + int, the return code. |
| 122 | + """ |
| 123 | + try: |
| 124 | + retcode = subprocess.call(cmd, shell=True) |
| 125 | + if retcode < 0: |
| 126 | + logging.info(f"Child was terminated by signal {retcode}") |
| 127 | + except OSError as e: |
| 128 | + logging.info(f"Execution failed: {e}") |
| 129 | + retcode = -999 |
| 130 | + return retcode |
| 131 | + |
| 132 | + |
| 133 | +def decode_aac_with_ffmpeg(aac_file, wav_file): |
| 134 | + """Decode a given AAC file into WAV using ffmpeg. |
| 135 | + Args: |
| 136 | + aac_file: file path to input AAC file. |
| 137 | + wav_file: file path to output WAV file. |
| 138 | + Return: |
| 139 | + bool, True if success. |
| 140 | + """ |
| 141 | + cmd = f"ffmpeg -i {aac_file} {wav_file}" |
| 142 | + logging.info(f"Decoding aac file using command line: {cmd}") |
| 143 | + ret = exec_cmd(cmd) |
| 144 | + if ret != 0: |
| 145 | + logging.error(f"Failed to decode aac file with retcode {ret}") |
| 146 | + logging.error("Please check your ffmpeg installation.") |
| 147 | + return False |
| 148 | + return True |
| 149 | + |
| 150 | + |
| 151 | +def convert_audio_and_make_label(input_dir, subset, |
| 152 | + output_dir, output_file): |
| 153 | + """Optionally convert AAC to WAV and make speaker labels. |
| 154 | + Args: |
| 155 | + input_dir: the directory which holds the input dataset. |
| 156 | + subset: the name of the specified subset. e.g. vox1_dev_wav |
| 157 | + output_dir: the directory to place the newly generated csv files. |
| 158 | + output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv |
| 159 | + """ |
| 160 | + |
| 161 | + logging.info("Preprocessing audio and label for subset %s" % subset) |
| 162 | + source_dir = os.path.join(input_dir, subset) |
| 163 | + |
| 164 | + files = [] |
| 165 | + # Convert all AAC file into WAV format. At the same time, generate the csv |
| 166 | + for root, _, filenames in gfile.Walk(source_dir): |
| 167 | + for filename in filenames: |
| 168 | + name, ext = os.path.splitext(filename) |
| 169 | + if ext.lower() == ".wav": |
| 170 | + _, ext2 = (os.path.splitext(name)) |
| 171 | + if ext2: |
| 172 | + continue |
| 173 | + wav_file = os.path.join(root, filename) |
| 174 | + elif ext.lower() == ".m4a": |
| 175 | + # Convert AAC to WAV. |
| 176 | + aac_file = os.path.join(root, filename) |
| 177 | + wav_file = aac_file + ".wav" |
| 178 | + if not gfile.Exists(wav_file): |
| 179 | + if not decode_aac_with_ffmpeg(aac_file, wav_file): |
| 180 | + raise RuntimeError("Audio decoding failed.") |
| 181 | + else: |
| 182 | + continue |
| 183 | + speaker_name = root.split(os.path.sep)[-2] |
| 184 | + if speaker_name not in speaker_id_dict: |
| 185 | + num = len(speaker_id_dict) |
| 186 | + speaker_id_dict[speaker_name] = num |
| 187 | + # wav_filesize = os.path.getsize(wav_file) |
| 188 | + wav_length = len(sf.read(wav_file)[0]) |
| 189 | + files.append( |
| 190 | + (os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name) |
| 191 | + ) |
| 192 | + |
| 193 | + # Write to CSV file which contains four columns: |
| 194 | + # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name". |
| 195 | + csv_file_path = os.path.join(output_dir, output_file) |
| 196 | + df = pandas.DataFrame( |
| 197 | + data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"]) |
| 198 | + df.to_csv(csv_file_path, index=False, sep="\t") |
| 199 | + logging.info("Successfully generated csv file {}".format(csv_file_path)) |
| 200 | + |
| 201 | + |
| 202 | +def processor(directory, subset, force_process): |
| 203 | + """ download and process """ |
| 204 | + urls = SUBSETS |
| 205 | + if subset not in urls: |
| 206 | + raise ValueError(subset, "is not in voxceleb") |
| 207 | + |
| 208 | + subset_csv = os.path.join(directory, subset + '.csv') |
| 209 | + if not force_process and os.path.exists(subset_csv): |
| 210 | + return subset_csv |
| 211 | + |
| 212 | + logging.info("Downloading and process the voxceleb in %s", directory) |
| 213 | + logging.info("Preparing subset %s", subset) |
| 214 | + download_and_extract(directory, subset, urls[subset]) |
| 215 | + convert_audio_and_make_label( |
| 216 | + directory, |
| 217 | + subset, |
| 218 | + directory, |
| 219 | + subset + ".csv" |
| 220 | + ) |
| 221 | + logging.info("Finished downloading and processing") |
| 222 | + return subset_csv |
| 223 | + |
| 224 | + |
| 225 | +if __name__ == "__main__": |
| 226 | + logging.set_verbosity(logging.INFO) |
| 227 | + if len(sys.argv) != 4: |
| 228 | + print("Usage: python prepare_data.py save_directory user password") |
| 229 | + sys.exit() |
| 230 | + |
| 231 | + DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3] |
| 232 | + for SUBSET in SUBSETS: |
| 233 | + processor(DIR, SUBSET, False) |
0 commit comments