-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
230 changed files
with
1,393,770 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import argparse | ||
|
||
def str2bool(v): | ||
if isinstance(v, bool): | ||
return v | ||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | ||
return True | ||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | ||
return False | ||
else: | ||
raise argparse.ArgumentTypeError('Boolean value expected.') | ||
|
||
def get_args(): | ||
parser=argparse.ArgumentParser() | ||
|
||
# expeirment info | ||
parser.add_argument('-project', type=str, default='IPET') | ||
parser.add_argument('-name', type=str, required=True) | ||
parser.add_argument('-tags', type=str, required=True) | ||
parser.add_argument('-description', type=str, default='') | ||
|
||
# dir | ||
parser.add_argument('-path_logging', type=str, default='/source') | ||
parser.add_argument('-path_training_dataset', type=str, default='../datafiles/esc50_train_data.json') | ||
parser.add_argument('-path_evaluation_dataset', type=str, default='../datafiles/esc50_eval_data.json') | ||
parser.add_argument('-path_data_label', type=str, default='../datafiles/class_labels_indices.csv') | ||
|
||
# device | ||
parser.add_argument('-num_workers', type=int, default=4) | ||
parser.add_argument('-usable_gpu', type=str, default='0,1,2,3') | ||
|
||
# hyper-parameters | ||
parser.add_argument('-epoch', type=int, default=30) | ||
parser.add_argument('-batch_size', type=int, default=48) | ||
parser.add_argument('-amsgrad', type = str2bool, nargs='?', const=True, default = False) | ||
parser.add_argument('-lr', type = float, default = 1e-3) | ||
parser.add_argument('-lr_decay_start_epoch', type=int, default=5) | ||
parser.add_argument('-lr_decay_end_epoch', type=int, default=26) | ||
parser.add_argument('-gamma', type = float, default = 0.85) | ||
parser.add_argument('-weigth_decay', type = float, default = 5e-7) | ||
parser.add_argument('-classification_loss', type=str, default='cce') | ||
|
||
# training setting | ||
parser.add_argument('-fold', type=int, default=5) | ||
parser.add_argument('-number_iteration_for_log', type=int, default=10) | ||
parser.add_argument('-rand_seed', type=int, default=1234) | ||
parser.add_argument('-flag_reproduciable', type = str2bool, nargs='?', const=True, default = True) | ||
|
||
#model architectures | ||
parser.add_argument('-main_module_name', type=str, default='AST') | ||
parser.add_argument('-backend_module_name', type=str, default='head') | ||
parser.add_argument('-fstride', type=int, default=10) | ||
parser.add_argument('-tstride', type=int, default=10) | ||
parser.add_argument('-imagenet_pretrain', type = str2bool, nargs='?', const=True, default = True) | ||
parser.add_argument('-audioset_pretrain', type = str2bool, nargs='?', const=True, default = True) | ||
parser.add_argument('-model_size', type=str, default='base384') | ||
|
||
#parameter-efficient transfer learning methods | ||
parser.add_argument('-input_prompt', type = str2bool, nargs='?', const=True, default = False) | ||
parser.add_argument('-input_prompt_num', type=int, default=10) | ||
parser.add_argument('-embedding_prompt', type = str2bool, nargs='?', const=True, default = True) | ||
parser.add_argument('-embedding_prompt_num', type=int, default=4) | ||
parser.add_argument('-adapter', type = str2bool, nargs='?', const=True, default = True) | ||
parser.add_argument('-adapter_scalar', type = float, default = 0.1) | ||
parser.add_argument('-adapter_hidden_dim', type = int, default = 32) | ||
|
||
#data processing | ||
parser.add_argument('-frame_length', type = int, default = 512) | ||
parser.add_argument('-winlen', type = int, default = 25) | ||
parser.add_argument('-winstep', type = int, default = 10) | ||
parser.add_argument('-winfunc', type=str, default='hanning') | ||
parser.add_argument('-nfilts', type = int, default = 128) | ||
parser.add_argument('-norm_mean', type = float, default = -6.6268077) | ||
parser.add_argument('-norm_std', type = float, default = 5.358466) | ||
parser.add_argument('-skip_norm', type = str2bool, nargs='?', const=True, default = False) | ||
|
||
#data augmentation | ||
parser.add_argument('-mixup', type = float, default = 0) | ||
parser.add_argument('-freqm', type = int, default = 24) | ||
parser.add_argument('-timem', type = int, default = 96) | ||
parser.add_argument('-add_noise', type = str2bool, nargs='?', const=True, default = False) | ||
|
||
args=parser.parse_args() | ||
|
||
return args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
import math | ||
from random import random | ||
import torch | ||
import torch.utils.data as td | ||
import torchaudio as ta | ||
import numpy as np | ||
import warnings | ||
import torch.nn.functional | ||
import utils.util as util | ||
import random | ||
|
||
class Kaldi_Fbank(): | ||
"""Extract Kaldi_fbank from raw x using torchaudio. | ||
""" | ||
def __init__(self, args): | ||
super(Kaldi_Fbank, self).__init__() | ||
|
||
self.args = args | ||
|
||
def __call__(self, x, sr, x2 = None): | ||
# mixup | ||
if x2 is not None: | ||
if x.shape[1] != x2.shape[1]: | ||
if x.shape[1] > x2.shape[1]: | ||
# padding | ||
temp_wav = torch.zeros(1, x.shape[1]) | ||
temp_wav[0, 0:x2.shape[1]] = x2 | ||
x2 = temp_wav | ||
else: | ||
# cutting | ||
x2 = x2[0, 0:x.shape[1]] | ||
|
||
# sample lambda from uniform distribution | ||
#mix_lambda = random.random() | ||
# sample lambda from beta distribtion | ||
mix_lambda = np.random.beta(10, 10) | ||
|
||
mix_x = mix_lambda * x + (1 - mix_lambda) * x2.to(self.args.device) | ||
x = mix_x - mix_x.mean() | ||
|
||
fbank = ta.compliance.kaldi.fbank(x, frame_length=self.args.winlen, frame_shift=self.args.winstep, htk_compat=True, sample_frequency=sr, | ||
window_type=self.args.winfunc, num_mel_bins=self.args.nfilts, dither=0.0) | ||
|
||
n_frames = fbank.shape[0] | ||
|
||
p = self.args.frame_length - n_frames | ||
|
||
# cut and pad | ||
if p > 0: | ||
m = torch.nn.ZeroPad2d((0, 0, 0, p)) | ||
fbank = m(fbank) | ||
elif p < 0: | ||
fbank = fbank[0:self.args.frame_length, :] | ||
|
||
if x2 == None: | ||
return fbank, 0 | ||
else: | ||
return fbank, mix_lambda | ||
|
||
def get_loaders(args, dataset): | ||
train_set = TrainSet(args, dataset) | ||
train_set_sampler = td.DistributedSampler(train_set, shuffle=True) | ||
|
||
evaluationt_set = TestSet(args, dataset.evaluation_set, dataset.classes_labels) | ||
evaluationt_set_sampler = td.DistributedSampler(evaluationt_set, shuffle=False) | ||
|
||
train_loader = td.DataLoader( | ||
train_set, | ||
batch_size=args.batch_size, | ||
pin_memory=True, | ||
num_workers=args.num_workers, | ||
sampler=train_set_sampler | ||
) | ||
|
||
evaluation_loader = td.DataLoader( | ||
evaluationt_set, | ||
batch_size=args.batch_size * 2, | ||
pin_memory=True, | ||
num_workers=args.num_workers, | ||
sampler=evaluationt_set_sampler | ||
) | ||
|
||
return train_set, train_set_sampler, train_loader, evaluationt_set, evaluation_loader | ||
|
||
class TrainSet(td.Dataset): | ||
|
||
def __init__(self, args, dataset): | ||
self.args = args | ||
self.dataset = dataset | ||
self.items = dataset.train_set | ||
self.fbank = Kaldi_Fbank(args) | ||
# set label | ||
count = 0 | ||
|
||
|
||
|
||
def __len__(self): | ||
return len(self.items) | ||
|
||
def __getitem__(self, index): | ||
item = self.items[index] | ||
|
||
|
||
# read wav | ||
x, sr = ta.load(item.path) | ||
x = x - x.mean() | ||
x = x.to(self.args.device) | ||
mix_x = None | ||
|
||
if random.random() < self.args.mixup: | ||
mix_sample_idx = random.randint(0, len(self.items)-1) | ||
mix_item = self.items[mix_sample_idx] | ||
|
||
mix_x, sr = ta.load(mix_item.path) | ||
mix_x = mix_x - mix_x.mean() | ||
mix_x = mix_x.to(self.args.device) | ||
|
||
fbank, mix_lambda = self.fbank(x, sr, mix_x) | ||
# SpecAug, not do for eval set | ||
freqm = ta.transforms.FrequencyMasking(self.args.freqm) | ||
timem = ta.transforms.TimeMasking(self.args.timem) | ||
fbank = torch.transpose(fbank, 0, 1) | ||
# this is just to satisfy new torchaudio version, which only accept [1, freq, time] | ||
|
||
fbank = fbank.unsqueeze(0) | ||
if self.args.freqm != 0: | ||
fbank = freqm(fbank) | ||
if self.args.timem != 0: | ||
fbank = timem(fbank) | ||
# squeeze it back, it is just a trick to satisfy new torchaudio version | ||
fbank = fbank.squeeze(0) | ||
fbank = torch.transpose(fbank, 0, 1) | ||
fbank = fbank.cpu() | ||
# normalize the input for both training and test | ||
if not self.args.skip_norm: | ||
fbank = (fbank - self.args.norm_mean) / (self.args.norm_std * 2) | ||
# skip normalization the input if you are trying to get the normalization stats. | ||
else: | ||
pass | ||
|
||
if self.args.add_noise == True: | ||
fbank = fbank + torch.rand(fbank.shape[0], fbank.shape[1]) * np.random.rand() / 10 | ||
fbank = torch.roll(fbank, np.random.randint(-10, 10), 0) | ||
|
||
if mix_lambda == 0: | ||
label_indices = np.zeros(self.args.num_classes) | ||
|
||
for label_str in item.label.split(','): #one_hot for multi label classification (speech, audioset) | ||
label_indices[int(self.dataset.classes_labels[label_str])] = 1.0 | ||
|
||
label_indices = torch.FloatTensor(label_indices) | ||
else: | ||
label_indices = np.zeros(self.args.num_classes) | ||
# add sample 1 labels | ||
for label_str in item.label.split(','): | ||
label_indices[int(self.dataset.classes_labels[label_str])] += mix_lambda | ||
# add sample 2 labels | ||
for label_str in mix_item.label.split(','): | ||
label_indices[int(self.dataset.classes_labels[label_str])] += 1.0-mix_lambda | ||
label_indices = torch.FloatTensor(label_indices) | ||
|
||
return fbank, label_indices | ||
|
||
|
||
class TestSet(td.Dataset): | ||
|
||
def __init__(self, args, dataset, classes_labels): | ||
self.args = args | ||
self.items = dataset | ||
self.classes_labels = classes_labels | ||
self.fbank = Kaldi_Fbank(args) | ||
|
||
def __len__(self): | ||
return len(self.items) | ||
|
||
|
||
def __getitem__(self, index): | ||
item = self.items[index] | ||
|
||
# read wav | ||
x, sr = ta.load(item.path) | ||
x = x - x.mean() | ||
x = x.to(self.args.device) | ||
|
||
fbank, mix_lambda = self.fbank(x, sr) | ||
fbank = fbank.cpu() | ||
if not self.args.skip_norm: | ||
fbank = (fbank - self.args.norm_mean) / (self.args.norm_std * 2) | ||
# skip normalization the input if you are trying to get the normalization stats. | ||
else: | ||
pass | ||
|
||
label_indices = np.zeros(self.args.num_classes) | ||
|
||
for label_str in item.label.split(','): #one_hot for multi label classification (speech, audioset) | ||
label_indices[int(self.classes_labels[label_str])] = 1.0 | ||
|
||
label_indices = torch.FloatTensor(label_indices) | ||
|
||
return fbank, label_indices, item.path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import csv | ||
import json | ||
from dataclasses import dataclass | ||
|
||
@dataclass | ||
class DataItem: | ||
path: str | ||
label: str | ||
|
||
class Datasets: | ||
@property | ||
def train_set(self): | ||
return self.__train_set | ||
|
||
@property | ||
def classes_labels(self): | ||
return self.__classes_labels | ||
|
||
@property | ||
def evaluation_set(self): | ||
return self.__evaluation_set | ||
|
||
def __init__(self, args, fold): | ||
# train_set | ||
self.__train_set = [] | ||
path_fold_training_dataset = args.path_training_dataset.replace('.json', '_{}.json'.format(fold)) | ||
with open(path_fold_training_dataset, 'r') as fp: | ||
data = json.load(fp)['data'] | ||
for d in data: | ||
self.__train_set.append( | ||
DataItem( | ||
path=d['wav'], | ||
label=d['labels'] | ||
) | ||
) | ||
|
||
classes_labels = {} | ||
with open(args.path_data_label, 'r') as f: | ||
csv_reader = csv.DictReader(f) | ||
line_count = 0 | ||
for row in csv_reader: | ||
classes_labels[row['mid']] = row['index'] | ||
line_count += 1 | ||
self.__classes_labels = classes_labels | ||
|
||
|
||
# eval_set | ||
self.__evaluation_set = [] | ||
path_fold_evaluation_dataset = args.path_evaluation_dataset.replace('.json', '_{}.json'.format(fold)) | ||
with open(path_fold_evaluation_dataset, 'r') as fp: | ||
data = json.load(fp)['data'] | ||
for d in data: | ||
self.__evaluation_set.append( | ||
DataItem( | ||
path=d['wav'], | ||
label=d['labels'] | ||
) | ||
) |
Oops, something went wrong.