Skip to content

Commit

Permalink
first
Browse files Browse the repository at this point in the history
  • Loading branch information
wngh1187 committed Oct 28, 2022
1 parent 1f3e6f7 commit 0a2053d
Show file tree
Hide file tree
Showing 230 changed files with 1,393,770 additions and 0 deletions.
Binary file added ESC50/.DS_Store
Binary file not shown.
85 changes: 85 additions & 0 deletions ESC50/AST/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse

def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')

def get_args():
parser=argparse.ArgumentParser()

# expeirment info
parser.add_argument('-project', type=str, default='IPET')
parser.add_argument('-name', type=str, required=True)
parser.add_argument('-tags', type=str, required=True)
parser.add_argument('-description', type=str, default='')

# dir
parser.add_argument('-path_logging', type=str, default='/source')
parser.add_argument('-path_training_dataset', type=str, default='../datafiles/esc50_train_data.json')
parser.add_argument('-path_evaluation_dataset', type=str, default='../datafiles/esc50_eval_data.json')
parser.add_argument('-path_data_label', type=str, default='../datafiles/class_labels_indices.csv')

# device
parser.add_argument('-num_workers', type=int, default=4)
parser.add_argument('-usable_gpu', type=str, default='0,1,2,3')

# hyper-parameters
parser.add_argument('-epoch', type=int, default=30)
parser.add_argument('-batch_size', type=int, default=48)
parser.add_argument('-amsgrad', type = str2bool, nargs='?', const=True, default = False)
parser.add_argument('-lr', type = float, default = 1e-3)
parser.add_argument('-lr_decay_start_epoch', type=int, default=5)
parser.add_argument('-lr_decay_end_epoch', type=int, default=26)
parser.add_argument('-gamma', type = float, default = 0.85)
parser.add_argument('-weigth_decay', type = float, default = 5e-7)
parser.add_argument('-classification_loss', type=str, default='cce')

# training setting
parser.add_argument('-fold', type=int, default=5)
parser.add_argument('-number_iteration_for_log', type=int, default=10)
parser.add_argument('-rand_seed', type=int, default=1234)
parser.add_argument('-flag_reproduciable', type = str2bool, nargs='?', const=True, default = True)

#model architectures
parser.add_argument('-main_module_name', type=str, default='AST')
parser.add_argument('-backend_module_name', type=str, default='head')
parser.add_argument('-fstride', type=int, default=10)
parser.add_argument('-tstride', type=int, default=10)
parser.add_argument('-imagenet_pretrain', type = str2bool, nargs='?', const=True, default = True)
parser.add_argument('-audioset_pretrain', type = str2bool, nargs='?', const=True, default = True)
parser.add_argument('-model_size', type=str, default='base384')

#parameter-efficient transfer learning methods
parser.add_argument('-input_prompt', type = str2bool, nargs='?', const=True, default = False)
parser.add_argument('-input_prompt_num', type=int, default=10)
parser.add_argument('-embedding_prompt', type = str2bool, nargs='?', const=True, default = True)
parser.add_argument('-embedding_prompt_num', type=int, default=4)
parser.add_argument('-adapter', type = str2bool, nargs='?', const=True, default = True)
parser.add_argument('-adapter_scalar', type = float, default = 0.1)
parser.add_argument('-adapter_hidden_dim', type = int, default = 32)

#data processing
parser.add_argument('-frame_length', type = int, default = 512)
parser.add_argument('-winlen', type = int, default = 25)
parser.add_argument('-winstep', type = int, default = 10)
parser.add_argument('-winfunc', type=str, default='hanning')
parser.add_argument('-nfilts', type = int, default = 128)
parser.add_argument('-norm_mean', type = float, default = -6.6268077)
parser.add_argument('-norm_std', type = float, default = 5.358466)
parser.add_argument('-skip_norm', type = str2bool, nargs='?', const=True, default = False)

#data augmentation
parser.add_argument('-mixup', type = float, default = 0)
parser.add_argument('-freqm', type = int, default = 24)
parser.add_argument('-timem', type = int, default = 96)
parser.add_argument('-add_noise', type = str2bool, nargs='?', const=True, default = False)

args=parser.parse_args()

return args
200 changes: 200 additions & 0 deletions ESC50/AST/data/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import math
from random import random
import torch
import torch.utils.data as td
import torchaudio as ta
import numpy as np
import warnings
import torch.nn.functional
import utils.util as util
import random

class Kaldi_Fbank():
"""Extract Kaldi_fbank from raw x using torchaudio.
"""
def __init__(self, args):
super(Kaldi_Fbank, self).__init__()

self.args = args

def __call__(self, x, sr, x2 = None):
# mixup
if x2 is not None:
if x.shape[1] != x2.shape[1]:
if x.shape[1] > x2.shape[1]:
# padding
temp_wav = torch.zeros(1, x.shape[1])
temp_wav[0, 0:x2.shape[1]] = x2
x2 = temp_wav
else:
# cutting
x2 = x2[0, 0:x.shape[1]]

# sample lambda from uniform distribution
#mix_lambda = random.random()
# sample lambda from beta distribtion
mix_lambda = np.random.beta(10, 10)

mix_x = mix_lambda * x + (1 - mix_lambda) * x2.to(self.args.device)
x = mix_x - mix_x.mean()

fbank = ta.compliance.kaldi.fbank(x, frame_length=self.args.winlen, frame_shift=self.args.winstep, htk_compat=True, sample_frequency=sr,
window_type=self.args.winfunc, num_mel_bins=self.args.nfilts, dither=0.0)

n_frames = fbank.shape[0]

p = self.args.frame_length - n_frames

# cut and pad
if p > 0:
m = torch.nn.ZeroPad2d((0, 0, 0, p))
fbank = m(fbank)
elif p < 0:
fbank = fbank[0:self.args.frame_length, :]

if x2 == None:
return fbank, 0
else:
return fbank, mix_lambda

def get_loaders(args, dataset):
train_set = TrainSet(args, dataset)
train_set_sampler = td.DistributedSampler(train_set, shuffle=True)

evaluationt_set = TestSet(args, dataset.evaluation_set, dataset.classes_labels)
evaluationt_set_sampler = td.DistributedSampler(evaluationt_set, shuffle=False)

train_loader = td.DataLoader(
train_set,
batch_size=args.batch_size,
pin_memory=True,
num_workers=args.num_workers,
sampler=train_set_sampler
)

evaluation_loader = td.DataLoader(
evaluationt_set,
batch_size=args.batch_size * 2,
pin_memory=True,
num_workers=args.num_workers,
sampler=evaluationt_set_sampler
)

return train_set, train_set_sampler, train_loader, evaluationt_set, evaluation_loader

class TrainSet(td.Dataset):

def __init__(self, args, dataset):
self.args = args
self.dataset = dataset
self.items = dataset.train_set
self.fbank = Kaldi_Fbank(args)
# set label
count = 0



def __len__(self):
return len(self.items)

def __getitem__(self, index):
item = self.items[index]


# read wav
x, sr = ta.load(item.path)
x = x - x.mean()
x = x.to(self.args.device)
mix_x = None

if random.random() < self.args.mixup:
mix_sample_idx = random.randint(0, len(self.items)-1)
mix_item = self.items[mix_sample_idx]

mix_x, sr = ta.load(mix_item.path)
mix_x = mix_x - mix_x.mean()
mix_x = mix_x.to(self.args.device)

fbank, mix_lambda = self.fbank(x, sr, mix_x)
# SpecAug, not do for eval set
freqm = ta.transforms.FrequencyMasking(self.args.freqm)
timem = ta.transforms.TimeMasking(self.args.timem)
fbank = torch.transpose(fbank, 0, 1)
# this is just to satisfy new torchaudio version, which only accept [1, freq, time]

fbank = fbank.unsqueeze(0)
if self.args.freqm != 0:
fbank = freqm(fbank)
if self.args.timem != 0:
fbank = timem(fbank)
# squeeze it back, it is just a trick to satisfy new torchaudio version
fbank = fbank.squeeze(0)
fbank = torch.transpose(fbank, 0, 1)
fbank = fbank.cpu()
# normalize the input for both training and test
if not self.args.skip_norm:
fbank = (fbank - self.args.norm_mean) / (self.args.norm_std * 2)
# skip normalization the input if you are trying to get the normalization stats.
else:
pass

if self.args.add_noise == True:
fbank = fbank + torch.rand(fbank.shape[0], fbank.shape[1]) * np.random.rand() / 10
fbank = torch.roll(fbank, np.random.randint(-10, 10), 0)

if mix_lambda == 0:
label_indices = np.zeros(self.args.num_classes)

for label_str in item.label.split(','): #one_hot for multi label classification (speech, audioset)
label_indices[int(self.dataset.classes_labels[label_str])] = 1.0

label_indices = torch.FloatTensor(label_indices)
else:
label_indices = np.zeros(self.args.num_classes)
# add sample 1 labels
for label_str in item.label.split(','):
label_indices[int(self.dataset.classes_labels[label_str])] += mix_lambda
# add sample 2 labels
for label_str in mix_item.label.split(','):
label_indices[int(self.dataset.classes_labels[label_str])] += 1.0-mix_lambda
label_indices = torch.FloatTensor(label_indices)

return fbank, label_indices


class TestSet(td.Dataset):

def __init__(self, args, dataset, classes_labels):
self.args = args
self.items = dataset
self.classes_labels = classes_labels
self.fbank = Kaldi_Fbank(args)

def __len__(self):
return len(self.items)


def __getitem__(self, index):
item = self.items[index]

# read wav
x, sr = ta.load(item.path)
x = x - x.mean()
x = x.to(self.args.device)

fbank, mix_lambda = self.fbank(x, sr)
fbank = fbank.cpu()
if not self.args.skip_norm:
fbank = (fbank - self.args.norm_mean) / (self.args.norm_std * 2)
# skip normalization the input if you are trying to get the normalization stats.
else:
pass

label_indices = np.zeros(self.args.num_classes)

for label_str in item.label.split(','): #one_hot for multi label classification (speech, audioset)
label_indices[int(self.classes_labels[label_str])] = 1.0

label_indices = torch.FloatTensor(label_indices)

return fbank, label_indices, item.path
58 changes: 58 additions & 0 deletions ESC50/AST/data/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import csv
import json
from dataclasses import dataclass

@dataclass
class DataItem:
path: str
label: str

class Datasets:
@property
def train_set(self):
return self.__train_set

@property
def classes_labels(self):
return self.__classes_labels

@property
def evaluation_set(self):
return self.__evaluation_set

def __init__(self, args, fold):
# train_set
self.__train_set = []
path_fold_training_dataset = args.path_training_dataset.replace('.json', '_{}.json'.format(fold))
with open(path_fold_training_dataset, 'r') as fp:
data = json.load(fp)['data']
for d in data:
self.__train_set.append(
DataItem(
path=d['wav'],
label=d['labels']
)
)

classes_labels = {}
with open(args.path_data_label, 'r') as f:
csv_reader = csv.DictReader(f)
line_count = 0
for row in csv_reader:
classes_labels[row['mid']] = row['index']
line_count += 1
self.__classes_labels = classes_labels


# eval_set
self.__evaluation_set = []
path_fold_evaluation_dataset = args.path_evaluation_dataset.replace('.json', '_{}.json'.format(fold))
with open(path_fold_evaluation_dataset, 'r') as fp:
data = json.load(fp)['data']
for d in data:
self.__evaluation_set.append(
DataItem(
path=d['wav'],
label=d['labels']
)
)
Loading

0 comments on commit 0a2053d

Please sign in to comment.