open-mmlab · Nugine · May 4, 2024 · May 4, 2024 · May 4, 2024 · May 4, 2024
diff --git a/bins/svc/inference.py b/bins/svc/inference.py
@@ -50,7 +50,7 @@ def prepare_for_audio_file(args, cfg, num_workers=1):
     acoustic_extractor.extract_utt_acoustic_features_serial(
         metadata, temp_audio_dir, cfg
     )
-    if cfg.preprocess.use_min_max_norm_mel == True:
+    if cfg.preprocess.use_min_max_norm_mel is True:
         acoustic_extractor.cal_mel_min_max(
             dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
         )

diff --git a/bins/svc/preprocess.py b/bins/svc/preprocess.py
@@ -101,7 +101,7 @@ def preprocess(cfg, args):
                 new_datasets_list.extend(new_datasets)
             cfg.dataset.extend(new_datasets_list)
             print("Augmentation datasets: ", cfg.dataset)
-    except:
+    except Exception:  # TODO: better exception handling
         print("No Data Augmentation.")
 
     # Dump metadata of datasets (singers, train/test durations, etc.)
@@ -145,7 +145,7 @@ def preprocess(cfg, args):
             continue
         dataset_dir = os.path.join(output_path, dataset)
         metadata = []
-        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+        for split in ["train", "test"] if "eval" not in dataset else ["test"]:
             metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
             with open(metadata_file_path, "r") as f:
                 metadata.extend(json.load(f))

diff --git a/bins/svc/train.py b/bins/svc/train.py
@@ -80,7 +80,7 @@ def main():
 
     # Data Augmentation
     if (
-        type(cfg.preprocess.data_augment) == list
+        isinstance(cfg.preprocess.data_augment, list)
         and len(cfg.preprocess.data_augment) > 0
     ):
         new_datasets_list = []

diff --git a/bins/tta/preprocess.py b/bins/tta/preprocess.py
@@ -108,7 +108,7 @@ def preprocess(cfg, args):
                 new_datasets_list.extend(new_datasets)
             cfg.dataset.extend(new_datasets_list)
             print("Augmentation datasets: ", cfg.dataset)
-    except:
+    except Exception:  # TODO: better exception handling
         print("No Data Augmentation.")
 
     # Dump metadata of datasets (singers, train/test durations, etc.)
@@ -157,7 +157,7 @@ def preprocess(cfg, args):
             continue
         dataset_dir = os.path.join(output_path, dataset)
         metadata = []
-        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+        for split in ["train", "test"] if "eval" not in dataset else ["test"]:
             metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
             with open(metadata_file_path, "r") as f:
                 metadata.extend(json.load(f))

diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py
@@ -134,7 +134,7 @@ def preprocess(cfg, args):
                 new_datasets_list.extend(new_datasets)
             cfg.dataset.extend(new_datasets_list)
             print("Augmentation datasets: ", cfg.dataset)
-    except:
+    except Exception:  # TODO: better exception handling
         print("No Data Augmentation.")
 
     # json files
@@ -198,7 +198,7 @@ def preprocess(cfg, args):
             continue
         dataset_dir = os.path.join(output_path, dataset)
         metadata = []
-        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+        for split in ["train", "test"] if "eval" not in dataset else ["test"]:
             metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
             with open(metadata_file_path, "r") as f:
                 metadata.extend(json.load(f))

diff --git a/bins/tts/train.py b/bins/tts/train.py
@@ -79,7 +79,7 @@ def main():
 
     # Data Augmentation
     if (
-        type(cfg.preprocess.data_augment) == list
+        isinstance(cfg.preprocess.data_augment, list)
         and len(cfg.preprocess.data_augment) > 0
     ):
         new_datasets_list = []

diff --git a/bins/vocoder/preprocess.py b/bins/vocoder/preprocess.py
@@ -78,7 +78,7 @@ def preprocess(cfg, args):
                 new_datasets_list.extend(new_datasets)
             cfg.dataset.extend(new_datasets_list)
             print("Augmentation datasets: ", cfg.dataset)
-    except:
+    except Exception:  # TODO: better exception handling
         print("No Data Augmentation.")
 
     # Dump metadata of datasets (singers, train/test durations, etc.)
@@ -119,7 +119,7 @@ def preprocess(cfg, args):
             continue
         dataset_dir = os.path.join(output_path, dataset)
         metadata = []
-        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+        for split in ["train", "test"] if "eval" not in dataset else ["test"]:
             metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
             with open(metadata_file_path, "r") as f:
                 metadata.extend(json.load(f))

diff --git a/evaluation/features/long_term_average_spectrum.py b/evaluation/features/long_term_average_spectrum.py
@@ -9,7 +9,7 @@
 
 def extract_ltas(audio, fs=None, n_fft=1024, hop_length=256):
     """Extract Long-Term Average Spectrum for a given audio."""
-    if fs != None:
+    if fs is not None:
         y, _ = librosa.load(audio, sr=fs)
     else:
         y, fs = librosa.load(audio)

diff --git a/evaluation/features/signal_to_noise_ratio.py b/evaluation/features/signal_to_noise_ratio.py
@@ -79,7 +79,7 @@ def getHarmonics(fund, sr, nHarmonics=6, aliased=False):
 
 def extract_snr(audio, sr=None):
     """Extract Signal-to-Noise Ratio for a given audio."""
-    if sr != None:
+    if sr is not None:
         audio, _ = librosa.load(audio, sr=sr)
     else:
         audio, sr = librosa.load(audio, sr=sr)

diff --git a/evaluation/features/singing_power_ratio.py b/evaluation/features/singing_power_ratio.py
@@ -37,7 +37,7 @@ def extract_spr(
     pitch_min: lower limit for f0 quantization.
     """
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio, _ = librosa.load(audio, sr=fs)
     else:
         audio, fs = librosa.load(audio)

diff --git a/evaluation/metrics/energy/energy_pearson_coefficients.py b/evaluation/metrics/energy/energy_pearson_coefficients.py
@@ -42,7 +42,7 @@ def extract_energy_pearson_coeffcients(
     pearson = PearsonCorrCoef()
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/energy/energy_rmse.py b/evaluation/metrics/energy/energy_rmse.py
@@ -37,7 +37,7 @@ def extract_energy_rmse(
     db_scale = kwargs["db_scale"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/f0/f0_pearson_coefficients.py b/evaluation/metrics/f0/f0_pearson_coefficients.py
@@ -46,7 +46,7 @@ def extract_fpc(
     pearson = PearsonCorrCoef()
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/f0/f0_periodicity_rmse.py b/evaluation/metrics/f0/f0_periodicity_rmse.py
@@ -31,7 +31,7 @@ def extract_f0_periodicity_rmse(
     method = kwargs["method"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/f0/f0_rmse.py b/evaluation/metrics/f0/f0_rmse.py
@@ -45,7 +45,7 @@ def extract_f0rmse(
     need_mean = kwargs["need_mean"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/f0/v_uv_f1.py b/evaluation/metrics/f0/v_uv_f1.py
@@ -44,7 +44,7 @@ def extract_f1_v_uv(
     method = kwargs["method"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/similarity/speaker_similarity.py b/evaluation/metrics/similarity/speaker_similarity.py
@@ -108,7 +108,7 @@ def extract_similarity(path_ref, path_deg, **kwargs):
                 "microsoft/wavlm-base-plus-sv"
             )
             model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv")
-        except:
+        except Exception:  # TODO: better exception handling
             feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
                 "pretrained/wavlm", sampling_rate=16000
             )

diff --git a/evaluation/metrics/spectrogram/mel_cepstral_distortion.py b/evaluation/metrics/spectrogram/mel_cepstral_distortion.py
@@ -17,7 +17,7 @@ def extract_mcd(audio_ref, audio_deg, **kwargs):
     fs = kwargs["fs"]
 
     mcd_toolbox = Calculate_MCD(MCD_mode="dtw_sl")
-    if fs != None:
+    if fs is not None:
         mcd_toolbox.SAMPLING_RATE = fs
     mcd_value = mcd_toolbox.calculate_mcd(audio_ref, audio_deg)
 

diff --git a/evaluation/metrics/spectrogram/multi_resolution_stft_distance.py b/evaluation/metrics/spectrogram/multi_resolution_stft_distance.py
@@ -29,7 +29,7 @@ def extract_mstft(
     method = kwargs["method"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/spectrogram/pesq.py b/evaluation/metrics/spectrogram/pesq.py
@@ -24,7 +24,7 @@ def extract_pesq(audio_ref, audio_deg, **kwargs):
     method = kwargs["method"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py b/evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py
@@ -19,7 +19,7 @@ def extract_si_sdr(audio_ref, audio_deg, **kwargs):
 
     si_sdr = ScaleInvariantSignalDistortionRatio()
 
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py b/evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py
@@ -19,7 +19,7 @@ def extract_si_snr(audio_ref, audio_deg, **kwargs):
 
     si_snr = ScaleInvariantSignalNoiseRatio()
 
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/evaluation/metrics/spectrogram/short_time_objective_intelligibility.py b/evaluation/metrics/spectrogram/short_time_objective_intelligibility.py
@@ -25,7 +25,7 @@ def extract_stoi(audio_ref, audio_deg, **kwargs):
     method = kwargs["method"]
 
     # Load audio
-    if fs != None:
+    if fs is not None:
         audio_ref, _ = librosa.load(audio_ref, sr=fs)
         audio_deg, _ = librosa.load(audio_deg, sr=fs)
     else:

diff --git a/models/base/base_inference.py b/models/base/base_inference.py
@@ -14,7 +14,6 @@
 from tqdm import tqdm
 
 from models.vocoders.vocoder_inference import synthesis
-from torch.utils.data import DataLoader
 from utils.util import set_all_random_seed
 from utils.util import load_config
 

diff --git a/models/codec/ns3_codec/quantize/rvq.py b/models/codec/ns3_codec/quantize/rvq.py
@@ -15,7 +15,7 @@ class ResidualVQ(nn.Module):
     def __init__(self, *, num_quantizers, codebook_size, **kwargs):
         super().__init__()
         VQ = FactorizedVectorQuantize
-        if type(codebook_size) == int:
+        if isinstance(codebook_size, int):
             codebook_size = [codebook_size] * num_quantizers
         self.layers = nn.ModuleList(
             [VQ(codebook_size=2**size, **kwargs) for size in codebook_size]

diff --git a/models/codec/ns3_codec/transformer.py b/models/codec/ns3_codec/transformer.py
@@ -129,7 +129,7 @@ def forward(self, x, key_padding_mask, conditon=None):
         else:
             x = self.ln_1(x)
 
-        if key_padding_mask != None:
+        if key_padding_mask is not None:
             key_padding_mask_input = ~(key_padding_mask.bool())
         else:
             key_padding_mask_input = None
@@ -186,7 +186,7 @@ def __init__(
         )
         self.use_cln = use_cln if use_cln is not None else cfg.use_cln
 
-        if enc_emb_tokens != None:
+        if enc_emb_tokens is not None:
             self.use_enc_emb = True
             self.enc_emb_tokens = enc_emb_tokens
         else:

diff --git a/models/svc/base/svc_dataset.py b/models/svc/base/svc_dataset.py
@@ -317,7 +317,7 @@ def __init__(self, args, cfg, infer_type):
         target_singer = args.target_singer
         self.cfg = cfg
         self.trans_key = args.trans_key
-        assert type(target_singer) == str
+        assert isinstance(target_singer, str)
 
         self.target_singer = target_singer.split("_")[-1]
         self.target_dataset = target_singer.replace(
@@ -481,9 +481,9 @@ def __getitem__(self, index):
             if self.trans_key:
                 try:
                     self.trans_key = int(self.trans_key)
-                except:
+                except Exception:  # TODO: better exception handling
                     pass
-                if type(self.trans_key) == int:
+                if isinstance(self.trans_key, int):
                     frame_pitch = transpose_key(frame_pitch, self.trans_key)
                 elif self.trans_key:
                     assert self.target_singer

diff --git a/models/tta/autoencoder/autoencoder_dataset.py b/models/tta/autoencoder/autoencoder_dataset.py
@@ -77,9 +77,6 @@ def __getitem__(self, index):
     def __len__(self):
         return len(self.metadata)
 
-    def __len__(self):
-        return len(self.metadata)
-
 
 class AutoencoderKLCollator(BaseOfflineCollator):
     def __init__(self, cfg):

diff --git a/models/tta/autoencoder/autoencoder_trainer.py b/models/tta/autoencoder/autoencoder_trainer.py
@@ -91,7 +91,7 @@ def build_criterion(self):
         return AutoencoderLossWithDiscriminator(self.cfg.model.loss)
 
     def get_state_dict(self):
-        if self.scheduler != None:
+        if self.scheduler is not None:
             state_dict = {
                 "model": self.model.state_dict(),
                 "optimizer_ae": self.optimizer["opt_ae"].state_dict(),
@@ -119,7 +119,7 @@ def load_model(self, checkpoint):
         self.model.load_state_dict(checkpoint["model"])
         self.optimizer["opt_ae"].load_state_dict(checkpoint["optimizer_ae"])
         self.optimizer["opt_disc"].load_state_dict(checkpoint["optimizer_disc"])
-        if self.scheduler != None:
+        if self.scheduler is not None:
             self.scheduler.load_state_dict(checkpoint["scheduler"])
 
     def build_model(self):

diff --git a/models/tta/ldm/audioldm_trainer.py b/models/tta/ldm/audioldm_trainer.py
@@ -15,7 +15,6 @@
 from torch.utils.data import ConcatDataset, DataLoader
 
 from transformers import T5EncoderModel
-from diffusers import DDPMScheduler
 
 
 class AudioLDMTrainer(BaseTrainer):
@@ -122,7 +121,7 @@ def build_criterion(self):
         return criterion
 
     def get_state_dict(self):
-        if self.scheduler != None:
+        if self.scheduler is not None:
             state_dict = {
                 "model": self.model.state_dict(),
                 "optimizer": self.optimizer.state_dict(),
@@ -147,7 +146,7 @@ def load_model(self, checkpoint):
 
         self.model.load_state_dict(checkpoint["model"])
         self.optimizer.load_state_dict(checkpoint["optimizer"])
-        if self.scheduler != None:
+        if self.scheduler is not None:
             self.scheduler.load_state_dict(checkpoint["scheduler"])
 
     def build_model(self):

diff --git a/models/tta/ldm/inference_utils/vocoder.py b/models/tta/ldm/inference_utils/vocoder.py
@@ -216,7 +216,7 @@ class DiscriminatorP(torch.nn.Module):
     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
         super(DiscriminatorP, self).__init__()
         self.period = period
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
         self.convs = nn.ModuleList(
             [
                 norm_f(
@@ -314,7 +314,7 @@ def forward(self, y, y_hat):
 class DiscriminatorS(torch.nn.Module):
     def __init__(self, use_spectral_norm=False):
         super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
         self.convs = nn.ModuleList(
             [
                 norm_f(Conv1d(1, 128, 15, 1, padding=7)),

diff --git a/models/tts/base/tts_inferece.py b/models/tts/base/tts_inferece.py
@@ -158,7 +158,7 @@ def _load_model(
             assert checkpoint_dir is not None
             # Load the latest accelerator state dicts
             ls = [
-                str(i) for i in Path(checkpoint_dir).glob("*") if not "audio" in str(i)
+                str(i) for i in Path(checkpoint_dir).glob("*") if "audio" not in str(i)
             ]
             ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
             checkpoint_path = ls[0]

diff --git a/models/tts/base/tts_trainer.py b/models/tts/base/tts_trainer.py
@@ -9,7 +9,6 @@
 import torch
 import time
 from pathlib import Path
-import torch
 from tqdm import tqdm
 import re
 import logging
@@ -175,7 +174,7 @@ def _check_resume(self):
                     self.args.resume_type = "finetune"
                     checkpoint_dir = self.args.ar_model_ckpt_dir
                     self.logger.info(
-                        f"Training NAR model at stage 2 using the checkpoint of AR model at stage 1."
+                        "Training NAR model at stage 2 using the checkpoint of AR model at stage 1."
                     )
 
             self.logger.info(f"Resuming from checkpoint: {checkpoint_dir}")