migrate to diffusers!

teticio · Dec 8, 2022 · 8a3fb2e · 8a3fb2e
1 parent 52ca0bd
commit 8a3fb2e
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 105 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,8 @@ Go to https://soundcloud.com/teticio2/sets/audio-diffusion-loops for more exampl
 ---
 #### Updates
 
+**5/12/2022** 🤗 Exciting news! `AudioDiffusionPipeline` has been migrated to the Hugging Face `diffusers` package so that it is even easier for others to use and contribute.
+
 **2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256).
 
 **7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files.
@@ -62,12 +64,20 @@ You can play around with some pre-trained models on [Google Colab](https://colab
 
 ## Generate Mel spectrogram dataset from directory of audio files
 
-#### Install
+#### Install from GitHub (includes training scripts)
 
 ```bash
+git clone https://github.com/teticio/audio-diffusion.git
+cd audio-diffusion
 pip install .
 ```
 
+#### Install from PyPI
+
+```bash
+pip install audiodiffusion
+```
+
 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results
 
 ```bash

diff --git a/audiodiffusion/__init__.py b/audiodiffusion/__init__.py
@@ -1,13 +1,13 @@
-from typing import Iterable, Tuple, Union
+from typing import Iterable, Tuple
 
 import torch
 import numpy as np
 from PIL import Image
 from tqdm.auto import tqdm
 from librosa.beat import beat_track
-#from diffusers import DiffusionPipeline
+from diffusers import AudioDiffusionPipeline
 
-VERSION = "1.3.1"
+VERSION = "1.3.2"
 
 
 class AudioDiffusion:
@@ -131,6 +131,7 @@ def loop_it(audio: np.ndarray,
         return None
 
 
+'''
 # This code will be migrated to diffusers shortly
 
 #-----------------------------------------------------------------------------#
@@ -140,6 +141,7 @@ def loop_it(audio: np.ndarray,
 from typing import Any, Dict, Optional, Union
 
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
 
 
 warnings.filterwarnings("ignore")
@@ -150,7 +152,7 @@ def loop_it(audio: np.ndarray,
 from PIL import Image  # noqa: E402
 
 
-class Mel(ConfigMixin):
+class Mel(ConfigMixin, SchedulerMixin):
     """
     Parameters:
         x_res (`int`): x resolution of spectrogram (time)
@@ -272,88 +274,6 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray:
         )
         return audio
 
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Dict[str, Any] = None,
-        subfolder: Optional[str] = None,
-        return_unused_kwargs=False,
-        **kwargs,
-    ):
-        r"""
-        Instantiate a Mel class from a pre-defined JSON configuration file inside a directory or Hub repo.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
-                      organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing the mel configurations saved using [`~Mel.save_pretrained`],
-                      e.g., `./my_model_directory/`.
-            subfolder (`str`, *optional*):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                Whether kwargs that are not consumed by the Python class should be returned or not.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-
-        <Tip>
-
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-
-        </Tip>
-
-        <Tip>
-
-        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
-        use this method in a firewalled environment.
-
-        </Tip>
-
-        """
-        config, kwargs = cls.load_config(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            subfolder=subfolder,
-            return_unused_kwargs=True,
-            **kwargs,
-        )
-        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save a mel configuration object to the directory `save_directory`, so that it can be re-loaded using the
-        [`~Mel.from_pretrained`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-        """
-        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
-
 #-----------------------------------------------------------------------------#
 
 from math import acos, sin
@@ -603,3 +523,4 @@ class audio_diffusion():
 setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline)
 diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {}
 diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"]
+'''
diff --git a/notebooks/test_model.ipynb b/notebooks/test_model.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 torch
 numpy
 Pillow
-diffusers>=0.9.0
+diffusers>=0.10.0
 librosa
 datasets
 gradio

diff --git a/scripts/audio_to_images.py b/scripts/audio_to_images.py
@@ -7,10 +7,9 @@
 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
+from diffusers.pipelines.audio_diffusion import Mel
 from datasets import Dataset, DatasetDict, Features, Image, Value
 
-from audiodiffusion import Mel
-
 logging.basicConfig(level=logging.WARN)
 logger = logging.getLogger('audio_to_images')
 

diff --git a/scripts/train_unconditional.py b/scripts/train_unconditional.py
@@ -11,11 +11,13 @@
 from accelerate.logging import get_logger
 from datasets import load_from_disk, load_dataset
 from diffusers import (
+    AudioDiffusionPipeline,
     DDPMScheduler,
     UNet2DModel,
     DDIMScheduler,
     AutoencoderKL,
 )
+from diffusers.pipelines.audio_diffusion import Mel
 from huggingface_hub import HfFolder, Repository, whoami
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
@@ -27,7 +29,6 @@
 import numpy as np
 from tqdm.auto import tqdm
 from librosa.util import normalize
-from audiodiffusion import AudioDiffusionPipeline, Mel
 
 logger = get_logger(__name__)
 

diff --git a/scripts/train_vae.py b/scripts/train_vae.py
@@ -14,13 +14,11 @@
 from pytorch_lightning.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 from datasets import load_from_disk, load_dataset
+from diffusers.pipelines.audio_diffusion import Mel
+from audiodiffusion.utils import convert_ldm_to_hf_vae
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
-#from diffusers import Mel
-from audiodiffusion import Mel
-from audiodiffusion.utils import convert_ldm_to_hf_vae
-
 
 class AudioDiffusion(Dataset):
 

diff --git a/setup.cfg b/setup.cfg
@@ -15,6 +15,6 @@ install_requires =
     torch
     numpy
     Pillow
-    diffusers>=0.9.0
+    diffusers>=0.10.0
     librosa
     datasets