Skip to content

Commit

Permalink
migrate to diffusers!
Browse files Browse the repository at this point in the history
  • Loading branch information
teticio committed Dec 8, 2022
1 parent 52ca0bd commit 8a3fb2e
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 105 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Go to https://soundcloud.com/teticio2/sets/audio-diffusion-loops for more exampl
---
#### Updates

**5/12/2022** 🤗 Exciting news! `AudioDiffusionPipeline` has been migrated to the Hugging Face `diffusers` package so that it is even easier for others to use and contribute.

**2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256).

**7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files.
Expand Down Expand Up @@ -62,12 +64,20 @@ You can play around with some pre-trained models on [Google Colab](https://colab

## Generate Mel spectrogram dataset from directory of audio files

#### Install
#### Install from GitHub (includes training scripts)

```bash
git clone https://github.com/teticio/audio-diffusion.git
cd audio-diffusion
pip install .
```

#### Install from PyPI

```bash
pip install audiodiffusion
```

#### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results

```bash
Expand Down
93 changes: 7 additions & 86 deletions audiodiffusion/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from typing import Iterable, Tuple, Union
from typing import Iterable, Tuple

import torch
import numpy as np
from PIL import Image
from tqdm.auto import tqdm
from librosa.beat import beat_track
#from diffusers import DiffusionPipeline
from diffusers import AudioDiffusionPipeline

VERSION = "1.3.1"
VERSION = "1.3.2"


class AudioDiffusion:
Expand Down Expand Up @@ -131,6 +131,7 @@ def loop_it(audio: np.ndarray,
return None


'''
# This code will be migrated to diffusers shortly
#-----------------------------------------------------------------------------#
Expand All @@ -140,6 +141,7 @@ def loop_it(audio: np.ndarray,
from typing import Any, Dict, Optional, Union
from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.schedulers.scheduling_utils import SchedulerMixin
warnings.filterwarnings("ignore")
Expand All @@ -150,7 +152,7 @@ def loop_it(audio: np.ndarray,
from PIL import Image # noqa: E402
class Mel(ConfigMixin):
class Mel(ConfigMixin, SchedulerMixin):
"""
Parameters:
x_res (`int`): x resolution of spectrogram (time)
Expand Down Expand Up @@ -272,88 +274,6 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray:
)
return audio
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Dict[str, Any] = None,
subfolder: Optional[str] = None,
return_unused_kwargs=False,
**kwargs,
):
r"""
Instantiate a Mel class from a pre-defined JSON configuration file inside a directory or Hub repo.
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
Can be either:
- A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
organization name, like `google/ddpm-celebahq-256`.
- A path to a *directory* containing the mel configurations saved using [`~Mel.save_pretrained`],
e.g., `./my_model_directory/`.
subfolder (`str`, *optional*):
In case the relevant files are located inside a subfolder of the model repo (either remote in
huggingface.co or downloaded locally), you can specify the folder name here.
return_unused_kwargs (`bool`, *optional*, defaults to `False`):
Whether kwargs that are not consumed by the Python class should be returned or not.
cache_dir (`Union[str, os.PathLike]`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
resume_download (`bool`, *optional*, defaults to `False`):
Whether or not to delete incompletely received files. Will attempt to resume the download if such a
file exists.
proxies (`Dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
output_loading_info(`bool`, *optional*, defaults to `False`):
Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model).
use_auth_token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `transformers-cli login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
<Tip>
It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
models](https://huggingface.co/docs/hub/models-gated#gated-models).
</Tip>
<Tip>
Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
use this method in a firewalled environment.
</Tip>
"""
config, kwargs = cls.load_config(
pretrained_model_name_or_path=pretrained_model_name_or_path,
subfolder=subfolder,
return_unused_kwargs=True,
**kwargs,
)
return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)

def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
"""
Save a mel configuration object to the directory `save_directory`, so that it can be re-loaded using the
[`~Mel.from_pretrained`] class method.
Args:
save_directory (`str` or `os.PathLike`):
Directory where the configuration JSON file will be saved (will be created if it does not exist).
"""
self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)

#-----------------------------------------------------------------------------#
from math import acos, sin
Expand Down Expand Up @@ -603,3 +523,4 @@ class audio_diffusion():
setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline)
diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {}
diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"]
'''
179 changes: 170 additions & 9 deletions notebooks/test_model.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
torch
numpy
Pillow
diffusers>=0.9.0
diffusers>=0.10.0
librosa
datasets
gradio
Expand Down
3 changes: 1 addition & 2 deletions scripts/audio_to_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from diffusers.pipelines.audio_diffusion import Mel
from datasets import Dataset, DatasetDict, Features, Image, Value

from audiodiffusion import Mel

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger('audio_to_images')

Expand Down
3 changes: 2 additions & 1 deletion scripts/train_unconditional.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
from accelerate.logging import get_logger
from datasets import load_from_disk, load_dataset
from diffusers import (
AudioDiffusionPipeline,
DDPMScheduler,
UNet2DModel,
DDIMScheduler,
AutoencoderKL,
)
from diffusers.pipelines.audio_diffusion import Mel
from huggingface_hub import HfFolder, Repository, whoami
from diffusers.optimization import get_scheduler
from diffusers.training_utils import EMAModel
Expand All @@ -27,7 +29,6 @@
import numpy as np
from tqdm.auto import tqdm
from librosa.util import normalize
from audiodiffusion import AudioDiffusionPipeline, Mel

logger = get_logger(__name__)

Expand Down
6 changes: 2 additions & 4 deletions scripts/train_vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,11 @@
from pytorch_lightning.trainer import Trainer
from torch.utils.data import DataLoader, Dataset
from datasets import load_from_disk, load_dataset
from diffusers.pipelines.audio_diffusion import Mel
from audiodiffusion.utils import convert_ldm_to_hf_vae
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.utilities.distributed import rank_zero_only

#from diffusers import Mel
from audiodiffusion import Mel
from audiodiffusion.utils import convert_ldm_to_hf_vae


class AudioDiffusion(Dataset):

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ install_requires =
torch
numpy
Pillow
diffusers>=0.9.0
diffusers>=0.10.0
librosa
datasets

0 comments on commit 8a3fb2e

Please sign in to comment.