From 8a3fb2e3e88ef9686be271436b1157a7ac76d0d1 Mon Sep 17 00:00:00 2001 From: teticio Date: Thu, 8 Dec 2022 21:11:15 +0000 Subject: [PATCH] migrate to diffusers! --- README.md | 12 ++- audiodiffusion/__init__.py | 93 ++--------------- notebooks/test_model.ipynb | 179 +++++++++++++++++++++++++++++++-- requirements.txt | 2 +- scripts/audio_to_images.py | 3 +- scripts/train_unconditional.py | 3 +- scripts/train_vae.py | 6 +- setup.cfg | 2 +- 8 files changed, 195 insertions(+), 105 deletions(-) diff --git a/README.md b/README.md index 11e0dca..5287a64 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ Go to https://soundcloud.com/teticio2/sets/audio-diffusion-loops for more exampl --- #### Updates +**5/12/2022** 🤗 Exciting news! `AudioDiffusionPipeline` has been migrated to the Hugging Face `diffusers` package so that it is even easier for others to use and contribute. + **2/12/2022**. Added Mel to pipeline and updated the pretrained models to save Mel config (they are now no longer compatible with previous versions of this repo). It is relatively straightforward to migrate previously trained models to the new format (see https://huggingface.co/teticio/audio-diffusion-256). **7/11/2022**. Added pre-trained latent audio diffusion models [teticio/latent-audio-diffusion-256](https://huggingface.co/teticio/latent-audio-diffusion-256) and [teticio/latent-audio-diffusion-ddim-256](https://huggingface.co/teticio/latent-audio-diffusion-ddim-256). You can use the pre-trained VAE to train your own latent diffusion models on a different set of audio files. @@ -62,12 +64,20 @@ You can play around with some pre-trained models on [Google Colab](https://colab ## Generate Mel spectrogram dataset from directory of audio files -#### Install +#### Install from GitHub (includes training scripts) ```bash +git clone https://github.com/teticio/audio-diffusion.git +cd audio-diffusion pip install . ``` +#### Install from PyPI + +```bash +pip install audiodiffusion +``` + #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results ```bash diff --git a/audiodiffusion/__init__.py b/audiodiffusion/__init__.py index 27fda4c..22ff5ec 100644 --- a/audiodiffusion/__init__.py +++ b/audiodiffusion/__init__.py @@ -1,13 +1,13 @@ -from typing import Iterable, Tuple, Union +from typing import Iterable, Tuple import torch import numpy as np from PIL import Image from tqdm.auto import tqdm from librosa.beat import beat_track -#from diffusers import DiffusionPipeline +from diffusers import AudioDiffusionPipeline -VERSION = "1.3.1" +VERSION = "1.3.2" class AudioDiffusion: @@ -131,6 +131,7 @@ def loop_it(audio: np.ndarray, return None +''' # This code will be migrated to diffusers shortly #-----------------------------------------------------------------------------# @@ -140,6 +141,7 @@ def loop_it(audio: np.ndarray, from typing import Any, Dict, Optional, Union from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.schedulers.scheduling_utils import SchedulerMixin warnings.filterwarnings("ignore") @@ -150,7 +152,7 @@ def loop_it(audio: np.ndarray, from PIL import Image # noqa: E402 -class Mel(ConfigMixin): +class Mel(ConfigMixin, SchedulerMixin): """ Parameters: x_res (`int`): x resolution of spectrogram (time) @@ -272,88 +274,6 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray: ) return audio - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Dict[str, Any] = None, - subfolder: Optional[str] = None, - return_unused_kwargs=False, - **kwargs, - ): - r""" - Instantiate a Mel class from a pre-defined JSON configuration file inside a directory or Hub repo. - - Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): - Can be either: - - - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an - organization name, like `google/ddpm-celebahq-256`. - - A path to a *directory* containing the mel configurations saved using [`~Mel.save_pretrained`], - e.g., `./my_model_directory/`. - subfolder (`str`, *optional*): - In case the relevant files are located inside a subfolder of the model repo (either remote in - huggingface.co or downloaded locally), you can specify the folder name here. - return_unused_kwargs (`bool`, *optional*, defaults to `False`): - Whether kwargs that are not consumed by the Python class should be returned or not. - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - output_loading_info(`bool`, *optional*, defaults to `False`): - Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). - use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `transformers-cli login` (stored in `~/.huggingface`). - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. - - - - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models). - - - - - - Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to - use this method in a firewalled environment. - - - - """ - config, kwargs = cls.load_config( - pretrained_model_name_or_path=pretrained_model_name_or_path, - subfolder=subfolder, - return_unused_kwargs=True, - **kwargs, - ) - return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs) - - def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): - """ - Save a mel configuration object to the directory `save_directory`, so that it can be re-loaded using the - [`~Mel.from_pretrained`] class method. - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the configuration JSON file will be saved (will be created if it does not exist). - """ - self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) - #-----------------------------------------------------------------------------# from math import acos, sin @@ -603,3 +523,4 @@ class audio_diffusion(): setattr(diffusers, AudioDiffusionPipeline.__name__, AudioDiffusionPipeline) diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion'] = {} diffusers.pipeline_utils.LOADABLE_CLASSES['audio_diffusion']['Mel'] = ["save_pretrained", "from_pretrained"] +''' diff --git a/notebooks/test_model.ipynb b/notebooks/test_model.ipynb index 560e1c7..95e4b35 100644 --- a/notebooks/test_model.ipynb +++ b/notebooks/test_model.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "6c7800a6", "metadata": {}, "outputs": [], @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "b447e2c4", "metadata": {}, "outputs": [], @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "c2fc0e7a", "metadata": {}, "outputs": [], @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "b294a94a", "metadata": {}, "outputs": [], @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "97f24046", "metadata": {}, "outputs": [], @@ -98,10 +98,61 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, + "id": "88bebba3", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'AudioDiffusion' object has no attribute 'Mel'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [11]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mMel\u001b[49m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'AudioDiffusion' object has no attribute 'Mel'" + ] + } + ], + "source": [ + "audio_diffusion.Mel" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "a3d45c36", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "89e8b4345bab47378576244f4d3f7b44", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/244 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Seed = 2275699277188148\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e049bd4fb00542feba252d9f9da2334d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1000 [00:00\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSeed = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mseed\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 4\u001b[0m generator\u001b[38;5;241m.\u001b[39mmanual_seed(seed)\n\u001b[1;32m 5\u001b[0m image, (sample_rate,\n\u001b[0;32m----> 6\u001b[0m audio) \u001b[38;5;241m=\u001b[39m \u001b[43maudio_diffusion\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_spectrogram_and_audio\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m display(image)\n\u001b[1;32m 9\u001b[0m display(Audio(audio, rate\u001b[38;5;241m=\u001b[39msample_rate))\n", + "File \u001b[0;32m~/ML/huggingface/audio-diffusion/audiodiffusion/__init__.py:54\u001b[0m, in \u001b[0;36mAudioDiffusion.generate_spectrogram_and_audio\u001b[0;34m(self, steps, generator, step_generator, eta, noise)\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_spectrogram_and_audio\u001b[39m(\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 34\u001b[0m steps: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 38\u001b[0m noise: torch\u001b[38;5;241m.\u001b[39mTensor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 39\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tuple[Image\u001b[38;5;241m.\u001b[39mImage, Tuple[\u001b[38;5;28mint\u001b[39m, np\u001b[38;5;241m.\u001b[39mndarray]]:\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03m\"\"\"Generate random mel spectrogram and convert to audio.\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \n\u001b[1;32m 42\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m (float, np.ndarray): sample rate and raw audio\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 53\u001b[0m images, (sample_rate,\n\u001b[0;32m---> 54\u001b[0m audios) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpipe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 55\u001b[0m \u001b[43m \u001b[49m\u001b[43msteps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msteps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 56\u001b[0m \u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgenerator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 57\u001b[0m \u001b[43m \u001b[49m\u001b[43mstep_generator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 58\u001b[0m \u001b[43m \u001b[49m\u001b[43meta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 59\u001b[0m \u001b[43m \u001b[49m\u001b[43mnoise\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnoise\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m images[\u001b[38;5;241m0\u001b[39m], (sample_rate, audios[\u001b[38;5;241m0\u001b[39m])\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclone():\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py:160\u001b[0m, in \u001b[0;36mAudioDiffusionPipeline.__call__\u001b[0;34m(self, batch_size, audio_file, raw_audio, slice, start_step, steps, generator, mask_start_secs, mask_end_secs, step_generator, eta, noise, return_dict)\u001b[0m\n\u001b[1;32m 157\u001b[0m mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39madd_noise(input_images, noise, torch\u001b[38;5;241m.\u001b[39mtensor(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps[start_step:]))\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m step, t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprogress_bar(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mtimesteps[start_step:])):\n\u001b[0;32m--> 160\u001b[0m model_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler, DDIMScheduler):\n\u001b[1;32m 163\u001b[0m images \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mstep(\n\u001b[1;32m 164\u001b[0m model_output\u001b[38;5;241m=\u001b[39mmodel_output, timestep\u001b[38;5;241m=\u001b[39mt, sample\u001b[38;5;241m=\u001b[39mimages, eta\u001b[38;5;241m=\u001b[39meta, generator\u001b[38;5;241m=\u001b[39mstep_generator\n\u001b[1;32m 165\u001b[0m )[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprev_sample\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", + "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/models/unet_2d.py:247\u001b[0m, in \u001b[0;36mUNet2DModel.forward\u001b[0;34m(self, sample, timestep, return_dict)\u001b[0m\n\u001b[1;32m 245\u001b[0m sample, skip_sample \u001b[38;5;241m=\u001b[39m upsample_block(sample, res_samples, emb, skip_sample)\n\u001b[1;32m 246\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 247\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[43mupsample_block\u001b[49m\u001b[43m(\u001b[49m\u001b[43msample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mres_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43memb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 249\u001b[0m \u001b[38;5;66;03m# 6. post-process\u001b[39;00m\n\u001b[1;32m 250\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_norm_out(sample)\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", + "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/models/unet_2d_blocks.py:1317\u001b[0m, in \u001b[0;36mUpBlock2D.forward\u001b[0;34m(self, hidden_states, res_hidden_states_tuple, temb, upsample_size)\u001b[0m\n\u001b[1;32m 1315\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(create_custom_forward(resnet), hidden_states, temb)\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1317\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[43mresnet\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1320\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m upsampler \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers:\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", + "File \u001b[0;32m~/ML/huggingface/diffusers/src/diffusers/models/resnet.py:467\u001b[0m, in \u001b[0;36mResnetBlock2D.forward\u001b[0;34m(self, input_tensor, temb)\u001b[0m\n\u001b[1;32m 464\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv1(hidden_states)\n\u001b[1;32m 466\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m temb \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 467\u001b[0m temb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtime_emb_proj\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnonlinearity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtemb\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m[:, :, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m]\n\u001b[1;32m 468\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states \u001b[38;5;241m+\u001b[39m temb\n\u001b[1;32m 470\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm2(hidden_states)\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1131\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mlinear(\u001b[38;5;28minput\u001b[39m, \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbias)\n", + "File \u001b[0;32m~/.local/share/virtualenvs/huggingface-OfWfm_Zx/lib/python3.10/site-packages/torch/nn/modules/module.py:1194\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1191\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_is_full_backward_hook\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m:\n\u001b[1;32m 1192\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_is_full_backward_hook \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1194\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getattr__\u001b[39m(\u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tensor, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mModule\u001b[39m\u001b[38;5;124m'\u001b[39m]:\n\u001b[1;32m 1195\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_parameters\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m:\n\u001b[1;32m 1196\u001b[0m _parameters \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__dict__\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_parameters\u001b[39m\u001b[38;5;124m'\u001b[39m]\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ "for _ in range(10):\n", " seed = generator.seed()\n", diff --git a/requirements.txt b/requirements.txt index ab0083d..38e2823 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ torch numpy Pillow -diffusers>=0.9.0 +diffusers>=0.10.0 librosa datasets gradio diff --git a/scripts/audio_to_images.py b/scripts/audio_to_images.py index 54f5a85..12ac448 100644 --- a/scripts/audio_to_images.py +++ b/scripts/audio_to_images.py @@ -7,10 +7,9 @@ import numpy as np import pandas as pd from tqdm.auto import tqdm +from diffusers.pipelines.audio_diffusion import Mel from datasets import Dataset, DatasetDict, Features, Image, Value -from audiodiffusion import Mel - logging.basicConfig(level=logging.WARN) logger = logging.getLogger('audio_to_images') diff --git a/scripts/train_unconditional.py b/scripts/train_unconditional.py index f82d019..52e322b 100644 --- a/scripts/train_unconditional.py +++ b/scripts/train_unconditional.py @@ -11,11 +11,13 @@ from accelerate.logging import get_logger from datasets import load_from_disk, load_dataset from diffusers import ( + AudioDiffusionPipeline, DDPMScheduler, UNet2DModel, DDIMScheduler, AutoencoderKL, ) +from diffusers.pipelines.audio_diffusion import Mel from huggingface_hub import HfFolder, Repository, whoami from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel @@ -27,7 +29,6 @@ import numpy as np from tqdm.auto import tqdm from librosa.util import normalize -from audiodiffusion import AudioDiffusionPipeline, Mel logger = get_logger(__name__) diff --git a/scripts/train_vae.py b/scripts/train_vae.py index 9f40268..fc3306d 100644 --- a/scripts/train_vae.py +++ b/scripts/train_vae.py @@ -14,13 +14,11 @@ from pytorch_lightning.trainer import Trainer from torch.utils.data import DataLoader, Dataset from datasets import load_from_disk, load_dataset +from diffusers.pipelines.audio_diffusion import Mel +from audiodiffusion.utils import convert_ldm_to_hf_vae from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.utilities.distributed import rank_zero_only -#from diffusers import Mel -from audiodiffusion import Mel -from audiodiffusion.utils import convert_ldm_to_hf_vae - class AudioDiffusion(Dataset): diff --git a/setup.cfg b/setup.cfg index db35447..290b46f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,6 +15,6 @@ install_requires = torch numpy Pillow - diffusers>=0.9.0 + diffusers>=0.10.0 librosa datasets