From db9c7ca086e816705e6967cf3a4faa0028ed0e46 Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 11 Jul 2024 01:37:18 +0200 Subject: [PATCH 1/8] Stable audio model example --- .ci/skipped_notebooks.yml | 5 +- notebooks/stable-audio/README.md | 26 + notebooks/stable-audio/stable-audio.ipynb | 701 ++++++++++++++++++++++ 3 files changed, 731 insertions(+), 1 deletion(-) create mode 100644 notebooks/stable-audio/README.md create mode 100644 notebooks/stable-audio/stable-audio.ipynb diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml index 11db89667b9..a7d7be35e46 100644 --- a/.ci/skipped_notebooks.yml +++ b/.ci/skipped_notebooks.yml @@ -531,4 +531,7 @@ - ubuntu-20.04 - ubuntu-22.04 - windows-2019 - +- notebook: notebooks/stable-audio/stable-audio.ipynb + skips: + - python: + - '3.8' diff --git a/notebooks/stable-audio/README.md b/notebooks/stable-audio/README.md new file mode 100644 index 00000000000..b6afd227750 --- /dev/null +++ b/notebooks/stable-audio/README.md @@ -0,0 +1,26 @@ +# Sound Generation with Stable Audio Open 1.0 and OpenVINO™ + +[Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) is an open-source model optimized for generating short audio samples, sound effects, and production elements using text prompts. The model was trained on data from Freesound and the Free Music Archive, respecting creator rights. + + + +#### Key Takeaways: + + - Stable Audio Open is an open source text-to-audio model for generating up to 47 seconds of samples and sound effects. + - Users can create drum beats, instrument riffs, ambient sounds, foley and production elements. + - The model enables audio variations and style transfer of audio samples. + +This model is made to be used with the [stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools) library for inference. + +## Notebook contents +This tutorial consists of the following steps: +- Prerequisites +- Load the original model and inference +- Convert the model to OpenVINO IR +- Compiling models and inference +- Interactive inference + +## Installation instructions +This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +For details, please refer to [Installation Guide](../../README.md). diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb new file mode 100644 index 00000000000..15141716799 --- /dev/null +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -0,0 +1,701 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d5ec3b65-37b0-42cf-a0c3-01682f763e5b", + "metadata": {}, + "source": [ + "# Sound Generation with Stable Audio Open 1.0 and OpenVINO™\n", + "\n", + "[Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) is an open-source model optimized for generating short audio samples, sound effects, and production elements using text prompts. The model was trained on data from Freesound and the Free Music Archive, respecting creator rights.\n", + "\n", + "![stable-audio](https://huggingface.co/stabilityai/stable-audio-open-1.0/resolve/main/stable_audio_light.png)\n", + "\n", + "#### Key Takeaways:\n", + "\n", + " - Stable Audio Open is an open source text-to-audio model for generating up to 47 seconds of samples and sound effects.\n", + "\n", + " - Users can create drum beats, instrument riffs, ambient sounds, foley and production elements.\n", + "\n", + " - The model enables audio variations and style transfer of audio samples.\n", + "\n", + "This model is made to be used with the [stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools) library for inference.\n", + "\n", + "#### Table of contents:\n", + "- [Prerequisites](#Prerequisites)\n", + "- [Load the original model and inference](#Load-the-original-model-and-inference)\n", + "- [Convert the model to OpenVINO IR](#Convert-the-model-to-OpenVINO-IR)\n", + "- [Compiling models and inference](#Compiling-models-and-inference)\n", + "- [Interactive inference](#Interactive-inference)" + ] + }, + { + "cell_type": "markdown", + "id": "8ec22045-23e8-4c7e-9f9b-5aa9d4ef4d13", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e76d1f84-9692-4bdb-8f06-b5fd442a70f8", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%pip install -q \"torch>=2.1\" \"torchaudio\" einops \"stable-audio-tools\" \"gradio>=4.19\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install -Uq --pre \"openvino\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" + ] + }, + { + "cell_type": "markdown", + "id": "a4615956-78e7-4ca4-b1e6-07fa53d344d6", + "metadata": {}, + "source": [ + "## Load the original model and inference\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + ">**Note**: run model with notebook, you will need to accept license agreement. \n", + ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/stabilityai/stable-audio-open-1.0), carefully read terms of usage and click accept button. You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n", + ">You can login on Hugging Face Hub in notebook environment, using following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "392ecaf4-46c1-4ed9-8dbb-b0b5abfec902", + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment these lines to login to huggingfacehub to get access to pretrained model\n", + "# from huggingface_hub import notebook_login, whoami\n", + "\n", + "# try:\n", + "# whoami()\n", + "# print('Authorization token already provided')\n", + "# except OSError:\n", + "# notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "536daa43-4620-49ba-975c-171d25162253", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torchaudio\n", + "from einops import rearrange\n", + "from stable_audio_tools import get_pretrained_model\n", + "from stable_audio_tools.inference.generation import generate_diffusion_cond\n", + "\n", + "\n", + "# Download model\n", + "model, model_config = get_pretrained_model(\"stabilityai/stable-audio-open-1.0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6d6af561-c1aa-43f8-a6a6-6a70e1cc9500", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/maleksandr/test_notebooks/stable-audio/openvino_notebooks/notebooks/stable-audio/venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b78f46dc3b5d4d06a01e9d2868096012", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/100 [00:00=t0 but got ta=0.29999998211860657 and t0=0.3.\n", + " warnings.warn(f\"Should have ta>=t0 but got ta={ta} and t0={self._start}.\")\n" + ] + } + ], + "source": [ + "sample_rate = model_config[\"sample_rate\"]\n", + "\n", + "model = model.to(\"cpu\")\n", + "total_seconds = 20\n", + "\n", + "# Set up text and timing conditioning\n", + "conditioning = [{\"prompt\": \"128 BPM tech house drum loop\", \"seconds_start\": 0, \"seconds_total\": total_seconds}]\n", + "\n", + "# Generate stereo audio\n", + "output = generate_diffusion_cond(\n", + " model,\n", + " steps=100,\n", + " seed=42,\n", + " cfg_scale=7,\n", + " conditioning=conditioning,\n", + " sample_size=sample_rate * total_seconds,\n", + " sigma_min=0.3,\n", + " sigma_max=500,\n", + " sampler_type=\"dpmpp-3m-sde\",\n", + " device=\"cpu\",\n", + ")\n", + "\n", + "# Rearrange audio batch to a single sequence\n", + "output = rearrange(output, \"b d n -> d (b n)\")\n", + "\n", + "# Peak normalize, clip, convert to int16, and save to file\n", + "output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()\n", + "torchaudio.save(\"output.wav\", output, sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f12a8b46-eb52-439b-9411-023738259543", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Audio\n", + "\n", + "Audio(\"output.wav\")" + ] + }, + { + "cell_type": "markdown", + "id": "97961f42-3f6a-485a-95b4-763ff2dd11e2", + "metadata": {}, + "source": [ + "## Convert the model to OpenVINO IR\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Let's define the conversion function for PyTorch modules. We use `ov.convert_model` function to obtain OpenVINO Intermediate Representation object and `ov.save_model` function to save it as XML file." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0e12081f-057e-4b18-b54d-b47483ba3bd2", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import torch\n", + "\n", + "import openvino as ov\n", + "\n", + "\n", + "def convert(model: torch.nn.Module, xml_path: str, example_input):\n", + " xml_path = Path(xml_path)\n", + " if not xml_path.exists():\n", + " xml_path.parent.mkdir(parents=True, exist_ok=True)\n", + " model.eval()\n", + " with torch.no_grad():\n", + " converted_model = ov.convert_model(model, example_input=example_input)\n", + " ov.save_model(converted_model, xml_path)\n", + "\n", + " # cleanup memory\n", + " torch._C._jit_clear_class_registry()\n", + " torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()\n", + " torch.jit._state._clear_class_state()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "107a10f1-48b9-4767-afa2-003b88d88d40", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_DIR = Path(\"model\")\n", + "\n", + "CONDITIONER_ENCODER_PATH = MODEL_DIR / \"conditioner_encoder.xml\"\n", + "DIFFUSION_PATH = MODEL_DIR / \"diffusion.xml\"\n", + "PRETRANSFORM_PATH = MODEL_DIR / \"pretransform.xml\"" + ] + }, + { + "cell_type": "markdown", + "id": "876da018-8a16-4212-9c05-b5a731858af7", + "metadata": {}, + "source": [ + "The pipeline comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder. In this example an init audio is not used, so we need to convert T5-based text embedding model, transformer-based diffusion (DiT) model and only decoder part of autoencoder." + ] + }, + { + "cell_type": "markdown", + "id": "34382cd1-21f8-4f83-942c-29b8873d598c", + "metadata": {}, + "source": [ + "T5-based text embedding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "247b3db7-46e0-4d75-92fa-3945113c8eef", + "metadata": {}, + "outputs": [], + "source": [ + "example_input = {\n", + " \"input_ids\": torch.zeros(1, 120, dtype=torch.int64),\n", + " \"attention_mask\": torch.zeros(1, 120, dtype=torch.int64),\n", + "}\n", + "\n", + "convert(model.conditioner.conditioners[\"prompt\"].model, CONDITIONER_ENCODER_PATH, example_input)" + ] + }, + { + "cell_type": "markdown", + "id": "a2447c3b-5ef2-4a06-964f-70ad86566e71", + "metadata": {}, + "source": [ + "Transformer-based diffusion (DiT) model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fac667b8-1845-40ba-9d74-ea0645de22c1", + "metadata": {}, + "outputs": [], + "source": [ + "class DiffusionWrapper(torch.nn.Module):\n", + " def __init__(self, diffusion):\n", + " super().__init__()\n", + " self.diffusion = diffusion\n", + "\n", + " def forward(self, x=None, t=None, cross_attn_cond=None, cross_attn_cond_mask=None, global_embed=None):\n", + " model_inputs = {\"cross_attn_cond\": cross_attn_cond, \"cross_attn_cond_mask\": cross_attn_cond_mask, \"global_embed\": global_embed}\n", + "\n", + " return self.diffusion.forward(x, t, cfg_scale=7, **model_inputs)\n", + "\n", + "\n", + "example_input = {\n", + " \"x\": torch.rand([1, 64, 1024], dtype=torch.float32),\n", + " \"t\": torch.rand([1], dtype=torch.float32),\n", + " \"cross_attn_cond\": torch.rand([1, 130, 768], dtype=torch.float32),\n", + " \"cross_attn_cond_mask\": torch.ones([1, 130], dtype=torch.float32),\n", + " \"global_embed\": torch.rand(torch.Size([1, 1536]), dtype=torch.float32),\n", + "}\n", + "\n", + "\n", + "convert(DiffusionWrapper(model.model.model), DIFFUSION_PATH, example_input)" + ] + }, + { + "cell_type": "markdown", + "id": "350d6149-9cc8-43be-b1fe-d1bde58425a9", + "metadata": {}, + "source": [ + "Decoder part of autoencoder." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0ae74584-5583-4a84-ab16-ce4e85c59d81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['x']\n" + ] + } + ], + "source": [ + "convert(model.pretransform.model.decoder, PRETRANSFORM_PATH, torch.rand([1, 64, 215], dtype=torch.float32))" + ] + }, + { + "cell_type": "markdown", + "id": "c02eec28-0be3-48a4-b27b-4eba351989e4", + "metadata": {}, + "source": [ + "## Compiling models and inference\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Select device from dropdown list for running inference using OpenVINO." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9043a56d-224c-4429-867e-3d3663f4e361", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6c64ca8207d14d71894b9781b03a9fbd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "core = ov.Core()\n", + "device = widgets.Dropdown(\n", + " options=core.available_devices + [\"AUTO\"],\n", + " value=\"AUTO\",\n", + " description=\"Device:\",\n", + " disabled=False,\n", + ")\n", + "\n", + "device" + ] + }, + { + "cell_type": "markdown", + "id": "8ebb00bc-3f50-49e1-8092-3a9ee134fb7e", + "metadata": {}, + "source": [ + "Let's create callable wrapper classes for compiled models to allow interaction with original pipeline. Note that all of wrapper classes return `torch.Tensor`s instead of `np.array`s." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "20892219-5610-402c-9949-a3f36e4850ac", + "metadata": {}, + "outputs": [], + "source": [ + "class TextEncoderWrapper(torch.nn.Module):\n", + " def __init__(self, text_encoder, dtype, device=\"CPU\"):\n", + " super().__init__()\n", + " self.text_encoder = core.compile_model(text_encoder, device)\n", + " self.dtype = dtype\n", + "\n", + " def __call__(self, input_ids=None, attention_mask=None):\n", + " inputs = {\n", + " \"input_ids\": input_ids,\n", + " \"attention_mask\": attention_mask,\n", + " }\n", + " last_hidden_state = self.text_encoder(inputs)[0]\n", + "\n", + " return {\"last_hidden_state\": torch.from_numpy(last_hidden_state)}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f25a2659-f134-4578-b42b-49ca37ed4c59", + "metadata": {}, + "outputs": [], + "source": [ + "class OVWrapper(torch.nn.Module):\n", + " def __init__(self, ov_model, old_model, device=\"CPU\") -> None:\n", + " super().__init__()\n", + " self.mock = torch.nn.Parameter(torch.zeros(1)) # this is only mock to not change the pipeline\n", + " self.dif_transformer = core.compile_model(ov_model, device)\n", + "\n", + " def forward(self, x=None, t=None, cross_attn_cond=None, cross_attn_cond_mask=None, global_embed=None, **kwargs):\n", + " inputs = {\n", + " \"x\": x,\n", + " \"t\": t,\n", + " \"cross_attn_cond\": cross_attn_cond,\n", + " \"cross_attn_cond_mask\": cross_attn_cond_mask,\n", + " \"global_embed\": global_embed,\n", + " }\n", + " result = self.dif_transformer(inputs)\n", + "\n", + " return torch.from_numpy(result[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "dd380ae0-7ed5-43f4-b2c2-e03c2ef7494c", + "metadata": {}, + "outputs": [], + "source": [ + "class PretransformDecoderWrapper(torch.nn.Module):\n", + " def __init__(self, ov_model, device=\"CPU\"):\n", + " super().__init__()\n", + " self.decoder = core.compile_model(ov_model, device)\n", + "\n", + " def forward(self, latents=None):\n", + "\n", + " result = self.decoder(latents)\n", + "\n", + " return torch.from_numpy(result[0])" + ] + }, + { + "cell_type": "markdown", + "id": "b99ea15b-be33-4df6-9d8c-54f51fab0cf6", + "metadata": {}, + "source": [ + "Now we can replace the original models by our wrapped OpenVINO models and run inference. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "767ade24-48d8-4ae2-897e-9ece47a0981f", + "metadata": {}, + "outputs": [], + "source": [ + "model.model.model = OVWrapper(DIFFUSION_PATH, model.model.model, device.value)\n", + "model.conditioner.conditioners[\"prompt\"].model = TextEncoderWrapper(\n", + " CONDITIONER_ENCODER_PATH, model.conditioner.conditioners[\"prompt\"].model.dtype, device.value\n", + ")\n", + "model.pretransform.model.decoder = PretransformDecoderWrapper(PRETRANSFORM_PATH, device.value)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "50a0f42c-8c81-4bca-8a9b-4bb637b8e78d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2944cc9d5abe4326828625f493b01bae", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/100 [00:00 d (b n)\")\n", + "\n", + "# Peak normalize, clip, convert to int16, and save to file\n", + "output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()\n", + "torchaudio.save(\"output.wav\", output, sample_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f6f14886-3184-4002-94d9-5d5b30dbb7fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Audio(\"output.wav\")" + ] + }, + { + "cell_type": "markdown", + "id": "09ca8a1a-ab99-4c8e-b0fa-bfd181bde3a1", + "metadata": {}, + "source": [ + "## Interactive inference\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b3ba7e98-5703-42c3-a7b7-acf696bb0a3d", + "metadata": {}, + "outputs": [], + "source": [ + "def _generate(prompt, total_seconds, steps, seed):\n", + " sample_rate = model_config[\"sample_rate\"]\n", + "\n", + " # Set up text and timing conditioning\n", + " conditioning = [{\"prompt\": prompt, \"seconds_start\": 0, \"seconds_total\": total_seconds}]\n", + "\n", + " output = generate_diffusion_cond(\n", + " model,\n", + " steps=steps,\n", + " seed=seed,\n", + " cfg_scale=7,\n", + " conditioning=conditioning,\n", + " sample_size=sample_rate * total_seconds,\n", + " sigma_min=0.3,\n", + " sigma_max=500,\n", + " sampler_type=\"dpmpp-3m-sde\",\n", + " device=\"cpu\",\n", + " )\n", + "\n", + " # Rearrange audio batch to a single sequence\n", + " output = rearrange(output, \"b d n -> d (b n)\")\n", + "\n", + " # Peak normalize, clip, convert to int16, and save to file\n", + " output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()\n", + " return (sample_rate, output.numpy().transpose())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eeff99cd-d90d-4439-8c82-351a893d1fd0", + "metadata": {}, + "outputs": [], + "source": [ + "import gradio as gr\n", + "import numpy as np\n", + "\n", + "\n", + "demo = gr.Interface(\n", + " _generate,\n", + " inputs=[\n", + " gr.Textbox(label=\"Text Prompt\"),\n", + " gr.Slider(1, 47, label=\"Total seconds\", step=1, value=10),\n", + " gr.Slider(10, 100, label=\"Number of steps\", step=1, value=100),\n", + " gr.Slider(0, np.iinfo(np.int32).max, label=\"Seed\", step=1),\n", + " ],\n", + " outputs=[\"audio\"],\n", + " examples=[\n", + " [\"128 BPM tech house drum loop\"],\n", + " [\"Blackbird song, summer, dusk in the forest\"],\n", + " [\"Rock beat played in a treated studio, session drumming on an acoustic kit\"],\n", + " [\"Calmful melody and nature sounds for restful sleep\"],\n", + " ],\n", + " allow_flagging=\"never\",\n", + ")\n", + "try:\n", + " demo.launch(debug=True)\n", + "except Exception:\n", + " demo.launch(share=True, debug=True)\n", + "\n", + "# If you are launching remotely, specify server_name and server_port\n", + "# EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')`\n", + "# To learn more please refer to the Gradio docs: https://gradio.app/docs/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "openvino_notebooks": { + "imageUrl": "https://huggingface.co/stabilityai/stable-audio-open-1.0/resolve/main/stable_audio_light.png?raw=true", + "tags": { + "categories": [ + "Model Demos", + "AI Trends" + ], + "libraries": [], + "other": [], + "tasks": [ + "Text-to-Audio" + ] + } + }, + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 76cccb0ad444e6b743ae4a14c8ff3e86f52b4076 Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 11 Jul 2024 01:40:01 +0200 Subject: [PATCH 2/8] Stable audio model example --- notebooks/stable-audio/stable-audio.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index 15141716799..4a29ae4d23f 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -694,7 +694,7 @@ "Text-to-Audio" ] } - }, + } }, "nbformat": 4, "nbformat_minor": 5 From 76804a42765fbfe344ea7e97cf8b48e670b0ebfb Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 11 Jul 2024 01:50:14 +0200 Subject: [PATCH 3/8] spellcheck --- .ci/spellcheck/.pyspelling.wordlist.txt | 2 ++ notebooks/stable-audio/stable-audio.ipynb | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index f7d6ee6eb1d..d91be4fdd39 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -244,11 +244,13 @@ finetuned finetuning FLAC floyd +foley Formatter formatter fp FP FPN +Freesound FreeVC freevc frisbee diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index 4a29ae4d23f..baab8528740 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -267,7 +267,7 @@ "id": "876da018-8a16-4212-9c05-b5a731858af7", "metadata": {}, "source": [ - "The pipeline comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder. In this example an init audio is not used, so we need to convert T5-based text embedding model, transformer-based diffusion (DiT) model and only decoder part of autoencoder." + "The pipeline comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder. In this example an initial audio is not used, so we need to convert T5-based text embedding model, transformer-based diffusion (DiT) model and only decoder part of autoencoder." ] }, { @@ -632,7 +632,6 @@ "outputs": [], "source": [ "import gradio as gr\n", - "import numpy as np\n", "\n", "\n", "demo = gr.Interface(\n", From bc15c509db3b1503ab5d07ca63c0f9f748481b2b Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 11 Jul 2024 13:17:46 +0200 Subject: [PATCH 4/8] Links --- .ci/skipped_notebooks.yml | 5 +++++ notebooks/stable-audio/README.md | 2 +- notebooks/stable-audio/stable-audio.ipynb | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml index a7d7be35e46..2b566e648dd 100644 --- a/.ci/skipped_notebooks.yml +++ b/.ci/skipped_notebooks.yml @@ -535,3 +535,8 @@ skips: - python: - '3.8' + - os: + - macos-12 + - ubuntu-20.04 + - ubuntu-22.04 + - windows-2019 diff --git a/notebooks/stable-audio/README.md b/notebooks/stable-audio/README.md index b6afd227750..1d7c2d45420 100644 --- a/notebooks/stable-audio/README.md +++ b/notebooks/stable-audio/README.md @@ -2,7 +2,7 @@ [Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) is an open-source model optimized for generating short audio samples, sound effects, and production elements using text prompts. The model was trained on data from Freesound and the Free Music Archive, respecting creator rights. - + #### Key Takeaways: diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index baab8528740..f80a01f19df 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -681,7 +681,7 @@ "version": "3.10.12" }, "openvino_notebooks": { - "imageUrl": "https://huggingface.co/stabilityai/stable-audio-open-1.0/resolve/main/stable_audio_light.png?raw=true", + "imageUrl": "https://github.com/openvinotoolkit/openvino_notebooks/assets/76171391/ed4aa0f2-0501-4519-8b24-c1c3072b4ef2", "tags": { "categories": [ "Model Demos", From 52e8e400fa7d07330ce0c51c3cad80bf4a8c71a4 Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Tue, 16 Jul 2024 01:32:33 +0200 Subject: [PATCH 5/8] Weight Compression, extend headers --- notebooks/stable-audio/README.md | 2 +- notebooks/stable-audio/stable-audio.ipynb | 119 +++++++--------------- 2 files changed, 36 insertions(+), 85 deletions(-) diff --git a/notebooks/stable-audio/README.md b/notebooks/stable-audio/README.md index 1d7c2d45420..1b4f729e1fd 100644 --- a/notebooks/stable-audio/README.md +++ b/notebooks/stable-audio/README.md @@ -1,4 +1,4 @@ -# Sound Generation with Stable Audio Open 1.0 and OpenVINO™ +# Sound Generation with Stable Audio Open and OpenVINO™ [Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) is an open-source model optimized for generating short audio samples, sound effects, and production elements using text prompts. The model was trained on data from Freesound and the Free Music Archive, respecting creator rights. diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index f80a01f19df..57759b35263 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -5,11 +5,11 @@ "id": "d5ec3b65-37b0-42cf-a0c3-01682f763e5b", "metadata": {}, "source": [ - "# Sound Generation with Stable Audio Open 1.0 and OpenVINO™\n", + "# Sound Generation with Stable Audio Open and OpenVINO™\n", "\n", "[Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) is an open-source model optimized for generating short audio samples, sound effects, and production elements using text prompts. The model was trained on data from Freesound and the Free Music Archive, respecting creator rights.\n", "\n", - "![stable-audio](https://huggingface.co/stabilityai/stable-audio-open-1.0/resolve/main/stable_audio_light.png)\n", + "![stable-audio](https://github.com/openvinotoolkit/openvino_notebooks/assets/76171391/ed4aa0f2-0501-4519-8b24-c1c3072b4ef2)\n", "\n", "#### Key Takeaways:\n", "\n", @@ -25,6 +25,9 @@ "- [Prerequisites](#Prerequisites)\n", "- [Load the original model and inference](#Load-the-original-model-and-inference)\n", "- [Convert the model to OpenVINO IR](#Convert-the-model-to-OpenVINO-IR)\n", + " - [T5-based text embedding](#T5-based-text-embedding)\n", + " - [Transformer-based diffusion (DiT) model](#Transformer-based-diffusion-(DiT)-model)\n", + " - [Decoder part of autoencoder](#Decoder-part-of-autoencoder)\n", "- [Compiling models and inference](#Compiling-models-and-inference)\n", "- [Interactive inference](#Interactive-inference)" ] @@ -47,7 +50,7 @@ }, "outputs": [], "source": [ - "%pip install -q \"torch>=2.1\" \"torchaudio\" einops \"stable-audio-tools\" \"gradio>=4.19\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install -q \"torch>=2.1\" \"torchaudio\" einops \"stable-audio-tools\" \"gradio>=4.19\" \"nncf>=2.11.0\" --extra-index-url https://download.pytorch.org/whl/cpu\n", "%pip install -Uq --pre \"openvino\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" ] }, @@ -101,50 +104,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "6d6af561-c1aa-43f8-a6a6-6a70e1cc9500", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "42\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/maleksandr/test_notebooks/stable-audio/openvino_notebooks/notebooks/stable-audio/venv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b78f46dc3b5d4d06a01e9d2868096012", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/100 [00:00=t0 but got ta=0.29999998211860657 and t0=0.3.\n", - " warnings.warn(f\"Should have ta>=t0 but got ta={ta} and t0={self._start}.\")\n" - ] - } - ], + "outputs": [], "source": [ "sample_rate = model_config[\"sample_rate\"]\n", "\n", @@ -215,7 +178,20 @@ "## Convert the model to OpenVINO IR\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "Let's define the conversion function for PyTorch modules. We use `ov.convert_model` function to obtain OpenVINO Intermediate Representation object and `ov.save_model` function to save it as XML file." + "Let's define the conversion function for PyTorch modules. We use `ov.convert_model` function to obtain OpenVINO Intermediate Representation object and `ov.save_model` function to save it as XML file.\n", + "\n", + "For reducing memory consumption, weights compression optimization can be applied using [NNCF](https://github.com/openvinotoolkit/nncf). Weight compression aims to reduce the memory footprint of a model.\n", + "models, which require extensive memory to store the weights during inference, can benefit from weight compression in the following ways:\n", + "\n", + "* enabling the inference of exceptionally large models that cannot be accommodated in the memory of the device;\n", + "\n", + "* improving the inference performance of the models by reducing the latency of the memory access when computing the operations with weights, for example, Linear layers.\n", + "\n", + "[Neural Network Compression Framework (NNCF)](https://github.com/openvinotoolkit/nncf) provides 4-bit / 8-bit mixed weight quantization as a compression method. The main difference between weights compression and full model quantization (post-training quantization) is that activations remain floating-point in the case of weights compression which leads to a better accuracy. In addition, weight compression is data-free and does not require a calibration dataset, making it easy to use.\n", + "\n", + "`nncf.compress_weights` function can be used for performing weights compression. The function accepts an OpenVINO model and other compression parameters. Different parameters may be suitable for different models. In this case default parmeters give bad results. But we can change mode to `CompressWeightsMode.INT8_SYM` to [compress weights symmetrically to 8-bit integer data type](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/post_training_compression/weights_compression/Usage.md#user-guide) and get the inference results the same as original. \n", + "\n", + "More details about weights compression can be found in [OpenVINO documentation](https://docs.openvino.ai/2023.3/weight_compression.html)." ] }, { @@ -230,6 +206,7 @@ "import numpy as np\n", "import torch\n", "\n", + "from nncf import compress_weights, CompressWeightsMode\n", "import openvino as ov\n", "\n", "\n", @@ -240,6 +217,7 @@ " model.eval()\n", " with torch.no_grad():\n", " converted_model = ov.convert_model(model, example_input=example_input)\n", + " converted_model = compress_weights(converted_model, mode=CompressWeightsMode.INT8_SYM)\n", " ov.save_model(converted_model, xml_path)\n", "\n", " # cleanup memory\n", @@ -275,7 +253,8 @@ "id": "34382cd1-21f8-4f83-942c-29b8873d598c", "metadata": {}, "source": [ - "T5-based text embedding." + "### T5-based text embedding\n", + "[back to top ⬆️](#Table-of-contents:)" ] }, { @@ -298,7 +277,8 @@ "id": "a2447c3b-5ef2-4a06-964f-70ad86566e71", "metadata": {}, "source": [ - "Transformer-based diffusion (DiT) model." + "### Transformer-based diffusion (DiT) model\n", + "[back to top ⬆️](#Table-of-contents:)" ] }, { @@ -336,23 +316,16 @@ "id": "350d6149-9cc8-43be-b1fe-d1bde58425a9", "metadata": {}, "source": [ - "Decoder part of autoencoder." + "### Decoder part of autoencoder\n", + "[back to top ⬆️](#Table-of-contents:)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "0ae74584-5583-4a84-ab16-ce4e85c59d81", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['x']\n" - ] - } - ], + "outputs": [], "source": [ "convert(model.pretransform.model.decoder, PRETRANSFORM_PATH, torch.rand([1, 64, 215], dtype=torch.float32))" ] @@ -504,32 +477,10 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "50a0f42c-8c81-4bca-8a9b-4bb637b8e78d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "42\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2944cc9d5abe4326828625f493b01bae", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/100 [00:00 Date: Tue, 16 Jul 2024 02:23:09 +0200 Subject: [PATCH 6/8] Fix link --- notebooks/stable-audio/stable-audio.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index 57759b35263..06f654cb30e 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -189,9 +189,9 @@ "\n", "[Neural Network Compression Framework (NNCF)](https://github.com/openvinotoolkit/nncf) provides 4-bit / 8-bit mixed weight quantization as a compression method. The main difference between weights compression and full model quantization (post-training quantization) is that activations remain floating-point in the case of weights compression which leads to a better accuracy. In addition, weight compression is data-free and does not require a calibration dataset, making it easy to use.\n", "\n", - "`nncf.compress_weights` function can be used for performing weights compression. The function accepts an OpenVINO model and other compression parameters. Different parameters may be suitable for different models. In this case default parmeters give bad results. But we can change mode to `CompressWeightsMode.INT8_SYM` to [compress weights symmetrically to 8-bit integer data type](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/post_training_compression/weights_compression/Usage.md#user-guide) and get the inference results the same as original. \n", + "`nncf.compress_weights` function can be used for performing weights compression. The function accepts an OpenVINO model and other compression parameters. Different parameters may be suitable for different models. In this case default parameters give bad results. But we can change mode to `CompressWeightsMode.INT8_SYM` to [compress weights symmetrically to 8-bit integer data type](https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/post_training_compression/weights_compression/Usage.md#user-guide) and get the inference results the same as original. \n", "\n", - "More details about weights compression can be found in [OpenVINO documentation](https://docs.openvino.ai/2023.3/weight_compression.html)." + "More details about weights compression can be found in [OpenVINO documentation](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html)." ] }, { From 5cfca4f1a56518a5b5c8c6e87131aae7356935f2 Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Wed, 17 Jul 2024 13:28:10 +0200 Subject: [PATCH 7/8] Change dependencies --- notebooks/stable-audio/stable-audio.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index 06f654cb30e..bf3d24407aa 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -38,7 +38,8 @@ "metadata": {}, "source": [ "## Prerequisites\n", - "[back to top ⬆️](#Table-of-contents:)" + "[back to top ⬆️](#Table-of-contents:)\n", + ">**Note**: using python3.8 can take a long time to resolve dependency conflicts." ] }, { @@ -50,7 +51,7 @@ }, "outputs": [], "source": [ - "%pip install -q \"torch>=2.1\" \"torchaudio\" einops \"stable-audio-tools\" \"gradio>=4.19\" \"nncf>=2.11.0\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install \"stable-audio-tools\" \"nncf>=2.11.0\" --extra-index-url https://download.pytorch.org/whl/cpu\n", "%pip install -Uq --pre \"openvino\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" ] }, From 1095c65fc7cf0efe97db5fdb8049782a9d243602 Mon Sep 17 00:00:00 2001 From: Aleksandr Mokrov Date: Thu, 18 Jul 2024 14:22:35 +0200 Subject: [PATCH 8/8] CPU as default device --- notebooks/stable-audio/stable-audio.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/stable-audio/stable-audio.ipynb b/notebooks/stable-audio/stable-audio.ipynb index bf3d24407aa..bd8e05afe63 100644 --- a/notebooks/stable-audio/stable-audio.ipynb +++ b/notebooks/stable-audio/stable-audio.ipynb @@ -370,7 +370,7 @@ "core = ov.Core()\n", "device = widgets.Dropdown(\n", " options=core.available_devices + [\"AUTO\"],\n", - " value=\"AUTO\",\n", + " value=\"CPU\",\n", " description=\"Device:\",\n", " disabled=False,\n", ")\n",