Stability-AI
diff --git a/‎LICENSES/LICENSE_NVIDIA.txt‎
Lines changed: 21 additions & 0 deletions b/‎LICENSES/LICENSE_NVIDIA.txt‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎scripts/ds_zero_to_pl_ckpt.py‎
Lines changed: 14 additions & 0 deletions b/‎scripts/ds_zero_to_pl_ckpt.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 9 additions & 7 deletions b/‎setup.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎stable_audio_tools/data/dataset.py‎
Lines changed: 12 additions & 3 deletions b/‎stable_audio_tools/data/dataset.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎stable_audio_tools/data/utils.py‎
Lines changed: 14 additions & 1 deletion b/‎stable_audio_tools/data/utils.py‎
Lines changed: 14 additions & 1 deletion
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 NVIDIA CORPORATION.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software. 
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,14 @@
+import argparse
+from lightning.pytorch.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save_path", type=str, help="Path to the zero checkpoint")
+    parser.add_argument("--output_path", type=str, help="Path to the output checkpoint", default="lightning_model.pt")
+    args = parser.parse_args()
+
+    # lightning deepspeed has saved a directory instead of a file
+    save_path = args.save_path
+    output_path = args.output_path
+    convert_zero_checkpoint_to_fp32_state_dict(save_path, output_path)
@@ -2,7 +2,7 @@
 
 setup(
     name='stable-audio-tools',
-    version='0.0.7',
+    version='0.0.8',
     url='https://github.com/Stability-AI/stable-audio-tools.git',
     author='Stability AI',
     description='Training and inference tools for generative audio models from Stability AI',
@@ -13,31 +13,33 @@
         'alias-free-torch==0.0.6',
         'auraloss==0.4.0',
         'descript-audio-codec==1.0.0',
-        'einops==0.6.1',
+        'einops==0.7.0',
         'einops-exts==0.0.4',
         'ema-pytorch==0.2.3',
         'encodec==0.1.1',
         'gradio==3.42.0',
         'importlib-resources==5.12.0',
-        'k-diffusion==0.0.15',
+        'k-diffusion==0.1.1',
         'laion-clap==1.1.4',
         'local-attention==1.8.6',
+        'nwt-pytorch==0.0.4',
         'pandas==2.0.2',
         'pedalboard==0.7.4',
         'prefigure==0.0.9',
-        'pytorch_lightning==2.0.9', 
+        'pytorch_lightning==2.1.0', 
         'PyWavelets==1.4.1',
+        'safetensors',
         'sentencepiece==0.1.99',
-        's3fs==2023.6.0',
+        's3fs',
         'torch>=2.0.1',
         'torchaudio>=2.0.2',
         'torchmetrics==0.11.4',
         'tqdm',
         'transformers==4.33.3',
         'v-diffusion-pytorch==0.0.2',
-        'vector-quantize-pytorch==1.6.21',
+        'vector-quantize-pytorch==1.9.14',
         'wandb==0.15.4',
         'webdataset==0.2.48',
-        'x-transformers==1.16.16'
+        'x-transformers>=1.25.15'
     ],
 )
@@ -168,7 +168,7 @@ def __getitem__(self, idx):
             start_time = time.time()
             audio = self.load_file(audio_filename)
 
-            audio, t_start, t_end, seconds_start, seconds_total = self.pad_crop(audio)
+            audio, t_start, t_end, seconds_start, seconds_total, padding_mask = self.pad_crop(audio)
 
             # Run augmentations on this sample (including random crop)
             if self.augs is not None:
@@ -190,6 +190,7 @@ def __getitem__(self, idx):
             info["timestamps"] = (t_start, t_end)
             info["seconds_start"] = seconds_start
             info["seconds_total"] = seconds_total
+            info["padding_mask"] = padding_mask
 
             end_time = time.time()
 
@@ -199,6 +200,9 @@ def __getitem__(self, idx):
                 custom_metadata = self.custom_metadata_fn(info, audio)
                 info.update(custom_metadata)
 
+                if "__reject__" in info and info["__reject__"]:
+                    return self[random.randrange(len(self))]
+
             return (audio, info)
         except Exception as e:
             print(f'Couldn\'t load file {audio_filename}: {e}')
@@ -339,8 +343,12 @@ def log_and_continue(exn):
 
 
 def is_valid_sample(sample):
-    return "json" in sample and "audio" in sample and not is_silence(sample["audio"])
+    has_json = "json" in sample
+    has_audio = "audio" in sample
+    is_silent = is_silence(sample["audio"])
+    is_rejected = "__reject__" in sample["json"] and sample["json"]["__reject__"]
 
+    return has_json and has_audio and not is_silent and not is_rejected
 
 class S3DatasetConfig:
     def __init__(
@@ -446,10 +454,11 @@ def wds_preprocess(self, sample):
             # Pad/crop and get the relative timestamp
             pad_crop = PadCrop_Normalized_T(
                 self.sample_size, randomize=self.random_crop, sample_rate=self.sample_rate)
-            audio, t_start, t_end, seconds_start, seconds_total = pad_crop(
+            audio, t_start, t_end, seconds_start, seconds_total, padding_mask = pad_crop(
                 audio)
             sample["json"]["seconds_start"] = seconds_start
             sample["json"]["seconds_total"] = seconds_total
+            sample["json"]["padding_mask"] = padding_mask
         else:
             t_start, t_end = 0, 1
 
 
@@ -33,27 +33,40 @@ def __call__(self, source: torch.Tensor) -> Tuple[torch.Tensor, float, float, in
 
         n_channels, n_samples = source.shape
 
+        # If the audio is shorter than the desired length, pad it
         upper_bound = max(0, n_samples - self.n_samples)
 
+        # If randomize is False, always start at the beginning of the audio
         offset = 0
         if(self.randomize and n_samples > self.n_samples):
             offset = random.randint(0, upper_bound)
 
+        # Calculate the start and end times of the chunk
         t_start = offset / (upper_bound + self.n_samples)
         t_end = (offset + self.n_samples) / (upper_bound + self.n_samples)
 
+        # Create the chunk
         chunk = source.new_zeros([n_channels, self.n_samples])
+
+        # Copy the audio into the chunk
         chunk[:, :min(n_samples, self.n_samples)] = source[:, offset:offset + self.n_samples]
 
+        # Calculate the start and end times of the chunk in seconds
         seconds_start = math.floor(offset / self.sample_rate)
         seconds_total = math.ceil(n_samples / self.sample_rate)
+
+        # Create a mask the same length as the chunk with 1s where the audio is and 0s where it isn't
+        padding_mask = torch.zeros([self.n_samples])
+        padding_mask[:min(n_samples, self.n_samples)] = 1
+        
 
         return (
             chunk,
             t_start,
             t_end,
             seconds_start,
-            seconds_total
+            seconds_total,
+            padding_mask
         )
 
 class PhaseFlipper(nn.Module):