-
Notifications
You must be signed in to change notification settings - Fork 445
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feature: stable diffusion video (SVD)
- Loading branch information
1 parent
80ff006
commit e8fe8d7
Showing
55 changed files
with
9,453 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import logging | ||
|
||
import click | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
"--start-image", | ||
default="other/images/sound-music.jpg", | ||
help="Input path for image file.", | ||
) | ||
@click.option("--num-frames", default=None, type=int, help="Number of frames.") | ||
@click.option("--num-steps", default=None, type=int, help="Number of steps.") | ||
@click.option( | ||
"--model", | ||
default="svd", | ||
help="Model to use. One of: svd, svd_xt, svd_image_decoder, svd_xt_image_decoder", | ||
) | ||
@click.option( | ||
"--fps", default=6, type=int, help="FPS for the AI to target when generating video" | ||
) | ||
@click.option("--output-fps", default=None, type=int, help="FPS for the output video") | ||
@click.option( | ||
"--motion-amount", | ||
default=127, | ||
type=int, | ||
help="How much motion to generate. value between 0 and 255.", | ||
) | ||
@click.option( | ||
"-r", | ||
"--repeats", | ||
default=1, | ||
show_default=True, | ||
type=int, | ||
help="How many times to repeat the renders. ", | ||
) | ||
@click.option("--cond-aug", default=0.02, type=float, help="Conditional augmentation.") | ||
@click.option( | ||
"--seed", default=None, type=int, help="Seed for random number generator." | ||
) | ||
@click.option( | ||
"--decoding_t", default=1, type=int, help="Number of frames decoded at a time." | ||
) | ||
@click.option("--device", default=None, help="Device to use.") | ||
@click.option("--output_folder", default=None, help="Output folder.") | ||
def videogen_cmd( | ||
start_image, | ||
num_frames, | ||
num_steps, | ||
model, | ||
fps, | ||
output_fps, | ||
motion_amount, | ||
repeats, | ||
cond_aug, | ||
seed, | ||
decoding_t, | ||
device, | ||
output_folder, | ||
): | ||
""" | ||
AI generate a video from an image | ||
Example: | ||
aimg videogen --start-image assets/rocket-wide.png | ||
""" | ||
from imaginairy.log_utils import configure_logging | ||
from imaginairy.video_sample import generate_video | ||
|
||
configure_logging() | ||
|
||
output_fps = output_fps or fps | ||
for i in range(repeats): | ||
logger.info(f"Generating video from image {start_image}") | ||
generate_video( | ||
input_path=start_image, | ||
num_frames=num_frames, | ||
num_steps=num_steps, | ||
model_name=model, | ||
fps_id=fps, | ||
output_fps=output_fps, | ||
motion_bucket_id=motion_amount, | ||
cond_aug=cond_aug, | ||
seed=seed, | ||
decoding_t=decoding_t, | ||
device=device, | ||
output_folder=output_folder, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
model: | ||
target: imaginairy.modules.sgm.diffusion.DiffusionEngine | ||
params: | ||
scale_factor: 0.18215 | ||
disable_first_stage_autocast: False | ||
|
||
|
||
denoiser_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.denoiser.Denoiser | ||
params: | ||
scaling_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise | ||
|
||
network_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.video_model.VideoUNet | ||
params: | ||
adm_in_channels: 768 | ||
num_classes: sequential | ||
use_checkpoint: False | ||
in_channels: 8 | ||
out_channels: 4 | ||
model_channels: 320 | ||
attention_resolutions: [4, 2, 1] | ||
num_res_blocks: 2 | ||
channel_mult: [1, 2, 4, 4] | ||
num_head_channels: 64 | ||
use_linear_in_transformer: True | ||
transformer_depth: 1 | ||
context_dim: 1024 | ||
spatial_transformer_attn_type: softmax-xformers | ||
extra_ff_mix_layer: True | ||
use_spatial_context: True | ||
merge_strategy: learned_with_images | ||
video_kernel_size: [3, 1, 1] | ||
|
||
conditioner_config: | ||
target: imaginairy.modules.sgm.encoders.modules.GeneralConditioner | ||
params: | ||
emb_models: | ||
- is_trainable: False | ||
input_key: cond_frames_without_noise | ||
target: imaginairy.modules.sgm.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder | ||
params: | ||
n_cond_frames: 1 | ||
n_copies: 1 | ||
open_clip_embedding_config: | ||
target: imaginairy.modules.sgm.encoders.modules.FrozenOpenCLIPImageEmbedder | ||
params: | ||
freeze: True | ||
|
||
- input_key: fps_id | ||
is_trainable: False | ||
target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND | ||
params: | ||
outdim: 256 | ||
|
||
- input_key: motion_bucket_id | ||
is_trainable: False | ||
target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND | ||
params: | ||
outdim: 256 | ||
|
||
- input_key: cond_frames | ||
is_trainable: False | ||
target: imaginairy.modules.sgm.encoders.modules.VideoPredictionEmbedderWithEncoder | ||
params: | ||
disable_encoder_autocast: False | ||
n_cond_frames: 1 | ||
n_copies: 1 | ||
is_ae: True | ||
encoder_config: | ||
target: imaginairy.modules.sgm.autoencoder.AutoencoderKLModeOnly | ||
params: | ||
embed_dim: 4 | ||
monitor: val/rec_loss | ||
ddconfig: | ||
attn_type: vanilla-xformers | ||
double_z: True | ||
z_channels: 4 | ||
resolution: 256 | ||
in_channels: 3 | ||
out_ch: 3 | ||
ch: 128 | ||
ch_mult: [1, 2, 4, 4] | ||
num_res_blocks: 2 | ||
attn_resolutions: [] | ||
dropout: 0.0 | ||
lossconfig: | ||
target: torch.nn.Identity | ||
|
||
- input_key: cond_aug | ||
is_trainable: False | ||
target: imaginairy.modules.sgm.encoders.modules.ConcatTimestepEmbedderND | ||
params: | ||
outdim: 256 | ||
|
||
first_stage_config: | ||
target: imaginairy.modules.sgm.autoencoder.AutoencodingEngine | ||
params: | ||
loss_config: | ||
target: torch.nn.Identity | ||
regularizer_config: | ||
target: imaginairy.modules.sgm.autoencoding.regularizers.DiagonalGaussianRegularizer | ||
encoder_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.model.Encoder | ||
params: | ||
attn_type: vanilla | ||
double_z: True | ||
z_channels: 4 | ||
resolution: 256 | ||
in_channels: 3 | ||
out_ch: 3 | ||
ch: 128 | ||
ch_mult: [1, 2, 4, 4] | ||
num_res_blocks: 2 | ||
attn_resolutions: [] | ||
dropout: 0.0 | ||
decoder_config: | ||
target: imaginairy.modules.sgm.autoencoding.temporal_ae.VideoDecoder | ||
params: | ||
attn_type: vanilla | ||
double_z: True | ||
z_channels: 4 | ||
resolution: 256 | ||
in_channels: 3 | ||
out_ch: 3 | ||
ch: 128 | ||
ch_mult: [1, 2, 4, 4] | ||
num_res_blocks: 2 | ||
attn_resolutions: [] | ||
dropout: 0.0 | ||
video_kernel_size: [3, 1, 1] | ||
|
||
sampler_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.sampling.EulerEDMSampler | ||
params: | ||
discretization_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.discretizer.EDMDiscretization | ||
params: | ||
sigma_max: 700.0 | ||
|
||
guider_config: | ||
target: imaginairy.modules.sgm.diffusionmodules.guiders.LinearPredictionGuider | ||
params: | ||
max_scale: 2.5 | ||
min_scale: 1.0 |
Oops, something went wrong.