Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs for xformers disable and a BETTER way to fix "RuntimeError" for "lms" and "euler_a" && Fix bugs of FP16 NaN values. #90

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
bitsandbytes==0.41.1
dadaptation==3.1
diffusers==0.20.2
ipython==8.7.0
diffusers<=0.27.2
ipython<=8.18.1
lion_pytorch==0.1.2
lpips==0.1.4
matplotlib==3.6.2
Expand All @@ -15,10 +15,10 @@ pydantic==2.6.3
PyYAML==6.0.1
Requests==2.31.0
safetensors==0.3.1
torch==2.0.1
torchvision==0.15.2
torch
torchvision
tqdm==4.64.1
transformers==4.27.4
wandb==0.12.21
xformers==0.0.21
accelerate==0.16.0
accelerate<=0.29.2
23 changes: 23 additions & 0 deletions requirements_nonxformers.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
bitsandbytes==0.41.1
dadaptation==3.1
diffusers<=0.27.2
ipython<=8.18.1
lion_pytorch==0.1.2
lpips==0.1.4
matplotlib==3.6.2
numpy==1.23.5
opencv_python==4.5.5.64
opencv_python_headless==4.7.0.68
pandas==1.5.2
Pillow==10.1.0
prodigyopt==1.0
pydantic==2.6.3
PyYAML==6.0.1
Requests==2.31.0
safetensors==0.3.1
torch
torchvision
tqdm==4.64.1
transformers==4.27.4
wandb==0.12.21
accelerate<=0.29.2
28 changes: 21 additions & 7 deletions trainscripts/imagesliders/model_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def load_diffusers_model(
v2: bool = False,
clip_skip: Optional[int] = None,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
# VAE はいらない

Expand All @@ -49,6 +50,7 @@ def load_diffusers_model(
num_hidden_layers=24 - (clip_skip - 1) if clip_skip is not None else 23,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)
else:
tokenizer = CLIPTokenizer.from_pretrained(
Expand All @@ -63,16 +65,18 @@ def load_diffusers_model(
num_hidden_layers=12 - (clip_skip - 1) if clip_skip is not None else 12,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

unet = UNet2DConditionModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="unet",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", variant = variant)

return tokenizer, text_encoder, unet, vae

Expand All @@ -82,12 +86,14 @@ def load_checkpoint_model(
v2: bool = False,
clip_skip: Optional[int] = None,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
pipe = StableDiffusionPipeline.from_ckpt(
pipe = StableDiffusionPipeline.from_pretrained(
checkpoint_path,
upcast_attention=True if v2 else False,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

unet = pipe.unet
Expand All @@ -111,16 +117,17 @@ def load_models(
v2: bool = False,
v_pred: bool = False,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel, SchedulerMixin,]:
if pretrained_model_name_or_path.endswith(
".ckpt"
) or pretrained_model_name_or_path.endswith(".safetensors"):
tokenizer, text_encoder, unet, vae = load_checkpoint_model(
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
)
else: # diffusers
tokenizer, text_encoder, unet, vae = load_diffusers_model(
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
)

# VAE はいらない
Expand All @@ -136,6 +143,7 @@ def load_models(
def load_diffusers_model_xl(
pretrained_model_name_or_path: str,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
# returns tokenizer, tokenizer_2, text_encoder, text_encoder_2, unet

Expand All @@ -161,12 +169,14 @@ def load_diffusers_model_xl(
subfolder="text_encoder",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
),
CLIPTextModelWithProjection.from_pretrained(
pretrained_model_name_or_path,
subfolder="text_encoder_2",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
),
]

Expand All @@ -175,19 +185,22 @@ def load_diffusers_model_xl(
subfolder="unet",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", variant = variant)
return tokenizers, text_encoders, unet, vae


def load_checkpoint_model_xl(
checkpoint_path: str,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
pipe = StableDiffusionXLPipeline.from_single_file(
checkpoint_path,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

unet = pipe.unet
Expand All @@ -205,6 +218,7 @@ def load_models_xl(
pretrained_model_name_or_path: str,
scheduler_name: AVAILABLE_SCHEDULERS,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[
list[CLIPTokenizer],
list[SDXL_TEXT_ENCODER_TYPE],
Expand All @@ -219,14 +233,14 @@ def load_models_xl(
text_encoders,
unet,
vae
) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype)
) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype, variant)
else: # diffusers
(
tokenizers,
text_encoders,
unet,
vae
) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype)
) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype, variant)

scheduler = create_noise_scheduler(scheduler_name)

Expand Down
2 changes: 2 additions & 0 deletions trainscripts/imagesliders/train_lora-scale-xl.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def train(
) = model_util.load_models_xl(
config.pretrained_model.name_or_path,
scheduler_name=config.train.noise_scheduler,
weight_dtype = weight_dtype,
variant= "fp16" if weight_dtype == torch.float16 else None
)

for text_encoder in text_encoders:
Expand Down
2 changes: 2 additions & 0 deletions trainscripts/imagesliders/train_lora-scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def train(
scheduler_name=config.train.noise_scheduler,
v2=config.pretrained_model.v2,
v_pred=config.pretrained_model.v_pred,
weight_dtype = weight_dtype,
variant= "fp16" if weight_dtype == torch.float16 else None
)

text_encoder.to(device, dtype=weight_dtype)
Expand Down
13 changes: 7 additions & 6 deletions trainscripts/imagesliders/train_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


def get_random_noise(
batch_size: int, height: int, width: int, generator: torch.Generator = None
batch_size: int, height: int, width: int, device: torch.device, generator: torch.Generator = None
) -> torch.Tensor:
return torch.randn(
(
Expand All @@ -28,8 +28,8 @@ def get_random_noise(
height // VAE_SCALE_FACTOR, # 縦と横これであってるのかわからないけど、どっちにしろ大きな問題は発生しないのでこれでいいや
width // VAE_SCALE_FACTOR,
),
generator=generator,
device="cpu",
device=device,
generator=generator
)


Expand All @@ -47,13 +47,14 @@ def get_initial_latents(
height: int,
width: int,
n_prompts: int,
generator=None,
device: torch.device,
generator=None
) -> torch.Tensor:
noise = get_random_noise(n_imgs, height, width, generator=generator).repeat(
noise = get_random_noise(n_imgs, height, width, device, generator=generator).repeat(
n_prompts, 1, 1, 1
)

latents = noise * scheduler.init_noise_sigma
latents = noise * torch.tensor(scheduler.init_noise_sigma).to(device)

return latents

Expand Down
24 changes: 19 additions & 5 deletions trainscripts/textsliders/model_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def load_diffusers_model(
v2: bool = False,
clip_skip: Optional[int] = None,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
# VAE はいらない

Expand All @@ -48,6 +49,7 @@ def load_diffusers_model(
num_hidden_layers=24 - (clip_skip - 1) if clip_skip is not None else 23,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)
else:
tokenizer = CLIPTokenizer.from_pretrained(
Expand All @@ -62,13 +64,15 @@ def load_diffusers_model(
num_hidden_layers=12 - (clip_skip - 1) if clip_skip is not None else 12,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

unet = UNet2DConditionModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="unet",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

return tokenizer, text_encoder, unet
Expand All @@ -79,12 +83,14 @@ def load_checkpoint_model(
v2: bool = False,
clip_skip: Optional[int] = None,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
pipe = StableDiffusionPipeline.from_ckpt(
pipe = StableDiffusionPipeline.from_pretrained(
checkpoint_path,
upcast_attention=True if v2 else False,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

unet = pipe.unet
Expand All @@ -107,16 +113,17 @@ def load_models(
v2: bool = False,
v_pred: bool = False,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel, SchedulerMixin,]:
if pretrained_model_name_or_path.endswith(
".ckpt"
) or pretrained_model_name_or_path.endswith(".safetensors"):
tokenizer, text_encoder, unet = load_checkpoint_model(
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
)
else: # diffusers
tokenizer, text_encoder, unet = load_diffusers_model(
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
)

# VAE はいらない
Expand All @@ -132,6 +139,7 @@ def load_models(
def load_diffusers_model_xl(
pretrained_model_name_or_path: str,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
# returns tokenizer, tokenizer_2, text_encoder, text_encoder_2, unet

Expand All @@ -157,12 +165,14 @@ def load_diffusers_model_xl(
subfolder="text_encoder",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
),
CLIPTextModelWithProjection.from_pretrained(
pretrained_model_name_or_path,
subfolder="text_encoder_2",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
),
]

Expand All @@ -171,6 +181,7 @@ def load_diffusers_model_xl(
subfolder="unet",
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

return tokenizers, text_encoders, unet
Expand All @@ -179,11 +190,13 @@ def load_diffusers_model_xl(
def load_checkpoint_model_xl(
checkpoint_path: str,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
pipe = StableDiffusionXLPipeline.from_single_file(
checkpoint_path,
torch_dtype=weight_dtype,
cache_dir=DIFFUSERS_CACHE_DIR,
variant = variant
)

unet = pipe.unet
Expand All @@ -201,6 +214,7 @@ def load_models_xl(
pretrained_model_name_or_path: str,
scheduler_name: AVAILABLE_SCHEDULERS,
weight_dtype: torch.dtype = torch.float32,
variant: Optional[str] = None
) -> tuple[
list[CLIPTokenizer],
list[SDXL_TEXT_ENCODER_TYPE],
Expand All @@ -214,13 +228,13 @@ def load_models_xl(
tokenizers,
text_encoders,
unet,
) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype)
) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype, variant)
else: # diffusers
(
tokenizers,
text_encoders,
unet,
) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype)
) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype, variant)

scheduler = create_noise_scheduler(scheduler_name)

Expand Down
Loading