rohitgandikota · Looong01 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 bitsandbytes==0.41.1
 dadaptation==3.1
-diffusers==0.20.2
-ipython==8.7.0
+diffusers<=0.27.2
+ipython<=8.18.1
 lion_pytorch==0.1.2
 lpips==0.1.4
 matplotlib==3.6.2
@@ -15,10 +15,10 @@ pydantic==2.6.3
 PyYAML==6.0.1
 Requests==2.31.0
 safetensors==0.3.1
-torch==2.0.1
-torchvision==0.15.2
+torch
+torchvision
 tqdm==4.64.1
 transformers==4.27.4
 wandb==0.12.21
 xformers==0.0.21
-accelerate==0.16.0
+accelerate<=0.29.2
diff --git a/requirements_nonxformers.txt b/requirements_nonxformers.txt
@@ -0,0 +1,23 @@
+bitsandbytes==0.41.1
+dadaptation==3.1
+diffusers<=0.27.2
+ipython<=8.18.1
+lion_pytorch==0.1.2
+lpips==0.1.4
+matplotlib==3.6.2
+numpy==1.23.5
+opencv_python==4.5.5.64
+opencv_python_headless==4.7.0.68
+pandas==1.5.2
+Pillow==10.1.0
+prodigyopt==1.0
+pydantic==2.6.3
+PyYAML==6.0.1
+Requests==2.31.0
+safetensors==0.3.1
+torch
+torchvision
+tqdm==4.64.1
+transformers==4.27.4
+wandb==0.12.21
+accelerate<=0.29.2
diff --git a/trainscripts/imagesliders/model_util.py b/trainscripts/imagesliders/model_util.py
@@ -32,6 +32,7 @@ def load_diffusers_model(
     v2: bool = False,
     clip_skip: Optional[int] = None,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
     # VAE はいらない
 
@@ -49,6 +50,7 @@ def load_diffusers_model(
             num_hidden_layers=24 - (clip_skip - 1) if clip_skip is not None else 23,
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         )
     else:
         tokenizer = CLIPTokenizer.from_pretrained(
@@ -63,16 +65,18 @@ def load_diffusers_model(
             num_hidden_layers=12 - (clip_skip - 1) if clip_skip is not None else 12,
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         )
 
     unet = UNet2DConditionModel.from_pretrained(
         pretrained_model_name_or_path,
         subfolder="unet",
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
-    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", variant = variant)
 
     return tokenizer, text_encoder, unet, vae
 
@@ -82,12 +86,14 @@ def load_checkpoint_model(
     v2: bool = False,
     clip_skip: Optional[int] = None,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
-    pipe = StableDiffusionPipeline.from_ckpt(
+    pipe = StableDiffusionPipeline.from_pretrained(
         checkpoint_path,
         upcast_attention=True if v2 else False,
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
     unet = pipe.unet
@@ -111,16 +117,17 @@ def load_models(
     v2: bool = False,
     v_pred: bool = False,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel, SchedulerMixin,]:
     if pretrained_model_name_or_path.endswith(
         ".ckpt"
     ) or pretrained_model_name_or_path.endswith(".safetensors"):
         tokenizer, text_encoder, unet, vae = load_checkpoint_model(
-            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
+            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
         )
     else:  # diffusers
         tokenizer, text_encoder, unet, vae = load_diffusers_model(
-            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
+            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
         )
 
     # VAE はいらない
@@ -136,6 +143,7 @@ def load_models(
 def load_diffusers_model_xl(
     pretrained_model_name_or_path: str,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
     # returns tokenizer, tokenizer_2, text_encoder, text_encoder_2, unet
 
@@ -161,12 +169,14 @@ def load_diffusers_model_xl(
             subfolder="text_encoder",
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         ),
         CLIPTextModelWithProjection.from_pretrained(
             pretrained_model_name_or_path,
             subfolder="text_encoder_2",
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         ),
     ]
 
@@ -175,19 +185,22 @@ def load_diffusers_model_xl(
         subfolder="unet",
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
-    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", variant = variant)
     return tokenizers, text_encoders, unet, vae
 
 
 def load_checkpoint_model_xl(
     checkpoint_path: str,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
     pipe = StableDiffusionXLPipeline.from_single_file(
         checkpoint_path,
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
     unet = pipe.unet
@@ -205,6 +218,7 @@ def load_models_xl(
     pretrained_model_name_or_path: str,
     scheduler_name: AVAILABLE_SCHEDULERS,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[
     list[CLIPTokenizer],
     list[SDXL_TEXT_ENCODER_TYPE],
@@ -219,14 +233,14 @@ def load_models_xl(
             text_encoders,
             unet,
             vae
-        ) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype)
+        ) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype, variant)
     else:  # diffusers
         (
             tokenizers,
             text_encoders,
             unet,
             vae
-        ) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype)
+        ) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype, variant)
 
     scheduler = create_noise_scheduler(scheduler_name)
 

diff --git a/trainscripts/imagesliders/train_lora-scale-xl.py b/trainscripts/imagesliders/train_lora-scale-xl.py
@@ -80,6 +80,8 @@ def train(
     ) = model_util.load_models_xl(
         config.pretrained_model.name_or_path,
         scheduler_name=config.train.noise_scheduler,
+        weight_dtype = weight_dtype,
+        variant= "fp16" if weight_dtype == torch.float16 else None
     )
 
     for text_encoder in text_encoders:

diff --git a/trainscripts/imagesliders/train_lora-scale.py b/trainscripts/imagesliders/train_lora-scale.py
@@ -74,6 +74,8 @@ def train(
         scheduler_name=config.train.noise_scheduler,
         v2=config.pretrained_model.v2,
         v_pred=config.pretrained_model.v_pred,
+        weight_dtype = weight_dtype,
+        variant= "fp16" if weight_dtype == torch.float16 else None
     )
 
     text_encoder.to(device, dtype=weight_dtype)

diff --git a/trainscripts/imagesliders/train_util.py b/trainscripts/imagesliders/train_util.py
@@ -19,7 +19,7 @@
 
 
 def get_random_noise(
-    batch_size: int, height: int, width: int, generator: torch.Generator = None
+    batch_size: int, height: int, width: int, device: torch.device, generator: torch.Generator = None
 ) -> torch.Tensor:
     return torch.randn(
         (
@@ -28,8 +28,8 @@ def get_random_noise(
             height // VAE_SCALE_FACTOR,  # 縦と横これであってるのかわからないけど、どっちにしろ大きな問題は発生しないのでこれでいいや
             width // VAE_SCALE_FACTOR,
         ),
-        generator=generator,
-        device="cpu",
+        device=device,
+        generator=generator
     )
 
 
@@ -47,13 +47,14 @@ def get_initial_latents(
     height: int,
     width: int,
     n_prompts: int,
-    generator=None,
+    device: torch.device,
+    generator=None
 ) -> torch.Tensor:
-    noise = get_random_noise(n_imgs, height, width, generator=generator).repeat(
+    noise = get_random_noise(n_imgs, height, width, device, generator=generator).repeat(
         n_prompts, 1, 1, 1
     )
 
-    latents = noise * scheduler.init_noise_sigma
+    latents = noise * torch.tensor(scheduler.init_noise_sigma).to(device)
 
     return latents
 

diff --git a/trainscripts/textsliders/model_util.py b/trainscripts/textsliders/model_util.py
@@ -31,6 +31,7 @@ def load_diffusers_model(
     v2: bool = False,
     clip_skip: Optional[int] = None,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
     # VAE はいらない
 
@@ -48,6 +49,7 @@ def load_diffusers_model(
             num_hidden_layers=24 - (clip_skip - 1) if clip_skip is not None else 23,
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         )
     else:
         tokenizer = CLIPTokenizer.from_pretrained(
@@ -62,13 +64,15 @@ def load_diffusers_model(
             num_hidden_layers=12 - (clip_skip - 1) if clip_skip is not None else 12,
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         )
 
     unet = UNet2DConditionModel.from_pretrained(
         pretrained_model_name_or_path,
         subfolder="unet",
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
     return tokenizer, text_encoder, unet
@@ -79,12 +83,14 @@ def load_checkpoint_model(
     v2: bool = False,
     clip_skip: Optional[int] = None,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
-    pipe = StableDiffusionPipeline.from_ckpt(
+    pipe = StableDiffusionPipeline.from_pretrained(
         checkpoint_path,
         upcast_attention=True if v2 else False,
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
     unet = pipe.unet
@@ -107,16 +113,17 @@ def load_models(
     v2: bool = False,
     v_pred: bool = False,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel, SchedulerMixin,]:
     if pretrained_model_name_or_path.endswith(
         ".ckpt"
     ) or pretrained_model_name_or_path.endswith(".safetensors"):
         tokenizer, text_encoder, unet = load_checkpoint_model(
-            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
+            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
         )
     else:  # diffusers
         tokenizer, text_encoder, unet = load_diffusers_model(
-            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
+            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype, variant=variant
         )
 
     # VAE はいらない
@@ -132,6 +139,7 @@ def load_models(
 def load_diffusers_model_xl(
     pretrained_model_name_or_path: str,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
     # returns tokenizer, tokenizer_2, text_encoder, text_encoder_2, unet
 
@@ -157,12 +165,14 @@ def load_diffusers_model_xl(
             subfolder="text_encoder",
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         ),
         CLIPTextModelWithProjection.from_pretrained(
             pretrained_model_name_or_path,
             subfolder="text_encoder_2",
             torch_dtype=weight_dtype,
             cache_dir=DIFFUSERS_CACHE_DIR,
+            variant = variant
         ),
     ]
 
@@ -171,6 +181,7 @@ def load_diffusers_model_xl(
         subfolder="unet",
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
     return tokenizers, text_encoders, unet
@@ -179,11 +190,13 @@ def load_diffusers_model_xl(
 def load_checkpoint_model_xl(
     checkpoint_path: str,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
     pipe = StableDiffusionXLPipeline.from_single_file(
         checkpoint_path,
         torch_dtype=weight_dtype,
         cache_dir=DIFFUSERS_CACHE_DIR,
+        variant = variant
     )
 
     unet = pipe.unet
@@ -201,6 +214,7 @@ def load_models_xl(
     pretrained_model_name_or_path: str,
     scheduler_name: AVAILABLE_SCHEDULERS,
     weight_dtype: torch.dtype = torch.float32,
+    variant: Optional[str] = None
 ) -> tuple[
     list[CLIPTokenizer],
     list[SDXL_TEXT_ENCODER_TYPE],
@@ -214,13 +228,13 @@ def load_models_xl(
             tokenizers,
             text_encoders,
             unet,
-        ) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype)
+        ) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype, variant)
     else:  # diffusers
         (
             tokenizers,
             text_encoders,
             unet,
-        ) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype)
+        ) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype, variant)
 
     scheduler = create_noise_scheduler(scheduler_name)