From 14c452a269f63f19c2c0fbb1938b5c04adbe3bc8 Mon Sep 17 00:00:00 2001 From: Suprhimp Date: Fri, 17 Jan 2025 18:47:14 +0900 Subject: [PATCH 01/12] [feat]add strength in flux_fill pipeline --- .../pipelines/flux/pipeline_flux_fill.py | 150 ++++++++++++++---- 1 file changed, 115 insertions(+), 35 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index ed8623e31733..51f4765a4344 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -225,10 +225,9 @@ def __init__( # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible # by the patch size. So the vae scale factor is multiplied by the patch size to account for this self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) - latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16 self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor * 2, - vae_latent_channels=latent_channels, + vae_latent_channels=self.vae.config.latent_channels, do_normalize=False, do_binarize=True, do_convert_grayscale=True, @@ -493,10 +492,40 @@ def encode_prompt( return prompt_embeds, pooled_prompt_embeds, text_ids + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(self.vae.encode(image), generator=generator) + + image_latents = ( + image_latents - self.vae.config.shift_factor + ) * self.vae.config.scaling_factor + + return image_latents + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(num_inference_steps * strength, num_inference_steps) + + t_start = int(max(num_inference_steps - init_timestep, 0)) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + def check_inputs( self, prompt, prompt_2, + strength, height, width, prompt_embeds=None, @@ -507,6 +536,9 @@ def check_inputs( mask_image=None, masked_image_latents=None, ): + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0: logger.warning( f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly" @@ -627,6 +659,8 @@ def disable_vae_tiling(self): # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents def prepare_latents( self, + image, + timestep, batch_size, num_channels_latents, height, @@ -643,22 +677,37 @@ def prepare_latents( shape = (batch_size, num_channels_latents, height, width) - if latents is not None: - latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype) - return latents.to(device=device, dtype=dtype), latent_image_ids + # if latents is not None: + image = image.to(device=device, dtype=dtype) + image_latents = self._encode_vae_image(image=image, generator=generator) - if isinstance(generator, list) and len(generator) != batch_size: + latent_image_ids = self._prepare_latent_image_ids( + batch_size, height // 2, width // 2, device, dtype + ) + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." ) + else: + image_latents = torch.cat([image_latents], dim=0) - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) - - latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype) + if latents is None: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + latents = self.scheduler.scale_noise(image_latents, timestep, noise) + else: + noise = latents.to(device) + latents = noise - return latents, latent_image_ids + noise = self._pack_latents(noise, batch_size, num_channels_latents, height, width) + image_latents = self._pack_latents( + image_latents, batch_size, num_channels_latents, height, width + ) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + return latents, noise, image_latents, latent_image_ids @property def guidance_scale(self): @@ -687,6 +736,7 @@ def __call__( masked_image_latents: Optional[torch.FloatTensor] = None, height: Optional[int] = None, width: Optional[int] = None, + strength: float = 1.0, num_inference_steps: int = 50, sigmas: Optional[List[float]] = None, guidance_scale: float = 30.0, @@ -731,6 +781,12 @@ def __call__( The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. This is set to 1024 by default for the best results. + strength (`float`, *optional*, defaults to 1.0): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -794,6 +850,7 @@ def __call__( self.check_inputs( prompt, prompt_2, + strength, height, width, prompt_embeds=prompt_embeds, @@ -809,6 +866,10 @@ def __call__( self._joint_attention_kwargs = joint_attention_kwargs self._interrupt = False + original_image = image + init_image = self.image_processor.preprocess(image, height=height, width=width) + init_image = init_image.to(dtype=torch.float32) + # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -821,7 +882,9 @@ def __call__( # 3. Prepare prompt embeddings lora_scale = ( - self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None + self.joint_attention_kwargs.get("scale", None) + if self.joint_attention_kwargs is not None + else None ) ( prompt_embeds, @@ -838,9 +901,43 @@ def __call__( lora_scale=lora_scale, ) + # 6. Prepare timesteps + sigmas = ( + np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + if sigmas is None + else sigmas + ) + image_seq_len = (int(height) // self.vae_scale_factor // 2) * ( + int(width) // self.vae_scale_factor // 2 + ) + mu = calculate_shift( + image_seq_len, + self.scheduler.config.base_image_seq_len, + self.scheduler.config.max_image_seq_len, + self.scheduler.config.base_shift, + self.scheduler.config.max_shift, + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + mu=mu, + ) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" + f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." + ) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + # 4. Prepare latent variables num_channels_latents = self.vae.config.latent_channels - latents, latent_image_ids = self.prepare_latents( + latents, noise, image_latents, latent_image_ids = self.prepare_latents( + init_image, + latent_timestep, batch_size * num_images_per_prompt, num_channels_latents, height, @@ -855,13 +952,13 @@ def __call__( if masked_image_latents is not None: masked_image_latents = masked_image_latents.to(latents.device) else: - image = self.image_processor.preprocess(image, height=height, width=width) + # image = self.image_processor.preprocess(image, height=height, width=width) mask_image = self.mask_processor.preprocess(mask_image, height=height, width=width) - masked_image = image * (1 - mask_image) + masked_image = init_image * (1 - mask_image) masked_image = masked_image.to(device=device, dtype=prompt_embeds.dtype) - height, width = image.shape[-2:] + height, width = init_image.shape[-2:] mask, masked_image_latents = self.prepare_mask_latents( mask_image, masked_image, @@ -876,23 +973,6 @@ def __call__( ) masked_image_latents = torch.cat((masked_image_latents, mask), dim=-1) - # 6. Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas - image_seq_len = latents.shape[1] - mu = calculate_shift( - image_seq_len, - self.scheduler.config.get("base_image_seq_len", 256), - self.scheduler.config.get("max_image_seq_len", 4096), - self.scheduler.config.get("base_shift", 0.5), - self.scheduler.config.get("max_shift", 1.16), - ) - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, - num_inference_steps, - device, - sigmas=sigmas, - mu=mu, - ) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) From a7e15017e2f525665cdb28a74b9998a6169bb688 Mon Sep 17 00:00:00 2001 From: Suprhimp <73486185+Suprhimp@users.noreply.github.com> Date: Fri, 17 Jan 2025 19:55:47 +0900 Subject: [PATCH 02/12] Update src/diffusers/pipelines/flux/pipeline_flux_fill.py Co-authored-by: hlky --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 51f4765a4344..93938aec20a1 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -952,7 +952,6 @@ def __call__( if masked_image_latents is not None: masked_image_latents = masked_image_latents.to(latents.device) else: - # image = self.image_processor.preprocess(image, height=height, width=width) mask_image = self.mask_processor.preprocess(mask_image, height=height, width=width) masked_image = init_image * (1 - mask_image) From cf60e52a3e8134aaa6f32b3cea11043981b520f1 Mon Sep 17 00:00:00 2001 From: Suprhimp <73486185+Suprhimp@users.noreply.github.com> Date: Fri, 17 Jan 2025 19:55:54 +0900 Subject: [PATCH 03/12] Update src/diffusers/pipelines/flux/pipeline_flux_fill.py Co-authored-by: hlky --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 93938aec20a1..68372b76c229 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -901,7 +901,7 @@ def __call__( lora_scale=lora_scale, ) - # 6. Prepare timesteps + # 4. Prepare timesteps sigmas = ( np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None From 25fa97c4ec129dff1513457934c25370b241bc66 Mon Sep 17 00:00:00 2001 From: Suprhimp <73486185+Suprhimp@users.noreply.github.com> Date: Fri, 17 Jan 2025 19:56:10 +0900 Subject: [PATCH 04/12] Update src/diffusers/pipelines/flux/pipeline_flux_fill.py Co-authored-by: hlky --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 68372b76c229..8ff340cdb7ca 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -933,7 +933,7 @@ def __call__( ) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) - # 4. Prepare latent variables + # 5. Prepare latent variables num_channels_latents = self.vae.config.latent_channels latents, noise, image_latents, latent_image_ids = self.prepare_latents( init_image, From 5d6b78cd09b5e8a7e822e0a4a886fe9582148994 Mon Sep 17 00:00:00 2001 From: Suprhimp Date: Fri, 17 Jan 2025 19:59:20 +0900 Subject: [PATCH 05/12] [refactor] refactor after review --- .../pipelines/flux/pipeline_flux_fill.py | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 8ff340cdb7ca..3ae53a101707 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -225,9 +225,10 @@ def __init__( # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible # by the patch size. So the vae scale factor is multiplied by the patch size to account for this self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) + latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16 self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor * 2, - vae_latent_channels=self.vae.config.latent_channels, + vae_latent_channels=latent_channels, do_normalize=False, do_binarize=True, do_convert_grayscale=True, @@ -656,7 +657,7 @@ def disable_vae_tiling(self): """ self.vae.disable_tiling() - # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents + # Copied from diffusers.pipelines.flux.pipeline_flux.FluxImg2ImgPipeline.prepare_latents def prepare_latents( self, image, @@ -670,20 +671,24 @@ def prepare_latents( generator, latents=None, ): + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + # VAE applies 8x compression on images but we must also account for packing which requires # latent height and width to be divisible by 2. height = 2 * (int(height) // (self.vae_scale_factor * 2)) width = 2 * (int(width) // (self.vae_scale_factor * 2)) - shape = (batch_size, num_channels_latents, height, width) + latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype) + + if latents is not None: + return latents.to(device=device, dtype=dtype), latent_image_ids - # if latents is not None: image = image.to(device=device, dtype=dtype) image_latents = self._encode_vae_image(image=image, generator=generator) - - latent_image_ids = self._prepare_latent_image_ids( - batch_size, height // 2, width // 2, device, dtype - ) if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: # expand init_latents for batch_size additional_image_per_prompt = batch_size // image_latents.shape[0] @@ -695,19 +700,10 @@ def prepare_latents( else: image_latents = torch.cat([image_latents], dim=0) - if latents is None: - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - latents = self.scheduler.scale_noise(image_latents, timestep, noise) - else: - noise = latents.to(device) - latents = noise - - noise = self._pack_latents(noise, batch_size, num_channels_latents, height, width) - image_latents = self._pack_latents( - image_latents, batch_size, num_channels_latents, height, width - ) + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + latents = self.scheduler.scale_noise(image_latents, timestep, noise) latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) - return latents, noise, image_latents, latent_image_ids + return latents, latent_image_ids @property def guidance_scale(self): @@ -866,7 +862,6 @@ def __call__( self._joint_attention_kwargs = joint_attention_kwargs self._interrupt = False - original_image = image init_image = self.image_processor.preprocess(image, height=height, width=width) init_image = init_image.to(dtype=torch.float32) @@ -935,7 +930,7 @@ def __call__( # 5. Prepare latent variables num_channels_latents = self.vae.config.latent_channels - latents, noise, image_latents, latent_image_ids = self.prepare_latents( + latents, latent_image_ids = self.prepare_latents( init_image, latent_timestep, batch_size * num_images_per_prompt, From 3a1ea2e5ebfb5adfc517afa6ad188a599eff4b4e Mon Sep 17 00:00:00 2001 From: Suprhimp Date: Fri, 17 Jan 2025 20:38:42 +0900 Subject: [PATCH 06/12] [fix] change comment --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 3ae53a101707..c6227b1b801a 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -943,7 +943,7 @@ def __call__( latents, ) - # 5. Prepare mask and masked image latents + # 6. Prepare mask and masked image latents if masked_image_latents is not None: masked_image_latents = masked_image_latents.to(latents.device) else: From a6737f1f382bde39eac9d41d6262c0134c2a6d0e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 4 Apr 2025 08:56:03 +0000 Subject: [PATCH 07/12] Apply style fixes --- .../pipelines/flux/pipeline_flux_fill.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 8e9cb940fe89..3e11bae177bd 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -504,9 +504,7 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): else: image_latents = retrieve_latents(self.vae.encode(image), generator=generator) - image_latents = ( - image_latents - self.vae.config.shift_factor - ) * self.vae.config.scaling_factor + image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor return image_latents @@ -877,9 +875,7 @@ def __call__( # 3. Prepare prompt embeddings lora_scale = ( - self.joint_attention_kwargs.get("scale", None) - if self.joint_attention_kwargs is not None - else None + self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None ) ( prompt_embeds, @@ -897,14 +893,8 @@ def __call__( ) # 4. Prepare timesteps - sigmas = ( - np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) - if sigmas is None - else sigmas - ) - image_seq_len = (int(height) // self.vae_scale_factor // 2) * ( - int(width) // self.vae_scale_factor // 2 - ) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas + image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2) mu = calculate_shift( image_seq_len, self.scheduler.config.base_image_seq_len, @@ -967,7 +957,6 @@ def __call__( ) masked_image_latents = torch.cat((masked_image_latents, mask), dim=-1) - num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) From c489b57370d07a2816c5128ce48cddc8b8389aab Mon Sep 17 00:00:00 2001 From: asomoza Date: Fri, 4 Apr 2025 11:06:47 +0200 Subject: [PATCH 08/12] empty From e87c9eb6ce14dc2ad35d652f6d8a25f9e98105d8 Mon Sep 17 00:00:00 2001 From: asomoza Date: Fri, 4 Apr 2025 11:15:50 +0200 Subject: [PATCH 09/12] fix --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 3e11bae177bd..00ade46b39e3 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -655,7 +655,7 @@ def disable_vae_tiling(self): """ self.vae.disable_tiling() - # Copied from diffusers.pipelines.flux.pipeline_flux.FluxImg2ImgPipeline.prepare_latents + # Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents def prepare_latents( self, image, From cb43412fb37557300091c6cb53a8ad98a784da4f Mon Sep 17 00:00:00 2001 From: Suprhimp Date: Fri, 4 Apr 2025 20:03:55 +0900 Subject: [PATCH 10/12] update prepare_latents from flux.img2img pipeline --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 00ade46b39e3..c2afa9fac0e2 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -224,11 +224,11 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8 # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible # by the patch size. So the vae scale factor is multiplied by the patch size to account for this - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) - latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16 + self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16 + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2,vae_latent_channels=self.latent_channels) self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor * 2, - vae_latent_channels=latent_channels, + vae_latent_channels=self.latent_channels, do_normalize=False, do_binarize=True, do_convert_grayscale=True, @@ -686,7 +686,10 @@ def prepare_latents( return latents.to(device=device, dtype=dtype), latent_image_ids image = image.to(device=device, dtype=dtype) - image_latents = self._encode_vae_image(image=image, generator=generator) + if image.shape[1] != self.latent_channels: + image_latents = self._encode_vae_image(image=image, generator=generator) + else: + image_latents = image if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: # expand init_latents for batch_size additional_image_per_prompt = batch_size // image_latents.shape[0] From f0fac627f90e5e777a5253faf67c80aa5729d250 Mon Sep 17 00:00:00 2001 From: asomoza Date: Fri, 4 Apr 2025 13:44:50 +0200 Subject: [PATCH 11/12] style --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index c2afa9fac0e2..2058f391d5c1 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -225,7 +225,9 @@ def __init__( # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible # by the patch size. So the vae scale factor is multiplied by the patch size to account for this self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16 - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2,vae_latent_channels=self.latent_channels) + self.image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor * 2, vae_latent_channels=self.latent_channels + ) self.mask_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor * 2, vae_latent_channels=self.latent_channels, From a0ffed1417d1335d4b5a51a02879142b7e48aced Mon Sep 17 00:00:00 2001 From: hlky Date: Fri, 4 Apr 2025 14:18:33 +0100 Subject: [PATCH 12/12] Update src/diffusers/pipelines/flux/pipeline_flux_fill.py --- src/diffusers/pipelines/flux/pipeline_flux_fill.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 2058f391d5c1..546a225aa999 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -902,10 +902,10 @@ def __call__( image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2) mu = calculate_shift( image_seq_len, - self.scheduler.config.base_image_seq_len, - self.scheduler.config.max_image_seq_len, - self.scheduler.config.base_shift, - self.scheduler.config.max_shift, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), ) timesteps, num_inference_steps = retrieve_timesteps( self.scheduler,