some new fix

prishajain1 · prishajain1 · commit bb9a24253a3f · 2025-12-31T11:20:22.000+05:30
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -121,21 +121,19 @@ def prepare_latents(
                         nlf=latents.shape[1],
                         exp=num_latent_frames)
         latent_condition, _ = self.prepare_latents_i2v_base(image, num_frames, dtype, last_image)
-        mask_lat_size = jnp.ones((batch_size, 1, num_frames, latent_height, latent_width), dtype=dtype)
+        # 1. Create a base mask at the latent frame level
+        mask_lat_size = jnp.ones((batch_size, 1, num_latent_frames, latent_height, latent_width), dtype=dtype)
+        # 2. Apply masking based on last_image
         if last_image is None:
             mask_lat_size = mask_lat_size.at[:, :, 1:, :, :].set(0)
         else:
-            mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)     
-        first_frame_mask = mask_lat_size[:, :, 0:1]
-        first_frame_mask = jnp.repeat(first_frame_mask, self.vae_scale_factor_temporal, axis=2)
-        jax.debug.print("first_frame_mask.shape:{shape}, is None:{isnone}",
-                        shape = first_frame_mask.shape if first_frame_mask is not None else (-1,),
-                        isnone = first_frame_mask is None)
-        jax.debug.print("first_frame_mask_stats: min={mn}, max={mx}, mean={mean}",
-                        mn=jnp.min(first_frame_mask) if first_frame_mask is not None else 0.0,
-                        mx=jnp.max(first_frame_mask) if first_frame_mask is not None else 0.0,
-                        mean=jnp.mean(first_frame_mask) if first_frame_mask is not None else 0.0)
-        mask_lat_size = jnp.concatenate([first_frame_mask, mask_lat_size[:, :, 1:]], axis=2)
+            mask_lat_size = mask_lat_size.at[:, :, 1:-1, :, :].set(0)
+        
+        # 3. Expand the mask to match the temporal scale factor during reshape
+        mask_lat_size = jnp.repeat(mask_lat_size, self.vae_scale_factor_temporal, axis=2)
+        jax.debug.print("mask_lat_size shape after repeat: {shape}", shape=mask_lat_size.shape)
+
+        # 4. Reshape to combine latent frames and temporal scale factor
         mask_lat_size = mask_lat_size.reshape(
           batch_size, 
           1,
@@ -144,16 +142,21 @@ def prepare_latents(
           latent_height, 
           latent_width
         )
+        # 5. Transpose and squeeze to get the final mask shape (B, F_l, H_l, W_l, T_sf)
         mask_lat_size = jnp.transpose(mask_lat_size, (0, 2, 4, 5, 3, 1)).squeeze(-1)
+        jax.debug.print("mask_lat_size final shape: {shape}", shape=mask_lat_size.shape)
+
+        # 6. Concatenate with latent_condition
         condition = jnp.concatenate([mask_lat_size, latent_condition], axis=-1)
         jax.debug.print("condition shape: {shape}, channel dim: {c}",
                         shape=condition.shape,
                         c=condition.shape[-1])
         jax.debug.print("condition stats: mask_mean={mm}, latent_mean={lm}",
-                        mm=jnp.mean(condition[..., 0]),
-                        lm=jnp.mean(condition[..., 1:]))
+                        mm=jnp.mean(condition[..., :self.vae_scale_factor_temporal]),
+                        lm=jnp.mean(condition[..., self.vae_scale_factor_temporal:]))
 
-        return latents, condition, None
+        first_frame_mask = mask_lat_size[:, 0:1, :, :, :] # (B, 1, H_l, W_l, 4)
+        return latents, condition, first_frame_mask
 
 
   def __call__(