Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions docs/source/en/api/pipelines/ltx2.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ video, audio = pipe(
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

encode_video(
video[0],
Expand Down Expand Up @@ -185,8 +183,6 @@ video, audio = pipe(
output_type="np",
return_dict=False,
)
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

encode_video(
video[0],
Expand Down
69 changes: 61 additions & 8 deletions src/diffusers/pipelines/ltx2/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Generator, Iterator
from fractions import Fraction
from typing import Optional
from typing import List, Optional, Tuple, Union

import numpy as np
import PIL.Image
import torch
from tqdm import tqdm

from ...utils import is_av_available

Expand Down Expand Up @@ -101,11 +105,52 @@ def _write_audio(


def encode_video(
video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
video: Union[List[PIL.Image.Image], np.ndarray, torch.Tensor, Iterator[torch.Tensor]],
fps: int,
audio: Optional[torch.Tensor],
audio_sample_rate: Optional[int],
output_path: str,
video_chunks_number: int = 1,
) -> None:
video_np = video.cpu().numpy()
"""
Encodes a video with audio using the PyAV library. Based on code from the original LTX-2 repo:
https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py#L182

Args:
video (`List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
A video tensor of shape [frames, height, width, channels] with integer pixel values in [0, 255]. If the
input is a `np.ndarray`, it is expected to be a float array with values in [0, 1] (which is what pipelines
usually return with `output_type="np"`).
fps (`int`)
The frames per second (FPS) of the encoded video.
audio (`torch.Tensor`, *optional*):
An audio waveform of shape [audio_channels, samples].
audio_sample_rate: (`int`, *optional*):
The sampling rate of the audio waveform. For LTX 2, this is typically 24000 (24 kHz).
output_path (`str`):
The path to save the encoded video to.
video_chunks_number (`int`, *optional*, defaults to `1`):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is this option helpful?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original LTX-2 code will use a video_chunks_number calculated from the video VAE tiling config, for example in two stage inference:

https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py#L257

For the default num_frames value of 121 and default tiling config TilingConfig.default(), I believe this works out to 3 chunks. The idea seems to be that the chunks correspond to each tiled stride when decoding.

In practice, I haven't had any issues with the current code, which is equivalent to just using one chunk. I don't fully understand the reasoning behind why the original code supports it; my guess is that it is useful for very long videos or if there are compute constraints.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #13057 (comment) for discussion about some complications for supporting video_chunks_number.

The number of chunks to split the video into for encoding. Each chunk will be encoded separately. The
number of chunks to use often depends on the tiling config for the video VAE.
"""
if isinstance(video, list) and isinstance(video[0], PIL.Image.Image):
# Pipeline output_type="pil"
video_frames = [np.array(frame) for frame in video]
video = np.stack(video_frames, axis=0)
video = torch.from_numpy(video)
elif isinstance(video, np.ndarray):
# Pipeline output_type="np"
is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
if np.all(is_denormalized):
video = (video * 255).round().astype("uint8")
video = torch.from_numpy(video)

if isinstance(video, torch.Tensor):
video = iter([video])

first_chunk = next(video)

_, height, width, _ = video_np.shape
_, height, width, _ = first_chunk.shape

container = av.open(output_path, mode="w")
stream = container.add_stream("libx264", rate=int(fps))
Expand All @@ -119,10 +164,18 @@ def encode_video(

audio_stream = _prepare_audio_stream(container, audio_sample_rate)

for frame_array in video_np:
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
for packet in stream.encode(frame):
container.mux(packet)
def all_tiles(
first_chunk: torch.Tensor, tiles_generator: Generator[Tuple[torch.Tensor, int], None, None]
) -> Generator[Tuple[torch.Tensor, int], None, None]:
yield first_chunk
yield from tiles_generator

for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WDYT of getting rid of all_tiles() and doing it like so?

from itertools import chain

for video_chunk in tqdm(chain([first_chunk], video), total=video_chunks_number):
    video_chunk_cpu = video_chunk.to("cpu").numpy()
    for frame_array in video_chunk_cpu:
        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
        for packet in stream.encode(frame):
            container.mux(packet)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does the right thing but appears not to work well with tqdm, which doesn't update properly from the chain object:

 33%|████████████████████████████████▋                                                                 | 1/3 [00:04<00:09,  4.57s/it]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay!

Copy link
Collaborator Author

@dg845 dg845 Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I think #13057 (comment) is wrong - as we generally supply a single torch.Tensor to encode_video (for example from a pipeline output), this line creates an iterator with one element:

if isinstance(video, torch.Tensor):
video = iter([video])
first_chunk = next(video)

So when we call next(video) in the following line, the video iterator is now exhausted. So even if we set video_num_chunks > 1 in this case, our for loop through first_chunk and video will only yield one element in total, whether that's using all_tiles or chain. Thus, the progress bar will end up being wrong in this case since we tell tqdm that we have video_num_chunks > 1 elements when we in fact only have one.

I think the underlying difference is that the original LTX 2 code will return an iterator over decoded tiles when performing tiled VAE decoding, whereas we will return the whole decoded output as a single tensor with the tiles stitched back together. So maybe it doesn't make sense to support video_chunks_number as this will only work well when we supply an Iterator[torch.Tensor] to encode_video (in the current implementation).

video_chunk_cpu = video_chunk.to("cpu").numpy()
for frame_array in video_chunk_cpu:
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we let the users control this format? 👀

Copy link
Collaborator Author

@dg845 dg845 Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could allow the users to specify the format, but this would be in tension with value checking as suggested in #13057 (comment): for example, if we always convert denormalized inputs with values in $[0, 1]$ to uint8 values in $\{0, 1, \ldots, 255\}$, that would probably make it difficult to support a variety of formats.

We could conditionally convert based on the supplied video_format, but my understanding is that there are a lot of video formats, and I don't think we can anticipate all of the use cases that users may have. So I think we could support a video_format argument with a "use at your own risk" caveat:

    elif isinstance(video, np.ndarray):
        # Pipeline output_type="np"
        is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
        if np.all(is_denormalized) and video_format == "rgb24":
            video = (video * 255).round().astype("uint8")
        else:
            logger.warning(
                f"The video will be encoded using the input `video` values as-is with format {video_format}. Make sure"
                f" the values are in the proper range for the supplied format".
            )
        video = torch.from_numpy(video)

An alternative would be to only support "rgb24" as the original LTX-2 code does with the idea that power users can use their own video encoding code if they have a different use case.

EDIT: the right terminology here might be "pixel format" rather than "video format".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative would be to only support "rgb24" as the original LTX-2 code does with the idea that power users can use their own video encoding code if they have a different use case.

Okay let's go with this.

for packet in stream.encode(frame):
container.mux(packet)

# Flush encoder
for packet in stream.encode():
Expand Down
2 changes: 0 additions & 2 deletions src/diffusers/pipelines/ltx2/pipeline_ltx2.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)

>>> encode_video(
... video[0],
Expand Down
2 changes: 0 additions & 2 deletions src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@
... output_type="np",
... return_dict=False,
... )
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)

>>> encode_video(
... video[0],
Expand Down
2 changes: 0 additions & 2 deletions src/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@
... output_type="np",
... return_dict=False,
... )[0]
>>> video = (video * 255).round().astype("uint8")
>>> video = torch.from_numpy(video)

>>> encode_video(
... video[0],
Expand Down
Loading