Skip to content

Add modular pipeline for HunyuanVideo 1.5#13389

Open
akshan-main wants to merge 14 commits intohuggingface:mainfrom
akshan-main:modular-hunyuan1.5
Open

Add modular pipeline for HunyuanVideo 1.5#13389
akshan-main wants to merge 14 commits intohuggingface:mainfrom
akshan-main:modular-hunyuan1.5

Conversation

@akshan-main
Copy link
Copy Markdown

@akshan-main akshan-main commented Apr 2, 2026

What does this PR do?

Adds modular pipeline blocks for HunyuanVideo 1.5 with both text-to-video (HunyuanVideo15Blocks) and image-to-video (HunyuanVideo15Image2VideoBlocks).

Parity verified on Colab G4 GPU:

  • T2V: MAD 0.000000 vs HunyuanVideo15Pipeline
hv15_t2v_standard.mp4
hv15_t2v_modular.mp4
T2V reproduction code
import gc
import numpy as np
import torch
from diffusers import (
    HunyuanVideo15Pipeline,
    HunyuanVideo15ImageToVideoPipeline,
    HunyuanVideo15Blocks,
    HunyuanVideo15ModularPipeline,
)
from diffusers.utils import load_image, export_to_video

device = "cuda"
dtype = torch.bfloat16

T2V_ID = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v"
I2V_ID = "hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_i2v"

def to_np(x):
    if hasattr(x, "frames"):
        x = x.frames
    if isinstance(x, list):
        x = np.array(x)
    if isinstance(x, torch.Tensor):
        x = x.float().cpu().numpy()
    return x
prompt = "A cinematic drone shot over snowy mountains at sunrise."

print("=== Standard T2V ===")

ref_pipe = HunyuanVideo15Pipeline.from_pretrained(T2V_ID, torch_dtype=dtype).to(device)
g = torch.Generator(device=device).manual_seed(1234)
ref_out = ref_pipe(prompt=prompt, num_frames=55, num_inference_steps=6, generator=g, output_type="np").frames
print(f"Shape: {np.array(ref_out).shape}")
export_to_video(ref_out[0], "/content/hv15_t2v_standard.mp4", fps=24)
del ref_pipe; gc.collect(); torch.cuda.empty_cache()



print("\n=== Modular T2V ===")
blocks = HunyuanVideo15Blocks()
pipe = blocks.init_pipeline(T2V_ID)
pipe.load_components(torch_dtype=dtype)
pipe.to(device)

print("Guider type:", type(pipe.guider).__name__)
print("Guider scale:", pipe.guider.guidance_scale)
print("Guider enabled:", pipe.guider._enabled)
print("Guider num_conditions:", pipe.guider.num_conditions)
g = torch.Generator(device=device).manual_seed(1234)
mod_out = pipe(prompt=prompt, num_frames=55, num_inference_steps=6, generator=g, output="videos", output_type="np")
print(f"Shape: {np.array(mod_out).shape}")
export_to_video(mod_out[0], "/content/hv15_t2v_modular.mp4", fps=24)

diff = np.abs(to_np(ref_out).astype(float) - to_np(mod_out).astype(float)).mean()
print(f"\nT2V MAD: {diff:.6f}")
del pipe, blocks; gc.collect(); torch.cuda.empty_cache()
  • I2V: MAD 0.000000 vs HunyuanVideo15ImageToVideoPipeline
hv15_i2v_standard.mp4
hv15_i2v_modular.mp4
I2V reproduction code
from diffusers.modular_pipelines import HunyuanVideo15Blocks, HunyuanVideo15Image2VideoBlocks, HunyuanVideo15ModularPipeline

image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png").convert("RGB")

print("=== Standard I2V ===")
ref_pipe = HunyuanVideo15ImageToVideoPipeline.from_pretrained(I2V_ID, torch_dtype=dtype).to(device)
g = torch.Generator(device=device).manual_seed(1234)
ref_out = ref_pipe(image=image, prompt="A cat turns its head", num_frames=55, num_inference_steps=6, generator=g, output_type="np").frames
print(f"Shape: {np.array(ref_out).shape}")
export_to_video(ref_out[0], "/content/hv15_i2v_standard.mp4", fps=24)
del ref_pipe; gc.collect(); torch.cuda.empty_cache()

print("\n=== Modular I2V ===")
blocks = HunyuanVideo15Image2VideoBlocks()
pipe = blocks.init_pipeline(I2V_ID)
pipe.load_components(torch_dtype=dtype)
pipe.to(device)
g = torch.Generator(device=device).manual_seed(1234)
mod_out = pipe(image=image, prompt="A cat turns its head", num_frames=55, num_inference_steps=6, generator=g, output="videos", output_type="np")
print(f"Shape: {np.array(mod_out).shape}")
export_to_video(mod_out[0], "/content/hv15_i2v_modular.mp4", fps=24)

diff = np.abs(to_np(ref_out).astype(float) - to_np(mod_out).astype(float)).mean()
print(f"\nI2V MAD: {diff:.6f}")
print("\n=== Done ===")

Addresses #13295 (HunyuanVideo 1.5 contribution)

Before submitting

Who can review?

@sayakpaul @yiyixuxu @asomoza

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant