hao-ai-lab · SolitaryThinker · Oct 13, 2025 · Sep 30, 2025 · Oct 2, 2025 · Oct 7, 2025
diff --git a/examples/inference/basic/basic_wan2_2_Fun.py b/examples/inference/basic/basic_wan2_2_Fun.py
@@ -0,0 +1,36 @@
+from fastvideo import VideoGenerator
+
+# from fastvideo.configs.sample import SamplingParam
+
+OUTPUT_PATH = "video_samples_wan2_1_Fun"
+OUTPUT_NAME = "wan2.1_test"
+def main():
+    # FastVideo will automatically use the optimal default arguments for the
+    # model.
+    # If a local path is provided, FastVideo will make a best effort
+    # attempt to identify the optimal arguments.
+    generator = VideoGenerator.from_pretrained(
+        "IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers",
+        # "alibaba-pai/Wan2.2-Fun-A14B-Control",
+        # FastVideo will automatically handle distributed setup
+        num_gpus=1,
+        use_fsdp_inference=True,
+        dit_cpu_offload=True, # DiT need to be offloaded for MoE
+        vae_cpu_offload=False,
+        text_encoder_cpu_offload=True,
+        # Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer
+        pin_cpu_memory=True,
+        # image_encoder_cpu_offload=False,
+    )
+
+    prompt = "一位年轻女性穿着一件粉色的连衣裙，裙子上有白色的装饰和粉色的纽扣。她的头发是紫色的，头上戴着一个红色的大蝴蝶结，显得非常可爱和精致。她还戴着一个红色的领结，整体造型充满了少女感和活力。她的表情温柔，双手轻轻交叉放在身前，姿态优雅。背景是简单的灰色，没有任何多余的装饰，使得人物更加突出。她的妆容清淡自然，突显了她的清新气质。整体画面给人一种甜美、梦幻的感觉，仿佛置身于童话世界中。"
+    negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+    # prompt                  = "A young woman with beautiful, clear eyes and blonde hair stands in the forest, wearing a white dress and a crown. Her expression is serene, reminiscent of a movie star, with fair and youthful skin. Her brown long hair flows in the wind. The video quality is very high, with a clear view. High quality, masterpiece, best quality, high resolution, ultra-fine, fantastical."
+    # negative_prompt         = "Twisted body, limb deformities, text captions, comic, static, ugly, error, messy code."
+    image_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/8.png"
+    control_video_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/pose.mp4"
+
+    video = generator.generate_video(prompt, negative_prompt=negative_prompt, image_path=image_path, video_path=control_video_path, output_path=OUTPUT_PATH, output_video_name=OUTPUT_NAME, save_video=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/fastvideo/configs/models/encoders/__init__.py b/fastvideo/configs/models/encoders/__init__.py
@@ -2,13 +2,13 @@
                                                     EncoderConfig,
                                                     ImageEncoderConfig,
                                                     TextEncoderConfig)
-from fastvideo.configs.models.encoders.clip import (CLIPTextConfig,
-                                                    CLIPVisionConfig)
+from fastvideo.configs.models.encoders.clip import (
+    CLIPTextConfig, CLIPVisionConfig, WAN2_1ControlCLIPVisionConfig)
 from fastvideo.configs.models.encoders.llama import LlamaConfig
 from fastvideo.configs.models.encoders.t5 import T5Config
 
 __all__ = [
     "EncoderConfig", "TextEncoderConfig", "ImageEncoderConfig",
-    "BaseEncoderOutput", "CLIPTextConfig", "CLIPVisionConfig", "LlamaConfig",
-    "T5Config"
+    "BaseEncoderOutput", "CLIPTextConfig", "CLIPVisionConfig",
+    "WAN2_1ControlCLIPVisionConfig", "LlamaConfig", "T5Config"
 ]
diff --git a/fastvideo/configs/models/encoders/clip.py b/fastvideo/configs/models/encoders/clip.py
@@ -77,6 +77,8 @@ class CLIPTextConfig(TextEncoderConfig):
 
     num_hidden_layers_override: int | None = None
     require_post_norm: bool | None = None
+    enable_scale: bool = True
+    is_causal: bool = True
     prefix: str = "clip"
 
 
@@ -87,4 +89,14 @@ class CLIPVisionConfig(ImageEncoderConfig):
 
     num_hidden_layers_override: int | None = None
     require_post_norm: bool | None = None
+    enable_scale: bool = True
+    is_causal: bool = True
     prefix: str = "clip"
+
+
+@dataclass
+class WAN2_1ControlCLIPVisionConfig(CLIPVisionConfig):
+    num_hidden_layers_override: int | None = 31
+    require_post_norm: bool | None = False
+    enable_scale: bool = False
+    is_causal: bool = False
diff --git a/fastvideo/configs/pipelines/registry.py b/fastvideo/configs/pipelines/registry.py
@@ -13,7 +13,7 @@
     FastWan2_1_T2V_480P_Config, FastWan2_2_TI2V_5B_Config,
     Wan2_2_I2V_A14B_Config, Wan2_2_T2V_A14B_Config, Wan2_2_TI2V_5B_Config,
     WanI2V480PConfig, WanI2V720PConfig, WanT2V480PConfig, WanT2V720PConfig,
-    SelfForcingWanT2V480PConfig)
+    SelfForcingWanT2V480PConfig, WANV2VConfig)
 # isort: on
 from fastvideo.logger import init_logger
 from fastvideo.utils import (maybe_download_model_index,
@@ -27,6 +27,7 @@
     "hunyuanvideo-community/HunyuanVideo": HunyuanConfig,
     "Wan-AI/Wan2.1-T2V-1.3B-Diffusers": WanT2V480PConfig,
     "weizhou03/Wan2.1-Fun-1.3B-InP-Diffusers": WanI2V480PConfig,
+    "IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers": WANV2VConfig,
     "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers": WanI2V480PConfig,
     "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers": WanI2V720PConfig,
     "Wan-AI/Wan2.1-T2V-14B-Diffusers": WanT2V720PConfig,

diff --git a/fastvideo/configs/pipelines/wan.py b/fastvideo/configs/pipelines/wan.py
@@ -7,7 +7,8 @@
 from fastvideo.configs.models import DiTConfig, EncoderConfig, VAEConfig
 from fastvideo.configs.models.dits import WanVideoConfig
 from fastvideo.configs.models.encoders import (BaseEncoderOutput,
-                                               CLIPVisionConfig, T5Config)
+                                               CLIPVisionConfig, T5Config,
+                                               WAN2_1ControlCLIPVisionConfig)
 from fastvideo.configs.models.vaes import WanVAEConfig
 from fastvideo.configs.pipelines.base import PipelineConfig
 
@@ -97,6 +98,16 @@ class WanI2V720PConfig(WanI2V480PConfig):
     flow_shift: float | None = 5.0
 
 
+@dataclass
+class WANV2VConfig(WanI2V480PConfig):
+    """Configuration for WAN2.1 1.3B Control pipeline."""
+
+    image_encoder_config: EncoderConfig = field(
+        default_factory=WAN2_1ControlCLIPVisionConfig)
+    # CLIP encoder precision
+    image_encoder_precision: str = 'bf16'
+
+
 @dataclass
 class FastWan2_1_T2V_480P_Config(WanT2V480PConfig):
     """Base configuration for FastWan T2V 1.3B 480P pipeline architecture with DMD"""

diff --git a/fastvideo/configs/sample/base.py b/fastvideo/configs/sample/base.py
@@ -18,6 +18,9 @@ class SamplingParam:
     # Image inputs
     image_path: str | None = None
 
+    # Video inputs
+    video_path: str | None = None
+
     # Text inputs
     prompt: str | list[str] | None = None
     negative_prompt: str = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
@@ -200,6 +203,12 @@ def add_cli_args(parser: Any) -> Any:
             default=SamplingParam.image_path,
             help="Path to input image for image-to-video generation",
         )
+        parser.add_argument(
+            "--video_path",
+            type=str,
+            default=SamplingParam.video_path,
+            help="Path to input video for video-to-video generation",
+        )
         parser.add_argument(
             "--moba-config-path",
             type=str,

diff --git a/fastvideo/configs/sample/registry.py b/fastvideo/configs/sample/registry.py
@@ -18,6 +18,7 @@
     WanI2V_14B_720P_SamplingParam,
     WanT2V_1_3B_SamplingParam,
     WanT2V_14B_SamplingParam,
+    Wan2_1_Fun_1_3B_Control_SamplingParam,
     SelfForcingWanT2V480PConfig,
 )
 # isort: on
@@ -39,6 +40,8 @@
     "Wan-AI/Wan2.1-I2V-14B-720P-Diffusers": WanI2V_14B_720P_SamplingParam,
     "weizhou03/Wan2.1-Fun-1.3B-InP-Diffusers":
     Wan2_1_Fun_1_3B_InP_SamplingParam,
+    "IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers":
+    Wan2_1_Fun_1_3B_Control_SamplingParam,
 
     # Wan2.2
     "Wan-AI/Wan2.2-TI2V-5B-Diffusers": Wan2_2_TI2V_5B_SamplingParam,

diff --git a/fastvideo/configs/sample/wan.py b/fastvideo/configs/sample/wan.py
@@ -122,6 +122,17 @@ class Wan2_1_Fun_1_3B_InP_SamplingParam(SamplingParam):
     num_inference_steps: int = 50
 
 
+@dataclass
+class Wan2_1_Fun_1_3B_Control_SamplingParam(SamplingParam):
+    fps: int = 16
+    num_frames: int = 49
+    height: int = 832
+    width: int = 480
+    guidance_scale: float = 6.0
+    teacache_params: WanTeaCacheParams = field(
+        default_factory=lambda: WanTeaCacheParams(teacache_thresh=0.1, ))
+
+
 # =============================================
 # ============= Wan2.2 TI2V Models =============
 # =============================================
@@ -162,6 +173,12 @@ class Wan2_2_I2V_A14B_SamplingParam(Wan2_2_Base_SamplingParam):
     # can be overridden during sampling
 
 
+@dataclass
+class Wan2_2_Fun_A14B_Control_SamplingParam(
+        Wan2_1_Fun_1_3B_Control_SamplingParam):
+    num_frames: int = 81
+
+
 # =============================================
 # ============= Causal Self-Forcing =============
 # =============================================

diff --git a/fastvideo/entrypoints/video_generator.py b/fastvideo/entrypoints/video_generator.py
@@ -298,7 +298,6 @@ def _generate_single_video(
             eta=0.0,
             n_tokens=n_tokens,
             VSA_sparsity=fastvideo_args.VSA_sparsity,
-            extra={},
         )
 
         # Use prompt[:100] for video name

diff --git a/fastvideo/models/encoders/clip.py b/fastvideo/models/encoders/clip.py
@@ -140,7 +140,7 @@ def __init__(
                 "embed_dim must be divisible by num_heads "
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads}).")
-        self.scale = self.head_dim**-0.5
+        self.scale = self.head_dim**-0.5 if config.enable_scale else None
         self.dropout = config.attention_dropout
 
         self.qkv_proj = QKVParallelLinear(
@@ -166,7 +166,7 @@ def __init__(
             self.head_dim,
             self.num_heads_per_partition,
             softmax_scale=self.scale,
-            causal=True,
+            causal=config.is_causal,
             supported_attention_backends=config._supported_attention_backends)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):

diff --git a/fastvideo/models/vision_utils.py b/fastvideo/models/vision_utils.py
@@ -3,6 +3,7 @@
 import os
 import tempfile
 from collections.abc import Callable
+from typing import Any
 from urllib.parse import unquote, urlparse
 
 import imageio
@@ -11,6 +12,8 @@
 import PIL.ImageOps
 import requests
 import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
 from packaging import version
 
 if version.parse(version.parse(
@@ -136,7 +139,7 @@ def load_video(
     video: str,
     convert_method: Callable[[list[PIL.Image.Image]], list[PIL.Image.Image]]
     | None = None,
-) -> list[PIL.Image.Image]:
+) -> tuple[list[Any], float | Any]:
     """
     Loads `video` to a list of PIL Image.
     Args:
@@ -175,39 +178,57 @@ def load_video(
             video_data = response.iter_content(chunk_size=8192)
             for chunk in video_data:
                 temp_file.write(chunk)
-
-        video = video_path
-
-    pil_images = []
-    if video.endswith(".gif"):
-        gif = PIL.Image.open(video)
-        try:
-            while True:
-                pil_images.append(gif.copy())
-                gif.seek(gif.tell() + 1)
-        except EOFError:
-            pass
-
+        was_tempfile_created = True
     else:
-        try:
-            imageio.plugins.ffmpeg.get_exe()
-        except AttributeError:
-            raise AttributeError(
-                "`Unable to find an ffmpeg installation on your machine. Please install via `pip install imageio-ffmpeg"
-            ) from None
+        video_path = video
 
-        with imageio.get_reader(video) as reader:
-            # Read all frames
-            for frame in reader:
-                pil_images.append(PIL.Image.fromarray(frame))
-
-    if was_tempfile_created:
-        os.remove(video_path)
+    pil_images = []
+    original_fps = None
+
+    try:
+        if video_path.endswith(".gif"):
+            gif = PIL.Image.open(video_path)
+            try:
+                # GIF FPS estimation
+                if hasattr(gif, 'info') and 'duration' in gif.info:
+                    duration_ms = gif.info['duration']
+                    if duration_ms > 0:
+                        original_fps = 1000.0 / duration_ms
+
+                while True:
+                    pil_images.append(gif.copy())
+                    gif.seek(gif.tell() + 1)
+            except EOFError:
+                pass
+        else:
+            try:
+                imageio.plugins.ffmpeg.get_exe()
+            except AttributeError:
+                raise AttributeError(
+                    "`Unable to find an ffmpeg installation on your machine. Please install via `pip install imageio-ffmpeg"
+                ) from None
+
+            with imageio.get_reader(video_path) as reader:
+                try:
+                    original_fps = reader.get_meta_data().get('fps', None)
+                except:
+                    # Fallback: try to get from format-specific metadata
+                    try:
+                        original_fps = reader.get_meta_data().get('source_size', {}).get('fps', None)
+                    except:
+                        pass
+
+                for frame in reader:
+                    pil_images.append(PIL.Image.fromarray(frame))
+    finally:
+        # Clean up temporary file if it was created
+        if was_tempfile_created and os.path.exists(video_path):
+            os.remove(video_path)
 
     if convert_method is not None:
         pil_images = convert_method(pil_images)
 
-    return pil_images
+    return pil_images, original_fps
 
 
 def get_default_height_width(
@@ -297,3 +318,53 @@ def resize(
     else:
         raise ValueError(f"resize_mode {resize_mode} is not supported")
     return image
+
+
+def create_default_image(width: int = 512, height: int = 512, color: tuple[int, int, int] = (0, 0, 0)) -> PIL.Image.Image:
+    """
+    Create a default black PIL image.
+
+    Args:
+        width: Image width in pixels
+        height: Image height in pixels
+        color: RGB color tuple
+
+    Returns:
+        PIL.Image.Image: A new PIL image with specified dimensions and color
+    """
+    return PIL.Image.new("RGB", (width, height), color=color)
+
+
+def preprocess_reference_image_for_clip(image: PIL.Image.Image, device: torch.device | None = None) -> PIL.Image.Image:
+    """
+    Preprocess reference image to match CLIP encoder requirements.
+
+    Applies normalization, resizing to 224x224, and denormalization to ensure
+    the image is in the correct format for CLIP processing.
+
+    Args:
+        image: Input PIL image
+        device: Target device for tensor operations
+
+    Returns:
+        Preprocessed PIL image ready for CLIP encoder
+    """
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Convert PIL to tensor and normalize to [-1, 1] range
+    image_tensor = TF.to_tensor(image).sub_(0.5).div_(0.5).to(device)
+
+    # Resize to CLIP's expected input size (224x224) using bicubic interpolation
+    resized_tensor = F.interpolate(
+        image_tensor.unsqueeze(0),
+        size=(224, 224),
+        mode='bicubic',
+        align_corners=False
+    ).squeeze(0)
+
+    # Denormalize back to [0, 1] range
+    denormalized_tensor = resized_tensor.mul_(0.5).add_(0.5)
+
+    return TF.to_pil_image(denormalized_tensor)
+