-
Notifications
You must be signed in to change notification settings - Fork 184
[Feature]Add video-to-video (V2V) pipeline #829
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
49e2631
69f998d
9d402df
1746e68
6b492d6
ad90d71
177e546
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from fastvideo import VideoGenerator | ||
|
||
# from fastvideo.configs.sample import SamplingParam | ||
|
||
OUTPUT_PATH = "video_samples_wan2_1_Fun" | ||
OUTPUT_NAME = "wan2.1_test" | ||
def main(): | ||
# FastVideo will automatically use the optimal default arguments for the | ||
# model. | ||
# If a local path is provided, FastVideo will make a best effort | ||
# attempt to identify the optimal arguments. | ||
generator = VideoGenerator.from_pretrained( | ||
"IRMChen/Wan2.1-Fun-1.3B-Control-Diffusers", | ||
# "alibaba-pai/Wan2.2-Fun-A14B-Control", | ||
# FastVideo will automatically handle distributed setup | ||
num_gpus=1, | ||
use_fsdp_inference=True, | ||
dit_cpu_offload=True, # DiT need to be offloaded for MoE | ||
vae_cpu_offload=False, | ||
text_encoder_cpu_offload=True, | ||
# Set pin_cpu_memory to false if CPU RAM is limited and there're no frequent CPU-GPU transfer | ||
pin_cpu_memory=True, | ||
# image_encoder_cpu_offload=False, | ||
) | ||
|
||
prompt = "一位年轻女性穿着一件粉色的连衣裙,裙子上有白色的装饰和粉色的纽扣。她的头发是紫色的,头上戴着一个红色的大蝴蝶结,显得非常可爱和精致。她还戴着一个红色的领结,整体造型充满了少女感和活力。她的表情温柔,双手轻轻交叉放在身前,姿态优雅。背景是简单的灰色,没有任何多余的装饰,使得人物更加突出。她的妆容清淡自然,突显了她的清新气质。整体画面给人一种甜美、梦幻的感觉,仿佛置身于童话世界中。" | ||
negative_prompt = "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" | ||
# prompt = "A young woman with beautiful, clear eyes and blonde hair stands in the forest, wearing a white dress and a crown. Her expression is serene, reminiscent of a movie star, with fair and youthful skin. Her brown long hair flows in the wind. The video quality is very high, with a clear view. High quality, masterpiece, best quality, high resolution, ultra-fine, fantastical." | ||
# negative_prompt = "Twisted body, limb deformities, text captions, comic, static, ugly, error, messy code." | ||
image_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/8.png" | ||
control_video_path = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/wan_fun/asset_Wan2_2/v1.0/pose.mp4" | ||
|
||
video = generator.generate_video(prompt, negative_prompt=negative_prompt, image_path=image_path, video_path=control_video_path, output_path=OUTPUT_PATH, output_video_name=OUTPUT_NAME, save_video=True) | ||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import os | ||
import tempfile | ||
from collections.abc import Callable | ||
from typing import Any | ||
from urllib.parse import unquote, urlparse | ||
|
||
import imageio | ||
|
@@ -11,6 +12,8 @@ | |
import PIL.ImageOps | ||
import requests | ||
import torch | ||
import torch.nn.functional as F | ||
import torchvision.transforms.functional as TF | ||
from packaging import version | ||
|
||
if version.parse(version.parse( | ||
|
@@ -136,7 +139,7 @@ def load_video( | |
video: str, | ||
convert_method: Callable[[list[PIL.Image.Image]], list[PIL.Image.Image]] | ||
| None = None, | ||
) -> list[PIL.Image.Image]: | ||
) -> tuple[list[Any], float | Any]: | ||
Gary-ChenJL marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
""" | ||
Loads `video` to a list of PIL Image. | ||
Args: | ||
|
@@ -175,39 +178,57 @@ def load_video( | |
video_data = response.iter_content(chunk_size=8192) | ||
for chunk in video_data: | ||
temp_file.write(chunk) | ||
|
||
video = video_path | ||
|
||
pil_images = [] | ||
if video.endswith(".gif"): | ||
gif = PIL.Image.open(video) | ||
try: | ||
while True: | ||
pil_images.append(gif.copy()) | ||
gif.seek(gif.tell() + 1) | ||
except EOFError: | ||
pass | ||
|
||
was_tempfile_created = True | ||
else: | ||
try: | ||
imageio.plugins.ffmpeg.get_exe() | ||
except AttributeError: | ||
raise AttributeError( | ||
"`Unable to find an ffmpeg installation on your machine. Please install via `pip install imageio-ffmpeg" | ||
) from None | ||
video_path = video | ||
|
||
with imageio.get_reader(video) as reader: | ||
# Read all frames | ||
for frame in reader: | ||
pil_images.append(PIL.Image.fromarray(frame)) | ||
|
||
if was_tempfile_created: | ||
os.remove(video_path) | ||
pil_images = [] | ||
original_fps = None | ||
|
||
try: | ||
if video_path.endswith(".gif"): | ||
gif = PIL.Image.open(video_path) | ||
try: | ||
# GIF FPS estimation | ||
if hasattr(gif, 'info') and 'duration' in gif.info: | ||
duration_ms = gif.info['duration'] | ||
if duration_ms > 0: | ||
original_fps = 1000.0 / duration_ms | ||
|
||
while True: | ||
pil_images.append(gif.copy()) | ||
gif.seek(gif.tell() + 1) | ||
except EOFError: | ||
pass | ||
else: | ||
try: | ||
imageio.plugins.ffmpeg.get_exe() | ||
except AttributeError: | ||
raise AttributeError( | ||
"`Unable to find an ffmpeg installation on your machine. Please install via `pip install imageio-ffmpeg" | ||
) from None | ||
|
||
with imageio.get_reader(video_path) as reader: | ||
try: | ||
original_fps = reader.get_meta_data().get('fps', None) | ||
except: | ||
# Fallback: try to get from format-specific metadata | ||
try: | ||
original_fps = reader.get_meta_data().get('source_size', {}).get('fps', None) | ||
except: | ||
pass | ||
|
||
for frame in reader: | ||
pil_images.append(PIL.Image.fromarray(frame)) | ||
finally: | ||
# Clean up temporary file if it was created | ||
if was_tempfile_created and os.path.exists(video_path): | ||
os.remove(video_path) | ||
Comment on lines
264
to
275
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know the original code was not the best, but could you clean this up and remove all of these try except blocks? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. refactored |
||
|
||
if convert_method is not None: | ||
pil_images = convert_method(pil_images) | ||
|
||
return pil_images | ||
return pil_images, original_fps | ||
Gary-ChenJL marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
|
||
def get_default_height_width( | ||
|
@@ -297,3 +318,53 @@ def resize( | |
else: | ||
raise ValueError(f"resize_mode {resize_mode} is not supported") | ||
return image | ||
|
||
|
||
def create_default_image(width: int = 512, height: int = 512, color: tuple[int, int, int] = (0, 0, 0)) -> PIL.Image.Image: | ||
""" | ||
Create a default black PIL image. | ||
|
||
Args: | ||
width: Image width in pixels | ||
height: Image height in pixels | ||
color: RGB color tuple | ||
|
||
Returns: | ||
PIL.Image.Image: A new PIL image with specified dimensions and color | ||
""" | ||
return PIL.Image.new("RGB", (width, height), color=color) | ||
|
||
|
||
def preprocess_reference_image_for_clip(image: PIL.Image.Image, device: torch.device | None = None) -> PIL.Image.Image: | ||
""" | ||
Preprocess reference image to match CLIP encoder requirements. | ||
|
||
Applies normalization, resizing to 224x224, and denormalization to ensure | ||
the image is in the correct format for CLIP processing. | ||
|
||
Args: | ||
image: Input PIL image | ||
device: Target device for tensor operations | ||
|
||
Returns: | ||
Preprocessed PIL image ready for CLIP encoder | ||
""" | ||
if device is None: | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
Gary-ChenJL marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
# Convert PIL to tensor and normalize to [-1, 1] range | ||
image_tensor = TF.to_tensor(image).sub_(0.5).div_(0.5).to(device) | ||
|
||
# Resize to CLIP's expected input size (224x224) using bicubic interpolation | ||
resized_tensor = F.interpolate( | ||
image_tensor.unsqueeze(0), | ||
size=(224, 224), | ||
mode='bicubic', | ||
align_corners=False | ||
).squeeze(0) | ||
|
||
# Denormalize back to [0, 1] range | ||
denormalized_tensor = resized_tensor.mul_(0.5).add_(0.5) | ||
|
||
return TF.to_pil_image(denormalized_tensor) | ||
|
Uh oh!
There was an error while loading. Please reload this page.