From a4b59f851016c3a95dadd0d4d3944790b9953cf8 Mon Sep 17 00:00:00 2001 From: Kazancev Danil <48238046+vaskers5@users.noreply.github.com> Date: Wed, 9 Jul 2025 18:31:53 +0300 Subject: [PATCH 1/3] Implement NAG, DCM and TaylorSeer --- .../lora/wan_multi_lora_inference.py | 32 +++++ fastvideo/v1/configs/pipelines/base.py | 35 +++++ fastvideo/v1/pipelines/stages/denoising.py | 125 +++++++++++++++++- 3 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 examples/inference/lora/wan_multi_lora_inference.py diff --git a/examples/inference/lora/wan_multi_lora_inference.py b/examples/inference/lora/wan_multi_lora_inference.py new file mode 100644 index 000000000..f17d59eee --- /dev/null +++ b/examples/inference/lora/wan_multi_lora_inference.py @@ -0,0 +1,32 @@ +from fastvideo import VideoGenerator + +OUTPUT_PATH = "./multi_lora" + + +def main(): + # Create a generator for WanVideo2.1 I2V + generator = VideoGenerator.from_pretrained( + "Wan-AI/Wan2.1-I2V-14B-480P", + num_gpus=1, + ) + + # Load three LoRA adapters into the pipeline + generator.set_lora_adapter("lora1", "path/to/first_lora") + generator.set_lora_adapter("lora2", "path/to/second_lora") + generator.set_lora_adapter("lora3", "path/to/third_lora") + + # The last call activates lora3. Generate a video with it + prompt = "An astronaut explores a strange new world, cinematic scene" + generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True) + + # Switch to lora1 and generate another video + generator.set_lora_adapter("lora1") + generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True) + + # Switch to lora2 and generate one more video + generator.set_lora_adapter("lora2") + generator.generate_video(prompt, output_path=OUTPUT_PATH, save_video=True) + + +if __name__ == "__main__": + main() diff --git a/fastvideo/v1/configs/pipelines/base.py b/fastvideo/v1/configs/pipelines/base.py index 7ec4027b2..8c04b90e5 100644 --- a/fastvideo/v1/configs/pipelines/base.py +++ b/fastvideo/v1/configs/pipelines/base.py @@ -89,6 +89,12 @@ class PipelineConfig: STA_mode: STA_Mode = STA_Mode.STA_INFERENCE skip_time_steps: int = 15 + # Additional guidance/optimization parameters + skip_layer_guidance: float | None = None # fraction of denoise steps without CFG + use_normalized_attention: bool = False + use_dcm: bool = False + use_taylor_seer: bool = False + # Compilation # enable_torch_compile: bool = False @@ -206,6 +212,35 @@ def add_cli_args(parser: FlexibleArgumentParser, "Bool for applying scheduler scale in set_timesteps, used in stepvideo", ) + parser.add_argument( + f"--{prefix_with_dot}skip-layer-guidance", + type=float, + dest=f"{prefix_with_dot.replace('-', '_')}skip_layer_guidance", + default=PipelineConfig.skip_layer_guidance, + help="Fraction of steps to disable CFG for SkipLayerGuidance", + ) + parser.add_argument( + f"--{prefix_with_dot}use-normalized-attention", + action=StoreBoolean, + dest=f"{prefix_with_dot.replace('-', '_')}use_normalized_attention", + default=PipelineConfig.use_normalized_attention, + help="Enable Normalized Attention Guidance", + ) + parser.add_argument( + f"--{prefix_with_dot}use-dcm", + action=StoreBoolean, + dest=f"{prefix_with_dot.replace('-', '_')}use_dcm", + default=PipelineConfig.use_dcm, + help="Enable Dynamic Convolution Module", + ) + parser.add_argument( + f"--{prefix_with_dot}use-taylor-seer", + action=StoreBoolean, + dest=f"{prefix_with_dot.replace('-', '_')}use_taylor_seer", + default=PipelineConfig.use_taylor_seer, + help="Enable TaylorSeer optimization", + ) + # Add VAE configuration arguments from fastvideo.v1.configs.models.vaes.base import VAEConfig VAEConfig.add_cli_args(parser, prefix=f"{prefix_with_dot}vae-config") diff --git a/fastvideo/v1/pipelines/stages/denoising.py b/fastvideo/v1/pipelines/stages/denoising.py index 46040644e..0634034e2 100644 --- a/fastvideo/v1/pipelines/stages/denoising.py +++ b/fastvideo/v1/pipelines/stages/denoising.py @@ -45,6 +45,99 @@ logger = init_logger(__name__) +def apply_normalized_attention_guidance( + pos: torch.Tensor, + neg: torch.Tensor | None = None, + nag_scale: float = 1.5, + nag_tau: float = 2.5, + nag_alpha: float = 0.125, +) -> torch.Tensor: + """Apply Normalized Attention Guidance (NAG) to noise predictions. + + This implementation follows the formula from the official NAG repository + and operates on the positive and negative noise predictions. + """ + + if neg is None: + flat = pos.flatten(2) + mean = flat.mean(dim=-1, keepdim=True) + var = flat.var(dim=-1, unbiased=False, keepdim=True) + normalized = (flat - mean) / (var + 1e-6).sqrt() + return normalized.view_as(pos) + + pos_flat = pos.flatten(2) + neg_flat = neg.flatten(2) + + guidance = pos_flat * nag_scale - neg_flat * (nag_scale - 1) + norm_pos = pos_flat.norm(p=2, dim=-1, keepdim=True) + norm_guidance = guidance.norm(p=2, dim=-1, keepdim=True) + scale = norm_guidance / (norm_pos + 1e-7) + guidance = guidance * torch.minimum(scale, scale.new_ones(1) * nag_tau) / ( + scale + 1e-7 + ) + + out = guidance * nag_alpha + pos_flat * (1 - nag_alpha) + return out.view_as(pos) + + +_dcm_modules: dict[torch.device, tuple[torch.nn.Conv3d, torch.nn.Conv3d, torch.nn.Conv3d]] = {} + + +def apply_dcm(tensor: torch.Tensor) -> torch.Tensor: + """Apply Dynamic Convolution Module (DCM).""" + global _dcm_modules + conv_offset, conv_weight, conv_gate = _dcm_modules.get(tensor.device, (None, None, None)) + if conv_offset is None: + channels = tensor.size(1) + conv_offset = torch.nn.Conv3d(channels, channels, kernel_size=3, padding=1, bias=False).to( + tensor.device, tensor.dtype + ) + conv_weight = torch.nn.Conv3d(channels, channels, kernel_size=3, padding=1, bias=False).to( + tensor.device, tensor.dtype + ) + conv_gate = torch.nn.Conv3d(channels, channels, kernel_size=3, padding=1).to( + tensor.device, tensor.dtype + ) + _dcm_modules[tensor.device] = (conv_offset, conv_weight, conv_gate) + + offset = conv_offset(tensor) + out = conv_weight(tensor + offset) + gate = torch.sigmoid(conv_gate(tensor)) + return tensor + gate * out + + +_taylor_cache: dict[torch.device, dict[str, Any]] = {} + + +def apply_taylor_seer(tensor: torch.Tensor, step: int, order: int = 2) -> torch.Tensor: + """Apply TaylorSeer optimization using a simple derivative cache.""" + cache = _taylor_cache.setdefault(tensor.device, { + "prev": None, + "prev_diff": None, + "prev_step": None, + }) + + if cache["prev"] is None: + cache["prev"] = tensor.detach() + cache["prev_step"] = step + return tensor + + dt = step - cache["prev_step"] + if dt == 0: + return tensor + + diff = (tensor - cache["prev"]) / dt + result = cache["prev"] + diff * dt + if order >= 2 and cache["prev_diff"] is not None: + second = (diff - cache["prev_diff"]) / dt + result = result + 0.5 * second * dt * dt + + cache["prev"] = tensor.detach() + cache["prev_diff"] = diff + cache["prev_step"] = step + return result + + class DenoisingStage(PipelineStage): """ Stage for running the denoising loop in diffusion pipelines. @@ -83,6 +176,9 @@ def forward( Returns: The batch with denoised latents. """ + # Reset caches for optional optimizations + _taylor_cache.clear() + # Prepare extra step kwargs for scheduler extra_step_kwargs = self.prepare_extra_func_kwargs( self.scheduler.step, @@ -264,8 +360,25 @@ def forward( **neg_cond_kwargs, ) noise_pred_text = noise_pred - noise_pred = noise_pred_uncond + batch.guidance_scale * ( - noise_pred_text - noise_pred_uncond) + if fastvideo_args.pipeline_config.skip_layer_guidance and ( + i / len(timesteps) + < fastvideo_args.pipeline_config.skip_layer_guidance + ): + noise_pred = noise_pred_text + else: + noise_pred = noise_pred_uncond + batch.guidance_scale * ( + noise_pred_text - noise_pred_uncond) + + if fastvideo_args.pipeline_config.use_normalized_attention: + noise_pred = apply_normalized_attention_guidance( + noise_pred_text, + noise_pred_uncond, + nag_scale=batch.guidance_scale, + ) + if fastvideo_args.pipeline_config.use_dcm: + noise_pred = apply_dcm(noise_pred) + if fastvideo_args.pipeline_config.use_taylor_seer: + noise_pred = apply_taylor_seer(noise_pred, i) # Apply guidance rescale if needed if batch.guidance_rescale > 0.0: @@ -276,6 +389,14 @@ def forward( guidance_rescale=batch.guidance_rescale, ) + if not batch.do_classifier_free_guidance: + if fastvideo_args.pipeline_config.use_normalized_attention: + noise_pred = apply_normalized_attention_guidance(noise_pred) + if fastvideo_args.pipeline_config.use_dcm: + noise_pred = apply_dcm(noise_pred) + if fastvideo_args.pipeline_config.use_taylor_seer: + noise_pred = apply_taylor_seer(noise_pred, i) + # Compute the previous noisy sample latents = self.scheduler.step(noise_pred, t, From 22307a94bc0f7023dd3e4a97705de6a3d13290ce Mon Sep 17 00:00:00 2001 From: Kazancev Danil <48238046+vaskers5@users.noreply.github.com> Date: Thu, 10 Jul 2025 11:16:57 +0300 Subject: [PATCH 2/3] Add optimized Wan I2V inference example --- .../optimized_wan_i2v_example.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 examples/inference/optimizations/optimized_wan_i2v_example.py diff --git a/examples/inference/optimizations/optimized_wan_i2v_example.py b/examples/inference/optimizations/optimized_wan_i2v_example.py new file mode 100644 index 000000000..93dc68bd8 --- /dev/null +++ b/examples/inference/optimizations/optimized_wan_i2v_example.py @@ -0,0 +1,32 @@ +from fastvideo import VideoGenerator +from fastvideo.v1.configs.sample import SamplingParam + + +OUTPUT_PATH = "./optimized_output" + + +def main(): + """Run WanVideo2.1 I2V pipeline with all optimizations enabled.""" + generator = VideoGenerator.from_pretrained( + "Wan-AI/Wan2.1-I2V-14B-480P", + num_gpus=1, + skip_layer_guidance=0.2, + use_normalized_attention=True, + use_dcm=True, + use_taylor_seer=True, + ) + + sampling = SamplingParam.from_pretrained("Wan-AI/Wan2.1-I2V-14B-480P") + + prompt = "A lone explorer crosses a vast alien desert under twin moons" + generator.generate_video( + prompt, + sampling_param=sampling, + output_path=OUTPUT_PATH, + save_video=True, + ) + + +if __name__ == "__main__": + main() + From 7cd20636bcbcea2680d8def96c092efb8ca05ef7 Mon Sep 17 00:00:00 2001 From: Kazancev Danil <48238046+vaskers5@users.noreply.github.com> Date: Thu, 10 Jul 2025 11:17:04 +0300 Subject: [PATCH 3/3] expose optimization params and integrate in denoising --- .../optimized_wan_i2v_example.py | 4 +++ fastvideo/v1/configs/pipelines/base.py | 32 +++++++++++++++++++ fastvideo/v1/pipelines/stages/denoising.py | 23 ++++++++++--- 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/examples/inference/optimizations/optimized_wan_i2v_example.py b/examples/inference/optimizations/optimized_wan_i2v_example.py index 93dc68bd8..f029f431f 100644 --- a/examples/inference/optimizations/optimized_wan_i2v_example.py +++ b/examples/inference/optimizations/optimized_wan_i2v_example.py @@ -12,8 +12,12 @@ def main(): num_gpus=1, skip_layer_guidance=0.2, use_normalized_attention=True, + nag_scale=1.5, + nag_tau=2.5, + nag_alpha=0.125, use_dcm=True, use_taylor_seer=True, + taylor_seer_order=2, ) sampling = SamplingParam.from_pretrained("Wan-AI/Wan2.1-I2V-14B-480P") diff --git a/fastvideo/v1/configs/pipelines/base.py b/fastvideo/v1/configs/pipelines/base.py index 8c04b90e5..1b35b069a 100644 --- a/fastvideo/v1/configs/pipelines/base.py +++ b/fastvideo/v1/configs/pipelines/base.py @@ -92,8 +92,12 @@ class PipelineConfig: # Additional guidance/optimization parameters skip_layer_guidance: float | None = None # fraction of denoise steps without CFG use_normalized_attention: bool = False + nag_scale: float = 1.5 + nag_tau: float = 2.5 + nag_alpha: float = 0.125 use_dcm: bool = False use_taylor_seer: bool = False + taylor_seer_order: int = 2 # Compilation # enable_torch_compile: bool = False @@ -226,6 +230,27 @@ def add_cli_args(parser: FlexibleArgumentParser, default=PipelineConfig.use_normalized_attention, help="Enable Normalized Attention Guidance", ) + parser.add_argument( + f"--{prefix_with_dot}nag-scale", + type=float, + dest=f"{prefix_with_dot.replace('-', '_')}nag_scale", + default=PipelineConfig.nag_scale, + help="Scale for Normalized Attention Guidance", + ) + parser.add_argument( + f"--{prefix_with_dot}nag-tau", + type=float, + dest=f"{prefix_with_dot.replace('-', '_')}nag_tau", + default=PipelineConfig.nag_tau, + help="Tau parameter for Normalized Attention Guidance", + ) + parser.add_argument( + f"--{prefix_with_dot}nag-alpha", + type=float, + dest=f"{prefix_with_dot.replace('-', '_')}nag_alpha", + default=PipelineConfig.nag_alpha, + help="Alpha parameter for Normalized Attention Guidance", + ) parser.add_argument( f"--{prefix_with_dot}use-dcm", action=StoreBoolean, @@ -240,6 +265,13 @@ def add_cli_args(parser: FlexibleArgumentParser, default=PipelineConfig.use_taylor_seer, help="Enable TaylorSeer optimization", ) + parser.add_argument( + f"--{prefix_with_dot}taylor-seer-order", + type=int, + dest=f"{prefix_with_dot.replace('-', '_')}taylor_seer_order", + default=PipelineConfig.taylor_seer_order, + help="Derivative order for TaylorSeer optimization", + ) # Add VAE configuration arguments from fastvideo.v1.configs.models.vaes.base import VAEConfig diff --git a/fastvideo/v1/pipelines/stages/denoising.py b/fastvideo/v1/pipelines/stages/denoising.py index 0634034e2..e5f0d9b46 100644 --- a/fastvideo/v1/pipelines/stages/denoising.py +++ b/fastvideo/v1/pipelines/stages/denoising.py @@ -373,12 +373,18 @@ def forward( noise_pred = apply_normalized_attention_guidance( noise_pred_text, noise_pred_uncond, - nag_scale=batch.guidance_scale, + nag_scale=fastvideo_args.pipeline_config.nag_scale * batch.guidance_scale, + nag_tau=fastvideo_args.pipeline_config.nag_tau, + nag_alpha=fastvideo_args.pipeline_config.nag_alpha, ) if fastvideo_args.pipeline_config.use_dcm: noise_pred = apply_dcm(noise_pred) if fastvideo_args.pipeline_config.use_taylor_seer: - noise_pred = apply_taylor_seer(noise_pred, i) + noise_pred = apply_taylor_seer( + noise_pred, + i, + order=fastvideo_args.pipeline_config.taylor_seer_order, + ) # Apply guidance rescale if needed if batch.guidance_rescale > 0.0: @@ -391,11 +397,20 @@ def forward( if not batch.do_classifier_free_guidance: if fastvideo_args.pipeline_config.use_normalized_attention: - noise_pred = apply_normalized_attention_guidance(noise_pred) + noise_pred = apply_normalized_attention_guidance( + noise_pred, + nag_scale=fastvideo_args.pipeline_config.nag_scale, + nag_tau=fastvideo_args.pipeline_config.nag_tau, + nag_alpha=fastvideo_args.pipeline_config.nag_alpha, + ) if fastvideo_args.pipeline_config.use_dcm: noise_pred = apply_dcm(noise_pred) if fastvideo_args.pipeline_config.use_taylor_seer: - noise_pred = apply_taylor_seer(noise_pred, i) + noise_pred = apply_taylor_seer( + noise_pred, + i, + order=fastvideo_args.pipeline_config.taylor_seer_order, + ) # Compute the previous noisy sample latents = self.scheduler.step(noise_pred,