From c3bb59e4a6e3468a0214ba5b2b345e43b609294b Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Thu, 3 Jul 2025 11:14:18 +0800
Subject: [PATCH 1/2] always padding before allgather in deepseek

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_deepseekv3.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index d0b00450f3b..b7ae4220cd7 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -52,8 +52,7 @@
 from ..modules.attention import MLA
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
-from ..modules.fused_moe import (CutlassFusedMoE, DeepSeekV3MoeRoutingMethod,
-                                 create_moe)
+from ..modules.fused_moe import DeepSeekV3MoeRoutingMethod, create_moe
 from ..modules.gated_mlp import GatedMLP
 from ..modules.linear import Linear, TensorParallelMode, WeightsLoadingConfig
 from ..modules.multi_stream_utils import maybe_execute_in_parallel
@@ -504,6 +503,10 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
         # max-throughput
         use_dp_padding = False
         if self.use_dp and self.mapping.tp_size > 1:
+            max_num_token = max(all_rank_num_tokens)
+            hidden_states = torch.nn.functional.pad(
+                hidden_states,
+                (0, 0, 0, max_num_token - hidden_states.shape[0]))
             # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
             # to reduce allreduce BW
             if disable_fp4_allgather() and not self.experts.enable_alltoall:
@@ -511,14 +514,6 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
                                           self.mapping,
                                           dim=0,
                                           sizes=all_rank_num_tokens)
-            elif not isinstance(self.experts, CutlassFusedMoE) or (
-                    not self.experts.has_fp8_qdq and self.experts.has_nvfp4):
-                # Use padding when not using the cutlass path or when x_sf in self.experts is not None
-                use_dp_padding = True
-                max_num_token = max(all_rank_num_tokens)
-                hidden_states = torch.nn.functional.pad(
-                    hidden_states,
-                    (0, 0, 0, max_num_token - hidden_states.shape[0]))
 
         router_logits = self.gate(hidden_states)
 

From e30a4d185dd02c97ce8ba2459e974fc071c82259 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Thu, 3 Jul 2025 11:35:50 +0800
Subject: [PATCH 2/2] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_deepseekv3.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index b7ae4220cd7..4c5f6ae3dc0 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -501,7 +501,6 @@ def _compute_shared_expert_tp_size(self, intermediate_size: int,
     def compute_routed_output(self, hidden_states, hidden_states_fp4,
                               all_rank_num_tokens, do_finalize):
         # max-throughput
-        use_dp_padding = False
         if self.use_dp and self.mapping.tp_size > 1:
             max_num_token = max(all_rank_num_tokens)
             hidden_states = torch.nn.functional.pad(
@@ -523,7 +522,7 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
             do_finalize=do_finalize,
             output_dtype=hidden_states.dtype,
             all_rank_num_tokens=all_rank_num_tokens,
-            use_dp_padding=use_dp_padding,
+            use_dp_padding=True,
         )
 
         return routed_output