fix: code format and fix moe dummy run.

Gossity · zhaotianyi · commit 67ad72044e35 · 2025-10-28T09:18:26.000+08:00
diff --git a/xllm/core/layers/common/CMakeLists.txt b/xllm/core/layers/common/CMakeLists.txt
@@ -16,6 +16,7 @@ cc_library(
     qwen3_moe_decoder_layer.h
     linear_impl.h
     word_embedding_impl.h
+    layer_utils.h
   SRCS
     qwen3_attention.cpp
     attention.cpp
@@ -26,6 +27,7 @@ cc_library(
     qwen3_decoder_layer.cpp
     qwen3_moe_decoder_layer.cpp
     linear_impl.cpp
+    layer_utils.cpp
   DEPS
     "-Wl,--whole-archive"
     "-Wl,--no-whole-archive"
diff --git a/xllm/core/layers/common/fused_moe.cpp b/xllm/core/layers/common/fused_moe.cpp
@@ -130,28 +130,13 @@ torch::Tensor FusedMoEImpl::forward_expert(
 torch::Tensor FusedMoEImpl::forward(const torch::Tensor& hidden_states,
                                     const ModelInputParams& input_params) {
   auto input = hidden_states;
-  const auto& dp_tokens = input_params.dp_global_token_nums;
-  int dp_rank = 0;
   bool need_slice = false;
   if (parallel_args_.dp_size() > 1 && parallel_args_.ep_size() > 1) {
-    dp_rank = parallel_args_.dp_local_process_group_->rank();
     input = parallel_state::gather(input,
                                    parallel_args_.dp_local_process_group_,
                                    input_params.dp_global_token_nums);
     need_slice = true;
   }
-  // fake run for dp rank with zero tokens
-  if (dp_tokens[dp_rank] == 0) {
-    // If the current dp rank has zero tokens, return an empty tensor
-    input = parallel_state::reduce(input, tp_pg_);
-    if (need_slice) {
-      auto start =
-          std::accumulate(dp_tokens.begin(), dp_tokens.begin() + dp_rank, 0);
-      auto end = start + dp_tokens[dp_rank];
-      return input.slice(0, start, end);
-    }
-    return input;
-  }
 
   pack_params();
   std::optional<torch::Tensor> shared_output = std::nullopt;
@@ -162,6 +147,8 @@ torch::Tensor FusedMoEImpl::forward(const torch::Tensor& hidden_states,
   auto output = forward_expert(input, router_logits, shared_output);
 
   if (need_slice) {
+    const auto& dp_tokens = input_params.dp_global_token_nums;
+    const int dp_rank = parallel_args_.dp_local_process_group_->rank();
     auto start =
         std::accumulate(dp_tokens.begin(), dp_tokens.begin() + dp_rank, 0);
     auto end = start + dp_tokens[dp_rank];
diff --git a/xllm/core/layers/common/layer_utils.cpp b/xllm/core/layers/common/layer_utils.cpp
@@ -0,0 +1,65 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "layer_utils.h"
+
+#include "framework/parallel_state/parallel_state.h"
+
+namespace xllm {
+namespace layer {
+
+bool is_dummy_run(const ModelInputParams& input_params,
+                  const ParallelArgs& parallel_args) {
+  int dp_rank = 0;
+  if (parallel_args.dp_size() > 1) {
+    dp_rank = parallel_args.dp_local_process_group_->rank();
+  }
+  return input_params.dp_global_token_nums[dp_rank] == 0;
+}
+
+torch::Tensor dummy_run(torch::Tensor& input,
+                        const ModelInputParams& input_params,
+                        const ParallelArgs& parallel_args) {
+  if (parallel_args.dp_size() <= 1 && parallel_args.ep_size() <= 1) {
+    return input;
+  }
+
+  auto tp_pg = parallel_args.tp_group_;
+  if (parallel_args.ep_size() > 1) {
+    tp_pg = parallel_args.process_group_;
+  }
+  bool need_slice = false;
+  if (parallel_args.dp_size() > 1 && parallel_args.ep_size() > 1) {
+    input = parallel_state::gather(input,
+                                   parallel_args.dp_local_process_group_,
+                                   input_params.dp_global_token_nums);
+    need_slice = true;
+  }
+  if (tp_pg->world_size() > 1) {
+    input = parallel_state::reduce(input, tp_pg);
+  }
+  if (need_slice) {
+    const auto& dp_tokens = input_params.dp_global_token_nums;
+    const int dp_rank = parallel_args.dp_local_process_group_->rank();
+    auto start =
+        std::accumulate(dp_tokens.begin(), dp_tokens.begin() + dp_rank, 0);
+    auto end = start + dp_tokens[dp_rank];
+    input = input.slice(0, start, end);
+  }
+  return input;
+}
+
+}  // namespace layer
+}  // namespace xllm
diff --git a/xllm/core/layers/common/layer_utils.h b/xllm/core/layers/common/layer_utils.h
@@ -0,0 +1,31 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+#include "framework/model/model_input_params.h"
+#include "framework/parallel_state/parallel_args.h"
+
+namespace xllm {
+namespace layer {
+
+bool is_dummy_run(const ModelInputParams& input_params,
+                  const ParallelArgs& parallel_args);
+
+torch::Tensor dummy_run(torch::Tensor& input,
+                        const ModelInputParams& input_params,
+                        const ParallelArgs& parallel_args);
+
+}  // namespace layer
+}  // namespace xllm
diff --git a/xllm/core/layers/common/qwen3_decoder_layer.cpp b/xllm/core/layers/common/qwen3_decoder_layer.cpp
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include <glog/logging.h>
 
+#include "layer_utils.h"
+
 namespace xllm {
 namespace layer {
 
-Qwen3DecoderImpl::Qwen3DecoderImpl(const ModelContext& context) {
+Qwen3DecoderImpl::Qwen3DecoderImpl(const ModelContext& context)
+    : parallel_args_(context.get_parallel_args()) {
   const auto& model_args = context.get_model_args();
   const auto& quant_args = context.get_quant_args();
   const auto& parallel_args = context.get_parallel_args();
@@ -65,6 +68,10 @@ torch::Tensor Qwen3DecoderImpl::forward(torch::Tensor& x,
                                         const AttentionMetadata& attn_metadata,
                                         KVCache& kv_cache,
                                         const ModelInputParams& input_params) {
+  bool is_dummy_run = layer::is_dummy_run(input_params, parallel_args_);
+  if (is_dummy_run) {
+    return x;
+  }
   // Pre-attention norm
   auto residual = x;
   x = input_norm_->forward(x);
diff --git a/xllm/core/layers/common/qwen3_decoder_layer.h b/xllm/core/layers/common/qwen3_decoder_layer.h
@@ -53,6 +53,8 @@ class Qwen3DecoderImpl : public torch::nn::Module {
   DenseMLP mlp_{nullptr};
   RmsNorm input_norm_{nullptr};
   RmsNorm post_norm_{nullptr};
+
+  ParallelArgs parallel_args_;
 };
 
 }  // namespace layer
diff --git a/xllm/core/layers/common/qwen3_moe_decoder_layer.cpp b/xllm/core/layers/common/qwen3_moe_decoder_layer.cpp
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <glog/logging.h>
 
+#include "layer_utils.h"
+
 namespace xllm {
 namespace layer {
 
@@ -95,13 +97,9 @@ torch::Tensor Qwen3MoeDecoderImpl::forward(
     const AttentionMetadata& attn_metadata,
     KVCache& kv_cache,
     const ModelInputParams& input_params) {
-  const auto& dp_rank = parallel_args_.dp_local_process_group_->rank();
-  if (input_params.dp_global_token_nums[dp_rank] == 0) {
-    if (moe_mlp_) {
-      return moe_mlp_(x, input_params);
-    } else {
-      return x;
-    }
+  bool is_dummy_run = layer::is_dummy_run(input_params, parallel_args_);
+  if (is_dummy_run) {
+    return layer::dummy_run(x, input_params, parallel_args_);
   }
   // Pre-attention norm
   torch::Tensor residual = x;
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
@@ -308,11 +308,6 @@ class LlmModelImplBase : public torch::nn::Module {
     auto cancated_h = torch::cat(hs, 0);
     return norm_(cancated_h, 0);
 #elif defined(USE_MLU)
-    CHECK(input_params.size() == 1)
-        << "invalid input_params size: " << input_params.size();
-    if (input_params[0].q_max_seq_len == 0) {
-      return hs[0];
-    }
     bool is_prefill = input_params[0].q_max_seq_len > 1;
     auto attn_metadata =
         layer::AttentionMetadata::build(input_params[0], is_prefill);