[feat] support tensor parallelism for MQA/GQA models when num_kv_heads < world_size (#137)

guocuimi · web-flow · commit dff774e64755 · 2024-04-20T12:37:44.000-07:00
diff --git a/src/common/slice.h b/src/common/slice.h
@@ -62,13 +62,13 @@ class Slice final {
 
 // help comparison operators between slices and std::vector
 template <typename T>
-bool operator==(const Slice<T>& lhs, const std::vector<T>& rhs) {
+inline bool operator==(const Slice<T>& lhs, const std::vector<T>& rhs) {
   return lhs.size() == rhs.size() &&
          std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
 
 template <typename T>
-bool operator==(const std::vector<T>& lhs, const Slice<T>& rhs) {
+inline bool operator==(const std::vector<T>& lhs, const Slice<T>& rhs) {
   return lhs.size() == rhs.size() &&
          std::equal(lhs.begin(), lhs.end(), rhs.begin());
 }
diff --git a/src/engine/llm_engine.cpp b/src/engine/llm_engine.cpp
@@ -141,7 +141,7 @@ bool LLMEngine::init_model(const std::string& model_weights_path) {
   const int world_size = static_cast<int>(workers_.size());
   const int64_t n_heads = args_.n_heads();
   const int64_t n_kv_heads = args_.n_kv_heads().value_or(n_heads);
-  n_local_kv_heads_ = n_kv_heads / world_size;
+  n_local_kv_heads_ = std::max<int64_t>(1, n_kv_heads / world_size);
   head_dim_ = args_.head_dim();
   dtype_ = parse_dtype(args_.dtype(), options_.devices()[0]);
 
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
@@ -6,9 +6,11 @@ cc_library(
     linear 
   HDRS
     linear.h
+    qkv_linear.h
     linear_impl.h
   SRCS
     linear.cpp
+    qkv_linear.cpp
     linear_impl.cpp
   DEPS
     :state_dict
diff --git a/src/layers/qkv_linear.cpp b/src/layers/qkv_linear.cpp
@@ -0,0 +1,64 @@
+#include "qkv_linear.h"
+
+#include <absl/strings/match.h>
+#include <glog/logging.h>
+#include <torch/torch.h>
+
+namespace llm {
+QKVColumnParallelLinearImpl::QKVColumnParallelLinearImpl(
+    int64_t hidden_size,
+    int64_t n_heads,
+    int64_t n_kv_heads,
+    int64_t head_dim,
+    bool bias,
+    bool gather_output,
+    const QuantArgs& quant_args,
+    const ParallelArgs& parallel_args,
+    const torch::TensorOptions& options) {
+  // calculate logical kv heads with support of MQA/GQA
+  const int32_t world_size = parallel_args.world_size();
+  if (n_kv_heads >= world_size) {
+    // partition kv heads evenly across world_size for MHA
+    CHECK_EQ(n_kv_heads % world_size, 0)
+        << "kv_heads can't be partitioned evenly across world_size";
+    kv_replication_ratio_ = 1;
+  } else {
+    // replicate kv heads evenly across world_size for GQA/MQA
+    CHECK_EQ(world_size % n_kv_heads, 0)
+        << "kv heads can't be replicated evenly across world_size";
+    kv_replication_ratio_ = world_size / n_kv_heads;
+    n_kv_heads = world_size;
+  }
+
+  parallel_linear_ = ColumnParallelLinear(hidden_size,
+                                          (n_heads + 2 * n_kv_heads) * head_dim,
+                                          bias,
+                                          gather_output,
+                                          quant_args,
+                                          parallel_args,
+                                          options);
+}
+
+// special load_state_dict for fused cases
+void QKVColumnParallelLinearImpl::load_state_dict(
+    const StateDict& state_dict,
+    const std::vector<std::string_view>& prefixes,
+    const std::vector<std::string_view>& kv_prefixes) {
+  if (kv_replication_ratio_ > 1) {
+    // replicate kv heads
+    auto kv_replicated_state_dict = state_dict.select_with_transform(
+        "", [&](const std::string_view& name, const torch::Tensor& tensor) {
+          for (const auto& kv_prefix : kv_prefixes) {
+            if (absl::StartsWith(name, kv_prefix)) {
+              return tensor.repeat({kv_replication_ratio_, 1});
+            }
+          }
+          return tensor;
+        });
+    parallel_linear_->load_state_dict(kv_replicated_state_dict, prefixes);
+  } else {
+    parallel_linear_->load_state_dict(state_dict, prefixes);
+  }
+}
+
+}  // namespace llm
diff --git a/src/layers/qkv_linear.h b/src/layers/qkv_linear.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <glog/logging.h>
+#include <torch/torch.h>
+
+#include "linear.h"
+#include "model_loader/state_dict.h"
+#include "model_parallel/parallel_args.h"
+#include "quantization/quant_args.h"
+
+namespace llm {
+
+// a thin wrapper to handle state_dict loading for QKV with
+// support of MQA/GQA
+class QKVColumnParallelLinearImpl : public torch::nn::Module {
+ public:
+  QKVColumnParallelLinearImpl(int64_t hidden_size,
+                              int64_t n_heads,
+                              int64_t n_kv_heads,
+                              int64_t head_dim,
+                              bool bias,
+                              bool gather_output,
+                              const QuantArgs& quant_args,
+                              const ParallelArgs& parallel_args,
+                              const torch::TensorOptions& options);
+
+  torch::Tensor forward(torch::Tensor input) const {
+    return parallel_linear_->forward(input);
+  }
+
+  // special load_state_dict for fused cases
+  void load_state_dict(const StateDict& state_dict,
+                       const std::vector<std::string_view>& prefixes,
+                       const std::vector<std::string_view>& kv_prefixes);
+
+  void verify_loaded_weights(const std::string& prefix = "") const {
+    parallel_linear_->verify_loaded_weights(prefix);
+  }
+
+ private:
+  ColumnParallelLinear parallel_linear_{nullptr};
+
+  // replication ratio of kv heads for MQA/GQA cases
+  int64_t kv_replication_ratio_ = 1;
+};
+TORCH_MODULE(QKVColumnParallelLinear);
+
+}  // namespace llm
diff --git a/src/model_loader/state_dict.cpp b/src/model_loader/state_dict.cpp
@@ -1,6 +1,7 @@
 #include "state_dict.h"
 
 #include <ATen/core/TensorBody.h>
+#include <absl/strings/match.h>
 #include <caffe2/serialize/inline_container.h>
 #include <glog/logging.h>
 #include <torch/csrc/jit/serialization/import_read.h>
@@ -183,7 +184,8 @@ torch::Tensor StateDict::get_tensor(const std::string_view& tensor_name) const {
     return torch::Tensor{nullptr};
   }
   // apply transform function if exists
-  return transform_func_ ? transform_func_(it->second) : it->second;
+  return transform_func_ ? transform_func_(tensor_name, it->second)
+                         : it->second;
 }
 
 torch::Tensor StateDict::get_sharded_tensor(const std::string_view& tensor_name,
@@ -231,8 +233,7 @@ torch::Tensor StateDict::get_sharded_tensor(const std::string_view& tensor_name,
 StateDict StateDict::select(const std::string_view& prefix) const {
   std::unordered_map<std::string, torch::Tensor> selected;
   for (const auto& [name, tensor] : dict_) {
-    std::size_t found = name.find(prefix);
-    if (found == 0) {
+    if (absl::StartsWith(name, prefix)) {
       selected[name.substr(prefix.length())] = tensor;
     }
   }
diff --git a/src/model_loader/state_dict.h b/src/model_loader/state_dict.h
@@ -44,7 +44,8 @@ class StateDict final {
 
   // select all tensors whose name starts with prefix and apply the transform
   // for each tensor.
-  using TensorTransform = std::function<torch::Tensor(torch::Tensor)>;
+  using TensorTransform = std::function<torch::Tensor(const std::string_view&,
+                                                      const torch::Tensor&)>;
   StateDict select_with_transform(const std::string_view& prefix,
                                   TensorTransform transform_func) const;
 
diff --git a/src/models/huggingface/baichuan.h b/src/models/huggingface/baichuan.h
@@ -437,7 +437,8 @@ class BaichuanForCausalLMImpl : public torch::nn::Module {
       // Baichuan2 normalizes the head weights:
       // https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L508
       lm_head_->load_state_dict(state_dict.select_with_transform(
-          "lm_head.", [](torch::Tensor tensor) {
+          "lm_head.",
+          [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
             return torch::nn::functional::normalize(tensor);
           }));
     } else {
diff --git a/src/models/huggingface/gemma.h b/src/models/huggingface/gemma.h
@@ -1,14 +1,18 @@
 #pragma once
+#include <absl/strings/match.h>
 #include <glog/logging.h>
 #include <torch/torch.h>
 
+#include <string>
+
 #include "chat_template/coded_chat_template.h"
 #include "layers/activation.h"
 #include "layers/attention/attention.h"
 #include "layers/attention/handler.h"
 #include "layers/embedding.h"
 #include "layers/linear.h"
 #include "layers/normalization.h"
+#include "layers/qkv_linear.h"
 #include "memory/kv_cache.h"
 #include "models/model_args.h"
 #include "models/model_registry.h"
@@ -87,26 +91,28 @@ class GemmaAttentionImpl : public torch::nn::Module {
     const int32_t world_size = parallel_args.world_size();
     const int64_t hidden_size = args.hidden_size();
     const int64_t n_heads = args.n_heads();
-    const int64_t head_dim = args.head_dim();
     const int64_t n_kv_heads = args.n_kv_heads().value_or(n_heads);
+    const int64_t head_dim = args.head_dim();
     const int64_t n_local_heads = n_heads / world_size;
-    const int64_t n_local_kv_heads = n_kv_heads / world_size;
+    const int64_t n_local_kv_heads =
+        std::max<int64_t>(1, n_kv_heads / world_size);
 
     // size for q, k, v
     qkv_sizes_ = {n_local_heads * head_dim,
                   n_local_kv_heads * head_dim,
                   n_local_kv_heads * head_dim};
 
     // register submodules
-    qkv_proj_ = register_module(
-        "qkv_proj",
-        ColumnParallelLinear(hidden_size,
-                             (n_heads + 2 * n_kv_heads) * head_dim,
-                             /*bias=*/false,
-                             /*gather_output=*/false,
-                             quant_args,
-                             parallel_args,
-                             options));
+    qkv_proj_ = register_module("qkv_proj",
+                                QKVColumnParallelLinear(hidden_size,
+                                                        n_heads,
+                                                        n_kv_heads,
+                                                        head_dim,
+                                                        /*bias=*/false,
+                                                        /*gather_output=*/false,
+                                                        quant_args,
+                                                        parallel_args,
+                                                        options));
 
     o_proj_ = register_module("o_proj",
                               RowParallelLinear(n_heads * head_dim,
@@ -141,7 +147,8 @@ class GemmaAttentionImpl : public torch::nn::Module {
   // load the weight from the checkpoint
   void load_state_dict(const StateDict& state_dict) {
     // call each submodule's load_state_dict function
-    qkv_proj_->load_state_dict(state_dict, {"q_proj.", "k_proj.", "v_proj."});
+    qkv_proj_->load_state_dict(
+        state_dict, {"q_proj.", "k_proj.", "v_proj."}, {"k_proj.", "v_proj."});
     o_proj_->load_state_dict(state_dict.select("o_proj."));
   }
 
@@ -152,7 +159,7 @@ class GemmaAttentionImpl : public torch::nn::Module {
 
  private:
   // parameter members, must be registered
-  ColumnParallelLinear qkv_proj_{nullptr};
+  QKVColumnParallelLinear qkv_proj_{nullptr};
 
   RowParallelLinear o_proj_{nullptr};
 
@@ -207,12 +214,16 @@ class GemmaDecoderLayerImpl : public torch::nn::Module {
   void load_state_dict(const StateDict& state_dict) {
     input_layernorm_->load_state_dict((state_dict.select_with_transform(
         "input_layernorm.",
-        [](torch::Tensor tensor) { return tensor + 1.0f; })));
+        [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+          return tensor + 1.0f;
+        })));
     mlp_->load_state_dict(state_dict.select("mlp."));
     post_attention_layernorm_->load_state_dict(
         (state_dict.select_with_transform(
             "post_attention_layernorm.",
-            [](torch::Tensor tensor) { return tensor + 1.0f; })));
+            [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+              return tensor + 1.0f;
+            })));
     self_attn_->load_state_dict(state_dict.select("self_attn."));
   }
   void verify_loaded_weights(const std::string& prefix) const {
@@ -301,7 +312,10 @@ class GemmaModelImpl : public torch::nn::Module {
     // GemmaRMSNorm is different from Llama's in that it multiplies
     // (1 + weight) to the output, instead of just weight.
     norm_->load_state_dict((state_dict.select_with_transform(
-        "norm.", [](torch::Tensor tensor) { return tensor + 1.0f; })));
+        "norm.",
+        [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+          return tensor + 1.0f;
+        })));
   }
 
   void verify_loaded_weights(const std::string& prefix) const {
diff --git a/src/models/huggingface/gpt2.h b/src/models/huggingface/gpt2.h
@@ -57,9 +57,15 @@ class GPT2MLPImpl : public torch::nn::Module {
     // GPT-2 implementation uses Conv1D instead of Linear. As a result, we
     // need to transpose the weight.
     c_fc_->load_state_dict(state_dict.select_with_transform(
-        "c_fc.", [](torch::Tensor tensor) { return tensor.t(); }));
+        "c_fc.",
+        [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+          return tensor.t();
+        }));
     c_proj_->load_state_dict(state_dict.select_with_transform(
-        "c_proj.", [](torch::Tensor tensor) { return tensor.t(); }));
+        "c_proj.",
+        [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+          return tensor.t();
+        }));
   }
 
   void verify_loaded_weights(const std::string& prefix) const {
@@ -134,9 +140,15 @@ class GPT2AttentionImpl : public torch::nn::Module {
     // GPT-2 implementation uses Conv1D instead of Linear. As a result, we
     // need to transpose the weight.
     c_attn_->load_state_dict(state_dict.select_with_transform(
-        "c_attn.", [](torch::Tensor tensor) { return tensor.t(); }));
+        "c_attn.",
+        [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+          return tensor.t();
+        }));
     c_proj_->load_state_dict(state_dict.select_with_transform(
-        "c_proj.", [](torch::Tensor tensor) { return tensor.t(); }));
+        "c_proj.",
+        [](const std::string_view& /*name*/, const torch::Tensor& tensor) {
+          return tensor.t();
+        }));
   }
 
   void verify_loaded_weights(const std::string& prefix) const {
diff --git a/src/models/huggingface/llama.h b/src/models/huggingface/llama.h
diff --git a/src/models/huggingface/mistral.h b/src/models/huggingface/mistral.h
diff --git a/src/models/huggingface/mpt.h b/src/models/huggingface/mpt.h

Original file line number	Diff line number	Diff line change
`@@ -62,13 +62,13 @@ class Slice final {`
`62`	`62`
`63`	`63`	`// help comparison operators between slices and std::vector`
`64`	`64`	`template <typename T>`
`65`		`-bool operator==(const Slice<T>& lhs, const std::vector<T>& rhs) {`
	`65`	`+inline bool operator==(const Slice<T>& lhs, const std::vector<T>& rhs) {`
`66`	`66`	`return lhs.size() == rhs.size() &&`
`67`	`67`	`std::equal(lhs.begin(), lhs.end(), rhs.begin());`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`template <typename T>`
`71`		`-bool operator==(const std::vector<T>& lhs, const Slice<T>& rhs) {`
	`71`	`+inline bool operator==(const std::vector<T>& lhs, const Slice<T>& rhs) {`
`72`	`72`	`return lhs.size() == rhs.size() &&`
`73`	`73`	`std::equal(lhs.begin(), lhs.end(), rhs.begin());`
`74`	`74`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`#include "state_dict.h"`
`2`	`2`
`3`	`3`	`#include <ATen/core/TensorBody.h>`
	`4`	`+#include <absl/strings/match.h>`
`4`	`5`	`#include <caffe2/serialize/inline_container.h>`
`5`	`6`	`#include <glog/logging.h>`
`6`	`7`	`#include <torch/csrc/jit/serialization/import_read.h>`
`@@ -183,7 +184,8 @@ torch::Tensor StateDict::get_tensor(const std::string_view& tensor_name) const {`
`183`	`184`	`return torch::Tensor{nullptr};`
`184`	`185`	`}`
`185`	`186`	`// apply transform function if exists`
`186`		`- return transform_func_ ? transform_func_(it->second) : it->second;`
	`187`	`+ return transform_func_ ? transform_func_(tensor_name, it->second)`
	`188`	`+ : it->second;`
`187`	`189`	`}`
`188`	`190`
`189`	`191`	`torch::Tensor StateDict::get_sharded_tensor(const std::string_view& tensor_name,`
`@@ -231,8 +233,7 @@ torch::Tensor StateDict::get_sharded_tensor(const std::string_view& tensor_name,`
`231`	`233`	`StateDict StateDict::select(const std::string_view& prefix) const {`
`232`	`234`	`std::unordered_map<std::string, torch::Tensor> selected;`
`233`	`235`	`for (const auto& [name, tensor] : dict_) {`
`234`		`- std::size_t found = name.find(prefix);`
`235`		`- if (found == 0) {`
	`236`	`+ if (absl::StartsWith(name, prefix)) {`
`236`	`237`	`selected[name.substr(prefix.length())] = tensor;`
`237`	`238`	`}`
`238`	`239`	`}`