[feat] added llama3 chat template support

guocuimi · guocuimi · commit 9b69c9c0f288 · 2024-04-18T22:20:07.000-07:00
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ ScaleLLM is a cutting-edge inference system engineered for large language models
 |  GPT_NeoX  |       Yes       |     Yes      |    No    | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) |
 |    GPT2    |       Yes       |     Yes      |    No    | [gpt2](https://huggingface.co/gpt2)|
 | InternLM   |       Yes       |     Yes      |    Yes   | [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) |
-|   Llama3/2 |       Yes       |     Yes      |    Yes   | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b), [TheBloke/Llama-2-13B-chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ), [TheBloke/Llama-2-70B-AWQ](https://huggingface.co/TheBloke/Llama-2-70B-AWQ) |
+|   Llama3/2 |       Yes       |     Yes      |    Yes   | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct), [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) |
 |  Mistral   |       Yes       |     Yes      |    Yes   | [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) |
 |    MPT     |       Yes       |     Yes      |    Yes   | [mosaicml/mpt-30b](https://huggingface.co/mosaicml/mpt-30b) |
 |   Phi2     |       Yes       |     Yes      |    No   | [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) |
@@ -89,7 +89,7 @@ Once you have Docker installed, you can run ScaleLLM Docker container with [late
 docker pull docker.io/vectorchai/scalellm:latest
 docker run -it --gpus=all --net=host --shm-size=1g \
   -v $HOME/.cache/huggingface/hub:/models \
-  -e HF_MODEL_ID=meta-llama/Meta-Llama-3-8B \
+  -e HF_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct \
   -e DEVICE=cuda:0 \
   docker.io/vectorchai/scalellm:latest --logtostderr
 ``` 
@@ -100,7 +100,7 @@ This command starts the Docker container with GPU support and various configurat
 - `HF_MODEL_REVISION` specifies which Hugging Face model revision you want to run. By default, it is set to `"main"`.
 - `DEVICE` specifies the device on which this model should run. By default, it is set to `"auto"`, using all available GPUs. You can also specify specific GPUs by using `"cuda:0,cuda:1"`, or use CPU by using `"cpu"`.
 - `HF_MODEL_ALLOW_PATTERN` specifies which types of files are allowed to be downloaded. By default, it will be configured automatically based on tensor type. Only use this option if the default configuration is not working for you.
-- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models.
+- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models. `-e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN`
 
 > **Warning**<br />
 > * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' could be changed to a new version upon new release. In order to use latest image, you may need to repull the image with specific tag.
@@ -155,7 +155,7 @@ Using Docker Compose is the easiest way to run ScaleLLM with all the services to
 
 ```bash
 curl https://raw.githubusercontent.com/vectorch-ai/ScaleLLM/main/scalellm.yml -sSf > scalellm_compose.yml
-HF_MODEL_ID=meta-llama/Meta-Llama-3-8B DEVICE=cuda docker compose -f ./scalellm_compose.yml up
+HF_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct DEVICE=cuda docker compose -f ./scalellm_compose.yml up
 ```
 
 you will get following running services:
@@ -173,7 +173,7 @@ You can get chat completions with the following example:
 curl http://localhost:8080/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Meta-Llama-3-8B",
+    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
     "messages": [
       {
         "role": "system",
@@ -198,7 +198,7 @@ openai.api_base = "http://localhost:8080/v1"
 print("==== Available models ====")
 models = openai.Model.list()
 
-model = "meta-llama/Meta-Llama-3-8B"
+model = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 completion = openai.ChatCompletion.create(
     model=model,
@@ -225,7 +225,7 @@ For regular completions, you can use this example:
 curl http://localhost:8080/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "meta-llama/Meta-Llama-3-8B",
+    "model": "meta-llama/Meta-Llama-3-8B-Instruct",
     "prompt": "hello",
     "max_tokens": 32,
     "temperature": 0.7,
@@ -244,7 +244,7 @@ openai.api_base = "http://localhost:8080/v1"
 print("==== Available models ====")
 models = openai.Model.list()
 
-model = "meta-llama/Meta-Llama-3-8B"
+model = "meta-llama/Meta-Llama-3-8B-Instruct"
 
 completion = openai.Completion.create(
     model=model,
diff --git a/src/chat_template/common_chat_template.cpp b/src/chat_template/common_chat_template.cpp
@@ -42,4 +42,36 @@ std::optional<std::string> Llama2ChatTemplate::get_prompt(
   return ss.str();
 }
 
+// generate prompt from ChatTemplate
+std::optional<std::string> Llama3ChatTemplate::get_prompt(
+    const std::string_view& system_message,
+    const std::vector<std::string_view>& messages) const {
+  // at least one user message
+  if (messages.size() % 2 == 0) {
+    return std::nullopt;
+  }
+
+  std::stringstream ss;
+  ss << "<|begin_of_text|>";
+  auto add_message = [&ss](const std::string_view& role,
+                           const std::string_view& message) {
+    ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n";
+    ss << message << "<|eot_id|>";
+  };
+
+  // start with system message
+  if (!system_message.empty()) {
+    add_message("system", system_message);
+  }
+
+  // then user and assistant message pairs (u/a/u/a/u...)
+  for (size_t i = 0; i < messages.size(); ++i) {
+    const char* role = i % 2 == 0 ? "user" : "assistant";
+    add_message(role, messages[i]);
+  }
+  // end with assistant message
+  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
+  return ss.str();
+}
+
 }  // namespace llm
diff --git a/src/chat_template/common_chat_template.h b/src/chat_template/common_chat_template.h
@@ -18,4 +18,12 @@ class Llama2ChatTemplate final : public CodedChatTemplate {
       const std::vector<std::string_view>& messages) const override;
 };
 
+class Llama3ChatTemplate final : public CodedChatTemplate {
+ public:
+  // generate prompt from dialogs
+  std::optional<std::string> get_prompt(
+      const std::string_view& system_message,
+      const std::vector<std::string_view>& messages) const override;
+};
+
 }  // namespace llm
diff --git a/src/models/huggingface/llama.h b/src/models/huggingface/llama.h
@@ -365,32 +365,41 @@ TORCH_MODULE(LlamaForCausalLM);
 
 // register the causal model
 REGISTER_CAUSAL_MODEL(llama, LlamaForCausalLM);
+REGISTER_CAUSAL_MODEL(llama3, LlamaForCausalLM);
+
 REGISTER_DEFAULT_CHAT_TEMPLATE(llama, Llama2ChatTemplate);
+REGISTER_DEFAULT_CHAT_TEMPLATE(llama3, Llama3ChatTemplate);
 // register the model args
 // example config:
-// https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json set
-// default values for args explicitly with values from:
-// https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/configuration_llama.py#L112
+// https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct/blob/main/config.json
 REGISTER_MODEL_ARGS(llama, [&] {
   LOAD_ARG_OR(model_type, "model_type", "llama");
   LOAD_ARG_OR(dtype, "torch_dtype", "");
-  LOAD_ARG_OR(vocab_size, "vocab_size", 32000);
-  LOAD_ARG_OR(hidden_size, "hidden_size", 4096);
-  LOAD_ARG_OR(n_layers, "num_hidden_layers", 32);
-  LOAD_ARG_OR(n_heads, "num_attention_heads", 32);
+  LOAD_ARG_OR(vocab_size, "vocab_size", 128256);
+  LOAD_ARG_OR(hidden_size, "hidden_size", 8192);
+  LOAD_ARG_OR(n_layers, "num_hidden_layers", 80);
+  LOAD_ARG_OR(n_heads, "num_attention_heads", 64);
   LOAD_ARG(n_kv_heads, "num_key_value_heads");
-  LOAD_ARG_OR(intermediate_size, "intermediate_size", 11008);
+  LOAD_ARG_OR(intermediate_size, "intermediate_size", 28672);
   LOAD_ARG_OR(hidden_act, "hidden_act", "silu");
-  LOAD_ARG_OR(max_position_embeddings, "max_position_embeddings", 2048);
+  LOAD_ARG_OR(max_position_embeddings, "max_position_embeddings", 8192);
   LOAD_ARG_OR(rms_norm_eps, "rms_norm_eps", 1e-5);
-  LOAD_ARG_OR(bos_token_id, "bos_token_id", 1);
-  LOAD_ARG_OR(eos_token_id, "eos_token_id", 2);
-  LOAD_ARG_OR(rope_theta, "rope_theta", 10000.0f);
+  LOAD_ARG_OR(bos_token_id, "bos_token_id", 128000);
+  LOAD_ARG_OR(eos_token_id, "eos_token_id", 128001);
+  LOAD_ARG_OR(rope_theta, "rope_theta", 500000.0f);
   LOAD_ARG_OR(rope_scaling, "rope_scaling", 1.0f);
 
   LOAD_ARG_OR_FUNC(head_dim, "head_dim", [&] {
     return args->hidden_size() / args->n_heads();
   });
+
+  // decide model type based on vocab size
+  if (args->vocab_size() == 128256) {
+    // choose the right chat template
+    SET_ARG(model_type, "llama3");
+    // stop token ids: "<|end_of_text|>", "<|eot_id|>"
+    SET_ARG(stop_token_ids, std::unordered_set<int32_t>({128001, 128009}));
+  }
 });
 
 }  // namespace llm::hf