vectorch-ai
diff --git a/‎src/model_loader/model_loader.cpp‎
Lines changed: 0 additions & 4 deletions b/‎src/model_loader/model_loader.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/models/huggingface/llama.h‎
Lines changed: 39 additions & 0 deletions b/‎src/models/huggingface/llama.h‎
Lines changed: 39 additions & 0 deletions
@@ -203,10 +203,6 @@ bool HFModelLoader::load_model_args(const std::string& model_weights_path) {
     return false;
   }
 
-  // override model type from gflag if exists
-  if (!FLAGS_model_type.empty()) {
-    model_type = FLAGS_model_type;
-  }
   auto args_loader = ModelRegistry::get_model_args_loader(model_type);
   if (args_loader == nullptr) {
     LOG(ERROR) << "Failed to find model args loader for model type "
 
@@ -363,12 +363,45 @@ class LlamaForCausalLMImpl : public torch::nn::Module {
 };
 TORCH_MODULE(LlamaForCausalLM);
 
+class YiChatTemplate final : public CodedChatTemplate {
+ public:
+  // generate prompt from dialogs
+  // https://huggingface.co/01-ai/Yi-34B-Chat/blob/main/tokenizer_config.json#L60
+  // Prompt template:
+  // <|im_start|>user\n {message} <|im_end|>\n
+  // <|im_start|>assistant\n
+  std::optional<std::string> get_prompt(
+      const std::string_view& system_message,
+      const std::vector<std::string_view>& messages) const override {
+    // at least one user message
+    if (messages.size() % 2 == 0) {
+      return std::nullopt;
+    }
+
+    std::stringstream ss;
+    if (!system_message.empty()) {
+      ss << "<|im_start|>system\n" << system_message << "<|im_end|>\n";
+    }
+
+    // then user and assistant message pairs (u/a/u/a/u...)
+    for (size_t i = 0; i < messages.size(); ++i) {
+      const char* role = (i % 2) == 0 ? "user" : "assistant";
+      ss << "<|im_start|>" << role << "\n" << messages[i] << "<|im_end|>\n";
+    }
+    // end with assistant message
+    ss << "<|im_start|>assistant\n";
+    return ss.str();
+  }
+};
+
 // register the causal model
 REGISTER_CAUSAL_MODEL(llama, LlamaForCausalLM);
 REGISTER_CAUSAL_MODEL(llama3, LlamaForCausalLM);
+REGISTER_CAUSAL_MODEL(Yi, LlamaForCausalLM);
 
 REGISTER_DEFAULT_CHAT_TEMPLATE(llama, Llama2ChatTemplate);
 REGISTER_DEFAULT_CHAT_TEMPLATE(llama3, Llama3ChatTemplate);
+REGISTER_DEFAULT_CHAT_TEMPLATE(Yi, YiChatTemplate);
 // register the model args
 // example config:
 // https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct/blob/main/config.json
@@ -399,6 +432,12 @@ REGISTER_MODEL_ARGS(llama, [&] {
     SET_ARG(model_type, "llama3");
     // stop token ids: "<|end_of_text|>", "<|eot_id|>"
     SET_ARG(stop_token_ids, std::unordered_set<int32_t>({128001, 128009}));
+  } else if (args->vocab_size() == 64000) {
+    // choose the right chat template
+    SET_ARG(model_type, "Yi");
+    // stop token ids: "<|endoftext|>", "<|im_start|>", "<|im_end|>",
+    // "<|im_sep|>"
+    SET_ARG(stop_token_ids, std::unordered_set<int32_t>({2, 6, 7, 8}));
   }
 });