load model with clean_cache_threshold

tikikun · tikikun · commit 2f179bc80d17 · 2023-12-11T16:49:42.000+07:00
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -8,6 +8,7 @@
 #include <regex>
 #include <string>
 #include <thread>
+#include <trantor/utils/Logger.h>
 
 using namespace inferences;
 using json = nlohmann::json;
@@ -177,6 +178,14 @@ void llamaCPP::chatCompletion(
   // To set default value
 
   if (jsonBody) {
+    // Increase number of chats received and clean the prompt
+    no_of_chats++;
+    if (no_of_chats % clean_cache_threshold == 0) {
+      LOG_INFO << "Clean cache threshold reached!";
+      llama.kv_cache_clear();
+      LOG_INFO << "Cache cleaned";
+    }
+
     // Default values to enable auto caching
     data["cache_prompt"] = caching_enabled;
     data["n_keep"] = -1;
@@ -390,6 +399,8 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {
             .asInt();
     params.cont_batching = jsonBody.get("cont_batching", false).asBool();
 
+    this->clean_cache_threshold =
+        jsonBody.get("clean_cache_threshold", 5).asInt();
     this->caching_enabled = jsonBody.get("caching_enabled", false).asBool();
     this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString();
     this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString();
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -1909,6 +1909,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
   std::string pre_prompt;
   int repeat_last_n;
   bool caching_enabled;
-  std::atomic<int> no_of_chats = 0; 
+  std::atomic<int> no_of_chats = 0;
+  int clean_cache_threshold;
 };
 }; // namespace inferences