feat: cortex.onnx (#660)

vansangpfiev · web-flow · commit fa8207871c95 · 2024-06-14T13:34:05.000+07:00
diff --git a/cortex-cpp/controllers/server.cc b/cortex-cpp/controllers/server.cc
@@ -16,6 +16,7 @@ namespace inferences {
 namespace {
 constexpr static auto kLlamaEngine = "cortex.llamacpp";
 constexpr static auto kPythonRuntimeEngine = "cortex.python";
+constexpr static auto kOnnxEngine = "cortex.onnx";
 }  // namespace
 
 server::server(){
@@ -32,7 +33,7 @@ void server::ChatCompletion(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
   auto engine_type =
-      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+      (*(req->getJsonObject())).get("engine", cur_engine_type_).asString();
   if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
@@ -91,7 +92,7 @@ void server::UnloadModel(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
   auto engine_type =
-      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+      (*(req->getJsonObject())).get("engine", cur_engine_type_).asString();
   if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
@@ -118,7 +119,7 @@ void server::ModelStatus(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
   auto engine_type =
-      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+      (*(req->getJsonObject())).get("engine", cur_engine_type_).asString();
   if (!IsEngineLoaded(engine_type)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
@@ -144,8 +145,7 @@ void server::ModelStatus(
 
 void server::GetModels(const HttpRequestPtr& req,
                        std::function<void(const HttpResponsePtr&)>&& callback) {
-  // TODO(sang) need to change this when we support Tensorrt-llm
-  if (!IsEngineLoaded(kLlamaEngine)) {
+  if (!IsEngineLoaded(cur_engine_type_)) {
     Json::Value res;
     res["message"] = "Engine is not loaded yet";
     auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -156,7 +156,7 @@ void server::GetModels(const HttpRequestPtr& req,
   }
 
   LOG_TRACE << "Start to get models";
-  auto& en = std::get<EngineI*>(engines_[kLlamaEngine].engine);
+  auto& en = std::get<EngineI*>(engines_[cur_engine_type_].engine);
   if (en->IsSupported("GetModels")) {
     en->GetModels(
         req->getJsonObject(),
@@ -257,11 +257,13 @@ void server::LoadModel(const HttpRequestPtr& req,
 
   // We have not loaded engine yet, should load it before using it
   if (engines_.find(engine_type) == engines_.end()) {
-    // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time.
-    // So need an unload engine machanism to handle.
+    // We only use single engine so unload all engines before load new engine
+    UnloadEngines();
     auto get_engine_path = [](std::string_view e) {
       if (e == kLlamaEngine) {
         return cortex_utils::kLlamaLibPath;
+      } else if(e == kOnnxEngine) {
+        return cortex_utils::kOnnxLibPath;
       }
       return cortex_utils::kLlamaLibPath;
     };
@@ -292,6 +294,7 @@ void server::LoadModel(const HttpRequestPtr& req,
       callback(resp);
       return;
     }
+    cur_engine_type_ = engine_type;
 
     auto func =
         engines_[engine_type].dl->get_function<EngineI*()>("get_engine");
@@ -358,4 +361,14 @@ bool server::IsEngineLoaded(const std::string& e) {
   return engines_.find(e) != engines_.end();
 }
 
+void server::UnloadEngines() {
+  // We unload all engines except python engine
+  for (auto it = engines_.begin(); it != engines_.end();) {
+    if (it->first != kPythonRuntimeEngine) {
+      it = engines_.erase(it);
+    } else
+      it++;
+  }
+}
+
 }  // namespace inferences
diff --git a/cortex-cpp/controllers/server.h b/cortex-cpp/controllers/server.h
@@ -99,6 +99,8 @@ class server : public drogon::HttpController<server>,
                            SyncQueue& q);
   bool IsEngineLoaded(const std::string& e);
 
+  void UnloadEngines();
+
  private:
   struct SyncQueue {
     void push(std::pair<Json::Value, Json::Value>&& p) {
@@ -145,5 +147,6 @@ class server : public drogon::HttpController<server>,
     EngineV engine;
   };
   std::unordered_map<std::string, EngineInfo> engines_;
+  std::string cur_engine_type_;
 };
 };  // namespace inferences
diff --git a/cortex-cpp/utils/cortex_utils.h b/cortex-cpp/utils/cortex_utils.h
@@ -27,6 +27,7 @@
 namespace cortex_utils {
 constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp";
 constexpr static auto kPythonRuntimeLibPath = "/engines/cortex.python";
+constexpr static auto kOnnxLibPath = "/engines/cortex.onnx";
 
 inline std::string models_folder = "./models";