Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit fa82078

Browse files
authored
feat: cortex.onnx (#660)
1 parent 5b7e6dc commit fa82078

File tree

3 files changed

+25
-8
lines changed

3 files changed

+25
-8
lines changed

cortex-cpp/controllers/server.cc

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ namespace inferences {
1616
namespace {
1717
constexpr static auto kLlamaEngine = "cortex.llamacpp";
1818
constexpr static auto kPythonRuntimeEngine = "cortex.python";
19+
constexpr static auto kOnnxEngine = "cortex.onnx";
1920
} // namespace
2021

2122
server::server(){
@@ -32,7 +33,7 @@ void server::ChatCompletion(
3233
const HttpRequestPtr& req,
3334
std::function<void(const HttpResponsePtr&)>&& callback) {
3435
auto engine_type =
35-
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
36+
(*(req->getJsonObject())).get("engine", cur_engine_type_).asString();
3637
if (!IsEngineLoaded(engine_type)) {
3738
Json::Value res;
3839
res["message"] = "Engine is not loaded yet";
@@ -91,7 +92,7 @@ void server::UnloadModel(
9192
const HttpRequestPtr& req,
9293
std::function<void(const HttpResponsePtr&)>&& callback) {
9394
auto engine_type =
94-
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
95+
(*(req->getJsonObject())).get("engine", cur_engine_type_).asString();
9596
if (!IsEngineLoaded(engine_type)) {
9697
Json::Value res;
9798
res["message"] = "Engine is not loaded yet";
@@ -118,7 +119,7 @@ void server::ModelStatus(
118119
const HttpRequestPtr& req,
119120
std::function<void(const HttpResponsePtr&)>&& callback) {
120121
auto engine_type =
121-
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
122+
(*(req->getJsonObject())).get("engine", cur_engine_type_).asString();
122123
if (!IsEngineLoaded(engine_type)) {
123124
Json::Value res;
124125
res["message"] = "Engine is not loaded yet";
@@ -144,8 +145,7 @@ void server::ModelStatus(
144145

145146
void server::GetModels(const HttpRequestPtr& req,
146147
std::function<void(const HttpResponsePtr&)>&& callback) {
147-
// TODO(sang) need to change this when we support Tensorrt-llm
148-
if (!IsEngineLoaded(kLlamaEngine)) {
148+
if (!IsEngineLoaded(cur_engine_type_)) {
149149
Json::Value res;
150150
res["message"] = "Engine is not loaded yet";
151151
auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -156,7 +156,7 @@ void server::GetModels(const HttpRequestPtr& req,
156156
}
157157

158158
LOG_TRACE << "Start to get models";
159-
auto& en = std::get<EngineI*>(engines_[kLlamaEngine].engine);
159+
auto& en = std::get<EngineI*>(engines_[cur_engine_type_].engine);
160160
if (en->IsSupported("GetModels")) {
161161
en->GetModels(
162162
req->getJsonObject(),
@@ -257,11 +257,13 @@ void server::LoadModel(const HttpRequestPtr& req,
257257

258258
// We have not loaded engine yet, should load it before using it
259259
if (engines_.find(engine_type) == engines_.end()) {
260-
// TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time.
261-
// So need an unload engine machanism to handle.
260+
// We only use single engine so unload all engines before load new engine
261+
UnloadEngines();
262262
auto get_engine_path = [](std::string_view e) {
263263
if (e == kLlamaEngine) {
264264
return cortex_utils::kLlamaLibPath;
265+
} else if(e == kOnnxEngine) {
266+
return cortex_utils::kOnnxLibPath;
265267
}
266268
return cortex_utils::kLlamaLibPath;
267269
};
@@ -292,6 +294,7 @@ void server::LoadModel(const HttpRequestPtr& req,
292294
callback(resp);
293295
return;
294296
}
297+
cur_engine_type_ = engine_type;
295298

296299
auto func =
297300
engines_[engine_type].dl->get_function<EngineI*()>("get_engine");
@@ -358,4 +361,14 @@ bool server::IsEngineLoaded(const std::string& e) {
358361
return engines_.find(e) != engines_.end();
359362
}
360363

364+
void server::UnloadEngines() {
365+
// We unload all engines except python engine
366+
for (auto it = engines_.begin(); it != engines_.end();) {
367+
if (it->first != kPythonRuntimeEngine) {
368+
it = engines_.erase(it);
369+
} else
370+
it++;
371+
}
372+
}
373+
361374
} // namespace inferences

cortex-cpp/controllers/server.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ class server : public drogon::HttpController<server>,
9999
SyncQueue& q);
100100
bool IsEngineLoaded(const std::string& e);
101101

102+
void UnloadEngines();
103+
102104
private:
103105
struct SyncQueue {
104106
void push(std::pair<Json::Value, Json::Value>&& p) {
@@ -145,5 +147,6 @@ class server : public drogon::HttpController<server>,
145147
EngineV engine;
146148
};
147149
std::unordered_map<std::string, EngineInfo> engines_;
150+
std::string cur_engine_type_;
148151
};
149152
}; // namespace inferences

cortex-cpp/utils/cortex_utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
namespace cortex_utils {
2828
constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp";
2929
constexpr static auto kPythonRuntimeLibPath = "/engines/cortex.python";
30+
constexpr static auto kOnnxLibPath = "/engines/cortex.onnx";
3031

3132
inline std::string models_folder = "./models";
3233

0 commit comments

Comments
 (0)