@@ -16,6 +16,7 @@ namespace inferences {
1616namespace {
1717constexpr static auto kLlamaEngine = " cortex.llamacpp" ;
1818constexpr static auto kPythonRuntimeEngine = " cortex.python" ;
19+ constexpr static auto kOnnxEngine = " cortex.onnx" ;
1920} // namespace
2021
2122server::server (){
@@ -32,7 +33,7 @@ void server::ChatCompletion(
3233 const HttpRequestPtr& req,
3334 std::function<void (const HttpResponsePtr&)>&& callback) {
3435 auto engine_type =
35- (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
36+ (*(req->getJsonObject ())).get (" engine" , cur_engine_type_ ).asString ();
3637 if (!IsEngineLoaded (engine_type)) {
3738 Json::Value res;
3839 res[" message" ] = " Engine is not loaded yet" ;
@@ -91,7 +92,7 @@ void server::UnloadModel(
9192 const HttpRequestPtr& req,
9293 std::function<void (const HttpResponsePtr&)>&& callback) {
9394 auto engine_type =
94- (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
95+ (*(req->getJsonObject ())).get (" engine" , cur_engine_type_ ).asString ();
9596 if (!IsEngineLoaded (engine_type)) {
9697 Json::Value res;
9798 res[" message" ] = " Engine is not loaded yet" ;
@@ -118,7 +119,7 @@ void server::ModelStatus(
118119 const HttpRequestPtr& req,
119120 std::function<void (const HttpResponsePtr&)>&& callback) {
120121 auto engine_type =
121- (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
122+ (*(req->getJsonObject ())).get (" engine" , cur_engine_type_ ).asString ();
122123 if (!IsEngineLoaded (engine_type)) {
123124 Json::Value res;
124125 res[" message" ] = " Engine is not loaded yet" ;
@@ -144,8 +145,7 @@ void server::ModelStatus(
144145
145146void server::GetModels (const HttpRequestPtr& req,
146147 std::function<void (const HttpResponsePtr&)>&& callback) {
147- // TODO(sang) need to change this when we support Tensorrt-llm
148- if (!IsEngineLoaded (kLlamaEngine )) {
148+ if (!IsEngineLoaded (cur_engine_type_)) {
149149 Json::Value res;
150150 res[" message" ] = " Engine is not loaded yet" ;
151151 auto resp = cortex_utils::nitroHttpJsonResponse (res);
@@ -156,7 +156,7 @@ void server::GetModels(const HttpRequestPtr& req,
156156 }
157157
158158 LOG_TRACE << " Start to get models" ;
159- auto & en = std::get<EngineI*>(engines_[kLlamaEngine ].engine );
159+ auto & en = std::get<EngineI*>(engines_[cur_engine_type_ ].engine );
160160 if (en->IsSupported (" GetModels" )) {
161161 en->GetModels (
162162 req->getJsonObject (),
@@ -257,11 +257,13 @@ void server::LoadModel(const HttpRequestPtr& req,
257257
258258 // We have not loaded engine yet, should load it before using it
259259 if (engines_.find (engine_type) == engines_.end ()) {
260- // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time.
261- // So need an unload engine machanism to handle.
260+ // We only use single engine so unload all engines before load new engine
261+ UnloadEngines ();
262262 auto get_engine_path = [](std::string_view e) {
263263 if (e == kLlamaEngine ) {
264264 return cortex_utils::kLlamaLibPath ;
265+ } else if (e == kOnnxEngine ) {
266+ return cortex_utils::kOnnxLibPath ;
265267 }
266268 return cortex_utils::kLlamaLibPath ;
267269 };
@@ -292,6 +294,7 @@ void server::LoadModel(const HttpRequestPtr& req,
292294 callback (resp);
293295 return ;
294296 }
297+ cur_engine_type_ = engine_type;
295298
296299 auto func =
297300 engines_[engine_type].dl ->get_function <EngineI*()>(" get_engine" );
@@ -358,4 +361,14 @@ bool server::IsEngineLoaded(const std::string& e) {
358361 return engines_.find (e) != engines_.end ();
359362}
360363
364+ void server::UnloadEngines () {
365+ // We unload all engines except python engine
366+ for (auto it = engines_.begin (); it != engines_.end ();) {
367+ if (it->first != kPythonRuntimeEngine ) {
368+ it = engines_.erase (it);
369+ } else
370+ it++;
371+ }
372+ }
373+
361374} // namespace inferences
0 commit comments