Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 33c9540

Browse files
authored
Merge pull request #357 from janhq/215-epic-queue-system
215 epic queue system
2 parents 18575c3 + 8edf8ae commit 33c9540

File tree

2 files changed

+25
-3
lines changed

2 files changed

+25
-3
lines changed

controllers/llamaCPP.cc

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,20 +293,38 @@ void llamaCPP::chatCompletion(
293293
LOG_INFO << "Current completion text";
294294
LOG_INFO << formatted_output;
295295
#endif
296-
const int task_id = llama.request_completion(data, false, false, -1);
296+
int task_id;
297+
298+
if (llama.params.n_parallel == 1) {
299+
while (true) {
300+
if (!single_queue_is_busy) {
301+
task_id = llama.request_completion(data, false, false, -1);
302+
single_queue_is_busy = true;
303+
break;
304+
} else {
305+
std::this_thread::sleep_for(
306+
std::chrono::milliseconds(500)); // Sleep for 500 milliseconds
307+
}
308+
}
309+
} else {
310+
task_id = llama.request_completion(data, false, false, -1);
311+
}
312+
297313
LOG_INFO << "Resolved request for task_id:" << task_id;
298314

299315
if (is_streamed) {
300316
auto state = createState(task_id, this);
301317

302318
auto chunked_content_provider =
303-
[state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
319+
[this, state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
304320
if (!pBuffer) {
305321
LOG_INFO << "Connection closed or buffer is null. Reset context";
306322
state->instance->llama.request_cancel(state->task_id);
323+
single_queue_is_busy = false;
307324
return 0;
308325
}
309326
if (state->isStopped) {
327+
single_queue_is_busy = false;
310328
return 0;
311329
}
312330

@@ -339,8 +357,10 @@ void llamaCPP::chatCompletion(
339357
}
340358
return nRead;
341359
} else {
360+
single_queue_is_busy = false;
342361
return 0;
343362
}
363+
single_queue_is_busy = false;
344364
return 0;
345365
};
346366
auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,

controllers/llamaCPP.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2560,7 +2560,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
25602560

25612561
private:
25622562
llama_server_context llama;
2563-
//std::atomic<bool> model_loaded = false;
2563+
// std::atomic<bool> model_loaded = false;
25642564
size_t sent_count = 0;
25652565
size_t sent_token_probs_index = 0;
25662566
std::thread backgroundThread;
@@ -2572,5 +2572,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
25722572
bool caching_enabled;
25732573
std::atomic<int> no_of_chats = 0;
25742574
int clean_cache_threshold;
2575+
std::atomic<bool> single_queue_is_busy; // This value only used under the
2576+
// condition n_parallel is 1
25752577
};
25762578
}; // namespace inferences

0 commit comments

Comments
 (0)