|
6 | 6 | using namespace inferences; |
7 | 7 | using json = nlohmann::json; |
8 | 8 |
|
9 | | -struct State { |
| 9 | +struct inferenceState { |
10 | 10 | bool isStopped = false; |
11 | 11 | int task_id; |
12 | 12 | llamaCPP *instance; |
13 | 13 |
|
14 | | - State(int tid, llamaCPP *inst) : task_id(tid), instance(inst) {} |
| 14 | + inferenceState(int tid, llamaCPP *inst) : task_id(tid), instance(inst) {} |
15 | 15 | }; |
16 | 16 |
|
17 | | -std::shared_ptr<State> createState(int task_id, llamaCPP *instance) { |
18 | | - return std::make_shared<State>(task_id, instance); |
| 17 | +std::shared_ptr<inferenceState> create_inference_state(int task_id, |
| 18 | + llamaCPP *instance) { |
| 19 | + return std::make_shared<inferenceState>(task_id, instance); |
19 | 20 | } |
20 | 21 |
|
21 | 22 | // -------------------------------------------- |
@@ -295,36 +296,21 @@ void llamaCPP::chatCompletion( |
295 | 296 | #endif |
296 | 297 | int task_id; |
297 | 298 |
|
298 | | - if (llama.params.n_parallel == 1) { |
299 | | - while (true) { |
300 | | - if (!single_queue_is_busy) { |
301 | | - task_id = llama.request_completion(data, false, false, -1); |
302 | | - single_queue_is_busy = true; |
303 | | - break; |
304 | | - } else { |
305 | | - std::this_thread::sleep_for( |
306 | | - std::chrono::milliseconds(500)); // Sleep for 500 milliseconds |
307 | | - } |
308 | | - } |
309 | | - } else { |
310 | 299 | task_id = llama.request_completion(data, false, false, -1); |
311 | | - } |
312 | 300 |
|
313 | 301 | LOG_INFO << "Resolved request for task_id:" << task_id; |
314 | 302 |
|
315 | 303 | if (is_streamed) { |
316 | | - auto state = createState(task_id, this); |
| 304 | + auto state = create_inference_state(task_id, this); |
317 | 305 |
|
318 | 306 | auto chunked_content_provider = |
319 | 307 | [this, state](char *pBuffer, std::size_t nBuffSize) -> std::size_t { |
320 | 308 | if (!pBuffer) { |
321 | 309 | LOG_INFO << "Connection closed or buffer is null. Reset context"; |
322 | 310 | state->instance->llama.request_cancel(state->task_id); |
323 | | - single_queue_is_busy = false; |
324 | 311 | return 0; |
325 | 312 | } |
326 | 313 | if (state->isStopped) { |
327 | | - single_queue_is_busy = false; |
328 | 314 | return 0; |
329 | 315 | } |
330 | 316 |
|
@@ -357,10 +343,8 @@ void llamaCPP::chatCompletion( |
357 | 343 | } |
358 | 344 | return nRead; |
359 | 345 | } else { |
360 | | - single_queue_is_busy = false; |
361 | 346 | return 0; |
362 | 347 | } |
363 | | - single_queue_is_busy = false; |
364 | 348 | return 0; |
365 | 349 | }; |
366 | 350 | auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, |
|
0 commit comments