@@ -293,20 +293,38 @@ void llamaCPP::chatCompletion(
293293 LOG_INFO << " Current completion text" ;
294294 LOG_INFO << formatted_output;
295295#endif
296- const int task_id = llama.request_completion (data, false , false , -1 );
296+ int task_id;
297+
298+ if (llama.params .n_parallel == 1 ) {
299+ while (true ) {
300+ if (!single_queue_is_busy) {
301+ task_id = llama.request_completion (data, false , false , -1 );
302+ single_queue_is_busy = true ;
303+ break ;
304+ } else {
305+ std::this_thread::sleep_for (
306+ std::chrono::milliseconds (500 )); // Sleep for 500 milliseconds
307+ }
308+ }
309+ } else {
310+ task_id = llama.request_completion (data, false , false , -1 );
311+ }
312+
297313 LOG_INFO << " Resolved request for task_id:" << task_id;
298314
299315 if (is_streamed) {
300316 auto state = createState (task_id, this );
301317
302318 auto chunked_content_provider =
303- [state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
319+ [this , state](char *pBuffer, std::size_t nBuffSize) -> std::size_t {
304320 if (!pBuffer) {
305321 LOG_INFO << " Connection closed or buffer is null. Reset context" ;
306322 state->instance ->llama .request_cancel (state->task_id );
323+ single_queue_is_busy = false ;
307324 return 0 ;
308325 }
309326 if (state->isStopped ) {
327+ single_queue_is_busy = false ;
310328 return 0 ;
311329 }
312330
@@ -339,8 +357,10 @@ void llamaCPP::chatCompletion(
339357 }
340358 return nRead;
341359 } else {
360+ single_queue_is_busy = false ;
342361 return 0 ;
343362 }
363+ single_queue_is_busy = false ;
344364 return 0 ;
345365 };
346366 auto resp = nitro_utils::nitroStreamResponse (chunked_content_provider,
0 commit comments