Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit d2e4ac5

Browse files
authored
Merge pull request #244 from tikikun/main
version pump
2 parents 531a713 + a1c6b97 commit d2e4ac5

File tree

3 files changed

+20
-284
lines changed

3 files changed

+20
-284
lines changed

controllers/llamaCPP.cc

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,17 @@ void llamaCPP::warmupModel() {
145145
return;
146146
}
147147

148+
void llamaCPP::chatCompletionPrelight(
149+
const HttpRequestPtr &req,
150+
std::function<void(const HttpResponsePtr &)> &&callback) {
151+
auto resp = drogon::HttpResponse::newHttpResponse();
152+
resp->setStatusCode(drogon::HttpStatusCode::k200OK);
153+
resp->addHeader("Access-Control-Allow-Origin", "*");
154+
resp->addHeader("Access-Control-Allow-Methods", "POST, OPTIONS");
155+
resp->addHeader("Access-Control-Allow-Headers", "*");
156+
callback(resp);
157+
}
158+
148159
void llamaCPP::chatCompletion(
149160
const HttpRequestPtr &req,
150161
std::function<void(const HttpResponsePtr &)> &&callback) {
@@ -454,7 +465,9 @@ void llamaCPP::backgroundTask() {
454465
// model_loaded =
455466
llama.update_slots();
456467
}
457-
LOG_INFO << "Background task stopped!";
468+
LOG_INFO << "Background task stopped! ";
469+
llama.kv_cache_clear();
470+
LOG_INFO << "KV cache cleared!";
458471
return;
459472
}
460473

controllers/llamaCPP.h

Lines changed: 5 additions & 282 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,288 +1775,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
17751775
"LLaVA.\n");
17761776
printf("\n");
17771777
}
1778-
1779-
static void server_params_parse(int argc, char **argv, server_params &sparams,
1780-
gpt_params &params,
1781-
llama_server_context &llama) {
1782-
gpt_params default_params;
1783-
server_params default_sparams;
1784-
std::string arg;
1785-
bool invalid_param = false;
1786-
1787-
for (int i = 1; i < argc; i++) {
1788-
arg = argv[i];
1789-
if (arg == "--port") {
1790-
if (++i >= argc) {
1791-
invalid_param = true;
1792-
break;
1793-
}
1794-
sparams.port = std::stoi(argv[i]);
1795-
} else if (arg == "--host") {
1796-
if (++i >= argc) {
1797-
invalid_param = true;
1798-
break;
1799-
}
1800-
sparams.hostname = argv[i];
1801-
} else if (arg == "--path") {
1802-
if (++i >= argc) {
1803-
invalid_param = true;
1804-
break;
1805-
}
1806-
sparams.public_path = argv[i];
1807-
} else if (arg == "--timeout" || arg == "-to") {
1808-
if (++i >= argc) {
1809-
invalid_param = true;
1810-
break;
1811-
}
1812-
sparams.read_timeout = std::stoi(argv[i]);
1813-
sparams.write_timeout = std::stoi(argv[i]);
1814-
} else if (arg == "-m" || arg == "--model") {
1815-
if (++i >= argc) {
1816-
invalid_param = true;
1817-
break;
1818-
}
1819-
params.model = argv[i];
1820-
} else if (arg == "-a" || arg == "--alias") {
1821-
if (++i >= argc) {
1822-
invalid_param = true;
1823-
break;
1824-
}
1825-
params.model_alias = argv[i];
1826-
} else if (arg == "-h" || arg == "--help") {
1827-
server_print_usage(argv[0], default_params, default_sparams);
1828-
exit(0);
1829-
} else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
1830-
if (++i >= argc) {
1831-
invalid_param = true;
1832-
break;
1833-
}
1834-
params.n_ctx = std::stoi(argv[i]);
1835-
} else if (arg == "--rope-scaling") {
1836-
if (++i >= argc) {
1837-
invalid_param = true;
1838-
break;
1839-
}
1840-
std::string value(argv[i]);
1841-
/**/ if (value == "none") {
1842-
params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE;
1843-
} else if (value == "linear") {
1844-
params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR;
1845-
} else if (value == "yarn") {
1846-
params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN;
1847-
} else {
1848-
invalid_param = true;
1849-
break;
1850-
}
1851-
} else if (arg == "--rope-freq-base") {
1852-
if (++i >= argc) {
1853-
invalid_param = true;
1854-
break;
1855-
}
1856-
params.rope_freq_base = std::stof(argv[i]);
1857-
} else if (arg == "--rope-freq-scale") {
1858-
if (++i >= argc) {
1859-
invalid_param = true;
1860-
break;
1861-
}
1862-
params.rope_freq_scale = std::stof(argv[i]);
1863-
} else if (arg == "--yarn-ext-factor") {
1864-
if (++i >= argc) {
1865-
invalid_param = true;
1866-
break;
1867-
}
1868-
params.yarn_ext_factor = std::stof(argv[i]);
1869-
} else if (arg == "--yarn-attn-factor") {
1870-
if (++i >= argc) {
1871-
invalid_param = true;
1872-
break;
1873-
}
1874-
params.yarn_attn_factor = std::stof(argv[i]);
1875-
} else if (arg == "--yarn-beta-fast") {
1876-
if (++i >= argc) {
1877-
invalid_param = true;
1878-
break;
1879-
}
1880-
params.yarn_beta_fast = std::stof(argv[i]);
1881-
} else if (arg == "--yarn-beta-slow") {
1882-
if (++i >= argc) {
1883-
invalid_param = true;
1884-
break;
1885-
}
1886-
params.yarn_beta_slow = std::stof(argv[i]);
1887-
} else if (arg == "--memory-f32" || arg == "--memory_f32") {
1888-
params.memory_f16 = false;
1889-
} else if (arg == "--threads" || arg == "-t") {
1890-
if (++i >= argc) {
1891-
invalid_param = true;
1892-
break;
1893-
}
1894-
params.n_threads = std::stoi(argv[i]);
1895-
} else if (arg == "--threads-batch" || arg == "-tb") {
1896-
if (++i >= argc) {
1897-
invalid_param = true;
1898-
break;
1899-
}
1900-
params.n_threads_batch = std::stoi(argv[i]);
1901-
} else if (arg == "-b" || arg == "--batch-size") {
1902-
if (++i >= argc) {
1903-
invalid_param = true;
1904-
break;
1905-
}
1906-
params.n_batch = std::stoi(argv[i]);
1907-
params.n_batch = std::min(512, params.n_batch);
1908-
} else if (arg == "--gpu-layers" || arg == "-ngl" ||
1909-
arg == "--n-gpu-layers") {
1910-
if (++i >= argc) {
1911-
invalid_param = true;
1912-
break;
1913-
}
1914-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1915-
params.n_gpu_layers = std::stoi(argv[i]);
1916-
#else
1917-
LOG_WARNING_LLAMA(
1918-
"Not compiled with GPU offload support, --n-gpu-layers option will "
1919-
"be ignored. "
1920-
"See main README.md for information on enabling GPU BLAS support",
1921-
{{"n_gpu_layers", params.n_gpu_layers}});
1922-
#endif
1923-
} else if (arg == "--tensor-split" || arg == "-ts") {
1924-
if (++i >= argc) {
1925-
invalid_param = true;
1926-
break;
1927-
}
1928-
#ifdef GGML_USE_CUBLAS
1929-
std::string arg_next = argv[i];
1930-
1931-
// split string by , and /
1932-
const std::regex regex{R"([,/]+)"};
1933-
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex,
1934-
-1};
1935-
std::vector<std::string> split_arg{it, {}};
1936-
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
1937-
1938-
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
1939-
if (i_device < split_arg.size()) {
1940-
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
1941-
} else {
1942-
params.tensor_split[i_device] = 0.0f;
1943-
}
1944-
}
1945-
#else
1946-
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
1947-
"possible to set a tensor split.\n",
1948-
{});
1949-
#endif // GGML_USE_CUBLAS
1950-
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
1951-
#ifdef GGML_USE_CUBLAS
1952-
params.mul_mat_q = false;
1953-
#else
1954-
LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. "
1955-
"Disabling mul_mat_q kernels has no effect.\n",
1956-
{});
1957-
#endif // GGML_USE_CUBLAS
1958-
} else if (arg == "--main-gpu" || arg == "-mg") {
1959-
if (++i >= argc) {
1960-
invalid_param = true;
1961-
break;
1962-
}
1963-
#ifdef GGML_USE_CUBLAS
1964-
params.main_gpu = std::stoi(argv[i]);
1965-
#else
1966-
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
1967-
"possible to set a main GPU.",
1968-
{});
1969-
#endif
1970-
} else if (arg == "--lora") {
1971-
if (++i >= argc) {
1972-
invalid_param = true;
1973-
break;
1974-
}
1975-
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
1976-
params.use_mmap = false;
1977-
} else if (arg == "--lora-scaled") {
1978-
if (++i >= argc) {
1979-
invalid_param = true;
1980-
break;
1981-
}
1982-
const char *lora_adapter = argv[i];
1983-
if (++i >= argc) {
1984-
invalid_param = true;
1985-
break;
1986-
}
1987-
params.lora_adapter.push_back(
1988-
std::make_tuple(lora_adapter, std::stof(argv[i])));
1989-
params.use_mmap = false;
1990-
} else if (arg == "--lora-base") {
1991-
if (++i >= argc) {
1992-
invalid_param = true;
1993-
break;
1994-
}
1995-
params.lora_base = argv[i];
1996-
} else if (arg == "-v" || arg == "--verbose") {
1997-
#if SERVER_VERBOSE != 1
1998-
LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {});
1999-
#else
2000-
server_verbose = true;
2001-
#endif
2002-
} else if (arg == "--mlock") {
2003-
params.use_mlock = true;
2004-
} else if (arg == "--no-mmap") {
2005-
params.use_mmap = false;
2006-
} else if (arg == "--numa") {
2007-
params.numa = true;
2008-
} else if (arg == "--embedding") {
2009-
params.embedding = true;
2010-
} else if (arg == "-cb" || arg == "--cont-batching") {
2011-
params.cont_batching = true;
2012-
} else if (arg == "-np" || arg == "--parallel") {
2013-
if (++i >= argc) {
2014-
invalid_param = true;
2015-
break;
2016-
}
2017-
params.n_parallel = std::stoi(argv[i]);
2018-
} else if (arg == "-n" || arg == "--n-predict") {
2019-
if (++i >= argc) {
2020-
invalid_param = true;
2021-
break;
2022-
}
2023-
params.n_predict = std::stoi(argv[i]);
2024-
} else if (arg == "-spf" || arg == "--system-prompt-file") {
2025-
if (++i >= argc) {
2026-
invalid_param = true;
2027-
break;
2028-
}
2029-
std::ifstream file(argv[i]);
2030-
if (!file) {
2031-
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
2032-
invalid_param = true;
2033-
break;
2034-
}
2035-
std::string systm_content;
2036-
std::copy(std::istreambuf_iterator<char>(file),
2037-
std::istreambuf_iterator<char>(),
2038-
std::back_inserter(systm_content));
2039-
llama.process_system_prompt_data(json::parse(systm_content));
2040-
} else if (arg == "--mmproj") {
2041-
if (++i >= argc) {
2042-
invalid_param = true;
2043-
break;
2044-
}
2045-
params.mmproj = argv[i];
2046-
} else {
2047-
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
2048-
server_print_usage(argv[0], default_params, default_sparams);
2049-
exit(1);
2050-
}
2051-
}
2052-
2053-
if (invalid_param) {
2054-
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
2055-
server_print_usage(argv[0], default_params, default_sparams);
2056-
exit(1);
2057-
}
2058-
}
2059-
20601778
static json
20611779
format_partial_response(llama_server_context &llama, llama_client_slot *slot,
20621780
const std::string &content,
@@ -2150,12 +1868,17 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
21501868

21511869
// Openai compatible path
21521870
ADD_METHOD_TO(llamaCPP::chatCompletion, "/v1/chat/completions", Post);
1871+
ADD_METHOD_TO(llamaCPP::chatCompletionPrelight, "/v1/chat/completions",
1872+
Options);
1873+
21531874
ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post);
21541875

21551876
// PATH_ADD("/llama/chat_completion", Post);
21561877
METHOD_LIST_END
21571878
void chatCompletion(const HttpRequestPtr &req,
21581879
std::function<void(const HttpResponsePtr &)> &&callback);
1880+
void chatCompletionPrelight(const HttpRequestPtr &req,
1881+
std::function<void(const HttpResponsePtr &)> &&callback);
21591882
void embedding(const HttpRequestPtr &req,
21601883
std::function<void(const HttpResponsePtr &)> &&callback);
21611884
void loadModel(const HttpRequestPtr &req,

llama.cpp

0 commit comments

Comments
 (0)