@@ -1775,288 +1775,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
17751775 " LLaVA.\n " );
17761776 printf (" \n " );
17771777}
1778-
1779- static void server_params_parse (int argc, char **argv, server_params &sparams,
1780- gpt_params ¶ms,
1781- llama_server_context &llama) {
1782- gpt_params default_params;
1783- server_params default_sparams;
1784- std::string arg;
1785- bool invalid_param = false ;
1786-
1787- for (int i = 1 ; i < argc; i++) {
1788- arg = argv[i];
1789- if (arg == " --port" ) {
1790- if (++i >= argc) {
1791- invalid_param = true ;
1792- break ;
1793- }
1794- sparams.port = std::stoi (argv[i]);
1795- } else if (arg == " --host" ) {
1796- if (++i >= argc) {
1797- invalid_param = true ;
1798- break ;
1799- }
1800- sparams.hostname = argv[i];
1801- } else if (arg == " --path" ) {
1802- if (++i >= argc) {
1803- invalid_param = true ;
1804- break ;
1805- }
1806- sparams.public_path = argv[i];
1807- } else if (arg == " --timeout" || arg == " -to" ) {
1808- if (++i >= argc) {
1809- invalid_param = true ;
1810- break ;
1811- }
1812- sparams.read_timeout = std::stoi (argv[i]);
1813- sparams.write_timeout = std::stoi (argv[i]);
1814- } else if (arg == " -m" || arg == " --model" ) {
1815- if (++i >= argc) {
1816- invalid_param = true ;
1817- break ;
1818- }
1819- params.model = argv[i];
1820- } else if (arg == " -a" || arg == " --alias" ) {
1821- if (++i >= argc) {
1822- invalid_param = true ;
1823- break ;
1824- }
1825- params.model_alias = argv[i];
1826- } else if (arg == " -h" || arg == " --help" ) {
1827- server_print_usage (argv[0 ], default_params, default_sparams);
1828- exit (0 );
1829- } else if (arg == " -c" || arg == " --ctx-size" || arg == " --ctx_size" ) {
1830- if (++i >= argc) {
1831- invalid_param = true ;
1832- break ;
1833- }
1834- params.n_ctx = std::stoi (argv[i]);
1835- } else if (arg == " --rope-scaling" ) {
1836- if (++i >= argc) {
1837- invalid_param = true ;
1838- break ;
1839- }
1840- std::string value (argv[i]);
1841- /* */ if (value == " none" ) {
1842- params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE;
1843- } else if (value == " linear" ) {
1844- params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR;
1845- } else if (value == " yarn" ) {
1846- params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN;
1847- } else {
1848- invalid_param = true ;
1849- break ;
1850- }
1851- } else if (arg == " --rope-freq-base" ) {
1852- if (++i >= argc) {
1853- invalid_param = true ;
1854- break ;
1855- }
1856- params.rope_freq_base = std::stof (argv[i]);
1857- } else if (arg == " --rope-freq-scale" ) {
1858- if (++i >= argc) {
1859- invalid_param = true ;
1860- break ;
1861- }
1862- params.rope_freq_scale = std::stof (argv[i]);
1863- } else if (arg == " --yarn-ext-factor" ) {
1864- if (++i >= argc) {
1865- invalid_param = true ;
1866- break ;
1867- }
1868- params.yarn_ext_factor = std::stof (argv[i]);
1869- } else if (arg == " --yarn-attn-factor" ) {
1870- if (++i >= argc) {
1871- invalid_param = true ;
1872- break ;
1873- }
1874- params.yarn_attn_factor = std::stof (argv[i]);
1875- } else if (arg == " --yarn-beta-fast" ) {
1876- if (++i >= argc) {
1877- invalid_param = true ;
1878- break ;
1879- }
1880- params.yarn_beta_fast = std::stof (argv[i]);
1881- } else if (arg == " --yarn-beta-slow" ) {
1882- if (++i >= argc) {
1883- invalid_param = true ;
1884- break ;
1885- }
1886- params.yarn_beta_slow = std::stof (argv[i]);
1887- } else if (arg == " --memory-f32" || arg == " --memory_f32" ) {
1888- params.memory_f16 = false ;
1889- } else if (arg == " --threads" || arg == " -t" ) {
1890- if (++i >= argc) {
1891- invalid_param = true ;
1892- break ;
1893- }
1894- params.n_threads = std::stoi (argv[i]);
1895- } else if (arg == " --threads-batch" || arg == " -tb" ) {
1896- if (++i >= argc) {
1897- invalid_param = true ;
1898- break ;
1899- }
1900- params.n_threads_batch = std::stoi (argv[i]);
1901- } else if (arg == " -b" || arg == " --batch-size" ) {
1902- if (++i >= argc) {
1903- invalid_param = true ;
1904- break ;
1905- }
1906- params.n_batch = std::stoi (argv[i]);
1907- params.n_batch = std::min (512 , params.n_batch );
1908- } else if (arg == " --gpu-layers" || arg == " -ngl" ||
1909- arg == " --n-gpu-layers" ) {
1910- if (++i >= argc) {
1911- invalid_param = true ;
1912- break ;
1913- }
1914- #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1915- params.n_gpu_layers = std::stoi (argv[i]);
1916- #else
1917- LOG_WARNING_LLAMA (
1918- " Not compiled with GPU offload support, --n-gpu-layers option will "
1919- " be ignored. "
1920- " See main README.md for information on enabling GPU BLAS support" ,
1921- {{" n_gpu_layers" , params.n_gpu_layers }});
1922- #endif
1923- } else if (arg == " --tensor-split" || arg == " -ts" ) {
1924- if (++i >= argc) {
1925- invalid_param = true ;
1926- break ;
1927- }
1928- #ifdef GGML_USE_CUBLAS
1929- std::string arg_next = argv[i];
1930-
1931- // split string by , and /
1932- const std::regex regex{R"( [,/]+)" };
1933- std::sregex_token_iterator it{arg_next.begin (), arg_next.end (), regex,
1934- -1 };
1935- std::vector<std::string> split_arg{it, {}};
1936- GGML_ASSERT (split_arg.size () <= LLAMA_MAX_DEVICES);
1937-
1938- for (size_t i_device = 0 ; i_device < LLAMA_MAX_DEVICES; ++i_device) {
1939- if (i_device < split_arg.size ()) {
1940- params.tensor_split [i_device] = std::stof (split_arg[i_device]);
1941- } else {
1942- params.tensor_split [i_device] = 0 .0f ;
1943- }
1944- }
1945- #else
1946- LOG_WARNING_LLAMA (" llama.cpp was compiled without cuBLAS. It is not "
1947- " possible to set a tensor split.\n " ,
1948- {});
1949- #endif // GGML_USE_CUBLAS
1950- } else if (arg == " --no-mul-mat-q" || arg == " -nommq" ) {
1951- #ifdef GGML_USE_CUBLAS
1952- params.mul_mat_q = false ;
1953- #else
1954- LOG_WARNING_LLAMA (" warning: llama.cpp was compiled without cuBLAS. "
1955- " Disabling mul_mat_q kernels has no effect.\n " ,
1956- {});
1957- #endif // GGML_USE_CUBLAS
1958- } else if (arg == " --main-gpu" || arg == " -mg" ) {
1959- if (++i >= argc) {
1960- invalid_param = true ;
1961- break ;
1962- }
1963- #ifdef GGML_USE_CUBLAS
1964- params.main_gpu = std::stoi (argv[i]);
1965- #else
1966- LOG_WARNING_LLAMA (" llama.cpp was compiled without cuBLAS. It is not "
1967- " possible to set a main GPU." ,
1968- {});
1969- #endif
1970- } else if (arg == " --lora" ) {
1971- if (++i >= argc) {
1972- invalid_param = true ;
1973- break ;
1974- }
1975- params.lora_adapter .push_back (std::make_tuple (argv[i], 1 .0f ));
1976- params.use_mmap = false ;
1977- } else if (arg == " --lora-scaled" ) {
1978- if (++i >= argc) {
1979- invalid_param = true ;
1980- break ;
1981- }
1982- const char *lora_adapter = argv[i];
1983- if (++i >= argc) {
1984- invalid_param = true ;
1985- break ;
1986- }
1987- params.lora_adapter .push_back (
1988- std::make_tuple (lora_adapter, std::stof (argv[i])));
1989- params.use_mmap = false ;
1990- } else if (arg == " --lora-base" ) {
1991- if (++i >= argc) {
1992- invalid_param = true ;
1993- break ;
1994- }
1995- params.lora_base = argv[i];
1996- } else if (arg == " -v" || arg == " --verbose" ) {
1997- #if SERVER_VERBOSE != 1
1998- LOG_WARNING_LLAMA (" server.cpp is not built with verbose logging." , {});
1999- #else
2000- server_verbose = true ;
2001- #endif
2002- } else if (arg == " --mlock" ) {
2003- params.use_mlock = true ;
2004- } else if (arg == " --no-mmap" ) {
2005- params.use_mmap = false ;
2006- } else if (arg == " --numa" ) {
2007- params.numa = true ;
2008- } else if (arg == " --embedding" ) {
2009- params.embedding = true ;
2010- } else if (arg == " -cb" || arg == " --cont-batching" ) {
2011- params.cont_batching = true ;
2012- } else if (arg == " -np" || arg == " --parallel" ) {
2013- if (++i >= argc) {
2014- invalid_param = true ;
2015- break ;
2016- }
2017- params.n_parallel = std::stoi (argv[i]);
2018- } else if (arg == " -n" || arg == " --n-predict" ) {
2019- if (++i >= argc) {
2020- invalid_param = true ;
2021- break ;
2022- }
2023- params.n_predict = std::stoi (argv[i]);
2024- } else if (arg == " -spf" || arg == " --system-prompt-file" ) {
2025- if (++i >= argc) {
2026- invalid_param = true ;
2027- break ;
2028- }
2029- std::ifstream file (argv[i]);
2030- if (!file) {
2031- fprintf (stderr, " error: failed to open file '%s'\n " , argv[i]);
2032- invalid_param = true ;
2033- break ;
2034- }
2035- std::string systm_content;
2036- std::copy (std::istreambuf_iterator<char >(file),
2037- std::istreambuf_iterator<char >(),
2038- std::back_inserter (systm_content));
2039- llama.process_system_prompt_data (json::parse (systm_content));
2040- } else if (arg == " --mmproj" ) {
2041- if (++i >= argc) {
2042- invalid_param = true ;
2043- break ;
2044- }
2045- params.mmproj = argv[i];
2046- } else {
2047- fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
2048- server_print_usage (argv[0 ], default_params, default_sparams);
2049- exit (1 );
2050- }
2051- }
2052-
2053- if (invalid_param) {
2054- fprintf (stderr, " error: invalid parameter for argument: %s\n " , arg.c_str ());
2055- server_print_usage (argv[0 ], default_params, default_sparams);
2056- exit (1 );
2057- }
2058- }
2059-
20601778static json
20611779format_partial_response (llama_server_context &llama, llama_client_slot *slot,
20621780 const std::string &content,
@@ -2150,12 +1868,17 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
21501868
21511869 // Openai compatible path
21521870 ADD_METHOD_TO (llamaCPP::chatCompletion, " /v1/chat/completions" , Post);
1871+ ADD_METHOD_TO (llamaCPP::chatCompletionPrelight, " /v1/chat/completions" ,
1872+ Options);
1873+
21531874 ADD_METHOD_TO (llamaCPP::embedding, " /v1/embeddings" , Post);
21541875
21551876 // PATH_ADD("/llama/chat_completion", Post);
21561877 METHOD_LIST_END
21571878 void chatCompletion (const HttpRequestPtr &req,
21581879 std::function<void (const HttpResponsePtr &)> &&callback);
1880+ void chatCompletionPrelight (const HttpRequestPtr &req,
1881+ std::function<void (const HttpResponsePtr &)> &&callback);
21591882 void embedding (const HttpRequestPtr &req,
21601883 std::function<void (const HttpResponsePtr &)> &&callback);
21611884 void loadModel (const HttpRequestPtr &req,
0 commit comments