Skip to content

Commit 0fd1657

Browse files
slarenNexesenex
authored andcommitted
llama : add --n-cpu-moe option (ggml-org#15077)
* llama : add --n-cpu-moe option Keeps the MoE weights of the first N layers in the CPU
1 parent 4f1ee70 commit 0fd1657

File tree

1 file changed

+23
-7
lines changed

1 file changed

+23
-7
lines changed

common/arg.cpp

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <cstdarg>
2727
#include <filesystem>
2828
#include <fstream>
29+
#include <list>
2930
#include <regex>
3031
#include <set>
3132
#include <string>
@@ -2374,20 +2375,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23742375
}
23752376
throw std::invalid_argument("unknown buffer type");
23762377
}
2377-
// FIXME: this leaks memory
2378-
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2378+
// keep strings alive and avoid leaking memory by storing them in a static vector
2379+
static std::list<std::string> buft_overrides;
2380+
buft_overrides.push_back(tensor_name);
2381+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
23792382
}
23802383
}
23812384
));
23822385
add_opt(common_arg(
2383-
{"--cpu-moe"},
2384-
"use CPU for Mixture of Experts (MoE) weights",
2386+
{"--cpu-moe", "-cmoe"},
2387+
"keep all Mixture of Experts (MoE) weights in the CPU",
23852388
[](common_params & params) {
2386-
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2387-
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388-
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389+
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
23892390
}
23902391
).set_env("LLAMA_ARG_CPU_MOE"));
2392+
add_opt(common_arg(
2393+
{"--n-cpu-moe", "-ncmoe"}, "N",
2394+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2395+
[](common_params & params, int value) {
2396+
if (value < 0) {
2397+
throw std::invalid_argument("invalid value");
2398+
}
2399+
for (int i = 0; i < value; ++i) {
2400+
// keep strings alive and avoid leaking memory by storing them in a static vector
2401+
static std::list<std::string> buft_overrides;
2402+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2403+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2404+
}
2405+
}
2406+
).set_env("LLAMA_ARG_N_CPU_MOE"));
23912407
add_opt(common_arg(
23922408
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23932409
"number of layers to store in VRAM",

0 commit comments

Comments
 (0)