Skip to content

Commit f430916

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/backend/CANN.md # docs/multimodal/minicpmo2.6.md # docs/multimodal/minicpmv2.5.md # docs/multimodal/minicpmv2.6.md # examples/speculative-simple/speculative-simple.cpp # ggml/cmake/ggml-config.cmake.in # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/add.cl # ggml/src/ggml-opencl/kernels/mul.cl # scripts/compare-commits.sh # scripts/compare-llama-bench.py # scripts/sync-ggml.last # tools/server/README.md
2 parents b04362f + 9c35706 commit f430916

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+5954
-657
lines changed

.devops/cann.Dockerfile

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# ==============================================================================
2+
# ARGUMENTS
3+
# ==============================================================================
4+
5+
# Define the CANN base image for easier version updates later
6+
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
7+
8+
# ==============================================================================
9+
# BUILD STAGE
10+
# Compile all binary files and libraries
11+
# ==============================================================================
12+
FROM ${CANN_BASE_IMAGE} AS build
13+
14+
# Define the Ascend chip model for compilation. Default is Ascend910B3
15+
ARG ASCEND_SOC_TYPE=Ascend910B3
16+
17+
# -- Install build dependencies --
18+
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
19+
yum clean all && \
20+
rm -rf /var/cache/yum
21+
22+
# -- Set the working directory --
23+
WORKDIR /app
24+
25+
# -- Copy project files --
26+
COPY . .
27+
28+
# -- Set CANN environment variables (required for compilation) --
29+
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
30+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
31+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
32+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
33+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
34+
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
35+
# ... You can add other environment variables from the original file as needed ...
36+
# For brevity, only core variables are listed here. You can paste the original ENV list here.
37+
38+
# -- Build llama.cpp --
39+
# Use the passed ASCEND_SOC_TYPE argument and add general build options
40+
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
41+
&& \
42+
cmake -B build \
43+
-DGGML_CANN=ON \
44+
-DCMAKE_BUILD_TYPE=Release \
45+
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
46+
. && \
47+
cmake --build build --config Release -j$(nproc)
48+
49+
# -- Organize build artifacts for copying in later stages --
50+
# Create a lib directory to store all .so files
51+
RUN mkdir -p /app/lib && \
52+
find build -name "*.so" -exec cp {} /app/lib \;
53+
54+
# Create a full directory to store all executables and Python scripts
55+
RUN mkdir -p /app/full && \
56+
cp build/bin/* /app/full/ && \
57+
cp *.py /app/full/ && \
58+
cp -r gguf-py /app/full/ && \
59+
cp -r requirements /app/full/ && \
60+
cp requirements.txt /app/full/
61+
# If you have a tools.sh script, make sure it is copied here
62+
# cp .devops/tools.sh /app/full/tools.sh
63+
64+
# ==============================================================================
65+
# BASE STAGE
66+
# Create a minimal base image with CANN runtime and common libraries
67+
# ==============================================================================
68+
FROM ${CANN_BASE_IMAGE} AS base
69+
70+
# -- Install runtime dependencies --
71+
RUN yum install -y libgomp curl && \
72+
yum clean all && \
73+
rm -rf /var/cache/yum
74+
75+
# -- Set CANN environment variables (required for runtime) --
76+
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
77+
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
78+
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
79+
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
80+
# ... You can add other environment variables from the original file as needed ...
81+
82+
WORKDIR /app
83+
84+
# Copy compiled .so files from the build stage
85+
COPY --from=build /app/lib/ /app
86+
87+
# ==============================================================================
88+
# FINAL STAGES (TARGETS)
89+
# ==============================================================================
90+
91+
### Target: full
92+
# Complete image with all tools, Python bindings, and dependencies
93+
# ==============================================================================
94+
FROM base AS full
95+
96+
COPY --from=build /app/full /app
97+
98+
# Install Python dependencies
99+
RUN yum install -y git python3 python3-pip && \
100+
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
101+
pip3 install --no-cache-dir -r requirements.txt && \
102+
yum clean all && \
103+
rm -rf /var/cache/yum
104+
105+
# You need to provide a tools.sh script as the entrypoint
106+
ENTRYPOINT ["/app/tools.sh"]
107+
# If there is no tools.sh, you can set the default to start the server
108+
# ENTRYPOINT ["/app/llama-server"]
109+
110+
### Target: light
111+
# Lightweight image containing only llama-cli
112+
# ==============================================================================
113+
FROM base AS light
114+
115+
COPY --from=build /app/full/llama-cli /app
116+
117+
ENTRYPOINT [ "/app/llama-cli" ]
118+
119+
### Target: server
120+
# Dedicated server image containing only llama-server
121+
# ==============================================================================
122+
FROM base AS server
123+
124+
ENV LLAMA_ARG_HOST=0.0.0.0
125+
126+
COPY --from=build /app/full/llama-server /app
127+
128+
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
129+
130+
ENTRYPOINT [ "/app/llama-server" ]

common/arg.cpp

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
979979
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
980980
string_process_escapes(seq_breaker);
981981
}
982+
for (auto & pair : params.speculative.replacements) {
983+
string_process_escapes(pair.first);
984+
string_process_escapes(pair.second);
985+
}
982986
}
983987

984988
if (!params.kv_overrides.empty()) {
@@ -2093,6 +2097,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20932097
params.no_kv_offload = true;
20942098
}
20952099
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
2100+
add_opt(common_arg(
2101+
{"-nr", "--no-repack"},
2102+
"disable weight repacking",
2103+
[](common_params & params) {
2104+
params.no_extra_bufts = true;
2105+
}
2106+
).set_env("LLAMA_ARG_NO_REPACK"));
20962107
add_opt(common_arg(
20972108
{"-ctk", "--cache-type-k"}, "TYPE",
20982109
string_format(
@@ -2371,6 +2382,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23712382
}
23722383
}
23732384
));
2385+
add_opt(common_arg(
2386+
{"--cpu-moe"},
2387+
"use CPU for Mixture of Experts (MoE) weights",
2388+
[](common_params & params) {
2389+
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390+
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2391+
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2392+
}
2393+
).set_env("LLAMA_ARG_CPU_MOE"));
23742394
add_opt(common_arg(
23752395
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23762396
"number of layers to store in VRAM",
@@ -3251,6 +3271,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32513271
params.speculative.model.path = value;
32523272
}
32533273
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3274+
add_opt(common_arg(
3275+
{"--spec-replace"}, "TARGET", "DRAFT",
3276+
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3277+
[](common_params & params, const std::string & tgt, const std::string & dft) {
3278+
params.speculative.replacements.push_back({ tgt, dft });
3279+
}
3280+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
32543281
add_opt(common_arg(
32553282
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
32563283
string_format(
@@ -3440,34 +3467,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34403467
}
34413468
).set_examples({LLAMA_EXAMPLE_SERVER}));
34423469

3443-
// diffusion parameters
34443470
add_opt(common_arg(
34453471
{ "--diffusion-steps" }, "N",
34463472
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
34473473
[](common_params & params, int value) { params.diffusion.steps = value; }
34483474
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3475+
add_opt(common_arg(
3476+
{ "--diffusion-visual" },
3477+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3478+
params.diffusion.visual_mode ? "true" : "false"),
3479+
[](common_params & params) { params.diffusion.visual_mode = true; }
3480+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3481+
34493482
add_opt(common_arg(
34503483
{ "--diffusion-eps" }, "F",
34513484
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
34523485
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
34533486
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34543487
add_opt(common_arg(
34553488
{ "--diffusion-algorithm" }, "N",
3456-
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3489+
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
34573490
params.diffusion.algorithm),
34583491
[](common_params & params, int value) { params.diffusion.algorithm = value; }
34593492
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
34603493
add_opt(common_arg(
34613494
{ "--diffusion-alg-temp" }, "F",
3462-
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3495+
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
34633496
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
34643497
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3498+
34653499
add_opt(common_arg(
3466-
{ "--diffusion-visual" },
3467-
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3468-
params.diffusion.visual_mode ? "true" : "false"),
3469-
[](common_params & params) { params.diffusion.visual_mode = true; }
3500+
{ "--diffusion-block-length" }, "N",
3501+
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3502+
[](common_params & params, int value) { params.diffusion.block_length = value; }
3503+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3504+
add_opt(common_arg(
3505+
{ "--diffusion-cfg-scale" }, "F",
3506+
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3507+
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
34703508
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3509+
add_opt(common_arg(
3510+
{ "--diffusion-add-gumbel-noise" }, "F",
3511+
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3512+
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3513+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3514+
34713515

34723516
return ctx_arg;
34733517
}

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1130,6 +1130,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11301130
mparams.use_mmap = params.use_mmap;
11311131
mparams.use_mlock = params.use_mlock;
11321132
mparams.check_tensors = params.check_tensors;
1133+
mparams.use_extra_bufts = !params.no_extra_bufts;
11331134

11341135
if (params.kv_overrides.empty()) {
11351136
mparams.kv_overrides = NULL;

common/common.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ struct common_params_speculative {
197197
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
198198
float p_split = 0.1f; // speculative decoding split probability
199199
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
200+
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
200201

201202
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
202203
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -216,11 +217,17 @@ struct common_params_vocoder {
216217
};
217218

218219
struct common_params_diffusion {
219-
int32_t steps = 64; // number of diffusion steps
220-
float eps = 1e-3f; // epsilon for timesteps
221-
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
222-
float alg_temp = 0.0f; // algorithm temperature
223-
bool visual_mode = false; // show progressive diffusion on screen
220+
int32_t steps = 128;
221+
bool visual_mode = false;
222+
223+
float eps = 0; // epsilon for timesteps
224+
int32_t block_length = 0; // block length for generation
225+
226+
int32_t algorithm = 4; // default algorithm: low-confidence
227+
float alg_temp = 0.0f; // algorithm temperature
228+
229+
float cfg_scale = 0; // classifier-free guidance scale
230+
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
224231
};
225232

226233
enum common_reasoning_format {
@@ -348,6 +355,7 @@ struct common_params {
348355
bool warmup = true; // warmup run
349356
bool check_tensors = false; // validate tensor data
350357
bool no_op_offload = false; // globally disable offload host tensor operations to device
358+
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
351359

352360
bool single_turn = false; // single turn chat conversation
353361

0 commit comments

Comments
 (0)