@@ -979,6 +979,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
979
979
for (auto & seq_breaker : params.sampling .dry_sequence_breakers ) {
980
980
string_process_escapes (seq_breaker);
981
981
}
982
+ for (auto & pair : params.speculative .replacements ) {
983
+ string_process_escapes (pair.first );
984
+ string_process_escapes (pair.second );
985
+ }
982
986
}
983
987
984
988
if (!params.kv_overrides .empty ()) {
@@ -2093,6 +2097,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2093
2097
params.no_kv_offload = true ;
2094
2098
}
2095
2099
).set_env (" LLAMA_ARG_NO_KV_OFFLOAD" ));
2100
+ add_opt (common_arg (
2101
+ {" -nr" , " --no-repack" },
2102
+ " disable weight repacking" ,
2103
+ [](common_params & params) {
2104
+ params.no_extra_bufts = true ;
2105
+ }
2106
+ ).set_env (" LLAMA_ARG_NO_REPACK" ));
2096
2107
add_opt (common_arg (
2097
2108
{" -ctk" , " --cache-type-k" }, " TYPE" ,
2098
2109
string_format (
@@ -2371,6 +2382,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2371
2382
}
2372
2383
}
2373
2384
));
2385
+ add_opt (common_arg (
2386
+ {" --cpu-moe" },
2387
+ " use CPU for Mixture of Experts (MoE) weights" ,
2388
+ [](common_params & params) {
2389
+ params.tensor_buft_overrides .push_back ({" \\ .ffn_up_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2390
+ params.tensor_buft_overrides .push_back ({" \\ .ffn_down_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2391
+ params.tensor_buft_overrides .push_back ({" \\ .ffn_gate_exps\\ .weight$" , ggml_backend_cpu_buffer_type ()});
2392
+ }
2393
+ ).set_env (" LLAMA_ARG_CPU_MOE" ));
2374
2394
add_opt (common_arg (
2375
2395
{" -ngl" , " --gpu-layers" , " --n-gpu-layers" }, " N" ,
2376
2396
" number of layers to store in VRAM" ,
@@ -3251,6 +3271,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3251
3271
params.speculative .model .path = value;
3252
3272
}
3253
3273
).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env (" LLAMA_ARG_MODEL_DRAFT" ));
3274
+ add_opt (common_arg (
3275
+ {" --spec-replace" }, " TARGET" , " DRAFT" ,
3276
+ " translate the string in TARGET into DRAFT if the draft model and main model are not compatible" ,
3277
+ [](common_params & params, const std::string & tgt, const std::string & dft) {
3278
+ params.speculative .replacements .push_back ({ tgt, dft });
3279
+ }
3280
+ ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3254
3281
add_opt (common_arg (
3255
3282
{" -ctkd" , " --cache-type-k-draft" }, " TYPE" ,
3256
3283
string_format (
@@ -3440,34 +3467,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3440
3467
}
3441
3468
).set_examples ({LLAMA_EXAMPLE_SERVER}));
3442
3469
3443
- // diffusion parameters
3444
3470
add_opt (common_arg (
3445
3471
{ " --diffusion-steps" }, " N" ,
3446
3472
string_format (" number of diffusion steps (default: %d)" , params.diffusion .steps ),
3447
3473
[](common_params & params, int value) { params.diffusion .steps = value; }
3448
3474
).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3475
+ add_opt (common_arg (
3476
+ { " --diffusion-visual" },
3477
+ string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3478
+ params.diffusion .visual_mode ? " true" : " false" ),
3479
+ [](common_params & params) { params.diffusion .visual_mode = true ; }
3480
+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3481
+
3449
3482
add_opt (common_arg (
3450
3483
{ " --diffusion-eps" }, " F" ,
3451
3484
string_format (" epsilon for timesteps (default: %.6f)" , (double ) params.diffusion .eps ),
3452
3485
[](common_params & params, const std::string & value) { params.diffusion .eps = std::stof (value); }
3453
3486
).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3454
3487
add_opt (common_arg (
3455
3488
{ " --diffusion-algorithm" }, " N" ,
3456
- string_format (" diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS , 2=TOPK_MARGIN , 3=ENTROPY (default: %d)" ,
3489
+ string_format (" diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED , 2=MARGIN_BASED , 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)" ,
3457
3490
params.diffusion .algorithm ),
3458
3491
[](common_params & params, int value) { params.diffusion .algorithm = value; }
3459
3492
).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3460
3493
add_opt (common_arg (
3461
3494
{ " --diffusion-alg-temp" }, " F" ,
3462
- string_format (" algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
3495
+ string_format (" dream algorithm temperature (default: %.3f)" , (double ) params.diffusion .alg_temp ),
3463
3496
[](common_params & params, const std::string & value) { params.diffusion .alg_temp = std::stof (value); }
3464
3497
).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3498
+
3465
3499
add_opt (common_arg (
3466
- { " --diffusion-visual" },
3467
- string_format (" enable visual diffusion mode (show progressive generation) (default: %s)" ,
3468
- params.diffusion .visual_mode ? " true" : " false" ),
3469
- [](common_params & params) { params.diffusion .visual_mode = true ; }
3500
+ { " --diffusion-block-length" }, " N" ,
3501
+ string_format (" llada block length for generation (default: %d)" , params.diffusion .block_length ),
3502
+ [](common_params & params, int value) { params.diffusion .block_length = value; }
3503
+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3504
+ add_opt (common_arg (
3505
+ { " --diffusion-cfg-scale" }, " F" ,
3506
+ string_format (" llada classifier-free guidance scale (default: %.3f)" , (double ) params.diffusion .cfg_scale ),
3507
+ [](common_params & params, const std::string & value) { params.diffusion .cfg_scale = std::stof (value); }
3470
3508
).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3509
+ add_opt (common_arg (
3510
+ { " --diffusion-add-gumbel-noise" }, " F" ,
3511
+ string_format (" add gumbel noise to the logits if temp > 0.0 (default: %s)" , params.diffusion .add_gumbel_noise ? " true" : " false" ),
3512
+ [](common_params & params, const std::string & value) { params.diffusion .add_gumbel_noise = std::stof (value); }
3513
+ ).set_examples ({ LLAMA_EXAMPLE_DIFFUSION }));
3514
+
3471
3515
3472
3516
return ctx_arg;
3473
3517
}
0 commit comments