From 4fc6c2c1bc58338f6d152009b9d4eb3bc49ad405 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 13 Nov 2025 15:36:55 +0100 Subject: [PATCH 1/3] llama: add attn temperature tuning for llama arch (non-iswa) --- convert_hf_to_gguf.py | 13 +++++++++++++ gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 +++ src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-graph.cpp | 3 +++ src/llama-hparams.h | 4 ++-- src/llama-model.cpp | 15 +++++++++++++-- src/models/llama.cpp | 12 ++++++++++++ 9 files changed, 49 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cc77a3db273e4..3a9f12ba89aac 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2735,6 +2735,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter class Mistral3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA + def set_gguf_parameters(self): + super().set_gguf_parameters() + rope_params = self.hparams.get("rope_parameters") + if rope_params is not None and rope_params.get("rope_type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_params["factor"]) + self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"]) + self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # TODO: is this correct? + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) + if "llama_4_scaling_beta" in rope_params: + self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6b4b6c5ab075d..f19f68bdd4049 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -161,6 +161,7 @@ class Attention: VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" + TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a051daeeb1341..bbffc939286b4 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -857,6 +857,9 @@ def add_attn_output_scale(self, value: float) -> None: def add_attn_temperature_length(self, value: int) -> None: self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value) + def add_attn_temperature_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b7642b568dffb..c17e374017db5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -189,6 +189,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, + { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index a769dd1e85741..cd91b6520996b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -193,6 +193,7 @@ enum llm_kv { LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_TEMPERATURE_LENGTH, + LLM_KV_ATTENTION_TEMPERATURE_SCALE, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 650e40ec6ffce..3138424867aa5 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; + GGML_ASSERT(f_attn_temp_scale != 0.0f); + GGML_ASSERT(n_attn_temp_floor_scale != 0); + std::vector attn_scale_data(n_tokens, 0.0f); for (int i = 0; i < n_tokens; ++i) { const float pos = ubatch->pos[i]; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 9203af83b2e32..270de346d2792 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -162,8 +162,8 @@ struct llama_hparams { // llama4 smallthinker uint32_t n_moe_layer_step = 0; uint32_t n_no_rope_layer_step = 4; - uint32_t n_attn_temp_floor_scale = 8192; - float f_attn_temp_scale = 0.1; + uint32_t n_attn_temp_floor_scale = 0; + float f_attn_temp_scale = 0.0f; // gemma3n altup uint32_t n_altup = 4; // altup_num_inputs diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 829f1e3c14f82..2d319ecfb291d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -627,6 +627,15 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_LLAMA: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); + + // TODO: maybe add n_attn_temp_floor_scale as a separate KV? + if (hparams.f_attn_temp_scale != 0.0f) { + hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn; + if (hparams.n_attn_temp_floor_scale == 0) { + throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling"); + } + } if (hparams.n_expert == 8) { switch (hparams.n_layer) { @@ -663,8 +672,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope } else { - hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; - hparams.n_swa = 8192; + hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; + hparams.n_swa = 8192; + hparams.n_attn_temp_floor_scale = 8192; + hparams.f_attn_temp_scale = 0.1f; hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index ab7fd5d050866..e66ad1acdfef5 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -14,6 +14,12 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); + // (optional) temperature tuning + ggml_tensor * inp_attn_scale = nullptr; + if (hparams.f_attn_temp_scale != 0.0f) { + inp_attn_scale = build_inp_attn_scale(); + } + auto * inp_attn = build_attn_inp_kv(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -73,6 +79,12 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); + if (inp_attn_scale) { + // apply llama 4 temperature scaling + Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); + cb(Qcur, "Qcur_attn_temp_scaled", il); + } + if (hparams.use_kq_norm) { // Llama4TextL2Norm Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps); From 13369dda7cc03198d16594ba747695cf2489b14b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 17 Nov 2025 21:55:14 +0100 Subject: [PATCH 2/3] update conversion script --- convert_hf_to_gguf.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 224f6895f174e..f86a1c3f60469 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1581,10 +1581,27 @@ def __init__(self, *args, **kwargs): # load preprocessor config self.preprocessor_config = {} - if not self.is_mistral_format: - with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + + # prefer preprocessor_config.json if possible + preprocessor_config_path = self.dir_model / "preprocessor_config.json" + if preprocessor_config_path.is_file(): + with open(preprocessor_config_path, "r", encoding="utf-8") as f: self.preprocessor_config = json.load(f) + # prefer processor_config.json if possible + processor_config_path = self.dir_model / "processor_config.json" + if processor_config_path.is_file(): + with open(processor_config_path, "r", encoding="utf-8") as f: + cfg = json.load(f) + # move image_processor to root level for compat + if "image_processor" in cfg: + cfg = { + **cfg, + **cfg["image_processor"], + } + # merge configs + self.preprocessor_config = {**self.preprocessor_config, **cfg} + def get_vision_config(self) -> dict[str, Any] | None: config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" return self.global_config.get(config_name) From bf4ef6deefd7c8566d172ed0877ff32e5343b3c0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 17 Nov 2025 22:56:08 +0100 Subject: [PATCH 3/3] make sure to use rope_yarn_log_mul --- convert_hf_to_gguf.py | 2 +- src/llama-model.cpp | 5 +++++ src/models/llama.cpp | 13 +++++++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f86a1c3f60469..6ddf3f44cc24b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2839,7 +2839,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(rope_params["factor"]) self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"]) self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # TODO: is this correct? + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # copied from deepseekv2 self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) if "llama_4_scaling_beta" in rope_params: self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 54968fefefc2b..a0420ea1a16a2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -630,6 +630,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); + // TODO: maybe add n_attn_temp_floor_scale as a separate KV? if (hparams.f_attn_temp_scale != 0.0f) { hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn; diff --git a/src/models/llama.cpp b/src/models/llama.cpp index e66ad1acdfef5..cf96c49b82098 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -3,6 +3,17 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; + float attn_factor = this->attn_factor; + float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + // copied from deepseekv2 + // TODO: clean it up later + if (hparams.rope_yarn_log_mul != 0.0f) { + float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); + kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k)); + attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); + } + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -22,8 +33,6 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para auto * inp_attn = build_attn_inp_kv(); - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) {