From 4fc6c2c1bc58338f6d152009b9d4eb3bc49ad405 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 15:36:55 +0100
Subject: [PATCH 1/3] llama: add attn temperature tuning for llama arch
 (non-iswa)

---
 convert_hf_to_gguf.py       | 13 +++++++++++++
 gguf-py/gguf/constants.py   |  1 +
 gguf-py/gguf/gguf_writer.py |  3 +++
 src/llama-arch.cpp          |  1 +
 src/llama-arch.h            |  1 +
 src/llama-graph.cpp         |  3 +++
 src/llama-hparams.h         |  4 ++--
 src/llama-model.cpp         | 15 +++++++++++++--
 src/models/llama.cpp        | 12 ++++++++++++
 9 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index cc77a3db273e4..3a9f12ba89aac 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2735,6 +2735,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 class Mistral3Model(LlamaModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
 
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        rope_params = self.hparams.get("rope_parameters")
+        if rope_params is not None and rope_params.get("rope_type") == "yarn":
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
+            self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
+            self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
+            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # TODO: is this correct?
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
+            if "llama_4_scaling_beta" in rope_params:
+                self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
         name = name.replace("language_model.", "")
         if "multi_modal_projector" in name or "vision_tower" in name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 6b4b6c5ab075d..f19f68bdd4049 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -161,6 +161,7 @@ class Attention:
         VALUE_LENGTH_MLA             = "{arch}.attention.value_length_mla"
         SHARED_KV_LAYERS             = "{arch}.attention.shared_kv_layers"
         SLIDING_WINDOW_PATTERN       = "{arch}.attention.sliding_window_pattern"
+        TEMPERATURE_SCALE            = "{arch}.attention.temperature_scale"
 
     class Rope:
         DIMENSION_COUNT          = "{arch}.rope.dimension_count"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a051daeeb1341..bbffc939286b4 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -857,6 +857,9 @@ def add_attn_output_scale(self, value: float) -> None:
     def add_attn_temperature_length(self, value: int) -> None:
         self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value)
 
+    def add_attn_temperature_scale(self, value: float) -> None:
+        self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)
+
     def add_pooling_type(self, value: PoolingType) -> None:
         self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b7642b568dffb..c17e374017db5 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -189,6 +189,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_OUTPUT_SCALE,                 "%s.attention.output_scale"                 },
     { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,           "%s.attention.temperature_length"           },
+    { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
 
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a769dd1e85741..cd91b6520996b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -193,6 +193,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_OUTPUT_SCALE,
     LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+    LLM_KV_ATTENTION_TEMPERATURE_SCALE,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 650e40ec6ffce..3138424867aa5 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && attn_scale) {
         const int64_t n_tokens = ubatch->n_tokens;
 
+        GGML_ASSERT(f_attn_temp_scale != 0.0f);
+        GGML_ASSERT(n_attn_temp_floor_scale != 0);
+
         std::vector<float> attn_scale_data(n_tokens, 0.0f);
         for (int i = 0; i < n_tokens; ++i) {
             const float pos = ubatch->pos[i];
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 9203af83b2e32..270de346d2792 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -162,8 +162,8 @@ struct llama_hparams {
     // llama4 smallthinker
     uint32_t n_moe_layer_step        = 0;
     uint32_t n_no_rope_layer_step    = 4;
-    uint32_t n_attn_temp_floor_scale = 8192;
-    float    f_attn_temp_scale       = 0.1;
+    uint32_t n_attn_temp_floor_scale = 0;
+    float    f_attn_temp_scale       = 0.0f;
 
     // gemma3n altup
     uint32_t n_altup      = 4; // altup_num_inputs
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 829f1e3c14f82..2d319ecfb291d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -627,6 +627,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         case LLM_ARCH_LLAMA:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+
+                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
+                if (hparams.f_attn_temp_scale != 0.0f) {
+                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
+                    if (hparams.n_attn_temp_floor_scale == 0) {
+                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
+                    }
+                }
 
                 if (hparams.n_expert == 8) {
                     switch (hparams.n_layer) {
@@ -663,8 +672,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
                     hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
                 } else {
-                    hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
-                    hparams.n_swa         = 8192;
+                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
+                    hparams.n_swa                   = 8192;
+                    hparams.n_attn_temp_floor_scale = 8192;
+                    hparams.f_attn_temp_scale       = 0.1f;
                     hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
                 }
 
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index ab7fd5d050866..e66ad1acdfef5 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -14,6 +14,12 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
     // inp_pos - contains the positions
     ggml_tensor * inp_pos = build_inp_pos();
 
+    // (optional) temperature tuning
+    ggml_tensor * inp_attn_scale = nullptr;
+    if (hparams.f_attn_temp_scale != 0.0f) {
+        inp_attn_scale = build_inp_attn_scale();
+    }
+
     auto * inp_attn = build_attn_inp_kv();
 
     const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -73,6 +79,12 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
             cb(Kcur, "Kcur", il);
             cb(Vcur, "Vcur", il);
 
+            if (inp_attn_scale) {
+                // apply llama 4 temperature scaling
+                Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                cb(Qcur, "Qcur_attn_temp_scaled", il);
+            }
+
             if (hparams.use_kq_norm) {
                 // Llama4TextL2Norm
                 Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);

From 13369dda7cc03198d16594ba747695cf2489b14b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 17 Nov 2025 21:55:14 +0100
Subject: [PATCH 2/3] update conversion script

---
 convert_hf_to_gguf.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 224f6895f174e..f86a1c3f60469 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1581,10 +1581,27 @@ def __init__(self, *args, **kwargs):
 
         # load preprocessor config
         self.preprocessor_config = {}
-        if not self.is_mistral_format:
-            with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
+
+        # prefer preprocessor_config.json if possible
+        preprocessor_config_path = self.dir_model / "preprocessor_config.json"
+        if preprocessor_config_path.is_file():
+            with open(preprocessor_config_path, "r", encoding="utf-8") as f:
                 self.preprocessor_config = json.load(f)
 
+        # prefer processor_config.json if possible
+        processor_config_path = self.dir_model / "processor_config.json"
+        if processor_config_path.is_file():
+            with open(processor_config_path, "r", encoding="utf-8") as f:
+                cfg = json.load(f)
+                # move image_processor to root level for compat
+                if "image_processor" in cfg:
+                    cfg = {
+                        **cfg,
+                        **cfg["image_processor"],
+                    }
+                # merge configs
+                self.preprocessor_config = {**self.preprocessor_config, **cfg}
+
     def get_vision_config(self) -> dict[str, Any] | None:
         config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
         return self.global_config.get(config_name)

From bf4ef6deefd7c8566d172ed0877ff32e5343b3c0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 17 Nov 2025 22:56:08 +0100
Subject: [PATCH 3/3] make sure to use rope_yarn_log_mul

---
 convert_hf_to_gguf.py |  2 +-
 src/llama-model.cpp   |  5 +++++
 src/models/llama.cpp  | 13 +++++++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index f86a1c3f60469..6ddf3f44cc24b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2839,7 +2839,7 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
             self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
             self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
-            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # TODO: is this correct?
+            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # copied from deepseekv2
             self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
             if "llama_4_scaling_beta" in rope_params:
                 self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 54968fefefc2b..a0420ea1a16a2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -630,6 +630,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
 
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+
                 // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                 if (hparams.f_attn_temp_scale != 0.0f) {
                     hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index e66ad1acdfef5..cf96c49b82098 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -3,6 +3,17 @@
 llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
+    float attn_factor = this->attn_factor;
+    float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+    // copied from deepseekv2
+    // TODO: clean it up later
+    if (hparams.rope_yarn_log_mul != 0.0f) {
+        float mscale  = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+        kq_scale      = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+        attn_factor   = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    }
+
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -22,8 +33,6 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
 
     auto * inp_attn = build_attn_inp_kv();
 
-    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {