Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1581,10 +1581,27 @@ def __init__(self, *args, **kwargs):

# load preprocessor config
self.preprocessor_config = {}
if not self.is_mistral_format:
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:

# prefer preprocessor_config.json if possible
preprocessor_config_path = self.dir_model / "preprocessor_config.json"
if preprocessor_config_path.is_file():
with open(preprocessor_config_path, "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)

# prefer processor_config.json if possible
processor_config_path = self.dir_model / "processor_config.json"
if processor_config_path.is_file():
with open(processor_config_path, "r", encoding="utf-8") as f:
cfg = json.load(f)
# move image_processor to root level for compat
if "image_processor" in cfg:
cfg = {
**cfg,
**cfg["image_processor"],
}
# merge configs
self.preprocessor_config = {**self.preprocessor_config, **cfg}

def get_vision_config(self) -> dict[str, Any] | None:
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
return self.global_config.get(config_name)
Expand Down Expand Up @@ -2814,6 +2831,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA

def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_params = self.hparams.get("rope_parameters")
if rope_params is not None and rope_params.get("rope_type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_params["mscale_all_dim"]) # copied from deepseekv2
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
if "llama_4_scaling_beta" in rope_params:
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ class Attention:
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"

class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,9 @@ def add_attn_output_scale(self, value: float) -> None:
def add_attn_temperature_length(self, value: int) -> None:
self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value)

def add_attn_temperature_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)

def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },

Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,

Expand Down
3 changes: 3 additions & 0 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && attn_scale) {
const int64_t n_tokens = ubatch->n_tokens;

GGML_ASSERT(f_attn_temp_scale != 0.0f);
GGML_ASSERT(n_attn_temp_floor_scale != 0);

std::vector<float> attn_scale_data(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const float pos = ubatch->pos[i];
Expand Down
4 changes: 2 additions & 2 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ struct llama_hparams {
// llama4 smallthinker
uint32_t n_moe_layer_step = 0;
uint32_t n_no_rope_layer_step = 4;
uint32_t n_attn_temp_floor_scale = 8192;
float f_attn_temp_scale = 0.1;
uint32_t n_attn_temp_floor_scale = 0;
float f_attn_temp_scale = 0.0f;

// gemma3n altup
uint32_t n_altup = 4; // altup_num_inputs
Expand Down
20 changes: 18 additions & 2 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
case LLM_ARCH_LLAMA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);

ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);

// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
if (hparams.f_attn_temp_scale != 0.0f) {
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
if (hparams.n_attn_temp_floor_scale == 0) {
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
}
}

if (hparams.n_expert == 8) {
switch (hparams.n_layer) {
Expand Down Expand Up @@ -664,8 +678,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
} else {
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
hparams.n_swa = 8192;
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
hparams.n_swa = 8192;
hparams.n_attn_temp_floor_scale = 8192;
hparams.f_attn_temp_scale = 0.1f;
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
}

Expand Down
25 changes: 23 additions & 2 deletions src/models/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;

float attn_factor = this->attn_factor;
float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

// copied from deepseekv2
// TODO: clean it up later
if (hparams.rope_yarn_log_mul != 0.0f) {
float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
}

GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);

Expand All @@ -14,9 +25,13 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

auto * inp_attn = build_attn_inp_kv();
// (optional) temperature tuning
ggml_tensor * inp_attn_scale = nullptr;
if (hparams.f_attn_temp_scale != 0.0f) {
inp_attn_scale = build_inp_attn_scale();
}

const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
auto * inp_attn = build_attn_inp_kv();

ggml_tensor * inp_out_ids = build_inp_out_ids();

Expand Down Expand Up @@ -73,6 +88,12 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);

if (inp_attn_scale) {
// apply llama 4 temperature scaling
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
cb(Qcur, "Qcur_attn_temp_scaled", il);
}

if (hparams.use_kq_norm) {
// Llama4TextL2Norm
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
Expand Down
Loading