Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
!models/templates

# Zig
zig-out/
Expand Down
76 changes: 73 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2260,6 +2260,63 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("VoxtralForConditionalGeneration")
class VoxtralModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA

def set_vocab(self):
vocab = gguf.vocab.MistralVocab(self.dir_model)
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)

tokens = []
scores = []
toktypes = []

for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)

assert len(tokens) == vocab.vocab_size, (
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
)

if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
self.gguf_writer.add_tokenizer_pre("tekken")
self.gguf_writer.add_token_merges(
vocab.extract_vocab_merges_from_model()
)

logger.info(
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
)

self.gguf_writer.add_bos_token_id(vocab.bos_id)
self.gguf_writer.add_eos_token_id(vocab.eos_id)
self.gguf_writer.add_unk_token_id(vocab.unk_id)
self.gguf_writer.add_pad_token_id(vocab.pad_id)

self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_vocab_size(vocab.vocab_size)

self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(False)

script_dir = Path(__file__).parent
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
with open(template_path, "r", encoding="utf-8") as f:
template = f.read()
self.gguf_writer.add_chat_template(template)
Copy link
Collaborator Author

@ngxson ngxson Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fact that mistral wants to delegates chat formatting to their python library mistral-common make this part a bit tricky.. chat templates are no longer stored inside HF repo, even with transformers model. Therefore, we have no better way but to store a jinja version somewhere inside llama.cpp for compatibility reason

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A quick look at the tests in mistral-common seems to show that they've changed the chat template a bit for Voxtral to accommodate audio, including some new tokens to wrap the input audio (not sure if those are being added here or not? Not super familiar with mtmd).

It also seems like Mistral adds a special token for transcription tasks, which seems to be an optional language and then a [TRANSCRIBE] token based on the tests.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yeah we're missing the [BEGIN_AUDIO] token. I have no idea if [TRANSCRIBE] is required though, it seems like the model is also able to summarize the audio, not just transcribe it.

Copy link
Collaborator Author

@ngxson ngxson Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fun thing that I've just found out, without the [BEGIN_AUDIO], the model still be able to "understand" the audio, but it sometimes thinks the audio is bunch of text

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems like the model is also able to summarize the audio, not just transcribe it.

From what I understand from the HF page, it basically has two modes:

Dedicated transcription mode: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly

Without the [TRANSCRIBE] it can understand and talk about the audio, like you've shown. In [TRANSCRIBE] mode it only outputs the transcription, similar to Whisper, I believe.


def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "audio_tower" in name:
return []
return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("DeciLMForCausalLM")
class DeciModel(TextModel):
model_arch = gguf.MODEL_ARCH.DECI
Expand Down Expand Up @@ -7231,9 +7288,10 @@ class WhisperEncoderModel(MmprojModel):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["hidden_size"] = self.hparams["d_model"]
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
self.hparams["hidden_size"] = self.hparams["d_model"]
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]

def set_gguf_parameters(self):
super().set_gguf_parameters()
Expand Down Expand Up @@ -7272,9 +7330,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])


@ModelBase.register("VoxtralForConditionalGeneration")
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
has_vision_encoder = False # no vision encoder
has_audio_encoder = True

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.VOXTRAL)
self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size


@ModelBase.register("FalconH1ForCausalLM")
class FalconH1Model(Mamba2Model):
model_arch = gguf.MODEL_ARCH.FALCON_H1
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2704,6 +2704,7 @@ class VisionProjectorType:
INTERNVL = "internvl"
QWEN2A = "qwen2a" # audio
QWEN25O = "qwen2.5o" # omni
VOXTRAL = "voxtral"


# Items here are (block size, type size)
Expand Down
Loading
Loading