@@ -1900,6 +1900,7 @@ def prepare_tensors(self):
1900
1900
"MixtralForCausalLM" ,
1901
1901
"VLlama3ForCausalLM" ,
1902
1902
"LlavaForConditionalGeneration" ,
1903
+ "VoxtralForConditionalGeneration" ,
1903
1904
"LlamaModel" )
1904
1905
class LlamaModel (TextModel ):
1905
1906
model_arch = gguf .MODEL_ARCH .LLAMA
@@ -1912,6 +1913,11 @@ def __init__(self, *args, **kwargs):
1912
1913
self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 32 )
1913
1914
1914
1915
def set_vocab (self ):
1916
+ path_tekken_json = self .dir_model / "tekken.json"
1917
+ path_tokenizer_json = self .dir_model / "tokenizer.json"
1918
+ if path_tekken_json .is_file () and not path_tokenizer_json .is_file ():
1919
+ return self .set_vocab_tekken ()
1920
+
1915
1921
try :
1916
1922
self ._set_vocab_sentencepiece ()
1917
1923
except FileNotFoundError :
@@ -1944,6 +1950,52 @@ def set_vocab(self):
1944
1950
if self .hparams .get ("vocab_size" , 32000 ) == 49152 :
1945
1951
self .gguf_writer .add_add_bos_token (False )
1946
1952
1953
+ def set_vocab_tekken (self ):
1954
+ vocab = gguf .vocab .MistralVocab (self .dir_model )
1955
+ self .gguf_writer .add_tokenizer_model (vocab .gguf_tokenizer_model )
1956
+
1957
+ tokens = []
1958
+ scores = []
1959
+ toktypes = []
1960
+
1961
+ for text , score , toktype in vocab .all_tokens ():
1962
+ tokens .append (text )
1963
+ scores .append (score )
1964
+ toktypes .append (toktype )
1965
+
1966
+ assert len (tokens ) == vocab .vocab_size , (
1967
+ f"token count ({ len (tokens )} ) != vocab size ({ vocab .vocab_size } )"
1968
+ )
1969
+
1970
+ if vocab .tokenizer_type == gguf .vocab .MistralTokenizerType .tekken :
1971
+ self .gguf_writer .add_tokenizer_pre ("tekken" )
1972
+ self .gguf_writer .add_token_merges (
1973
+ vocab .extract_vocab_merges_from_model ()
1974
+ )
1975
+
1976
+ logger .info (
1977
+ f"Setting bos, eos, unk and pad token IDs to { vocab .bos_id } , { vocab .eos_id } , { vocab .unk_id } , { vocab .pad_id } ."
1978
+ )
1979
+
1980
+ self .gguf_writer .add_bos_token_id (vocab .bos_id )
1981
+ self .gguf_writer .add_eos_token_id (vocab .eos_id )
1982
+ self .gguf_writer .add_unk_token_id (vocab .unk_id )
1983
+ self .gguf_writer .add_pad_token_id (vocab .pad_id )
1984
+
1985
+ self .gguf_writer .add_token_list (tokens )
1986
+ self .gguf_writer .add_token_scores (scores )
1987
+ self .gguf_writer .add_token_types (toktypes )
1988
+ self .gguf_writer .add_vocab_size (vocab .vocab_size )
1989
+
1990
+ self .gguf_writer .add_add_bos_token (True )
1991
+ self .gguf_writer .add_add_eos_token (False )
1992
+
1993
+ script_dir = Path (__file__ ).parent
1994
+ template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
1995
+ with open (template_path , "r" , encoding = "utf-8" ) as f :
1996
+ template = f .read ()
1997
+ self .gguf_writer .add_chat_template (template )
1998
+
1947
1999
def set_gguf_parameters (self ):
1948
2000
super ().set_gguf_parameters ()
1949
2001
hparams = self .hparams
@@ -1971,12 +2023,13 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1971
2023
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1972
2024
n_head = self .hparams ["num_attention_heads" ]
1973
2025
n_kv_head = self .hparams .get ("num_key_value_heads" )
1974
- is_vision_tensor = "vision_tower" in name \
2026
+ is_multimodal_tensor = "vision_tower" in name \
1975
2027
or "vision_model" in name \
2028
+ or "audio_tower" in name \
1976
2029
or "model.connector" in name \
1977
2030
or "multi_modal_projector" in name
1978
2031
1979
- if is_vision_tensor :
2032
+ if is_multimodal_tensor :
1980
2033
return [] # skip vision tensors
1981
2034
elif self .hf_arch == "LlamaModel" :
1982
2035
name = "model." + name
@@ -7231,9 +7284,10 @@ class WhisperEncoderModel(MmprojModel):
7231
7284
7232
7285
def __init__ (self , * args , ** kwargs ):
7233
7286
super ().__init__ (* args , ** kwargs )
7234
- self .hparams ["hidden_size" ] = self .hparams ["d_model" ]
7235
- self .hparams ["intermediate_size" ] = self .hparams ["encoder_ffn_dim" ]
7236
- self .hparams ["num_attention_heads" ] = self .hparams ["encoder_attention_heads" ]
7287
+ if "hidden_size" not in self .hparams and "intermediate_size" not in self .hparams :
7288
+ self .hparams ["hidden_size" ] = self .hparams ["d_model" ]
7289
+ self .hparams ["intermediate_size" ] = self .hparams ["encoder_ffn_dim" ]
7290
+ self .hparams ["num_attention_heads" ] = self .hparams ["encoder_attention_heads" ]
7237
7291
7238
7292
def set_gguf_parameters (self ):
7239
7293
super ().set_gguf_parameters ()
@@ -7272,9 +7326,21 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
7272
7326
7273
7327
def set_gguf_parameters (self ):
7274
7328
super ().set_gguf_parameters ()
7329
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .ULTRAVOX )
7275
7330
self .gguf_writer .add_audio_stack_factor (self .global_config ["stack_factor" ])
7276
7331
7277
7332
7333
+ @ModelBase .register ("VoxtralForConditionalGeneration" )
7334
+ class VoxtralWhisperEncoderModel (WhisperEncoderModel ):
7335
+ has_vision_encoder = False # no vision encoder
7336
+ has_audio_encoder = True
7337
+
7338
+ def set_gguf_parameters (self ):
7339
+ super ().set_gguf_parameters ()
7340
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .VOXTRAL )
7341
+ self .gguf_writer .add_audio_stack_factor (4 ) # == intermediate_size // hidden_size
7342
+
7343
+
7278
7344
@ModelBase .register ("FalconH1ForCausalLM" )
7279
7345
class FalconH1Model (Mamba2Model ):
7280
7346
model_arch = gguf .MODEL_ARCH .FALCON_H1
0 commit comments