@@ -1216,6 +1216,55 @@ def _try_set_pooling_type(self) -> None:
1216
1216
raise NotImplementedError ("Only MEAN, CLS, and LAST pooling types supported" )
1217
1217
self .gguf_writer .add_pooling_type (pooling_type )
1218
1218
1219
+ def _set_vocab_interns1 (self ):
1220
+ tokens : list [str ] = []
1221
+ toktypes : list [int ] = []
1222
+
1223
+ from transformers import AutoTokenizer
1224
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
1225
+ vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
1226
+ vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
1227
+ assert max (vocab .values ()) < vocab_size
1228
+
1229
+ tokpre = self .get_vocab_base_pre (tokenizer )
1230
+
1231
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
1232
+ added_vocab = tokenizer .get_added_vocab ()
1233
+
1234
+ added_tokens_decoder = tokenizer .added_tokens_decoder
1235
+
1236
+ for i in range (vocab_size ):
1237
+ if i not in reverse_vocab :
1238
+ tokens .append (f"[PAD{ i } ]" )
1239
+ toktypes .append (gguf .TokenType .UNUSED )
1240
+ else :
1241
+ token : str = reverse_vocab [i ]
1242
+ if token in added_vocab :
1243
+ # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
1244
+ # To avoid unexpected issues - we make sure to normalize non-normalized tokens
1245
+ if not added_tokens_decoder [i ].normalized :
1246
+ previous_token = token
1247
+ token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
1248
+ if previous_token != token :
1249
+ logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
1250
+
1251
+ if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
1252
+ toktypes .append (gguf .TokenType .CONTROL )
1253
+ else :
1254
+ toktypes .append (gguf .TokenType .USER_DEFINED )
1255
+ else :
1256
+ toktypes .append (gguf .TokenType .NORMAL )
1257
+ tokens .append (token )
1258
+
1259
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
1260
+ self .gguf_writer .add_tokenizer_pre (tokpre )
1261
+ self .gguf_writer .add_token_list (tokens )
1262
+ self .gguf_writer .add_token_types (toktypes )
1263
+
1264
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
1265
+ special_vocab ._set_special_token ("bos" , 151643 )
1266
+ special_vocab .add_to_gguf (self .gguf_writer )
1267
+
1219
1268
1220
1269
class MmprojModel (ModelBase ):
1221
1270
model_type = ModelType .MMPROJ
@@ -2932,7 +2981,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2932
2981
if "language_model." in name :
2933
2982
name = name .replace ("language_model." , "" ) # for InternVL
2934
2983
if name .startswith ("mlp" ) or name .startswith ("multi_modal_projector" ) \
2935
- or name .startswith ("vision_model" ) or name .startswith ("audio_tower" ):
2984
+ or name .startswith ("vision_model" ) or name .startswith ("audio_tower" ) \
2985
+ or name .startswith ("model.vision_tower" ) or name .startswith ("model.multi_modal_projector" ):
2936
2986
# skip vision and audio tensors
2937
2987
return []
2938
2988
yield from super ().modify_tensors (data_torch , name , bid )
@@ -3604,6 +3654,19 @@ def prepare_tensors(self):
3604
3654
class Qwen3Model (Qwen2Model ):
3605
3655
model_arch = gguf .MODEL_ARCH .QWEN3
3606
3656
3657
+ def __init__ (self , * args , ** kwargs ):
3658
+ super ().__init__ (* args , ** kwargs )
3659
+ hparams = ModelBase .load_hparams (self .dir_model , is_mistral_format = False )
3660
+ self .origin_hf_arch = hparams .get ('architectures' , [None ])[0 ]
3661
+
3662
+ def set_vocab (self ):
3663
+ # deal with intern-s1-mini
3664
+ if self .origin_hf_arch == 'InternS1ForConditionalGeneration' :
3665
+ self ._set_vocab_interns1 ()
3666
+ return
3667
+
3668
+ super ().set_vocab ()
3669
+
3607
3670
3608
3671
@ModelBase .register ("Qwen3MoeForCausalLM" )
3609
3672
class Qwen3MoeModel (Qwen2MoeModel ):
@@ -3620,73 +3683,7 @@ def set_vocab(self):
3620
3683
self ._set_vocab_interns1 ()
3621
3684
return
3622
3685
3623
- try :
3624
- self ._set_vocab_sentencepiece ()
3625
- except FileNotFoundError :
3626
- self ._set_vocab_gpt2 ()
3627
-
3628
- def _set_vocab_interns1 (self ):
3629
- tokens : list [str ] = []
3630
- toktypes : list [int ] = []
3631
-
3632
- from transformers import AutoTokenizer
3633
- tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
3634
- vocab = getattr (tokenizer , 'vocab' , tokenizer .get_vocab ())
3635
- vocab_size = self .hparams .get ("vocab_size" , len (vocab ))
3636
- assert max (vocab .values ()) < vocab_size
3637
-
3638
- tokpre = self .get_vocab_base_pre (tokenizer )
3639
-
3640
- reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in vocab .items ()}
3641
- added_vocab = tokenizer .get_added_vocab ()
3642
-
3643
- added_tokens_decoder = tokenizer .added_tokens_decoder
3644
-
3645
- for i in range (vocab_size ):
3646
- if i not in reverse_vocab :
3647
- tokens .append (f"[PAD{ i } ]" )
3648
- toktypes .append (gguf .TokenType .UNUSED )
3649
- else :
3650
- token : str = reverse_vocab [i ]
3651
- if token in added_vocab :
3652
- # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
3653
- # To avoid unexpected issues - we make sure to normalize non-normalized tokens
3654
- if not added_tokens_decoder [i ].normalized :
3655
- previous_token = token
3656
- token = tokenizer .decode (tokenizer .encode (token , add_special_tokens = False ))
3657
- if previous_token != token :
3658
- logger .info (f"{ repr (previous_token )} is encoded and decoded back to { repr (token )} using AutoTokenizer" )
3659
-
3660
- if added_tokens_decoder [i ].special or self .does_token_look_special (token ):
3661
- toktypes .append (gguf .TokenType .CONTROL )
3662
- else :
3663
- toktypes .append (gguf .TokenType .USER_DEFINED )
3664
- else :
3665
- toktypes .append (gguf .TokenType .NORMAL )
3666
- tokens .append (token )
3667
-
3668
- self .gguf_writer .add_tokenizer_model ("gpt2" )
3669
- self .gguf_writer .add_tokenizer_pre (tokpre )
3670
- self .gguf_writer .add_token_list (tokens )
3671
- self .gguf_writer .add_token_types (toktypes )
3672
-
3673
- special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
3674
- special_tokens_map_file = self .dir_model / 'special_tokens_map.json'
3675
- additional_special_tokens = []
3676
- if special_tokens_map_file .is_file ():
3677
- with open (special_tokens_map_file , encoding = 'utf-8' ) as f :
3678
- additional_special_tokens = json .load (f ).get ('additional_special_tokens' , [])
3679
- tokenizer_cfg_file = self .dir_model / 'special_tokens_map.json'
3680
- if tokenizer_cfg_file .is_file ():
3681
- with open (tokenizer_cfg_file , encoding = 'utf-8' ) as f :
3682
- added_tokens_decoder = json .load (f ).get ('added_tokens_decoder' , {})
3683
- token2ids_map = {data ['content' ] : int (token ) for token , data in added_tokens_decoder .items () if data ['special' ]}
3684
- for token in additional_special_tokens :
3685
- if token in token2ids_map :
3686
- special_vocab ._set_special_token (token , token2ids_map [token ])
3687
- special_vocab ._set_special_token ('eos' , 151645 )
3688
- special_vocab ._set_special_token ("bos" , 151643 )
3689
- special_vocab .add_to_gguf (self .gguf_writer )
3686
+ super ().set_vocab ()
3690
3687
3691
3688
3692
3689
@ModelBase .register ("GPT2LMHeadModel" )
0 commit comments