@@ -6578,6 +6578,117 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
6578
6578
return super ().modify_tensors (data_torch , name , bid )
6579
6579
6580
6580
6581
+ @ModelBase .register ("Glm4MoeForCausalLM" )
6582
+ class Glm4MoeModel (TextModel ):
6583
+ model_arch = gguf .MODEL_ARCH .GLM4_MOE
6584
+
6585
+ def set_vocab (self ):
6586
+ from transformers import AutoTokenizer
6587
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
6588
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6589
+ tokens , toktypes , tokpre = self .get_vocab_base ()
6590
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6591
+ self .gguf_writer .add_tokenizer_pre (tokpre )
6592
+ self .gguf_writer .add_token_list (tokens )
6593
+ self .gguf_writer .add_token_types (toktypes )
6594
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6595
+ special_vocab ._set_special_token ("eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
6596
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
6597
+ special_vocab ._set_special_token ("unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
6598
+ special_vocab ._set_special_token ("bos" , tokenizer .get_added_vocab ()["<|endoftext|>" ])
6599
+ special_vocab .add_to_gguf (self .gguf_writer )
6600
+
6601
+ def set_gguf_parameters (self ):
6602
+ super ().set_gguf_parameters ()
6603
+ if (rope_dim := self .hparams .get ("head_dim" )) is None :
6604
+ rope_dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
6605
+ self .gguf_writer .add_rope_dimension_count (int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 )))
6606
+
6607
+ # MoE parameters
6608
+ if (n_experts := self .hparams .get ("n_routed_experts" )) is not None :
6609
+ self .gguf_writer .add_expert_count (n_experts )
6610
+ if (n_experts_used := self .hparams .get ("num_experts_per_tok" )) is not None :
6611
+ self .gguf_writer .add_expert_used_count (n_experts_used )
6612
+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
6613
+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
6614
+ if (n_shared_experts := self .hparams .get ("n_shared_experts" )) is not None :
6615
+ self .gguf_writer .add_expert_shared_count (n_shared_experts )
6616
+ if (first_k_dense_replace := self .hparams .get ("first_k_dense_replace" )) is not None :
6617
+ self .gguf_writer .add_leading_dense_block_count (first_k_dense_replace )
6618
+
6619
+ # Expert gating function (sigmoid for GLM4_MOE)
6620
+ self .gguf_writer .add_expert_gating_func (2 ) # LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID
6621
+
6622
+ # Routed scaling factor
6623
+ if (routed_scaling_factor := self .hparams .get ("routed_scaling_factor" )) is not None :
6624
+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
6625
+
6626
+ # Normalise topk probabilities
6627
+ if (norm_topk_prob := self .hparams .get ("norm_topk_prob" )) is not None :
6628
+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
6629
+
6630
+ _experts : list [dict [str , Tensor ]] | None = None
6631
+ _shared_experts : list [dict [str , Tensor ]] | None = None
6632
+
6633
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
6634
+ if name .startswith ("model.visual." ): # ignore visual part
6635
+ return []
6636
+ elif name .startswith ("model.language_model." ):
6637
+ name = name .replace ("language_model." , "" ) # for multimodal variants
6638
+
6639
+ # Handle routed experts
6640
+ if name .find ("mlp.experts" ) != - 1 and "shared_experts" not in name :
6641
+ n_experts = self .hparams ["n_routed_experts" ]
6642
+ assert bid is not None
6643
+
6644
+ if self ._experts is None :
6645
+ self ._experts = [{} for _ in range (self .block_count )]
6646
+
6647
+ self ._experts [bid ][name ] = data_torch
6648
+
6649
+ if len (self ._experts [bid ]) >= n_experts * 3 :
6650
+ tensors : list [tuple [str , Tensor ]] = []
6651
+
6652
+ # merge the experts into a single 3d tensor
6653
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
6654
+ datas : list [Tensor ] = []
6655
+
6656
+ for xid in range (n_experts ):
6657
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
6658
+ datas .append (self ._experts [bid ][ename ])
6659
+ del self ._experts [bid ][ename ]
6660
+
6661
+ data_torch = torch .stack (datas , dim = 0 )
6662
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
6663
+ new_name = self .map_tensor_name (merged_name )
6664
+ tensors .append ((new_name , data_torch ))
6665
+ return tensors
6666
+ else :
6667
+ return []
6668
+
6669
+ # Handle shared experts - map to shared expert tensors
6670
+ if "shared_experts" in name :
6671
+ if "gate_proj" in name :
6672
+ new_name = name .replace ("shared_experts.gate_proj.weight" , "ffn_gate_shexp.weight" )
6673
+ elif "up_proj" in name :
6674
+ new_name = name .replace ("shared_experts.up_proj.weight" , "ffn_up_shexp.weight" )
6675
+ elif "down_proj" in name :
6676
+ new_name = name .replace ("shared_experts.down_proj.weight" , "ffn_down_shexp.weight" )
6677
+ else :
6678
+ new_name = name
6679
+ return [(self .map_tensor_name (new_name ), data_torch )]
6680
+
6681
+ return super ().modify_tensors (data_torch , name , bid )
6682
+
6683
+ def prepare_tensors (self ):
6684
+ super ().prepare_tensors ()
6685
+ if self ._experts is not None :
6686
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
6687
+ experts = [k for d in self ._experts for k in d .keys ()]
6688
+ if len (experts ) > 0 :
6689
+ raise ValueError (f"Unprocessed experts: { experts } " )
6690
+
6691
+
6581
6692
@ModelBase .register ("GlmForCausalLM" , "ChatGLMModel" , "ChatGLMForConditionalGeneration" )
6582
6693
class ChatGLMModel (TextModel ):
6583
6694
model_arch = gguf .MODEL_ARCH .CHATGLM
0 commit comments