@@ -6578,6 +6578,177 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
6578
6578
return super ().modify_tensors (data_torch , name , bid )
6579
6579
6580
6580
6581
+ @ModelBase .register ("Glm4MoeForCausalLM" )
6582
+ class Glm4MoeModel (TextModel ):
6583
+ model_arch = gguf .MODEL_ARCH .GLM4_MOE
6584
+
6585
+ def set_vocab (self ):
6586
+ from transformers import AutoTokenizer
6587
+
6588
+ tokenizer = AutoTokenizer .from_pretrained (
6589
+ self .dir_model , trust_remote_code = True
6590
+ )
6591
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6592
+ tokens , toktypes , tokpre = self .get_vocab_base ()
6593
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6594
+ self .gguf_writer .add_tokenizer_pre (tokpre )
6595
+ self .gguf_writer .add_token_list (tokens )
6596
+ self .gguf_writer .add_token_types (toktypes )
6597
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6598
+ special_vocab ._set_special_token (
6599
+ "eos" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
6600
+ )
6601
+ special_vocab ._set_special_token ("eot" , tokenizer .get_added_vocab ()["<|user|>" ])
6602
+ special_vocab ._set_special_token (
6603
+ "unk" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
6604
+ )
6605
+ special_vocab ._set_special_token (
6606
+ "bos" , tokenizer .get_added_vocab ()["<|endoftext|>" ]
6607
+ )
6608
+ special_vocab .add_to_gguf (self .gguf_writer )
6609
+
6610
+ def set_gguf_parameters (self ):
6611
+ super ().set_gguf_parameters ()
6612
+ if (rope_dim := self .hparams .get ("head_dim" )) is None :
6613
+ rope_dim = (
6614
+ self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
6615
+ )
6616
+ self .gguf_writer .add_rope_dimension_count (
6617
+ int (rope_dim * self .hparams .get ("partial_rotary_factor" , 0.5 ))
6618
+ )
6619
+
6620
+ # MoE parameters
6621
+ if (n_experts := self .hparams .get ("n_routed_experts" )) is not None :
6622
+ self .gguf_writer .add_expert_count (n_experts )
6623
+ # Note: expert_used_count is already set by parent class using num_experts_per_tok
6624
+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
6625
+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
6626
+ if (n_shared_experts := self .hparams .get ("n_shared_experts" )) is not None :
6627
+ self .gguf_writer .add_expert_shared_count (n_shared_experts )
6628
+ if (first_k_dense_replace := self .hparams .get ("first_k_dense_replace" )) is not None :
6629
+ self .gguf_writer .add_leading_dense_block_count (first_k_dense_replace )
6630
+
6631
+ # Expert gating function (sigmoid for GLM4_MOE)
6632
+ self .gguf_writer .add_expert_gating_func (gguf .ExpertGatingFuncType .SIGMOID )
6633
+
6634
+ # Routed scaling factor
6635
+ if (routed_scaling_factor := self .hparams .get ("routed_scaling_factor" )) is not None :
6636
+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
6637
+
6638
+ # Normalise topk probabilities
6639
+ if (norm_topk_prob := self .hparams .get ("norm_topk_prob" )) is not None :
6640
+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
6641
+
6642
+ _experts : list [dict [str , Tensor ]] | None = None
6643
+ _shared_experts : list [dict [str , Tensor ]] | None = None
6644
+
6645
+ def modify_tensors (
6646
+ self , data_torch : Tensor , name : str , bid : int | None
6647
+ ) -> Iterable [tuple [str , Tensor ]]:
6648
+ # Handle special GLM4_MOE layer 46 tensors (nextn prediction layer)
6649
+ if bid is not None and bid == 46 :
6650
+ # Layer 46 is the nextn prediction layer - skip all tensors
6651
+ return []
6652
+
6653
+ if name .startswith ("model.visual." ): # ignore visual part
6654
+ return []
6655
+ elif name .startswith ("model.language_model." ):
6656
+ name = name .replace ("language_model." , "" ) # for multimodal variants
6657
+
6658
+ # Handle main token embedding
6659
+ if name == "model.embed_tokens.weight" :
6660
+ return [(self .map_tensor_name ("token_embd.weight" ), data_torch )]
6661
+
6662
+ # Handle routed experts
6663
+ if name .find ("mlp.experts" ) != - 1 and "shared_experts" not in name :
6664
+ n_experts = self .hparams ["n_routed_experts" ]
6665
+ assert bid is not None
6666
+
6667
+ if self ._experts is None :
6668
+ self ._experts = [{} for _ in range (self .block_count )]
6669
+
6670
+ self ._experts [bid ][name ] = data_torch
6671
+
6672
+ if len (self ._experts [bid ]) >= n_experts * 3 :
6673
+ tensors : list [tuple [str , Tensor ]] = []
6674
+
6675
+ # merge the experts into a single 3d tensor
6676
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
6677
+ datas : list [Tensor ] = []
6678
+
6679
+ for xid in range (n_experts ):
6680
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
6681
+ datas .append (self ._experts [bid ][ename ])
6682
+ del self ._experts [bid ][ename ]
6683
+
6684
+ data_torch = torch .stack (datas , dim = 0 )
6685
+ # Generate GGUF tensor names for merged experts
6686
+ if w_name == "down_proj" :
6687
+ new_name = f"blk.{ bid } .ffn_down_exps.weight"
6688
+ elif w_name == "gate_proj" :
6689
+ new_name = f"blk.{ bid } .ffn_gate_exps.weight"
6690
+ elif w_name == "up_proj" :
6691
+ new_name = f"blk.{ bid } .ffn_up_exps.weight"
6692
+ else :
6693
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
6694
+ new_name = self .map_tensor_name (merged_name )
6695
+ tensors .append ((new_name , data_torch ))
6696
+ return tensors
6697
+ else :
6698
+ return []
6699
+
6700
+ # Handle expert gating input (routing gate)
6701
+ if ".mlp.gate.e_score_correction_bias" in name :
6702
+ new_name = name .replace ("model.layers." , "blk." ).replace (
6703
+ ".mlp.gate.e_score_correction_bias" , ".ffn_gate_inp.bias"
6704
+ )
6705
+ return [(self .map_tensor_name (new_name ), data_torch )]
6706
+
6707
+ # Handle shared expert tensors
6708
+ if ".mlp.ffn_" in name and "_shexp" in name :
6709
+ new_name = name .replace ("model.layers." , "blk." )
6710
+ return [(new_name , data_torch )]
6711
+
6712
+ # Handle regular dense FFN layers (for hybrid dense/MoE architecture)
6713
+ if ".mlp." in name and "experts" not in name and "_shexp" not in name :
6714
+ if "gate_proj" in name :
6715
+ new_name = name .replace ("model.layers." , "blk." ).replace (
6716
+ ".mlp.gate_proj.weight" , ".ffn_gate.weight"
6717
+ )
6718
+ elif "up_proj" in name :
6719
+ new_name = name .replace ("model.layers." , "blk." ).replace (
6720
+ ".mlp.up_proj.weight" , ".ffn_up.weight"
6721
+ )
6722
+ elif "down_proj" in name :
6723
+ new_name = name .replace ("model.layers." , "blk." ).replace (
6724
+ ".mlp.down_proj.weight" , ".ffn_down.weight"
6725
+ )
6726
+ else :
6727
+ new_name = name
6728
+ return [(self .map_tensor_name (new_name ), data_torch )]
6729
+
6730
+ # Handle other special GLM4_MOE tensors (nextn prediction)
6731
+ if (
6732
+ ".embed_tokens." in name
6733
+ or ".shared_head." in name
6734
+ or ".eh_proj." in name
6735
+ or ".enorm." in name
6736
+ or ".hnorm." in name
6737
+ ):
6738
+ # Skip these special tensors - they are for nextn prediction
6739
+ return []
6740
+
6741
+ return super ().modify_tensors (data_torch , name , bid )
6742
+
6743
+ def prepare_tensors (self ):
6744
+ super ().prepare_tensors ()
6745
+ if self ._experts is not None :
6746
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
6747
+ experts = [k for d in self ._experts for k in d .keys ()]
6748
+ if len (experts ) > 0 :
6749
+ raise ValueError (f"Unprocessed experts: { experts } " )
6750
+
6751
+
6581
6752
@ModelBase .register ("GlmForCausalLM" , "ChatGLMModel" , "ChatGLMForConditionalGeneration" )
6582
6753
class ChatGLMModel (TextModel ):
6583
6754
model_arch = gguf .MODEL_ARCH .CHATGLM
0 commit comments