@@ -6647,12 +6647,6 @@ def set_gguf_parameters(self):
6647
6647
def modify_tensors (
6648
6648
self , data_torch : Tensor , name : str , bid : int | None
6649
6649
) -> Iterable [tuple [str , Tensor ]]:
6650
- # Handle layer 46 tensors - preserve all for future MTP support
6651
- if bid is not None and bid == 46 :
6652
- # Convert layer 46 tensors to GGUF naming but don't try to map them
6653
- new_name = name .replace ("model.layers." , "blk." )
6654
- return [(new_name , data_torch )]
6655
-
6656
6650
if name .startswith ("model.visual." ): # ignore visual part
6657
6651
return []
6658
6652
elif name .startswith ("model.language_model." ):
@@ -6662,14 +6656,18 @@ def modify_tensors(
6662
6656
if name == "model.embed_tokens.weight" :
6663
6657
return [(self .map_tensor_name ("token_embd.weight" ), data_torch )]
6664
6658
6665
- # Handle routed experts (skip for NextN layer 46)
6666
- if name .find ("mlp.experts" ) != - 1 and "shared_experts" not in name and bid != 46 :
6659
+ # Handle routed experts
6660
+ if name .find ("mlp.experts" ) != - 1 and "shared_experts" not in name :
6667
6661
n_experts = self .hparams ["n_routed_experts" ]
6668
6662
assert bid is not None
6669
6663
6670
6664
if self ._experts is None :
6671
6665
self ._experts = [{} for _ in range (self .block_count )]
6672
6666
6667
+ # Extend experts array if needed (for models where actual layers > num_hidden_layers)
6668
+ while len (self ._experts ) <= bid :
6669
+ self ._experts .append ({})
6670
+
6673
6671
self ._experts [bid ][name ] = data_torch
6674
6672
6675
6673
if len (self ._experts [bid ]) >= n_experts * 3 :
@@ -6705,11 +6703,22 @@ def modify_tensors(
6705
6703
new_name = name .replace ("model.layers." , "blk." ).replace (
6706
6704
".mlp.gate.e_score_correction_bias" , ".ffn_gate_inp.bias"
6707
6705
)
6708
- return [(self .map_tensor_name (new_name ), data_torch )]
6706
+ return [(new_name , data_torch )]
6707
+ elif ".mlp.gate.weight" in name :
6708
+ new_name = name .replace ("model.layers." , "blk." ).replace (
6709
+ ".mlp.gate.weight" , ".ffn_gate_inp.weight"
6710
+ )
6711
+ return [(new_name , data_torch )]
6709
6712
6710
6713
# Handle shared expert tensors
6711
- if ".mlp.ffn_" in name and "_shexp" in name :
6712
- new_name = name .replace ("model.layers." , "blk." )
6714
+ if ".mlp.shared_experts." in name :
6715
+ new_name = name .replace ("model.layers." , "blk." ).replace (".mlp.shared_experts." , ".ffn_" )
6716
+ if "gate_proj" in new_name :
6717
+ new_name = new_name .replace ("gate_proj" , "gate_shexp" )
6718
+ elif "down_proj" in new_name :
6719
+ new_name = new_name .replace ("down_proj" , "down_shexp" )
6720
+ elif "up_proj" in new_name :
6721
+ new_name = new_name .replace ("up_proj" , "up_shexp" )
6713
6722
return [(new_name , data_torch )]
6714
6723
6715
6724
# Handle regular dense FFN layers (for hybrid dense/MoE architecture)
@@ -6738,8 +6747,27 @@ def modify_tensors(
6738
6747
or ".enorm." in name
6739
6748
or ".hnorm." in name
6740
6749
):
6741
- # For NextN tensors, convert to GGUF naming convention
6742
- new_name = name .replace ("model.layers." , "blk." ).replace ("model." , "" )
6750
+ new_name = name .replace ("model.layers." , "blk." ).replace ("model." , "" ).replace (".weight" , "" )
6751
+ return [(new_name , data_torch )]
6752
+
6753
+ # GLM tensor mapping - handle directly without map_tensor_name
6754
+ if ".input_layernorm." in name :
6755
+ new_name = name .replace ("model.layers." , "blk." ).replace (".input_layernorm." , ".attn_norm." )
6756
+ return [(new_name , data_torch )]
6757
+ elif ".post_attention_layernorm." in name :
6758
+ new_name = name .replace ("model.layers." , "blk." ).replace (".post_attention_layernorm." , ".ffn_norm." )
6759
+ return [(new_name , data_torch )]
6760
+ elif ".self_attn." in name :
6761
+ # Map GLM self_attn to standard attention naming
6762
+ new_name = name .replace ("model.layers." , "blk." ).replace (".self_attn." , ".attn_" )
6763
+ if "q_proj" in new_name :
6764
+ new_name = new_name .replace ("q_proj" , "q" )
6765
+ elif "k_proj" in new_name :
6766
+ new_name = new_name .replace ("k_proj" , "k" )
6767
+ elif "v_proj" in new_name :
6768
+ new_name = new_name .replace ("v_proj" , "v" )
6769
+ elif "o_proj" in new_name :
6770
+ new_name = new_name .replace ("o_proj" , "output" )
6743
6771
return [(new_name , data_torch )]
6744
6772
6745
6773
return super ().modify_tensors (data_torch , name , bid )
0 commit comments