Skip to content

Commit 5eba3e3

Browse files
committed
put interns1 in tensor mapping
1 parent 483ffef commit 5eba3e3

File tree

2 files changed

+22
-25
lines changed

2 files changed

+22
-25
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3028,49 +3028,30 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
30283028
return gguf.GGMLQuantizationType.F32
30293029
return False
30303030

3031-
def _mapping_name_interns1(self, name):
3031+
def _mapping_interns1_name(self, name):
30323032
names_map = {
30333033
"model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
30343034
"model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
30353035
"model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
30363036
"model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
30373037
"model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
30383038
"model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
3039-
"model.vision_tower.embeddings.cls_token": "vision_model.embeddings.class_embedding",
3040-
"model.vision_tower.embeddings.patch_embeddings.projection.bias": "vision_model.embeddings.patch_embedding.bias",
3041-
"model.vision_tower.embeddings.patch_embeddings.projection.weight": "vision_model.embeddings.patch_embedding.weight",
3042-
"model.vision_tower.embeddings.position_embeddings": "vision_model.embeddings.position_embedding",
30433039
}
30443040
if name in names_map:
30453041
name = names_map[name]
3046-
elif name.startswith("model.language_model."):
3047-
name = "language_model.model." + name[len("model.language_model.") :]
3048-
elif name.startswith("model.vision_tower."):
3049-
name = "vision_model." + name[len("model.vision_tower.") :]
3050-
3051-
if name.startswith("vision_model.encoder.layer"):
3052-
name = name.replace(r".layer.", r".layers.")
3053-
name = name.replace(r".attention.", r".attn.")
3054-
name = name.replace(r".attn.q_proj", r".self_attn.q_proj")
3055-
name = name.replace(r".attn.k_proj", r".self_attn.k_proj")
3056-
name = name.replace(r".attn.v_proj", r".self_attn.v_proj")
3057-
name = name.replace(r".projection_layer.", r".proj.")
3058-
name = name.replace(r".lambda_1", r".ls1")
3059-
name = name.replace(r".lambda_2", r".ls2")
3060-
name = name.replace(r".layernorm_before.", r".norm1.")
3061-
name = name.replace(r".layernorm_after.", r".norm2.")
30623042
return name
30633043

30643044
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
30653045
del bid # unused
3066-
name = self._mapping_name_interns1(name)
3067-
# support interns1
3068-
if name.startswith("vision_model") or name.startswith("mlp"):
3046+
vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector']
3047+
# deal with intern-s1 special case
3048+
name = self._mapping_interns1_name(name)
3049+
if any([name.startswith(prefix) for prefix in vision_prefix]):
30693050
# process visual tensors
30703051
# correct name
30713052
if name.startswith("vision_model"):
30723053
name = "vision_tower." + name
3073-
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
3054+
if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"):
30743055
name += ".weight"
30753056
# split QKV tensors if needed
30763057
if ".qkv." in name:

gguf-py/gguf/tensor_mapping.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,11 +1054,13 @@ class TensorNameMap:
10541054

10551055
MODEL_TENSOR.V_ENC_EMBD_CLS: (
10561056
"vision_tower.vision_model.embeddings.class_embedding",
1057+
"model.vision_tower.embeddings.cls_token", # Intern-S1
10571058
"vision_model.class_embedding", # llama 4
10581059
),
10591060

10601061
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
10611062
"vision_tower.vision_model.embeddings.patch_embedding",
1063+
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
10621064
"vpm.embeddings.patch_embedding",
10631065
"model.vision_model.embeddings.patch_embedding", # SmolVLM
10641066
"vision_tower.patch_conv", # pixtral
@@ -1068,13 +1070,15 @@ class TensorNameMap:
10681070

10691071
MODEL_TENSOR.V_ENC_EMBD_POS: (
10701072
"vision_tower.vision_model.embeddings.position_embedding",
1073+
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
10711074
"vpm.embeddings.position_embedding",
10721075
"model.vision_model.embeddings.position_embedding", # SmolVLM
10731076
"vision_model.positional_embedding_vlm", # llama 4
10741077
),
10751078

10761079
MODEL_TENSOR.V_ENC_ATTN_Q: (
10771080
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
1081+
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
10781082
"vpm.encoder.layers.{bid}.self_attn.q_proj",
10791083
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
10801084
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
@@ -1084,10 +1088,12 @@ class TensorNameMap:
10841088

10851089
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
10861090
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
1091+
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
10871092
),
10881093

10891094
MODEL_TENSOR.V_ENC_ATTN_K: (
10901095
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
1096+
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
10911097
"vpm.encoder.layers.{bid}.self_attn.k_proj",
10921098
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
10931099
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
@@ -1097,10 +1103,12 @@ class TensorNameMap:
10971103

10981104
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
10991105
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
1106+
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
11001107
),
11011108

11021109
MODEL_TENSOR.V_ENC_ATTN_V: (
11031110
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
1111+
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
11041112
"vpm.encoder.layers.{bid}.self_attn.v_proj",
11051113
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
11061114
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
@@ -1111,6 +1119,7 @@ class TensorNameMap:
11111119
MODEL_TENSOR.V_ENC_INPUT_NORM: (
11121120
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
11131121
"vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
1122+
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
11141123
"vpm.encoder.layers.{bid}.layer_norm1",
11151124
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
11161125
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
@@ -1121,6 +1130,7 @@ class TensorNameMap:
11211130
MODEL_TENSOR.V_ENC_ATTN_O: (
11221131
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
11231132
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
1133+
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
11241134
"vpm.encoder.layers.{bid}.self_attn.out_proj",
11251135
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
11261136
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
@@ -1131,6 +1141,7 @@ class TensorNameMap:
11311141
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
11321142
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
11331143
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
1144+
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
11341145
"vpm.encoder.layers.{bid}.layer_norm2",
11351146
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
11361147
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
@@ -1140,6 +1151,7 @@ class TensorNameMap:
11401151

11411152
MODEL_TENSOR.V_ENC_FFN_UP: (
11421153
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
1154+
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
11431155
"vpm.encoder.layers.{bid}.mlp.fc1",
11441156
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
11451157
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
@@ -1155,6 +1167,7 @@ class TensorNameMap:
11551167

11561168
MODEL_TENSOR.V_ENC_FFN_DOWN: (
11571169
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
1170+
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
11581171
"vpm.encoder.layers.{bid}.mlp.fc2",
11591172
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
11601173
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
@@ -1165,10 +1178,12 @@ class TensorNameMap:
11651178

11661179
MODEL_TENSOR.V_LAYER_SCALE_1: (
11671180
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
1181+
"model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1
11681182
),
11691183

11701184
MODEL_TENSOR.V_LAYER_SCALE_2: (
11711185
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
1186+
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
11721187
),
11731188

11741189
MODEL_TENSOR.V_PRE_NORM: (
@@ -1190,6 +1205,7 @@ class TensorNameMap:
11901205

11911206
MODEL_TENSOR.V_MM_INP_NORM: (
11921207
"multi_modal_projector.norm",
1208+
"model.multi_modal_projector.layer_norm", # Intern-S1
11931209
),
11941210

11951211
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (

0 commit comments

Comments
 (0)