feat: support GLM 4.5 family of models

sammcj · sammcj · commit fdf3da02d308 · 2025-07-29T22:55:55.000+10:00
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -111,6 +111,8 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_30B_A3B:       return "30B.A3B";
         case LLM_TYPE_235B_A22B:     return "235B.A22B";
         case LLM_TYPE_300B_A47B:     return "300B.A47B";
+        case LLM_TYPE_9B_A2B:        return "9B.A2B";
+        case LLM_TYPE_32B_A7B:       return "32B.A7B";
         case LLM_TYPE_E2B:           return "E2B";
         case LLM_TYPE_E4B:           return "E4B";
         default:                     return "?B";
@@ -1435,8 +1437,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 }
 
                 switch (hparams.n_layer) {
-                    case 46: type = LLM_TYPE_12B; break;  // GLM-4.5-Air
-                    case 93: type = LLM_TYPE_32B; break;  // GLM-4.5
+                    case 46: type = LLM_TYPE_9B_A2B; break;  // GLM-4.5-Air
+                    case 93: type = LLM_TYPE_32B_A7B; break; // GLM-4.5
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -4393,9 +4395,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
                         layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_gqa }, TENSOR_NOT_REQUIRED);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED);
 
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -103,6 +103,8 @@ enum llm_type {
     LLM_TYPE_30B_A3B,
     LLM_TYPE_235B_A22B,
     LLM_TYPE_300B_A47B, // Ernie MoE big
+    LLM_TYPE_9B_A2B,    // GLM-4.5-Air (9B total, ~2B active)
+    LLM_TYPE_32B_A7B,   // GLM-4.5 (32B total, ~7B active)  
     LLM_TYPE_E2B,
     LLM_TYPE_E4B,
 };