Add flex attention support to AFMoE model

alyosha-swamy · alyosha-swamy · commit bcd7b97be3d5 · 2025-11-18T12:30:03.000Z
diff --git a/docs/source/en/model_doc/afmoe.md b/docs/source/en/model_doc/afmoe.md
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-*This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-14.*
+*This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-18.*
 
 <div style="float: right;">
     <div class="flex flex-wrap space-x-1">
diff --git a/src/transformers/models/afmoe/modular_afmoe.py b/src/transformers/models/afmoe/modular_afmoe.py
@@ -411,6 +411,7 @@ class AfmoePreTrainedModel(LlamaPreTrainedModel):
     ]
     _supports_sdpa = True
     _supports_flash_attn_2 = True
+    _supports_flex_attn = True
     _supports_attention_backend = True
     supports_gradient_checkpointing = True
 
diff --git a/test_afmoe_load.py b/test_afmoe_load.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+"""Quick test script to load AFMoE checkpoint weights."""
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+# Path to your checkpoint
+CHECKPOINT_PATH = "arcee-train/afmoe-nano-sft-v3-pocketRL-v0.1.4-2"  # HuggingFace Hub checkpoint
+
+def main():
+    print("Loading AFMoE checkpoint...")
+    
+    # Load config
+    config = AutoConfig.from_pretrained(CHECKPOINT_PATH, trust_remote_code=False)
+    print(f"Config loaded: {config.model_type}")
+    print(f"  - Hidden size: {config.hidden_size}")
+    print(f"  - Num layers: {config.num_hidden_layers}")
+    print(f"  - Num experts: {config.num_experts}")
+    print(f"  - Num shared experts: {config.num_shared_experts}")
+    print(f"  - Top-k: {config.num_experts_per_tok}")
+    
+    # Load model
+    model = AutoModelForCausalLM.from_pretrained(
+        CHECKPOINT_PATH,
+        config=config,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        trust_remote_code=False,
+    )
+    print(f"\nModel loaded successfully!")
+    print(f"  - Model class: {model.__class__.__name__}")
+    print(f"  - Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")
+    
+    # Test forward pass
+    print("\nTesting forward pass...")
+    input_ids = torch.randint(0, config.vocab_size, (1, 10)).to(model.device)
+    
+    with torch.no_grad():
+        outputs = model(input_ids)
+        print(f"  - Output logits shape: {outputs.logits.shape}")
+        print(f"  - Output dtype: {outputs.logits.dtype}")
+    
+    # Test generation (optional)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH)
+        prompt = "Hello, how are you?"
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        
+        print(f"\nTesting generation with prompt: '{prompt}'")
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        print(f"Generated: {generated}")
+    except Exception as e:
+        print(f"\nSkipping tokenizer test: {e}")
+    
+    print("\n✅ All checks passed!")
+
+if __name__ == "__main__":
+    main()
+

Original file line number	Diff line number	Diff line change
`@@ -411,6 +411,7 @@ class AfmoePreTrainedModel(LlamaPreTrainedModel):`
`411`	`411`	`]`
`412`	`412`	`_supports_sdpa = True`
`413`	`413`	`_supports_flash_attn_2 = True`
	`414`	`+ _supports_flex_attn = True`
`414`	`415`	`_supports_attention_backend = True`
`415`	`416`	`supports_gradient_checkpointing = True`
`416`	`417`