huggingface
diff --git a/‎docs/source/en/model_doc/afmoe.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/en/model_doc/afmoe.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/afmoe/configuration_afmoe.py‎
Lines changed: 2 additions & 11 deletions b/‎src/transformers/models/afmoe/configuration_afmoe.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎src/transformers/models/afmoe/modeling_afmoe.py‎
Lines changed: 108 additions & 78 deletions b/‎src/transformers/models/afmoe/modeling_afmoe.py‎
Lines changed: 108 additions & 78 deletions
@@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-14.*
 
 <div style="float: right;">
     <div class="flex flex-wrap space-x-1">
 
@@ -88,19 +88,15 @@ class AfmoeConfig(PreTrainedConfig):
             Number of experts to route each token to. This is the top-k value for the token-choice routing.
         num_shared_experts (`int`, *optional*, defaults to 2):
             Number of shared experts that are always activated for all tokens.
-        score_func (`str`, *optional*, defaults to `"sigmoid"`):
-            The scoring function for routing decisions. Can be either "sigmoid" or "softmax".
         route_norm (`bool`, *optional*, defaults to `True`):
-            Whether to normalize routing weights when using sigmoid scoring.
+            Whether to normalize routing weights.
         route_scale (`float`, *optional*, defaults to 1.0):
             Scaling factor applied to routing weights.
         global_attn_every_n_layers (`int`, *optional*, defaults to 4):
             The frequency of full attention layers. Every Nth layer will use full attention, while others use sliding
             window attention.
         sliding_window (`int`, *optional*, defaults to 1024):
             Sliding window size for local attention layers.
-        mup_enabled (`bool`, *optional*, defaults to `False`):
-            Whether to enable muP (Maximal Update Parametrization) scaling for training stability.
         layer_types (`list[str]`, *optional*):
             A list that explicitly maps each layer index with its attention type. Each element should be either
             "sliding_attention" or "full_attention". If not provided, it will be automatically generated based on
@@ -155,12 +151,10 @@ def __init__(
         num_experts: Optional[int] = 64,
         num_experts_per_tok: Optional[int] = 6,
         num_shared_experts: Optional[int] = 2,
-        score_func: Optional[str] = "sigmoid",
         route_norm: Optional[bool] = True,
         route_scale: Optional[float] = 1.0,
         global_attn_every_n_layers: Optional[int] = 4,
         sliding_window: Optional[int] = 1024,
-        mup_enabled: Optional[bool] = False,
         layer_types: Optional[list] = None,
         attention_dropout: Optional[float] = 0.0,
         **kwargs,
@@ -185,9 +179,9 @@ def __init__(
         self.num_experts_per_tok = num_experts_per_tok
         self.num_experts = num_experts
         self.num_shared_experts = num_shared_experts
-        self.score_func = score_func
         self.route_norm = route_norm
         self.route_scale = route_scale
+        self.attention_bias = False
 
         # Attention specific
         self.attention_dropout = attention_dropout
@@ -201,9 +195,6 @@ def __init__(
             ]
         layer_type_validation(self.layer_types)
 
-        # muP specific
-        self.mup_enabled = mup_enabled
-
         if num_key_value_heads is None:
             num_key_value_heads = num_attention_heads
 
 
@@ -24,7 +24,6 @@
 from typing import Optional, Union
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 from ...activations import ACT2FN
@@ -164,47 +163,99 @@ class AfmoeTokenChoiceRouter(nn.Module):
     """
     Token-choice top-K router for MoE routing.
 
-    This router assigns each token to the top-K experts based on learned routing scores.
-    It supports both sigmoid and softmax scoring functions.
+    This router assigns each token to the top-K experts based on sigmoid scores, matching the released checkpoints.
     """
 
     def __init__(self, config):
         super().__init__()
         self.config = config
         self.top_k = config.num_experts_per_tok
         self.num_experts = config.num_experts
-        self.score_func = config.score_func
         self.route_norm = config.route_norm
         self.route_scale = config.route_scale
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
 
-    def forward(self, hidden_states, expert_bias: torch.Tensor | None):
+    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor | None = None):
+        # Keep expert_bias argument for checkpoint/backwards compatibility (it is always zero in released models).
+        del expert_bias
         _, _, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        scores = self.gate(hidden_states)
+        scores = torch.sigmoid(self.gate(hidden_states).to(torch.float32))
+        top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
 
-        # Apply scoring function in float32 for stability
-        if self.score_func == "sigmoid":
-            scores = torch.sigmoid(scores.to(torch.float32))
-        else:
-            scores = F.softmax(scores.to(torch.float32), dim=-1)
-
-        if expert_bias is not None:
-            _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
-            top_scores = scores.gather(dim=1, index=selected_experts)
-        else:
-            top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
-
-        # Normalize weights if using sigmoid
-        if self.score_func == "sigmoid" and self.route_norm:
+        if self.route_norm:
             denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
             top_scores = top_scores / denominator
 
         top_scores = top_scores * self.route_scale
         return top_scores, selected_experts
 
 
+class AfmoeExperts(nn.ModuleList):
+    """
+    Container holding the routed experts.
+
+    This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
+    """
+
+    _checkpoint_conversion_mapping = {"experts": "experts"}
+
+    def __init__(self, config: AfmoeConfig):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_experts
+        for _ in range(self.num_experts):
+            self.append(AfmoeMLP(config, intermediate_size=config.moe_intermediate_size))
+
+    def forward(
+        self, hidden_states: torch.Tensor, selected_experts: torch.Tensor, routing_weights: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: (batch, seq, hidden)
+            selected_experts: (batch, seq, top_k)
+            routing_weights: (batch, seq, top_k)
+        """
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        if seq_len == 0:
+            return hidden_states.new_zeros(batch_size, 0, hidden_dim)
+        hidden_states_flat = hidden_states.view(-1, hidden_dim)
+        top_k = selected_experts.shape[-1]
+
+        # Map every token routing decision to a unique position so we can process expert by expert.
+        token_indices = torch.arange(
+            hidden_states_flat.shape[0], device=hidden_states.device, dtype=torch.long
+        ).repeat_interleave(top_k)
+        expert_indices = selected_experts.reshape(-1)
+        routing_weights = routing_weights.reshape(-1)
+
+        sorting = torch.argsort(expert_indices, stable=True)
+        token_indices = token_indices[sorting]
+        expert_indices = expert_indices[sorting]
+        routing_weights = routing_weights[sorting]
+
+        dispatched_tokens = hidden_states_flat.index_select(0, token_indices)
+        expert_outputs = torch.zeros_like(dispatched_tokens)
+
+        unique_experts, counts = torch.unique_consecutive(expert_indices, return_counts=True)
+        start = 0
+        for expert_id, count in zip(unique_experts.tolist(), counts.tolist()):
+            if count == 0:
+                continue
+            end = start + count
+            expert_input = dispatched_tokens[start:end]
+            expert_output = self[expert_id](expert_input)
+            expert_outputs[start:end] = expert_output
+            start = end
+
+        weighted_outputs = (expert_outputs.to(torch.float32) * routing_weights.unsqueeze(-1)).to(hidden_states.dtype)
+        aggregated = torch.zeros_like(hidden_states_flat)
+        scatter_indices = token_indices.unsqueeze(-1).expand_as(weighted_outputs)
+        aggregated.scatter_add_(0, scatter_indices, weighted_outputs)
+        return aggregated.view(batch_size, seq_len, hidden_dim)
+
+
 class AfmoeMoE(nn.Module):
     """
     Mixture of Experts (MoE) module for AFMoE.
@@ -221,9 +272,7 @@ def __init__(self, config):
         self.shared_experts = None
         if config.num_shared_experts > 0:
             self.shared_experts = AfmoeMLP(config, config.moe_intermediate_size * config.num_shared_experts)
-        self.experts = nn.ModuleList(
-            [AfmoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(config.num_experts)]
-        )
+        self.experts = AfmoeExperts(config)
         self.expert_bias = nn.Parameter(torch.zeros(config.num_experts, dtype=torch.float32), requires_grad=False)
 
     def forward(self, hidden_states):
@@ -232,37 +281,17 @@ def forward(self, hidden_states):
 
         # Get routing decisions
         top_scores, selected_experts = self.router(hidden_states, self.expert_bias)
+        top_scores = top_scores.view(batch_size, seq_len, self.config.num_experts_per_tok)
+        selected_experts = selected_experts.view(batch_size, seq_len, self.config.num_experts_per_tok)
 
         # Process through shared experts
         if self.shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states_flat)
+            shared_output = self.shared_experts(hidden_states_flat).view(batch_size, seq_len, hidden_dim)
         else:
-            shared_output = torch.zeros_like(hidden_states_flat)
-
-        # Reorder tokens by expert for efficient processing
-        token_indices_sorted = torch.argsort(selected_experts.view(-1), stable=True)
-        top_scores_sorted = top_scores.view(-1)[token_indices_sorted]
-        token_to_expert = selected_experts.view(-1)[token_indices_sorted]
-        token_indices_sorted = token_indices_sorted // self.config.num_experts_per_tok
+            shared_output = hidden_states.new_zeros(batch_size, seq_len, hidden_dim)
 
-        # Gather input tokens
-        token_indices_expanded = token_indices_sorted.unsqueeze(-1).expand(-1, hidden_dim)
-        routed_input = torch.gather(hidden_states_flat, dim=0, index=token_indices_expanded)
-
-        routed_output = torch.zeros_like(routed_input)
-        for expert_id in range(self.config.num_experts):
-            mask = token_to_expert == expert_id
-            if mask.any():
-                expert_input = routed_input[mask]
-                expert_out = self.experts[expert_id](expert_input)
-                routed_output[mask] = expert_out
-
-        routed_output = (routed_output.to(torch.float32) * top_scores_sorted.unsqueeze(-1)).to(hidden_states.dtype)
-
-        # Scatter back to original positions
-        output = shared_output.scatter_add(dim=0, index=token_indices_expanded, src=routed_output)
-
-        return output.view(batch_size, seq_len, hidden_dim)
+        routed_output = self.experts(hidden_states, selected_experts, top_scores)
+        return shared_output + routed_output
 
 
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
@@ -318,32 +347,40 @@ class AfmoeAttention(nn.Module):
     Multi-headed attention module with optional sliding window and gating.
 
     This attention mechanism supports both full attention and sliding window attention,
-    and includes Q/K normalization and gating of the output.
+    and includes Q/K normalization and gating of the output. It inherits from [`LlamaAttention`] to minimize the amount
+    of custom logic we need to maintain.
     """
 
     def __init__(self, config: AfmoeConfig, layer_idx: int):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_heads = config.num_attention_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        # Parent LlamaAttention already sets: layer_idx, num_heads, num_key_value_heads, num_key_value_groups, head_dim
+        # We only add AFMoE-specific attributes
         self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
         self.sliding_window = config.sliding_window if self.is_local_attention else None
 
-        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
-
         self.q_norm = AfmoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = AfmoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-
-        self.gate_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.gate_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
 
     def forward(
         self,
@@ -362,11 +399,8 @@ def forward(
         value_states = self.v_proj(hidden_states).view(hidden_shape)
         gate_states = self.gate_proj(hidden_states)
 
-        query_states = self.q_norm(query_states)
-        key_states = self.k_norm(key_states)
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
+        query_states = self.q_norm(query_states).transpose(1, 2)
+        key_states = self.k_norm(key_states).transpose(1, 2)
         value_states = value_states.transpose(1, 2)
 
         if self.is_local_attention:
@@ -394,7 +428,7 @@ def forward(
         )
 
         output = output.view(*input_shape, -1).contiguous()
-        output = output * F.sigmoid(gate_states)
+        output = output * torch.sigmoid(gate_states)
         return self.o_proj(output)
 
 
@@ -505,15 +539,15 @@ class AfmoePreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
-                module.bias.data.zero_()
+                module.bias.zero_()
         elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
+                module.weight[module.padding_idx].zero_()
         elif isinstance(module, AfmoeRMSNorm):
-            module.weight.data.fill_(1.0)
+            module.weight.fill_(1.0)
 
 
 @auto_docstring
@@ -591,11 +625,7 @@ def forward(
                 "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
 
-        hidden_states = inputs_embeds
-
-        # Apply muP input scaling if enabled
-        if self.config.mup_enabled:
-            hidden_states = hidden_states * (self.config.hidden_size**0.5)
+        hidden_states = inputs_embeds * (self.config.hidden_size**0.5)
 
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
 
@@ -633,7 +663,7 @@ class AfmoeForCausalLM(AfmoePreTrainedModel, GenerationMixin):
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     """
 
-    _tied_weights_keys = ["lm_head.weight"]
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}