Address PR review feedback for AFMoE model

alyosha-swamy · alyosha-swamy · commit 8d78a2962e6f · 2025-11-28T21:17:30.000Z
diff --git a/src/transformers/models/afmoe/configuration_afmoe.py b/src/transformers/models/afmoe/configuration_afmoe.py
@@ -103,6 +103,9 @@ class AfmoeConfig(PreTrainedConfig):
             `global_attn_every_n_layers`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        mup_enabled (`bool`, *optional*, defaults to `False`):
+            Whether to enable muP (Maximal Update Parametrization) input scaling. When enabled, input embeddings
+            are scaled by `sqrt(hidden_size)`.
 
     Example:
     ```python
@@ -157,6 +160,7 @@ def __init__(
         sliding_window: Optional[int] = 1024,
         layer_types: Optional[list] = None,
         attention_dropout: Optional[float] = 0.0,
+        mup_enabled: Optional[bool] = False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -187,6 +191,7 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.global_attn_every_n_layers = global_attn_every_n_layers
         self.sliding_window = sliding_window
+        self.mup_enabled = mup_enabled
         self.layer_types = layer_types
         if self.layer_types is None:
             self.layer_types = [
diff --git a/src/transformers/models/afmoe/modular_afmoe.py b/src/transformers/models/afmoe/modular_afmoe.py
@@ -132,17 +132,14 @@ def __init__(self, config):
         self.route_scale = config.route_scale
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
 
-    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor | None = None):
+    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor):
         _, _, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         scores = torch.sigmoid(self.gate(hidden_states).to(torch.float32))
 
-        if expert_bias is not None:
-            _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
-            top_scores = scores.gather(dim=1, index=selected_experts)
-        else:
-            top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
+        _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
+        top_scores = scores.gather(dim=1, index=selected_experts)
 
         if self.route_norm:
             denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
@@ -159,8 +156,6 @@ class AfmoeExperts(nn.ModuleList):
     This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
     """
 
-    _checkpoint_conversion_mapping = {"experts": "experts"}
-
     def __init__(self, config: AfmoeConfig):
         super().__init__()
         self.top_k = config.num_experts_per_tok
@@ -507,7 +502,11 @@ def forward(
                 "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
 
-        hidden_states = inputs_embeds * (self.config.hidden_size**0.5)
+        hidden_states = inputs_embeds
+
+        # Apply muP input scaling if enabled
+        if self.config.mup_enabled:
+            hidden_states = hidden_states * (self.config.hidden_size**0.5)
 
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
 
diff --git a/tests/models/afmoe/__init__.py b/tests/models/afmoe/__init__.py
@@ -1,14 +1 @@
-# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
diff --git a/tests_output.txt b/tests_output.txt