Address PR review feedback for AFMoE model

alyosha-swamy · alyosha-swamy · commit 30c3a20600ed · 2025-11-29T03:58:13.000Z
diff --git a/src/transformers/models/afmoe/configuration_afmoe.py b/src/transformers/models/afmoe/configuration_afmoe.py
@@ -17,7 +17,6 @@
 from typing import Optional
 
 from ...configuration_utils import PreTrainedConfig, layer_type_validation
-from ...modeling_rope_utils import rope_config_validation, standardize_rope_params
 from ...utils import logging
 
 
@@ -103,6 +102,9 @@ class AfmoeConfig(PreTrainedConfig):
             `global_attn_every_n_layers`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        mup_enabled (`bool`, *optional*, defaults to `False`):
+            Whether to enable muP (Maximal Update Parametrization) input scaling. When enabled, input embeddings
+            are scaled by `sqrt(hidden_size)`.
 
     Example:
     ```python
@@ -157,6 +159,7 @@ def __init__(
         sliding_window: Optional[int] = 1024,
         layer_types: Optional[list] = None,
         attention_dropout: Optional[float] = 0.0,
+        mup_enabled: Optional[bool] = False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -187,6 +190,7 @@ def __init__(
         self.attention_dropout = attention_dropout
         self.global_attn_every_n_layers = global_attn_every_n_layers
         self.sliding_window = sliding_window
+        self.mup_enabled = mup_enabled
         self.layer_types = layer_types
         if self.layer_types is None:
             self.layer_types = [
@@ -200,13 +204,6 @@ def __init__(
 
         self.num_key_value_heads = num_key_value_heads
 
-        # Setup and validate rope configs
-        self.rope_parameters = rope_scaling
-        standardize_rope_params(self, rope_theta=rope_theta)
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self)
-
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
diff --git a/src/transformers/models/afmoe/modeling_afmoe.py b/src/transformers/models/afmoe/modeling_afmoe.py
@@ -175,18 +175,16 @@ def __init__(self, config):
         self.route_scale = config.route_scale
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
 
-    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor | None = None):
+    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor):
         _, _, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         scores = torch.sigmoid(self.gate(hidden_states).to(torch.float32))
 
-        if expert_bias is not None:
-            _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
-            top_scores = scores.gather(dim=1, index=selected_experts)
-        else:
-            top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
+        _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
+        top_scores = scores.gather(dim=1, index=selected_experts)
 
+        # Normalize routing weights (default: True for sigmoid scoring)
         if self.route_norm:
             denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
             top_scores = top_scores / denominator
@@ -202,8 +200,6 @@ class AfmoeExperts(nn.ModuleList):
     This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
     """
 
-    _checkpoint_conversion_mapping = {"experts": "experts"}
-
     def __init__(self, config: AfmoeConfig):
         super().__init__()
         self.top_k = config.num_experts_per_tok
@@ -376,6 +372,7 @@ def __init__(self, config: AfmoeConfig, layer_idx: int):
         self.o_proj = nn.Linear(
             config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
+        self.rotary_fn = apply_rotary_pos_emb
         # Parent LlamaAttention already sets: layer_idx, num_heads, num_key_value_heads, num_key_value_groups, head_dim
         # We only add AFMoE-specific attributes
         self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
@@ -542,15 +539,15 @@ class AfmoePreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
-                module.bias.zero_()
+                nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
-                module.weight[module.padding_idx].zero_()
+                nn.init.zeros_(module.weight[module.padding_idx])
         elif isinstance(module, AfmoeRMSNorm):
-            module.weight.fill_(1.0)
+            nn.init.ones_(module.weight)
 
 
 @auto_docstring
@@ -628,7 +625,11 @@ def forward(
                 "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
 
-        hidden_states = inputs_embeds * (self.config.hidden_size**0.5)
+        hidden_states = inputs_embeds
+
+        # Apply muP input scaling if enabled
+        if self.config.mup_enabled:
+            hidden_states = hidden_states * (self.config.hidden_size**0.5)
 
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
 
diff --git a/src/transformers/models/afmoe/modular_afmoe.py b/src/transformers/models/afmoe/modular_afmoe.py
@@ -132,18 +132,16 @@ def __init__(self, config):
         self.route_scale = config.route_scale
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
 
-    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor | None = None):
+    def forward(self, hidden_states: torch.Tensor, expert_bias: torch.Tensor):
         _, _, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         scores = torch.sigmoid(self.gate(hidden_states).to(torch.float32))
 
-        if expert_bias is not None:
-            _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
-            top_scores = scores.gather(dim=1, index=selected_experts)
-        else:
-            top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
+        _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
+        top_scores = scores.gather(dim=1, index=selected_experts)
 
+        # Normalize routing weights (default: True for sigmoid scoring)
         if self.route_norm:
             denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
             top_scores = top_scores / denominator
@@ -159,8 +157,6 @@ class AfmoeExperts(nn.ModuleList):
     This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
     """
 
-    _checkpoint_conversion_mapping = {"experts": "experts"}
-
     def __init__(self, config: AfmoeConfig):
         super().__init__()
         self.top_k = config.num_experts_per_tok
@@ -421,15 +417,15 @@ class AfmoePreTrainedModel(LlamaPreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
-                module.bias.zero_()
+                nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
-                module.weight[module.padding_idx].zero_()
+                nn.init.zeros_(module.weight[module.padding_idx])
         elif isinstance(module, AfmoeRMSNorm):
-            module.weight.fill_(1.0)
+            nn.init.ones_(module.weight)
 
 
 @auto_docstring
@@ -507,7 +503,11 @@ def forward(
                 "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
 
-        hidden_states = inputs_embeds * (self.config.hidden_size**0.5)
+        hidden_states = inputs_embeds
+
+        # Apply muP input scaling if enabled
+        if self.config.mup_enabled:
+            hidden_states = hidden_states * (self.config.hidden_size**0.5)
 
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
 
diff --git a/tests/models/afmoe/__init__.py b/tests/models/afmoe/__init__.py
@@ -1,14 +1 @@
-# Copyright 2025 Arcee AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
diff --git a/tests_output.txt b/tests_output.txt