Add impl of Muon optimizer. Fix #2580

rwightman · rwightman · commit 220a7f8308fb · 2025-10-15T16:30:39.000-07:00
diff --git a/tests/test_optim.py b/tests/test_optim.py
@@ -394,6 +394,14 @@ def test_kron(optimizer):
     _test_model(optimizer, dict(lr=1e-3))
 
 
+@pytest.mark.parametrize('optimizer',  ['muon'])
+def test_muon(optimizer):
+    _test_rosenbrock(
+        lambda params: create_optimizer_v2(params, optimizer, lr=1e-3)
+    )
+    _test_model(optimizer, dict(lr=1e-3))
+
+
 @pytest.mark.parametrize('optimizer',  ['adopt', 'adoptw'])
 def test_adopt(optimizer):
     _test_rosenbrock(
diff --git a/timm/optim/__init__.py b/timm/optim/__init__.py
@@ -13,6 +13,7 @@
 from .lookahead import Lookahead
 from .madgrad import MADGRAD
 from .mars import Mars
+from .muon import Muon
 from .nadam import NAdamLegacy
 from .nadamw import NAdamW
 from .nvnovograd import NvNovoGrad
diff --git a/timm/optim/_optim_factory.py b/timm/optim/_optim_factory.py
@@ -31,6 +31,7 @@
 from .lookahead import Lookahead
 from .madgrad import MADGRAD
 from .mars import Mars
+from .muon import Muon
 from .nadam import NAdamLegacy
 from .nadamw import NAdamW
 from .nvnovograd import NvNovoGrad
@@ -871,6 +872,14 @@ def _register_other_optimizers(registry: OptimizerRegistry) -> None:
             description='Unleashing the Power of Variance Reduction for Training Large Models',
             has_betas=True,
         ),
+        OptimInfo(
+            name='muon',
+            opt_class=Muon,
+            description='MomentUm Orthogonalized by Newton-schulz with AdamW fallback for 1D params',
+            has_momentum=True,
+            has_eps=True,
+            has_betas=True,
+        ),
         OptimInfo(
             name='novograd',
             opt_class=NvNovoGrad,
diff --git a/timm/optim/_param_groups.py b/timm/optim/_param_groups.py
@@ -1,3 +1,4 @@
+import fnmatch
 import logging
 from itertools import islice
 from typing import Collection, Optional
@@ -10,27 +11,54 @@
 _logger = logging.getLogger(__name__)
 
 
+def _matches_pattern(name: str, patterns: Collection[str]) -> bool:
+    """Check if parameter name matches any pattern (supports wildcards)."""
+    return any(fnmatch.fnmatch(name, pattern) for pattern in patterns)
+
+
 def param_groups_weight_decay(
         model: nn.Module,
         weight_decay: float = 1e-5,
         no_weight_decay_list: Collection[str] = (),
+        simple_params_list: Collection[str] = (),
 ):
-    no_weight_decay_list = set(no_weight_decay_list)
     decay = []
+    decay_simple = []
     no_decay = []
+    no_decay_simple = []
     for name, param in model.named_parameters():
         if not param.requires_grad:
             continue
 
-        if param.ndim <= 1 or name.endswith(".bias") or name in no_weight_decay_list:
-            no_decay.append(param)
+        # Determine if this is a "simple" parameter for fallback optimizer (if available)
+        is_simple = _matches_pattern(name, no_weight_decay_list)
+
+        # Determine weight decay
+        matches_pattern = _matches_pattern(name, no_weight_decay_list)
+        if param.ndim <= 1 or name.endswith(".bias") or matches_pattern:
+            # No weight decay
+            if is_simple:
+                no_decay_simple.append(param)
+            else:
+                no_decay.append(param)
         else:
-            decay.append(param)
-
-    return [
-        {'params': no_decay, 'weight_decay': 0.},
-        {'params': decay, 'weight_decay': weight_decay}]
-
+            # With weight decay
+            if is_simple:
+                decay_simple.append(param)
+            else:
+                decay.append(param)
+
+    groups = []
+    if decay:
+        groups.append({'params': decay, 'weight_decay': weight_decay})
+    if decay_simple:
+        groups.append({'params': decay_simple, 'weight_decay': weight_decay, 'simple': True})
+    if no_decay:
+        groups.append({'params': no_decay, 'weight_decay': 0.})
+    if no_decay_simple:
+        groups.append({'params': no_decay_simple, 'weight_decay': 0., 'simple': True})
+
+    return groups
 
 def _group(it, size):
     it = iter(it)
@@ -70,9 +98,9 @@ def param_groups_layer_decay(
         model: nn.Module,
         weight_decay: float = 0.05,
         no_weight_decay_list: Collection[str] = (),
+        simple_params_list: Collection[str] = (),
         weight_decay_exclude_1d: bool = True,
         layer_decay: float = .75,
-        end_layer_decay: Optional[float] = None,
         min_scale: float = 0.,
         no_opt_scale: Optional[float] = None,
         verbose: bool = False,
@@ -81,7 +109,6 @@ def param_groups_layer_decay(
     Parameter groups for layer-wise lr decay & weight decay
     Based on BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
     """
-    no_weight_decay_list = set(no_weight_decay_list)
     param_group_names = {}  # NOTE for debugging
     param_groups = {}
 
@@ -99,8 +126,12 @@ def param_groups_layer_decay(
         if not param.requires_grad:
             continue
 
-        # no decay: all 1D parameters and model specific ones
-        if (weight_decay_exclude_1d and param.ndim <= 1) or name in no_weight_decay_list:
+        # Determine if this is a "simple" parameter for fallback optimizer (if available)
+        is_simple = _matches_pattern(name, simple_params_list)
+
+        # Determine weight decay
+        if (weight_decay_exclude_1d and param.ndim <= 1) or _matches_pattern(name, no_weight_decay_list):
+            # no weight decay for 1D parameters and model specific ones
             g_decay = "no_decay"
             this_decay = 0.
         else:
@@ -114,18 +145,23 @@ def param_groups_layer_decay(
             param.requires_grad = False
             continue
 
-        group_name = "layer_%d_%s" % (layer_id, g_decay)
+        simple_suffix = "_simple" if is_simple else ""
+        group_name = "layer_%d_%s%s" % (layer_id, g_decay, simple_suffix)
+
         if group_name not in param_groups:
             param_group_names[group_name] = {
                 "lr_scale": this_scale,
                 "weight_decay": this_decay,
+                "simple": is_simple,
                 "param_names": [],
             }
             param_groups[group_name] = {
                 "lr_scale": this_scale,
                 "weight_decay": this_decay,
                 "params": [],
             }
+            if is_simple:
+                param_groups[group_name]["simple"] = True
 
         param_group_names[group_name]["param_names"].append(name)
         param_groups[group_name]["params"].append(param)
diff --git a/timm/optim/muon.py b/timm/optim/muon.py
diff --git a/timm/optim/nadamw.py b/timm/optim/nadamw.py