Fix bnb fsdp loading for pre-quantized checkpoint (#41415)

SunMarc · Cyrilvallez · commit 2fbd25c5c2ad · 2025-10-14T16:55:51.000+02:00
* fix

* fix

* get_param_name

* fix device name
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -777,21 +777,17 @@ def _load_state_dict_into_meta_model(
                 # and then cast it to CPU to avoid excessive memory usage on each GPU
                 # in comparison to the sharded model across GPUs.
                 if is_fsdp_enabled() or is_deepspeed_zero3_enabled():
-                    param_name = hf_quantizer.update_param_name(param_name)
+                    param_name = hf_quantizer.get_param_name(param_name)
                     module, param_type = get_module_from_name(model, param_name)
                     value = getattr(module, param_type)
-                    # special case for gpt_oss model, we wait for the param to be leave the meta device before casting it to cpu
-                    if model.config.model_type == "gpt_oss" and value.device.type == "meta":
+                    # We need to wait until the quantized value is created
+                    if value.device.type == "meta":
                         continue
-                    param_to = "cpu"
-                    if is_fsdp_enabled() and not is_local_dist_rank_0():
-                        param_to = "meta"
-                    val_kwargs = {}
-                    if (hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params") or (
-                        value.dtype == torch.uint8 or value.dtype == torch.int8
-                    ):
+                    val_kwargs = value.__dict__
+                    if not value.is_floating_point():
                         val_kwargs["requires_grad"] = False
-                    value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__)
+                    device = "meta" if is_fsdp_enabled() and not is_local_dist_rank_0() else "cpu"
+                    value = type(value)(value.data.to(device), **val_kwargs)
                     setattr(module, param_type, value)
 
         # Remove the param from the state dict if it was not loaded on the fly to avoid wasting memory
@@ -6070,7 +6066,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: dict,
         # For example in the case of MXFP4 quantization, we need to update the param name to the original param name
         # because the checkpoint contains blocks, and scales, but since we are dequantizing, we need to use the original param name
         if hf_quantizer is not None:
-            param_name = hf_quantizer.update_param_name(param_name)
+            param_name = hf_quantizer.get_param_name(param_name)
 
         try:
             param = model.get_parameter_or_buffer(param_name)
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
@@ -283,7 +283,7 @@ def _dequantize(self, model):
             f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub."
         )
 
-    def update_param_name(self, param_name: str) -> str:
+    def get_param_name(self, param_name: str) -> str:
         """
         Override this method if you want to adjust the `param_name`.
         """
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -160,6 +160,19 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
         module, name = get_module_from_name(model, param_name)
         return isinstance(module, bnb.nn.Linear4bit) and name != "bias"
 
+    def get_param_name(self, param_name: str) -> str:
+        """
+        Get the right param_name in order to get the module associated with the param.
+        This is useful for quantized stats lile absmax or quant_map as we need to update the param_name to get the module as they are stored in ...weight.absmax.
+        """
+        if self.pre_quantized:
+            # We need to get the param name of quantized weights and not its components. Otherwise, we won't be able to get the nn.Module associated.
+            if any(param_name.endswith(x) for x in self.bnb_keys):
+                param_name = (
+                    param_name.rsplit(".", 1)[0] if "quant_state." not in param_name else param_name.rsplit(".", 2)[0]
+                )
+        return param_name
+
     def create_quantized_param(
         self,
         model: "PreTrainedModel",
@@ -170,12 +183,10 @@ def create_quantized_param(
     ):
         import bitsandbytes as bnb
 
-        is_quant_stat = any(param_name.endswith(x) for x in self.bnb_keys)
         full_name = param_name
-        if is_quant_stat:
-            param_name = (
-                param_name.rsplit(".", 1)[0] if "quant_state." not in param_name else param_name.rsplit(".", 2)[0]
-            )
+
+        # update param name to get the weights instead of the quantized stats
+        param_name = self.get_param_name(param_name)
         module, tensor_name = get_module_from_name(model, param_name)
 
         # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
diff --git a/src/transformers/quantizers/quantizer_mxfp4.py b/src/transformers/quantizers/quantizer_mxfp4.py
@@ -365,7 +365,7 @@ def update_ep_plan(self, config):
                 )
         return config
 
-    def update_param_name(self, param_name: str) -> str:
+    def get_param_name(self, param_name: str) -> str:
         if self.quantization_config.dequantize:
             if "_blocks" in param_name:
                 return param_name.replace("_blocks", "")
diff --git a/tests/quantization/mxfp4/test_mxfp4.py b/tests/quantization/mxfp4/test_mxfp4.py
@@ -265,7 +265,7 @@ def test_update_expected_keys(self):
 
         self.assertEqual(set(updated_keys), set(expected_updated))
 
-    def test_update_param_name_dequantize(self):
+    def test_get_param_name_dequantize(self):
         """Test parameter name updating when dequantizing"""
         from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
 
@@ -274,28 +274,28 @@ def test_update_param_name_dequantize(self):
 
         # Should remove _blocks suffix
         param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks"
-        updated_name = quantizer.update_param_name(param_name)
+        updated_name = quantizer.get_param_name(param_name)
         self.assertEqual(updated_name, "model.layers.0.mlp.experts.gate_up_proj")
 
         # Should remove _scales suffix
         param_name = "model.layers.0.mlp.experts.down_proj_scales"
-        updated_name = quantizer.update_param_name(param_name)
+        updated_name = quantizer.get_param_name(param_name)
         self.assertEqual(updated_name, "model.layers.0.mlp.experts.down_proj")
 
         # Should not change other names
         param_name = "model.embed_tokens.weight"
-        updated_name = quantizer.update_param_name(param_name)
+        updated_name = quantizer.get_param_name(param_name)
         self.assertEqual(updated_name, "model.embed_tokens.weight")
 
-    def test_update_param_name_no_dequantize(self):
+    def test_get_param_name_no_dequantize(self):
         """Test parameter name updating when not dequantizing"""
         from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
 
         config = Mxfp4Config(dequantize=False)
         quantizer = Mxfp4HfQuantizer(config)
 
         param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks"
-        updated_name = quantizer.update_param_name(param_name)
+        updated_name = quantizer.get_param_name(param_name)
         self.assertEqual(updated_name, param_name)
 
     def test_is_trainable(self):

Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,7 @@ def _dequantize(self, model):`
`283`	`283`	f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub."
`284`	`284`	`)`
`285`	`285`
`286`		`- def update_param_name(self, param_name: str) -> str:`
	`286`	`+ def get_param_name(self, param_name: str) -> str:`
`287`	`287`	`"""`
`288`	`288`	Override this method if you want to adjust the `param_name`.
`289`	`289`	`"""`
Original file line number	Diff line number	Diff line change
`@@ -365,7 +365,7 @@ def update_ep_plan(self, config):`
`365`	`365`	`)`
`366`	`366`	`return config`
`367`	`367`
`368`		`- def update_param_name(self, param_name: str) -> str:`
	`368`	`+ def get_param_name(self, param_name: str) -> str:`
`369`	`369`	`if self.quantization_config.dequantize:`
`370`	`370`	`if "_blocks" in param_name:`
`371`	`371`	`return param_name.replace("_blocks", "")`