Allow compression on meta device (#39039)

shanjiaz · kylesayrs · web-flow · commit ffdd10fcedb1 · 2025-08-29T15:49:15.000+02:00
* disable gradient calculation for int weights

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

* Update src/transformers/quantizers/quantizer_compressed_tensors.py

Co-authored-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* updated model procession before/after weight loading

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

* fix style

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

* reformat

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

* fix style

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;

---------

Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;
Co-authored-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -13,9 +13,6 @@
 # limitations under the License.
 
 
-import os
-import re
-
 from ..utils import is_compressed_tensors_available, is_torch_available, logging
 from ..utils.quantization_config import CompressedTensorsConfig
 from .base import HfQuantizer
@@ -55,45 +52,6 @@ def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs):
         self.run_compressed = quantization_config.run_compressed
         self.quantization_config = quantization_config
 
-    def update_missing_keys_after_loading(self, model, missing_keys: list[str], prefix: str) -> list[str]:
-        """
-        Update missing keys after loading the model. This is necessary for compressed tensors
-        to load the model correctly. We expect weights to be present in missing keys.
-        The weight's are re-constructed by ModelCompressor in _process_model_after_weight_loading
-
-        This function cleans up expected missing keys and returns the remaining missing keys
-        """
-
-        if self.run_compressed:
-            return missing_keys
-
-        # We expect some keys to be missing for
-        # compressed models
-        # This is fine as the weights are reconstructed by ModelCompressor
-        # in _process_model_after_weight_loading
-
-        expected_missing_keys = self.compressor.get_missing_module_keys(model)
-        return [
-            key for key in missing_keys if not any(re.match(f".*{pattern}", key) for pattern in expected_missing_keys)
-        ]
-
-    def update_unexpected_keys(self, model, unexpected_keys: list[str], prefix: str) -> list[str]:
-        """
-        Override this method if you want to adjust the `unexpected_keys`.
-
-        Args:
-            unexpected_keys (`list[str]`, *optional*):
-                The list of unexpected keys in the checkpoint compared to the state dict of the model
-        """
-
-        if self.run_compressed:
-            return unexpected_keys
-
-        # We expect some unexpected keys in model
-        # safetensors file for compressed models
-        keys_to_ignore = self.compressor.get_unexpected_file_keys(model)
-        return [key for key in unexpected_keys if not any(re.match(f".*{pattern}", key) for pattern in keys_to_ignore)]
-
     def validate_environment(self, *args, **kwargs):
         if not is_compressed_tensors_available():
             raise ImportError(
@@ -117,31 +75,21 @@ def _process_model_before_weight_loading(self, model, **kwargs):
 
         ct_quantization_config = self.compressor.quantization_config
 
-        if self.run_compressed:
-            apply_quantization_config(model, ct_quantization_config, run_compressed=True)
-        elif not self.quantization_config.is_quantization_compressed:
-            apply_quantization_config(model, ct_quantization_config)
+        # Always initialize compressed wrappers to match the checkpoint
+        apply_quantization_config(model, ct_quantization_config, self.run_compressed)
+        if (
+            self.quantization_config.is_quantization_compressed
+            or self.quantization_config.is_sparsification_compressed
+        ):
+            self.compressor.compress_model(model=model)
 
     def _process_model_after_weight_loading(self, model, **kwargs):
         """Decompress loaded model if necessary - need for qat"""
 
         if (
             self.quantization_config.is_quantization_compressed and not self.run_compressed
         ) or self.quantization_config.is_sparsification_compressed:
-            config = kwargs.get("config")
-            cache_path = config._name_or_path
-
-            if not os.path.exists(cache_path):
-                from transformers.utils import cached_file
-
-                config_file_path = cached_file(cache_path, "config.json")
-                cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
-
-            if self.quantization_config.is_quantization_compressed and not self.run_compressed:
-                from compressed_tensors.quantization import QuantizationStatus
-
-                self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN
-            self.compressor.decompress(model_path=cache_path, model=model)
+            self.compressor.decompress_model(model=model)
 
     def update_tp_plan(self, config):
         additional_plan = {