2222 require_accelerate ,
2323 require_fp_quant ,
2424 require_qutlass ,
25- require_torch_gpu ,
26- require_torch_multi_gpu ,
25+ require_torch_accelerator ,
26+ require_torch_multi_accelerator ,
2727 slow ,
2828 torch_device ,
2929)
3030
3131
32- @require_torch_gpu
32+ @require_torch_accelerator
3333class FPQuantConfigTest (unittest .TestCase ):
3434 def test_to_dict (self ):
3535 """
@@ -53,7 +53,7 @@ def test_from_dict(self):
5353
5454
5555@slow
56- @require_torch_gpu
56+ @require_torch_accelerator
5757@require_fp_quant
5858@require_accelerate
5959class FPQuantBaseTest (unittest .TestCase ):
@@ -64,7 +64,7 @@ class FPQuantBaseTest(unittest.TestCase):
6464
6565 EXPECTED_OUTPUT = "1 2 3 4 5 6"
6666
67- device_map = "cuda"
67+ device_map = torch_device
6868
6969 @classmethod
7070 def getQuantizationConfig (cls ):
@@ -77,10 +77,10 @@ def setUpClass(cls):
7777 Setup quantized model
7878 """
7979
80- quantization_config = cls .getQuantizationConfig ()
80+ cls . quantization_config = cls .getQuantizationConfig ()
8181 cls .tokenizer = AutoTokenizer .from_pretrained (cls .model_name )
8282 cls .quantized_model = AutoModelForCausalLM .from_pretrained (
83- cls .model_name , device_map = cls .device_map , quantization_config = quantization_config
83+ cls .model_name , device_map = cls .device_map , quantization_config = cls . quantization_config
8484 )
8585
8686 def tearDown (self ):
@@ -111,24 +111,25 @@ def test_save_pretrained(self):
111111 output = model .generate (** input_ids , max_new_tokens = self .max_new_tokens )
112112 self .assertEqual (self .tokenizer .decode (output [0 ], skip_special_tokens = True ), self .EXPECTED_OUTPUT )
113113
114- @require_torch_multi_gpu
115- def test_quantized_model_multi_gpu (self ):
114+ @require_torch_multi_accelerator
115+ def test_quantized_model_multi_accelerator (self ):
116116 """
117- Simple test that checks if the quantized model is working properly with multiple GPUs
118- set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
117+ Simple test that checks if the quantized model is working properly with multiple accelerators.
118+ Set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs. Or set ZE_AFFINITY_MASK=0,1
119+ if you have more than 2 Intel XPUs.
119120 """
120121 input_ids = self .tokenizer (self .input_text , return_tensors = "pt" ).to (torch_device )
121- quantization_config = FPQuantConfig ()
122+
122123 quantized_model = AutoModelForCausalLM .from_pretrained (
123- self .model_name , device_map = "auto" , quantization_config = quantization_config
124+ self .model_name , device_map = "auto" , quantization_config = self . quantization_config
124125 )
125126 self .assertTrue (set (quantized_model .hf_device_map .values ()) == {0 , 1 })
126127
127128 output = quantized_model .generate (** input_ids , max_new_tokens = self .max_new_tokens )
128129 self .assertEqual (self .tokenizer .decode (output [0 ], skip_special_tokens = True ), self .EXPECTED_OUTPUT )
129130
130- @require_torch_multi_gpu
131- def test_save_pretrained_multi_gpu (self ):
131+ @require_torch_multi_accelerator
132+ def test_save_pretrained_multi_accelerator (self ):
132133 """
133134 Simple test that checks if the quantized model is working properly after being saved and loaded
134135 """
0 commit comments