Skip to content

Commit 7aa59a0

Browse files
Inference speed test (#1159)
* inference speed * code review * code review * add comment * code review * code review * code clean up * fix name * code updated
1 parent 73cd87f commit 7aa59a0

File tree

3 files changed

+190
-0
lines changed

3 files changed

+190
-0
lines changed

tests/inference_speed.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Copyright 2025 ModelCloud
2+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import os
17+
import time
18+
19+
20+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
21+
22+
23+
import unittest
24+
from transformers import AutoTokenizer
25+
26+
from gptqmodel import GPTQModel
27+
from gptqmodel.utils.progress import ProgressBar
28+
29+
30+
class InferenceSpeed(unittest.TestCase):
31+
NATIVE_MODEL_ID = "/monster/data/model/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2"
32+
BITBLAS_NATIVE_MODEL_ID = "/monster/data/model/opt-125M-autoround-lm_head-false-symTrue"
33+
MAX_NEW_TOEKNS = 10
34+
NUM_RUNS = 20
35+
PROMPTS = [
36+
"I am in Paris and I",
37+
"The capital of the United Kingdom is",
38+
"The largest ocean on Earth is",
39+
"The world’s longest river is",
40+
"The tallest mountain in the world is",
41+
"The currency used in Japan is",
42+
"How to consult a dictionary?",
43+
"What is the boiling point of water in degrees Celsius?",
44+
"Which is the most widely used Internet search engine in the world?",
45+
"What is the official language of France?",
46+
]
47+
MAX_DELTA_FLOOR_PERCENT = 0.25
48+
MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.25
49+
50+
def inference(self, model_path, backend, tokens_per_second, assert_result=True):
51+
model = GPTQModel.from_quantized(
52+
model_path,
53+
backend=backend,
54+
)
55+
tokenizer = AutoTokenizer.from_pretrained(model_path)
56+
tokenizer.pad_token_id = tokenizer.eos_token_id
57+
inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left').to(
58+
model.device)
59+
60+
times = []
61+
tokens = []
62+
63+
pb = ProgressBar(range(self.NUM_RUNS))
64+
for i in pb:
65+
pb.set_description(f"run index {i} of {self.NUM_RUNS - 1}")
66+
start_time = time.time()
67+
result = model.generate(**inp, max_new_tokens=self.MAX_NEW_TOEKNS, pad_token_id=tokenizer.pad_token_id)
68+
end_time = time.time()
69+
elapsed_time = end_time - start_time
70+
times.append(elapsed_time)
71+
72+
for j in range(result.shape[0]):
73+
new_tokens = result[j][inp['input_ids'].shape[1]:]
74+
new_token_count = len(new_tokens)
75+
tokens.append(new_token_count)
76+
77+
sum_time = sum(times)
78+
sum_tokens = sum(tokens)
79+
80+
avg_tokens_per_second = round(sum_tokens / sum_time, 2)
81+
82+
print(f"\n**************** {backend} Result Info****************")
83+
print(f"Times: {times}")
84+
print(f"New Tokens: {tokens}")
85+
print(f"Sum Times: {sum_time}")
86+
print(f"Sum New Tokens: {sum_tokens}")
87+
print(f"New Token Per Second: {avg_tokens_per_second} token/s")
88+
print(f"**************** {backend} Result Info End****************")
89+
90+
if not assert_result:
91+
return
92+
93+
diff_pct = (avg_tokens_per_second / tokens_per_second) * 100
94+
negative_pct = 100 * (1 - self.MAX_DELTA_FLOOR_PERCENT)
95+
positive_pct = 100 * (1 + self.MAX_POSITIVE_DELTA_CEIL_PERCENT)
96+
97+
self.assertTrue(negative_pct <= diff_pct <= positive_pct,
98+
f"Tokens Per Second: {avg_tokens_per_second} diff {diff_pct:.2f}% is out of the expected range [{negative_pct}-{positive_pct}%]")

tests/test_inference_speed.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright 2025 ModelCloud
2+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import os
17+
from parameterized import parameterized
18+
19+
from gptqmodel.utils import BACKEND
20+
from inference_speed import InferenceSpeed
21+
22+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
23+
24+
'''
25+
NATIVE_MODEL_ID = /monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortext-v1
26+
BITBLAS_NATIVE_MODEL_ID = /monster/data/model/opt-125M-autoround-lm_head-false-symTrue
27+
GPU: 4090
28+
29+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.MARLIN, 748),
30+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.CUDA, 493),
31+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V1, 717),
32+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 775),
33+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 296),
34+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 295),
35+
(InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 1474),
36+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.IPEX, 48),
37+
'''
38+
39+
class TestInferenceSpeed(InferenceSpeed):
40+
41+
@parameterized.expand(
42+
[
43+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.MARLIN, 262),
44+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.CUDA, 48),
45+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V1, 186),
46+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 188),
47+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 141),
48+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 48),
49+
(InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 1474), # Second time running bitblas, there is cache
50+
]
51+
)
52+
def test_inference_speed(self, model_path, backend, tokens_per_second):
53+
# There are differences between the results of the first and second runs of bitblas
54+
# (there is a cache when running bitblas for the second time),
55+
# so only the results of the second run of bitblas are asserted.
56+
# The first run of bitblas only prints relevant information
57+
if backend == BACKEND.BITBLAS:
58+
self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second, assert_result=False)
59+
60+
self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second)

tests/test_inference_speed_ipex.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2025 ModelCloud
2+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import os
17+
18+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
19+
20+
from gptqmodel.utils import BACKEND
21+
from parameterized import parameterized
22+
from inference_speed import InferenceSpeed
23+
24+
25+
class TestInferenceSpeedIpex(InferenceSpeed):
26+
@parameterized.expand(
27+
[
28+
(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.IPEX, 12),
29+
]
30+
)
31+
def test_inference_speed_ipex(self, model_path, backend, tokens_per_second):
32+
self.inference(model_path=model_path, backend=backend, tokens_per_second=tokens_per_second)

0 commit comments

Comments
 (0)