|
| 1 | +# Copyright 2025 ModelCloud |
| 2 | +# Contact: qubitium@modelcloud.ai, x.com/qubitium |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | + |
| 16 | +import os |
| 17 | +import time |
| 18 | + |
| 19 | + |
| 20 | +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| 21 | + |
| 22 | + |
| 23 | +import unittest |
| 24 | +from transformers import AutoTokenizer |
| 25 | + |
| 26 | +from gptqmodel import GPTQModel |
| 27 | +from gptqmodel.utils.progress import ProgressBar |
| 28 | + |
| 29 | + |
| 30 | +class InferenceSpeed(unittest.TestCase): |
| 31 | + NATIVE_MODEL_ID = "/monster/data/model/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2" |
| 32 | + BITBLAS_NATIVE_MODEL_ID = "/monster/data/model/opt-125M-autoround-lm_head-false-symTrue" |
| 33 | + MAX_NEW_TOEKNS = 10 |
| 34 | + NUM_RUNS = 20 |
| 35 | + PROMPTS = [ |
| 36 | + "I am in Paris and I", |
| 37 | + "The capital of the United Kingdom is", |
| 38 | + "The largest ocean on Earth is", |
| 39 | + "The world’s longest river is", |
| 40 | + "The tallest mountain in the world is", |
| 41 | + "The currency used in Japan is", |
| 42 | + "How to consult a dictionary?", |
| 43 | + "What is the boiling point of water in degrees Celsius?", |
| 44 | + "Which is the most widely used Internet search engine in the world?", |
| 45 | + "What is the official language of France?", |
| 46 | + ] |
| 47 | + MAX_DELTA_FLOOR_PERCENT = 0.25 |
| 48 | + MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.25 |
| 49 | + |
| 50 | + def inference(self, model_path, backend, tokens_per_second, assert_result=True): |
| 51 | + model = GPTQModel.from_quantized( |
| 52 | + model_path, |
| 53 | + backend=backend, |
| 54 | + ) |
| 55 | + tokenizer = AutoTokenizer.from_pretrained(model_path) |
| 56 | + tokenizer.pad_token_id = tokenizer.eos_token_id |
| 57 | + inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left').to( |
| 58 | + model.device) |
| 59 | + |
| 60 | + times = [] |
| 61 | + tokens = [] |
| 62 | + |
| 63 | + pb = ProgressBar(range(self.NUM_RUNS)) |
| 64 | + for i in pb: |
| 65 | + pb.set_description(f"run index {i} of {self.NUM_RUNS - 1}") |
| 66 | + start_time = time.time() |
| 67 | + result = model.generate(**inp, max_new_tokens=self.MAX_NEW_TOEKNS, pad_token_id=tokenizer.pad_token_id) |
| 68 | + end_time = time.time() |
| 69 | + elapsed_time = end_time - start_time |
| 70 | + times.append(elapsed_time) |
| 71 | + |
| 72 | + for j in range(result.shape[0]): |
| 73 | + new_tokens = result[j][inp['input_ids'].shape[1]:] |
| 74 | + new_token_count = len(new_tokens) |
| 75 | + tokens.append(new_token_count) |
| 76 | + |
| 77 | + sum_time = sum(times) |
| 78 | + sum_tokens = sum(tokens) |
| 79 | + |
| 80 | + avg_tokens_per_second = round(sum_tokens / sum_time, 2) |
| 81 | + |
| 82 | + print(f"\n**************** {backend} Result Info****************") |
| 83 | + print(f"Times: {times}") |
| 84 | + print(f"New Tokens: {tokens}") |
| 85 | + print(f"Sum Times: {sum_time}") |
| 86 | + print(f"Sum New Tokens: {sum_tokens}") |
| 87 | + print(f"New Token Per Second: {avg_tokens_per_second} token/s") |
| 88 | + print(f"**************** {backend} Result Info End****************") |
| 89 | + |
| 90 | + if not assert_result: |
| 91 | + return |
| 92 | + |
| 93 | + diff_pct = (avg_tokens_per_second / tokens_per_second) * 100 |
| 94 | + negative_pct = 100 * (1 - self.MAX_DELTA_FLOOR_PERCENT) |
| 95 | + positive_pct = 100 * (1 + self.MAX_POSITIVE_DELTA_CEIL_PERCENT) |
| 96 | + |
| 97 | + self.assertTrue(negative_pct <= diff_pct <= positive_pct, |
| 98 | + f"Tokens Per Second: {avg_tokens_per_second} diff {diff_pct:.2f}% is out of the expected range [{negative_pct}-{positive_pct}%]") |
0 commit comments