intel · Deegue · Jun 19, 2024 · Jun 20, 2024 · Jun 25, 2024 · Jul 3, 2024
diff --git a/examples/inference/api_server_openai/query_openai_sdk.py b/examples/inference/api_server_openai/query_openai_sdk.py
@@ -40,6 +40,12 @@
     help="If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to`Top p` or higher are kept for generation",
 )
 
+parser.add_argument(
+    "--debug_mode",
+    action="store_true",
+    help="If debug mode is enabled, debug logs will be printed",
+)
+
 args = parser.parse_args()
 
 if "OPENAI_API_KEY" in os.environ:
@@ -65,6 +71,7 @@ def stream_chat():
         max_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_p=args.top_p,
+        debug_mode=args.debug_mode,
     ):
         content = chunk.choices[0].delta.content
         if content is not None:
@@ -81,6 +88,7 @@ def chunk_chat():
         max_tokens=args.max_new_tokens,
         temperature=args.temperature,
         top_p=args.top_p,
+        debug_mode=args.debug_mode,
     )
     for chunk in [output]:
         try:

diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py
@@ -54,10 +54,12 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep
         top_p = request_config.get("top_p", 1.0)
         max_new_tokens = request_config.get("max_tokens", None)
         gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}
-        if temperature != 1.0 or top_p != 1.0:
-            gen_config.update({"do_sample": True})
-        if request_config.get("ignore_eos", False):
-            gen_config.update({"ignore_eos": True})
+        gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0})
+        gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)})
+
+        if request_config.get("debug_mode", False):
+            print("DEBUG: print request_config:", request_config)
+        # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down
 
         async for x in handle_request(
             model=model,

diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
@@ -171,6 +171,7 @@ class InferenceConfig(BaseModel):
     ipex: Ipex = Ipex()
     hpu_model_config: HpuModelConfig = HpuModelConfig()
     model_description: ModelDescription = ModelDescription()
+    debug_mode: bool = False
 
     # prevent warning of protected namespaces
     # DO NOT TOUCH

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
@@ -396,8 +396,13 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py:print config received from json:", config)
+            print("DEBUG:predictor_deployment.py::print inputs for prompts:", input)
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts)
 
         # Handle streaming response
         if streaming_response:
@@ -416,12 +421,18 @@ async def openai_call(
     ):
         self.use_openai = True
 
+        # TODO: Pass down config into preprocess_prompts for more logs.
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py:print config received from query_client:", config)
+            print("DEBUG:predictor_deployment.py::print inputs for prompts:", input)
         # return prompt or list of prompts preprocessed
-        input = self.preprocess_prompts(input, tools, tool_choice)
+        prompts = self.preprocess_prompts(input, tools, tool_choice)
+        if config.get("debug_mode", False):
+            print("DEBUG:predictor_deployment.py::print prompts from inputs:", prompts)
 
         # Handle streaming response
         if streaming_response:
-            async for result in self.handle_streaming(input, config):
+            async for result in self.handle_streaming(prompts, config):
                 yield result
         else:
-            yield await self.handle_non_streaming(input, config)
+            yield await self.handle_non_streaming(prompts, config)
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -79,9 +79,19 @@ def __init__(self, infer_conf: InferenceConfig):
         # decide correct torch dtype for loading HF model
         decide_torch_dtype(infer_conf)
 
+        debug_mode = infer_conf.debug_mode
+
+        if debug_mode:
+            print("DEBUG:hpu_predictor:print inference config:", infer_conf)
+
         self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile
         self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs
 
+        # optimize transformers for gaudi
+        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+        adapt_transformers_to_gaudi()
+
 if infer_conf.deepspeed: 
 if infer_conf.deepspeed: 
         if infer_conf.deepspeed:
             # DeepSpeed is enabled, start worker group
             # Prepare placement group
@@ -105,13 +115,6 @@ def __init__(self, infer_conf: InferenceConfig):
 
                 htcore.hpu_set_env()
 
-            # Tweak transformer to optimize performance on Gaudi
-            from optimum.habana.transformers.modeling_utils import (
-                adapt_transformers_to_gaudi,
-            )
-
-            adapt_transformers_to_gaudi()
-
             self.device = torch.device("hpu")
             model = AutoModelForCausalLM.from_pretrained(
                 model_desc.model_id_or_path, **model_desc.config.dict()
@@ -181,6 +184,7 @@ def _process_config(self, config):
 
     def get_streamer(self):
         if self.infer_conf.deepspeed:
+            # Q2: Why always use the first worker?
             return ray.get(self.deepspeed_workers[0].get_streamer.remote())
         else:
             return TextIteratorStreamer(
@@ -196,6 +200,8 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:
 
         self._process_config(config)
 
+        # TODO: Maybe we should get realtime load info of all cards, set a heathy usage ratio and pick the usable cards for serving.
+        #       So that some errors like OOM can be prevented, and the server will be more robust.
         if self.infer_conf.deepspeed:
             return ray.get(
                 [worker.generate.remote(prompt, **config) for worker in self.deepspeed_workers]
@@ -219,7 +225,9 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput:
 
     def streaming_generate(self, prompt, streamer, **config):
         self._process_config(config)
+        # Q1: Why it is handled here when using both deepspeed and hpu?
         if self.infer_conf.deepspeed:
+            # Q2: Why always use the first worker?
             self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config)
             for worker in self.deepspeed_workers[1:]:
                 worker.streaming_generate.remote(prompt, self._create_dummy_streamer(), **config)
@@ -284,10 +292,6 @@ def load_model_and_tokenizer(self):
         self.world_size = int(os.environ["WORLD_SIZE"])
         self.local_rank = int(os.environ["LOCAL_RANK"])
         self.device = torch.device("hpu")
-        # optimize transformers for gaudi
-        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-        adapt_transformers_to_gaudi()
         self.load_model()
         model_desc = self.infer_conf.model_description
         self.tokenizer = load_tokenizer(self.model, model_desc.tokenizer_name_or_path)

diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py
@@ -41,14 +41,23 @@ def get_deployed_models(args):
                 set(all_models_name)
             ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}."
             model_list = {model: all_models[model] for model in models}
+            if args.debug_mode:
+                print(
+                    "DEBUG:serve.py: --config_file is not set while --models is set, serving model(s):",
+                    model_list,
+                )
         else:
             model_list = all_models
+            if args.debug_mode:
+                print(
+                    "DEBUG:serve.py: --config_file and --models is not set, serving all models:",
+                    model_list,
+                )
     else:
-        # config_file has precedence over others
-        if args.config_file:
-            print("Reading from config file, " + args.config_file)
-            with open(args.config_file, "r") as f:
-                infer_conf = parse_yaml_raw_as(InferenceConfig, f)
+        if args.debug_mode:
+            print("DEBUG:serve.py: Reading from config file, " + args.config_file)
+        with open(args.config_file, "r") as f:
+            infer_conf = parse_yaml_raw_as(InferenceConfig, f)
         model_list = {}
         model_list[infer_conf.name] = infer_conf
 
@@ -131,6 +140,11 @@ def main(argv=None):
     parser.add_argument(
         "--max_batch_size", default=None, type=int, help="The max batch size for dynamic batching."
     )
+    parser.add_argument(
+        "--debug_mode",
+        action="store_true",
+        help="If debug mode is enabled, debug logs will be printed",
+    )
 
     # Print help if no arguments were provided
     if len(sys.argv) == 1:
@@ -147,6 +161,9 @@ def main(argv=None):
 
     ray.init(address="auto")
     deployments, model_list = get_deployed_models(args)
+    if args.debug_mode:
+        print("DEBUG:serve.py: Service is running with deployments:" + str(deployments))
+        print("DEBUG:serve.py: Service is running models:" + str(model_list))
     if args.simple:
         # provide simple model endpoint
         # models can be served to customed URLs according to configuration files.
@@ -156,8 +173,6 @@ def main(argv=None):
         # all models are served under the same URL and then accessed
         # through model_id, so it needs to pass in a unified URL.
         host = "127.0.0.1" if args.serve_local_only else "0.0.0.0"
-        print("Service is running with deployments:" + str(deployments))
-        print("Service is running models:" + str(model_list))
         openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests)
 
     msg = "Service is deployed successfully."