[UI] Adapt to latest code & add logo (intel#124)

KepingYan · web-flow · commit 198e52262de5 · 2023-11-22T17:23:23.000+08:00
* Adapt to latest code &amp; add logo

* update

* merge

* add comment
diff --git a/README.md b/README.md
@@ -126,11 +126,17 @@ Llm-ray provides two ways to serve models, GUI or terminal.
 This method will launch a UI interface and deploy an online inference service.
 - (Optional) If customed models need to be added, please update `inference/config.py`.
 ```bash
-python inference/start_ui.py
+# Start ray head
+RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head  --node-ip-address $node_ip --dashboard-host 0.0.0.0 --resources='{"queue_hardware": 5}'
+# Start ray worker (optional)
+RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='$node_ip:6379'
+# Start ui (Please make sure passwordless SSH login is set up for $user on node $node_ip) 
+python -u inference/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379"
+# Get urls from the log
 # Running on local URL:  http://0.0.0.0:8080
 # Running on public URL: https://180cd5f7c31a1cfd3c.gradio.live
 ```
-Access url and deploy service in a few simple clicks.
+Then you can access url and deploy service in a few simple clicks.
 
 #### Terminal
 Ray serve is used to deploy models. First the model is exposed over HTTP by using Deployment, then test it over HTTP request.
diff --git a/inference/start_ui.py b/inference/start_ui.py
@@ -210,6 +210,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
         origin_model_path = self._base_models[model_name]["model_id_or_path"]
         tokenizer_path = self._base_models[model_name]["tokenizer_name_or_path"]
         gpt_base_model = self._base_models[model_name].get("gpt_base_model")
+        last_gpt_base_model = False
         finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name)
         finetuned_checkpoint_path = os.path.join(self.finetuned_checkpoint_path, model_name, new_model_name) if self.finetuned_checkpoint_path != "" else None
 
@@ -221,7 +222,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
         ray_resources = ray.available_resources()
         if "CPU" not in ray_resources or cpus_per_worker * worker_num + 1 > int(ray.available_resources()["CPU"]):
             raise gr.Error("Resources are not meeting the demand")
-        if worker_num != exist_worker or cpus_per_worker != exist_cpus_per_worker or gpt_base_model:
+        if worker_num != exist_worker or cpus_per_worker != exist_cpus_per_worker or not (gpt_base_model and last_gpt_base_model):
             ray.shutdown()
             new_ray_init_config = {
                 "runtime_env": {
@@ -239,6 +240,9 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
             }
             if gpt_base_model:
                 new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.26.0"]
+            else:
+                new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.31.0"]
+            last_gpt_base_model = gpt_base_model
             finetune_config["Training"]["num_training_workers"] = int(worker_num)
             finetune_config["Training"]["resources_per_worker"]["CPU"] = int(cpus_per_worker)
 
@@ -296,7 +300,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
             "port": "8000",
             "name": new_model_name,
             "route_prefix": "/" + new_model_name,
-            "chat_model": self._base_models[model_name]["chat_model"],
+            "chat_processor": self._base_models[model_name]["chat_processor"],
             "prompt": {
                 "intro": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
                 "human_id": "\n### Instruction",
@@ -342,15 +346,18 @@ def deploy_func(self, model_name: str, replica_num: int, cpus_per_worker: int):
         model_config = self._all_models[model_name]
         print("model path: ", model_config["model_id_or_path"])
 
-        chat_model = getattr(sys.modules[__name__], model_config["chat_model"], None)
-        if chat_model is None:
-            return model_name + " deployment failed. " + model_config["chat_model"] + " does not exist."
-        self.process_tool = chat_model(**model_config["prompt"])
+        chat_processor = getattr(sys.modules[__name__], model_config["chat_processor"], None)
+        if chat_processor is None:
+            return model_name + " deployment failed. " + model_config["chat_processor"] + " does not exist."
+        self.process_tool = chat_processor(**model_config["prompt"])
 
         model_load_config = model_config.get("config", {})
         device_name = "cpu"
-        deployment = PredictDeployment.options(num_replicas=replica_num, ray_actor_options={"num_cpus": cpus_per_worker, "runtime_env": {"pip": ["transformers==4.28.0"]}})\
-                                      .bind(model_config["model_id_or_path"], model_config["tokenizer_name_or_path"], model_load_config, device_name, amp_enabled, amp_dtype, stop_words=stop_words, cpus_per_worker=cpus_per_worker)
+        deployment = PredictDeployment.options(num_replicas=replica_num, ray_actor_options={"num_cpus": cpus_per_worker, "runtime_env": {"pip": ["transformers==4.31.0"]}})\
+                                      .bind(model_config["model_id_or_path"], model_config["tokenizer_name_or_path"], model_load_config,
+                                            device_name, amp_enabled, amp_dtype,
+                                            chat_processor_name=model_config["chat_processor"], prompt=model_config["prompt"],
+                                            cpus_per_worker=cpus_per_worker)
         handle = serve.run(deployment, _blocking=True, port=model_config["port"], name=model_config["name"], route_prefix=model_config["route_prefix"])
         return self.ip_port + model_config["route_prefix"]
 
@@ -379,8 +386,8 @@ def get_cpu_memory(self, index):
         out = stdout.read().decode('utf-8')
         out_words = out.split(" ")
         cpu_value = 100 - float(out_words[7])
-        total_memory = int(out_words[20].split('+')[0])
-        free_memory = int(out_words[21].split('+')[0])
+        total_memory = float(out_words[20].split('+')[0])
+        free_memory = float(out_words[21].split('+')[0])
         used_memory = 1 - free_memory/total_memory
         return cpu_memory_html.format(str(round(cpu_value, 1)), str(round(used_memory*100, 1)))
     
@@ -395,7 +402,7 @@ def kill_node(self, btn_txt, index):
             return "Start", ""
         elif btn_txt=="Start":
             index = int(index)
-            command = "conda activate " + self.conda_env_name + "; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + self.master_ip_port + r""" --resources='{"special_hardware": 4}'"""
+            command = "conda activate " + self.conda_env_name + "; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + self.master_ip_port + r""" --resources='{"special_hardware": 2}'"""
             self.ssh_connect[index].exec_command(command)
             self.ray_nodes[index]["Alive"] = "True"
             time.sleep(2)
@@ -428,6 +435,7 @@ def _init_ui(self):
         with gr.Blocks(css=custom_css,title=title) as gr_chat:
             head_content = """
                 <div style="color: #fff;text-align: center;">
+                    <div style="position:absolute; left:15px; top:15px; "><img  src="/file=inference/ui_images/logo.png" width="50" height="50"/></div>
                     <p style="color: #fff; font-size: 1.0rem;">LLM on Ray Workflow as a Service Demo</p> 
                     <p style="color: #fff; font-size: 0.8rem;">Build your own LLM models with proprietary data, deploy an online inference service in production, all in a few simple clicks.</p>
                 </div>
diff --git a/inference/ui_images/logo.png b/inference/ui_images/logo.png
diff --git a/requirements.txt b/requirements.txt
@@ -7,14 +7,15 @@ typing==3.7.4.3
 tabulate
 ray[tune]
 ray[serve]
-gradio
+gradio==3.36.1
 gymnasium
 dm-tree
 tensorboard
 einops
 peft==0.4.0
 deltatuner==1.1.9
 py-cpuinfo
+paramiko==3.2.0
 torch==2.1.0+cpu
 oneccl_bind_pt==2.1.0
 intel_extension_for_pytorch==2.1.0+cpu