Skip to content

Commit 198e522

Browse files
authored
[UI] Adapt to latest code & add logo (intel#124)
* Adapt to latest code & add logo * update * merge * add comment
1 parent f7b6e86 commit 198e522

File tree

4 files changed

+29
-14
lines changed

4 files changed

+29
-14
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,11 +126,17 @@ Llm-ray provides two ways to serve models, GUI or terminal.
126126
This method will launch a UI interface and deploy an online inference service.
127127
- (Optional) If customed models need to be added, please update `inference/config.py`.
128128
```bash
129-
python inference/start_ui.py
129+
# Start ray head
130+
RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address $node_ip --dashboard-host 0.0.0.0 --resources='{"queue_hardware": 5}'
131+
# Start ray worker (optional)
132+
RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='$node_ip:6379'
133+
# Start ui (Please make sure passwordless SSH login is set up for $user on node $node_ip)
134+
python -u inference/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379"
135+
# Get urls from the log
130136
# Running on local URL: http://0.0.0.0:8080
131137
# Running on public URL: https://180cd5f7c31a1cfd3c.gradio.live
132138
```
133-
Access url and deploy service in a few simple clicks.
139+
Then you can access url and deploy service in a few simple clicks.
134140

135141
#### Terminal
136142
Ray serve is used to deploy models. First the model is exposed over HTTP by using Deployment, then test it over HTTP request.

inference/start_ui.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
210210
origin_model_path = self._base_models[model_name]["model_id_or_path"]
211211
tokenizer_path = self._base_models[model_name]["tokenizer_name_or_path"]
212212
gpt_base_model = self._base_models[model_name].get("gpt_base_model")
213+
last_gpt_base_model = False
213214
finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name)
214215
finetuned_checkpoint_path = os.path.join(self.finetuned_checkpoint_path, model_name, new_model_name) if self.finetuned_checkpoint_path != "" else None
215216

@@ -221,7 +222,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
221222
ray_resources = ray.available_resources()
222223
if "CPU" not in ray_resources or cpus_per_worker * worker_num + 1 > int(ray.available_resources()["CPU"]):
223224
raise gr.Error("Resources are not meeting the demand")
224-
if worker_num != exist_worker or cpus_per_worker != exist_cpus_per_worker or gpt_base_model:
225+
if worker_num != exist_worker or cpus_per_worker != exist_cpus_per_worker or not (gpt_base_model and last_gpt_base_model):
225226
ray.shutdown()
226227
new_ray_init_config = {
227228
"runtime_env": {
@@ -239,6 +240,9 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
239240
}
240241
if gpt_base_model:
241242
new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.26.0"]
243+
else:
244+
new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.31.0"]
245+
last_gpt_base_model = gpt_base_model
242246
finetune_config["Training"]["num_training_workers"] = int(worker_num)
243247
finetune_config["Training"]["resources_per_worker"]["CPU"] = int(cpus_per_worker)
244248

@@ -296,7 +300,7 @@ def finetune(self, model_name, dataset, new_model_name, batch_size, num_epochs,
296300
"port": "8000",
297301
"name": new_model_name,
298302
"route_prefix": "/" + new_model_name,
299-
"chat_model": self._base_models[model_name]["chat_model"],
303+
"chat_processor": self._base_models[model_name]["chat_processor"],
300304
"prompt": {
301305
"intro": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n",
302306
"human_id": "\n### Instruction",
@@ -342,15 +346,18 @@ def deploy_func(self, model_name: str, replica_num: int, cpus_per_worker: int):
342346
model_config = self._all_models[model_name]
343347
print("model path: ", model_config["model_id_or_path"])
344348

345-
chat_model = getattr(sys.modules[__name__], model_config["chat_model"], None)
346-
if chat_model is None:
347-
return model_name + " deployment failed. " + model_config["chat_model"] + " does not exist."
348-
self.process_tool = chat_model(**model_config["prompt"])
349+
chat_processor = getattr(sys.modules[__name__], model_config["chat_processor"], None)
350+
if chat_processor is None:
351+
return model_name + " deployment failed. " + model_config["chat_processor"] + " does not exist."
352+
self.process_tool = chat_processor(**model_config["prompt"])
349353

350354
model_load_config = model_config.get("config", {})
351355
device_name = "cpu"
352-
deployment = PredictDeployment.options(num_replicas=replica_num, ray_actor_options={"num_cpus": cpus_per_worker, "runtime_env": {"pip": ["transformers==4.28.0"]}})\
353-
.bind(model_config["model_id_or_path"], model_config["tokenizer_name_or_path"], model_load_config, device_name, amp_enabled, amp_dtype, stop_words=stop_words, cpus_per_worker=cpus_per_worker)
356+
deployment = PredictDeployment.options(num_replicas=replica_num, ray_actor_options={"num_cpus": cpus_per_worker, "runtime_env": {"pip": ["transformers==4.31.0"]}})\
357+
.bind(model_config["model_id_or_path"], model_config["tokenizer_name_or_path"], model_load_config,
358+
device_name, amp_enabled, amp_dtype,
359+
chat_processor_name=model_config["chat_processor"], prompt=model_config["prompt"],
360+
cpus_per_worker=cpus_per_worker)
354361
handle = serve.run(deployment, _blocking=True, port=model_config["port"], name=model_config["name"], route_prefix=model_config["route_prefix"])
355362
return self.ip_port + model_config["route_prefix"]
356363

@@ -379,8 +386,8 @@ def get_cpu_memory(self, index):
379386
out = stdout.read().decode('utf-8')
380387
out_words = out.split(" ")
381388
cpu_value = 100 - float(out_words[7])
382-
total_memory = int(out_words[20].split('+')[0])
383-
free_memory = int(out_words[21].split('+')[0])
389+
total_memory = float(out_words[20].split('+')[0])
390+
free_memory = float(out_words[21].split('+')[0])
384391
used_memory = 1 - free_memory/total_memory
385392
return cpu_memory_html.format(str(round(cpu_value, 1)), str(round(used_memory*100, 1)))
386393

@@ -395,7 +402,7 @@ def kill_node(self, btn_txt, index):
395402
return "Start", ""
396403
elif btn_txt=="Start":
397404
index = int(index)
398-
command = "conda activate " + self.conda_env_name + "; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + self.master_ip_port + r""" --resources='{"special_hardware": 4}'"""
405+
command = "conda activate " + self.conda_env_name + "; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + self.master_ip_port + r""" --resources='{"special_hardware": 2}'"""
399406
self.ssh_connect[index].exec_command(command)
400407
self.ray_nodes[index]["Alive"] = "True"
401408
time.sleep(2)
@@ -428,6 +435,7 @@ def _init_ui(self):
428435
with gr.Blocks(css=custom_css,title=title) as gr_chat:
429436
head_content = """
430437
<div style="color: #fff;text-align: center;">
438+
<div style="position:absolute; left:15px; top:15px; "><img src="/file=inference/ui_images/logo.png" width="50" height="50"/></div>
431439
<p style="color: #fff; font-size: 1.0rem;">LLM on Ray Workflow as a Service Demo</p>
432440
<p style="color: #fff; font-size: 0.8rem;">Build your own LLM models with proprietary data, deploy an online inference service in production, all in a few simple clicks.</p>
433441
</div>

inference/ui_images/logo.png

10.6 KB
Loading

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@ typing==3.7.4.3
77
tabulate
88
ray[tune]
99
ray[serve]
10-
gradio
10+
gradio==3.36.1
1111
gymnasium
1212
dm-tree
1313
tensorboard
1414
einops
1515
peft==0.4.0
1616
deltatuner==1.1.9
1717
py-cpuinfo
18+
paramiko==3.2.0
1819
torch==2.1.0+cpu
1920
oneccl_bind_pt==2.1.0
2021
intel_extension_for_pytorch==2.1.0+cpu

0 commit comments

Comments
 (0)