Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.

Commit 60d9f15

Browse files
committed
update
2 parents b092b01 + cc356f6 commit 60d9f15

23 files changed

+536
-128
lines changed

.github/workflows/scripts/test.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env bash
2+
build_and_prune() {
3+
# Set TARGET and DF-SUFFIX using the passed in parameters
4+
local TARGET=$TARGET
5+
local DF_SUFFIX=$DF_SUFFIX
6+
local PYTHON_V=$PYTHON_V ## same name
7+
local USE_PROXY=$USE_PROXY
8+
9+
echo "defe is $TARGET"
10+
echo "df-suffix is $DF_SUFFIX"
11+
echo "python version is $PYTHON_V"
12+
echo "use proxy is $USE_PROXY"
13+
}

.github/workflows/workflow_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ jobs:
152152
153153
- name: Build Docker Image
154154
run: |
155-
DF_SUFFIX=".tests_cpu_and_deepspeed"
155+
DF_SUFFIX=".tests_cpu"
156156
TARGET=${{steps.target.outputs.target}}
157157
docker build ./ --build-arg CACHEBUST=1 --build-arg python_v=${{matrix.python-version}} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest
158158
docker container prune -f

dev/docker/Dockerfile.tests_cpu

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# syntax=docker/dockerfile:1
2+
FROM ubuntu:22.04
3+
4+
ARG python_v
5+
6+
ENV LANG C.UTF-8
7+
8+
WORKDIR /root/llm-on-ray
9+
10+
RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
11+
&& apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
12+
&& apt-get clean \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
ENV CONDA_DIR /opt/conda
16+
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
17+
/bin/bash ~/miniconda.sh -b -p /opt/conda
18+
ENV PATH $CONDA_DIR/bin:$PATH
19+
20+
# setup env
21+
SHELL ["/bin/bash", "--login", "-c"]
22+
23+
RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
24+
unset -f conda && \
25+
export PATH=$CONDA_DIR/bin/:${PATH} && \
26+
conda config --add channels intel && \
27+
conda install python==${python_v}
28+
29+
COPY ./pyproject.toml .
30+
COPY ./MANIFEST.in .
31+
32+
# create llm_on_ray package directory to bypass the following 'pip install -e' command
33+
RUN mkdir ./llm_on_ray
34+
35+
RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
36+
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
37+
38+
# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
39+
ARG CACHEBUST=1
40+
COPY ./dev/scripts/install-oneapi.sh /tmp
41+
RUN /tmp/install-oneapi.sh

examples/inference/api_server_openai/query_http_requests_tool.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,14 @@
7373

7474
messages = [
7575
[
76-
{"role": "user", "content": "You are a helpful assistant"},
7776
{"role": "user", "content": "What's the weather like in Boston today?"},
7877
],
7978
]
8079

8180
proxies = {"http": None, "https": None}
8281

8382
for message in messages:
84-
print(f"User: {message[1]['content']}")
83+
print(f"User: {message[0]['content']}")
8584
print("Assistant:", end=" ", flush=True)
8685

8786
body = {

llm_on_ray/common/tokenizer/huggingface_tokenizer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,7 @@ class HuggingFaceTokenizer(Tokenizer):
2323
def __call__(self, config):
2424
name = config.get("name")
2525
load_config = config.get("config", {})
26+
print(name)
2627
tokenizer = transformers.AutoTokenizer.from_pretrained(name, **load_config)
28+
print(tokenizer)
2729
return tokenizer

llm_on_ray/finetune/finetune_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,4 @@ class FinetuneConfig(BaseModel):
180180
m: FinetuneConfig = parse_yaml_raw_as(FinetuneConfig, f)
181181
_models[m.General.base_model] = m
182182

183-
all_models = _models.copy()
183+
base_models = _models.copy()
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#
2+
# Copyright 2023 The LLM-on-Ray Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
from typing import List, Union
17+
18+
from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
19+
20+
21+
class ChatTemplatePreprocess:
22+
def __init__(self, predictor) -> None:
23+
self.predictor = predictor
24+
25+
def get_prompt(self, input: List, is_mllm=False):
26+
"""Generate response based on input."""
27+
if self.predictor.infer_conf.model_description.chat_template is not None:
28+
self.predictor.tokenizer.chat_template = (
29+
self.predictor.infer_conf.model_description.chat_template
30+
)
31+
elif self.predictor.tokenizer.chat_template is None:
32+
self.predictor.tokenizer.chat_template = (
33+
self.predictor.infer_conf.model_description.default_chat_template
34+
)
35+
36+
if is_mllm:
37+
if isinstance(input, List):
38+
if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
39+
messages = []
40+
for chat_message in input:
41+
message = {
42+
"role": chat_message.role,
43+
"content": chat_message.content,
44+
}
45+
messages.append(message)
46+
texts, images = self._extract_messages(messages)
47+
elif isinstance(input, list) and input and isinstance(input[0], dict):
48+
texts, images = self._extract_messages(input)
49+
elif isinstance(input, list) and input and isinstance(input[0], list):
50+
texts, images = [self._extract_messages(p) for p in input]
51+
52+
image = self._prepare_image(images)
53+
prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
54+
return prompt, image
55+
else:
56+
if isinstance(input, list) and input and isinstance(input[0], dict):
57+
prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
58+
elif isinstance(input, list) and input and isinstance(input[0], list):
59+
prompt = [
60+
self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input
61+
]
62+
elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
63+
messages = []
64+
for chat_message in input:
65+
message = {"role": chat_message.role, "content": chat_message.content}
66+
messages.append(message)
67+
prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
68+
elif isinstance(input, list) and input and isinstance(input[0], str):
69+
prompt = input
70+
elif isinstance(input, str):
71+
prompt = input
72+
else:
73+
raise TypeError(
74+
f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
75+
)
76+
return prompt
77+
78+
def _extract_messages(self, messages):
79+
texts, images = [], []
80+
for message in messages:
81+
if message["role"] == "user" and isinstance(message["content"], list):
82+
texts.append({"role": "user", "content": message["content"][0]["text"]})
83+
images.append(
84+
{"role": "user", "content": message["content"][1]["image_url"]["url"]}
85+
)
86+
else:
87+
texts.append(message)
88+
return texts, images
89+
90+
def _prepare_image(self, messages: list):
91+
"""Prepare image from history messages."""
92+
from PIL import Image
93+
import requests
94+
from io import BytesIO
95+
import base64
96+
import re
97+
98+
# prepare images
99+
images: List = []
100+
if isinstance(messages[0], List):
101+
for i in range(len(messages)):
102+
for msg in messages[i]:
103+
msg = dict(msg)
104+
content = msg["content"]
105+
if "url" not in content:
106+
continue
107+
is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
108+
if is_data:
109+
encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
110+
images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
111+
else:
112+
images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
113+
elif isinstance(messages[0], dict):
114+
for msg in messages:
115+
msg = dict(msg)
116+
content = msg["content"]
117+
if "url" not in content:
118+
continue
119+
is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
120+
if is_data:
121+
encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
122+
images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
123+
else:
124+
images.append(Image.open(requests.get(content["url"], stream=True).raw))
125+
126+
return images

llm_on_ray/inference/models/gpt-j-6b.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ ipex:
1414
model_description:
1515
model_id_or_path: EleutherAI/gpt-j-6b
1616
tokenizer_name_or_path: EleutherAI/gpt-j-6b
17-
gpt_base_model: true
17+
gpt_base_model: true

llm_on_ray/inference/predictor_deployment.py

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from starlette.requests import Request
2727
from starlette.responses import StreamingResponse, JSONResponse
2828
from fastapi import HTTPException
29+
30+
from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess
2931
from llm_on_ray.inference.inference_config import InferenceConfig
3032
from llm_on_ray.inference.api_openai_backend.openai_protocol import (
3133
ChatMessage,
@@ -82,6 +84,7 @@ def __init__(
8284
self.predictor = TransformerPredictor(infer_conf)
8385

8486
self.loop = asyncio.get_running_loop()
87+
self.process_tool = ChatTemplatePreprocess(self.predictor)
8588

8689
def consume_streamer(self, streamer):
8790
for text in streamer:
@@ -308,9 +311,13 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
308311
Raises:
309312
HTTPException: If the input prompt format is invalid or not supported.
310313
"""
314+
311315
if isinstance(input, str):
312316
return input
313317
elif isinstance(input, list):
318+
prompts = []
319+
images = []
320+
314321
prompt_format = get_prompt_format(input)
315322
if prompt_format == PromptFormat.CHAT_FORMAT:
316323
# Process the input prompts with tools
@@ -327,35 +334,16 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
327334
m.content = self.openai_tools_prompter.content_from_assistant(m) # type: ignore
328335
elif m.tool_call_id is not None: # type: ignore
329336
m.content = self.openai_tools_prompter.content_from_tool(m) # type: ignore
330-
331-
if self.predictor.infer_conf.model_description.chat_template is not None:
332-
self.predictor.tokenizer.chat_template = (
333-
self.predictor.infer_conf.model_description.chat_template
334-
)
335-
elif self.predictor.tokenizer.chat_template is None:
336-
self.predictor.tokenizer.chat_template = (
337-
self.predictor.infer_conf.model_description.default_chat_template
338-
)
339-
340-
if self.is_mllm:
341-
if isinstance(input, list):
342-
if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
343-
messages = []
344-
for chat_message in input:
345-
message = {
346-
"role": chat_message.role,
347-
"content": chat_message.content,
348-
}
349-
messages.append(message)
350-
texts, images = self._extract_messages(messages)
351-
elif isinstance(input, list) and input and isinstance(input[0], dict):
352-
texts, images = self._extract_messages(input)
353-
elif isinstance(input, list) and input and isinstance(input[0], list):
354-
texts, images = [self._extract_messages(p) for p in input]
355-
356-
image = self._prepare_image(images)
357-
prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
358-
return prompt, image
337+
# Process the input prompts with MLLM tool
338+
if self.process_tool is not None:
339+
if self.is_mllm:
340+
input, image = self.process_tool.get_prompt(input, self.is_mllm)
341+
prompts.append(input)
342+
images.extend(image)
343+
return prompts, images
344+
else:
345+
prompt = self.process_tool.get_prompt(input)
346+
return prompt
359347
else:
360348
if isinstance(input, list) and input and isinstance(input[0], dict):
361349
prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
@@ -400,13 +388,13 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
400388
)
401389
streaming_response = json_request["stream"] if "stream" in json_request else False
402390
input = json_request["text"] if "text" in json_request else ""
391+
403392
if input == "":
404393
return JSONResponse(
405394
status_code=400,
406395
content="Empty prompt is not supported.",
407396
)
408397
config = json_request["config"] if "config" in json_request else {}
409-
logger.info(input)
410398
# return prompt or list of prompts preprocessed
411399
prompts = self.preprocess_prompts(input)
412400

llm_on_ray/inference/utils.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,13 @@
1919
import torch
2020
from typing import Dict, Any, List, Optional, Union
2121
from enum import Enum
22-
from llm_on_ray.inference.inference_config import InferenceConfig, DEVICE_CPU, DEVICE_HPU
22+
from llm_on_ray.inference.inference_config import (
23+
InferenceConfig,
24+
DEVICE_CPU,
25+
DEVICE_HPU,
26+
PRECISION_BF16,
27+
PRECISION_FP32,
28+
)
2329
from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
2430

2531

@@ -127,6 +133,10 @@ def decide_torch_dtype(infer_conf: InferenceConfig, hf_config=None):
127133

128134
if infer_conf.model_description.config.torch_dtype:
129135
# respect user config
136+
if infer_conf.model_description.config.torch_dtype == PRECISION_BF16:
137+
infer_conf.model_description.config.torch_dtype = torch.bfloat16
138+
elif infer_conf.model_description.config.torch_dtype == PRECISION_FP32:
139+
infer_conf.model_description.config.torch_dtype = torch.float32
130140
return
131141
elif hf_config is None:
132142
# default to float32 if hf_config is not supplied

0 commit comments

Comments
 (0)