intel
diff --git a/‎.github/workflows/scripts/test.sh‎
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/scripts/test.sh‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.github/workflows/workflow_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/workflow_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/docker/Dockerfile.tests_cpu‎
Lines changed: 41 additions & 0 deletions b/‎dev/docker/Dockerfile.tests_cpu‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎examples/inference/api_server_openai/query_http_requests_tool.py‎
Lines changed: 1 addition & 2 deletions b/‎examples/inference/api_server_openai/query_http_requests_tool.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎llm_on_ray/common/tokenizer/huggingface_tokenizer.py‎
Lines changed: 2 additions & 0 deletions b/‎llm_on_ray/common/tokenizer/huggingface_tokenizer.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎llm_on_ray/finetune/finetune_config.py‎
Lines changed: 1 addition & 1 deletion b/‎llm_on_ray/finetune/finetune_config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm_on_ray/inference/chat_template_process.py‎
Lines changed: 126 additions & 0 deletions b/‎llm_on_ray/inference/chat_template_process.py‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎llm_on_ray/inference/models/gpt-j-6b.yaml‎
Lines changed: 1 addition & 1 deletion b/‎llm_on_ray/inference/models/gpt-j-6b.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm_on_ray/inference/predictor_deployment.py‎
Lines changed: 18 additions & 30 deletions b/‎llm_on_ray/inference/predictor_deployment.py‎
Lines changed: 18 additions & 30 deletions
diff --git a/‎llm_on_ray/inference/utils.py‎
Lines changed: 11 additions & 1 deletion b/‎llm_on_ray/inference/utils.py‎
Lines changed: 11 additions & 1 deletion
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+build_and_prune() {
+    # Set TARGET and DF-SUFFIX using the passed in parameters
+    local TARGET=$TARGET
+    local DF_SUFFIX=$DF_SUFFIX
+    local PYTHON_V=$PYTHON_V  ## same name
+    local USE_PROXY=$USE_PROXY
+
+    echo "defe is $TARGET"
+    echo "df-suffix is $DF_SUFFIX"
+    echo "python version is $PYTHON_V"
+    echo "use proxy is $USE_PROXY"
+}
@@ -152,7 +152,7 @@ jobs:
 
       - name: Build Docker Image
         run: |
-          DF_SUFFIX=".tests_cpu_and_deepspeed"
+          DF_SUFFIX=".tests_cpu"
           TARGET=${{steps.target.outputs.target}}
           docker build ./ --build-arg CACHEBUST=1 --build-arg python_v=${{matrix.python-version}} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest 
           docker container prune -f 
 
@@ -0,0 +1,41 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ARG python_v 
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
+    /bin/bash ~/miniconda.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    conda config --add channels intel && \
+    conda install python==${python_v}
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+
+# create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s)
+ARG CACHEBUST=1
+COPY ./dev/scripts/install-oneapi.sh /tmp
+RUN /tmp/install-oneapi.sh
@@ -73,15 +73,14 @@
 
 messages = [
     [
-        {"role": "user", "content": "You are a helpful assistant"},
         {"role": "user", "content": "What's the weather like in Boston today?"},
     ],
 ]
 
 proxies = {"http": None, "https": None}
 
 for message in messages:
-    print(f"User: {message[1]['content']}")
+    print(f"User: {message[0]['content']}")
     print("Assistant:", end=" ", flush=True)
 
     body = {
 
@@ -23,5 +23,7 @@ class HuggingFaceTokenizer(Tokenizer):
     def __call__(self, config):
         name = config.get("name")
         load_config = config.get("config", {})
+        print(name)
         tokenizer = transformers.AutoTokenizer.from_pretrained(name, **load_config)
+        print(tokenizer)
         return tokenizer
@@ -180,4 +180,4 @@ class FinetuneConfig(BaseModel):
         m: FinetuneConfig = parse_yaml_raw_as(FinetuneConfig, f)
         _models[m.General.base_model] = m
 
-all_models = _models.copy()
+base_models = _models.copy()
@@ -0,0 +1,126 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import List, Union
+
+from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
+
+
+class ChatTemplatePreprocess:
+    def __init__(self, predictor) -> None:
+        self.predictor = predictor
+
+    def get_prompt(self, input: List, is_mllm=False):
+        """Generate response based on input."""
+        if self.predictor.infer_conf.model_description.chat_template is not None:
+            self.predictor.tokenizer.chat_template = (
+                self.predictor.infer_conf.model_description.chat_template
+            )
+        elif self.predictor.tokenizer.chat_template is None:
+            self.predictor.tokenizer.chat_template = (
+                self.predictor.infer_conf.model_description.default_chat_template
+            )
+
+        if is_mllm:
+            if isinstance(input, List):
+                if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                    messages = []
+                    for chat_message in input:
+                        message = {
+                            "role": chat_message.role,
+                            "content": chat_message.content,
+                        }
+                        messages.append(message)
+                    texts, images = self._extract_messages(messages)
+                elif isinstance(input, list) and input and isinstance(input[0], dict):
+                    texts, images = self._extract_messages(input)
+                elif isinstance(input, list) and input and isinstance(input[0], list):
+                    texts, images = [self._extract_messages(p) for p in input]
+
+                image = self._prepare_image(images)
+                prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
+                return prompt, image
+        else:
+            if isinstance(input, list) and input and isinstance(input[0], dict):
+                prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
+            elif isinstance(input, list) and input and isinstance(input[0], list):
+                prompt = [
+                    self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input
+                ]
+            elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                messages = []
+                for chat_message in input:
+                    message = {"role": chat_message.role, "content": chat_message.content}
+                    messages.append(message)
+                prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
+            elif isinstance(input, list) and input and isinstance(input[0], str):
+                prompt = input
+            elif isinstance(input, str):
+                prompt = input
+            else:
+                raise TypeError(
+                    f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
+                )
+        return prompt
+
+    def _extract_messages(self, messages):
+        texts, images = [], []
+        for message in messages:
+            if message["role"] == "user" and isinstance(message["content"], list):
+                texts.append({"role": "user", "content": message["content"][0]["text"]})
+                images.append(
+                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
+                )
+            else:
+                texts.append(message)
+        return texts, images
+
+    def _prepare_image(self, messages: list):
+        """Prepare image from history messages."""
+        from PIL import Image
+        import requests
+        from io import BytesIO
+        import base64
+        import re
+
+        # prepare images
+        images: List = []
+        if isinstance(messages[0], List):
+            for i in range(len(messages)):
+                for msg in messages[i]:
+                    msg = dict(msg)
+                    content = msg["content"]
+                    if "url" not in content:
+                        continue
+                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                    if is_data:
+                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                    else:
+                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
+        elif isinstance(messages[0], dict):
+            for msg in messages:
+                msg = dict(msg)
+                content = msg["content"]
+                if "url" not in content:
+                    continue
+                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                if is_data:
+                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                else:
+                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
+
+        return images
@@ -14,4 +14,4 @@ ipex:
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  gpt_base_model: true
+  gpt_base_model: true
@@ -26,6 +26,8 @@
 from starlette.requests import Request
 from starlette.responses import StreamingResponse, JSONResponse
 from fastapi import HTTPException
+
+from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess
 from llm_on_ray.inference.inference_config import InferenceConfig
 from llm_on_ray.inference.api_openai_backend.openai_protocol import (
     ChatMessage,
@@ -82,6 +84,7 @@ def __init__(
             self.predictor = TransformerPredictor(infer_conf)
 
         self.loop = asyncio.get_running_loop()
+        self.process_tool = ChatTemplatePreprocess(self.predictor)
 
     def consume_streamer(self, streamer):
         for text in streamer:
@@ -308,9 +311,13 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
+
         if isinstance(input, str):
             return input
         elif isinstance(input, list):
+            prompts = []
+            images = []
+
             prompt_format = get_prompt_format(input)
             if prompt_format == PromptFormat.CHAT_FORMAT:
                 # Process the input prompts with tools
@@ -327,35 +334,16 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                             m.content = self.openai_tools_prompter.content_from_assistant(m)  # type: ignore
                         elif m.tool_call_id is not None:  # type: ignore
                             m.content = self.openai_tools_prompter.content_from_tool(m)  # type: ignore
-
-                if self.predictor.infer_conf.model_description.chat_template is not None:
-                    self.predictor.tokenizer.chat_template = (
-                        self.predictor.infer_conf.model_description.chat_template
-                    )
-                elif self.predictor.tokenizer.chat_template is None:
-                    self.predictor.tokenizer.chat_template = (
-                        self.predictor.infer_conf.model_description.default_chat_template
-                    )
-
-                if self.is_mllm:
-                    if isinstance(input, list):
-                        if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
-                            messages = []
-                            for chat_message in input:
-                                message = {
-                                    "role": chat_message.role,
-                                    "content": chat_message.content,
-                                }
-                                messages.append(message)
-                            texts, images = self._extract_messages(messages)
-                        elif isinstance(input, list) and input and isinstance(input[0], dict):
-                            texts, images = self._extract_messages(input)
-                        elif isinstance(input, list) and input and isinstance(input[0], list):
-                            texts, images = [self._extract_messages(p) for p in input]
-
-                        image = self._prepare_image(images)
-                        prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
-                        return prompt, image
+                # Process the input prompts with MLLM tool
+                if self.process_tool is not None:
+                    if self.is_mllm:
+                        input, image = self.process_tool.get_prompt(input, self.is_mllm)
+                        prompts.append(input)
+                        images.extend(image)
+                        return prompts, images
+                    else:
+                        prompt = self.process_tool.get_prompt(input)
+                        return prompt
                 else:
                     if isinstance(input, list) and input and isinstance(input[0], dict):
                         prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
@@ -400,13 +388,13 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
             )
         streaming_response = json_request["stream"] if "stream" in json_request else False
         input = json_request["text"] if "text" in json_request else ""
+
         if input == "":
             return JSONResponse(
                 status_code=400,
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
-        logger.info(input)
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
 
 
@@ -19,7 +19,13 @@
 import torch
 from typing import Dict, Any, List, Optional, Union
 from enum import Enum
-from llm_on_ray.inference.inference_config import InferenceConfig, DEVICE_CPU, DEVICE_HPU
+from llm_on_ray.inference.inference_config import (
+    InferenceConfig,
+    DEVICE_CPU,
+    DEVICE_HPU,
+    PRECISION_BF16,
+    PRECISION_FP32,
+)
 from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
 
 
@@ -127,6 +133,10 @@ def decide_torch_dtype(infer_conf: InferenceConfig, hf_config=None):
 
     if infer_conf.model_description.config.torch_dtype:
         # respect user config
+        if infer_conf.model_description.config.torch_dtype == PRECISION_BF16:
+            infer_conf.model_description.config.torch_dtype = torch.bfloat16
+        elif infer_conf.model_description.config.torch_dtype == PRECISION_FP32:
+            infer_conf.model_description.config.torch_dtype = torch.float32
         return
     elif hf_config is None:
         # default to float32 if hf_config is not supplied
Original file line number	Diff line number	Diff line change
`@@ -73,15 +73,14 @@`
`73`	`73`
`74`	`74`	`messages = [`
`75`	`75`	`[`
`76`		`- {"role": "user", "content": "You are a helpful assistant"},`
`77`	`76`	`{"role": "user", "content": "What's the weather like in Boston today?"},`
`78`	`77`	`],`
`79`	`78`	`]`
`80`	`79`
`81`	`80`	`proxies = {"http": None, "https": None}`
`82`	`81`
`83`	`82`	`for message in messages:`
`84`		`- print(f"User: {message[1]['content']}")`
	`83`	`+ print(f"User: {message[0]['content']}")`
`85`	`84`	`print("Assistant:", end=" ", flush=True)`
`86`	`85`
`87`	`86`	`body = {`