VectorInstitute
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 79 additions & 0 deletions b/‎Dockerfile
Lines changed: 79 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 14 additions & 23 deletions b/‎README.md
Lines changed: 14 additions & 23 deletions
diff --git a/‎examples/README.md
Lines changed: 7 additions & 5 deletions b/‎examples/README.md
Lines changed: 7 additions & 5 deletions
diff --git a/‎examples/inference.py
Lines changed: 0 additions & 12 deletions b/‎examples/inference.py
Lines changed: 0 additions & 12 deletions
diff --git a/‎examples/inference.sh
Lines changed: 0 additions & 10 deletions b/‎examples/inference.sh
Lines changed: 0 additions & 10 deletions
diff --git a/‎examples/inference/llm/chat_completions.py
Lines changed: 15 additions & 0 deletions b/‎examples/inference/llm/chat_completions.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/inference/llm/completions.py
Lines changed: 13 additions & 0 deletions b/‎examples/inference/llm/completions.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎examples/inference/llm/completions.sh
Lines changed: 11 additions & 0 deletions b/‎examples/inference/llm/completions.sh
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/inference/vlm/vision_completions.py
Lines changed: 27 additions & 0 deletions b/‎examples/inference/vlm/vision_completions.py
Lines changed: 27 additions & 0 deletions
@@ -149,3 +149,6 @@ logs/
 local/
 slurm/
 scripts/
+
+# vLLM bug reporting files
+collect_env.py
@@ -0,0 +1,79 @@
+FROM nvidia/cuda:12.3.1-devel-ubuntu20.04
+
+# Non-interactive apt-get commands
+ARG DEBIAN_FRONTEND=noninteractive
+
+# No GPUs visible during build
+ARG CUDA_VISIBLE_DEVICES=none
+
+# Specify CUDA architectures -> 7.5: RTX 6000 & T4, 8.0: A100, 8.6+PTX
+ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX"
+
+# Set the Python version
+ARG PYTHON_VERSION=3.10.12
+
+# Install dependencies for building Python
+RUN apt-get update && apt-get install -y \
+    wget \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libffi-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    liblzma-dev \
+    git \
+    vim \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install Python from precompiled binaries
+RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
+    tar -xzf Python-$PYTHON_VERSION.tgz && \
+    cd Python-$PYTHON_VERSION && \
+    ./configure --enable-optimizations && \
+    make -j$(nproc) && \
+    make altinstall && \
+    cd .. && \
+    rm -rf Python-$PYTHON_VERSION.tgz Python-$PYTHON_VERSION
+
+# Download and install pip using get-pip.py
+RUN wget https://bootstrap.pypa.io/get-pip.py && \
+    python3.10 get-pip.py && \
+    rm get-pip.py
+
+# Ensure pip for Python 3.10 is used
+RUN python3.10 -m pip install --upgrade pip
+
+# Install Poetry using Python 3.10
+RUN python3.10 -m pip install poetry
+
+# Clone the repository
+RUN git clone https://github.com/VectorInstitute/vector-inference /vec-inf
+
+# Set the working directory
+WORKDIR /vec-inf
+
+# Configure Poetry to not create virtual environments
+RUN poetry config virtualenvs.create false
+
+# Update Poetry lock file if necessary
+RUN poetry lock
+
+# Install project dependencies via Poetry
+RUN poetry install
+
+# Install Flash Attention 2 backend
+RUN python3.10 -m pip install flash-attn --no-build-isolation
+
+# Move nccl to accessible location
+RUN mkdir -p /vec-inf/nccl
+RUN mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /vec-inf/nccl/libnccl.so.2.18.1; 
+
+# Set the default command to start an interactive shell
+CMD ["bash"]
@@ -1,8 +1,8 @@
 # Vector Inference: Easy inference on Slurm clusters
-This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). All scripts in this repository runs natively on the Vector Institute cluster environment, and can be easily adapted to other environments.  
+This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the config files in the `models` folder and the environment variables in the model launching scripts accordingly.  
 
 ## Installation
-If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, all you need to do is run `pip install vllm-nccl-cu12` and go to the next section. Otherwise, you might need up to 10GB of storage to setup your own virtual environment. The following steps needs to be run only once for each user.
+If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, you can go to the next section as we have a default container environment in place. Otherwise, you might need up to 10GB of storage to setup your own virtual environment. The following steps needs to be run only once for each user.
 
 1. Setup the virtual environment for running inference servers, run 
 ```bash
@@ -29,7 +29,7 @@ pip install vllm-flash-attn
 ## Launch an inference server
 We will use the Llama 3 model as example, to launch an inference server for Llama 3 8B, run
 ```bash
-bash models/llama3/launch_server.sh
+bash src/launch_server.sh --model-family llama3
 ```
 You should see an output like the following:
 > Job Name: vLLM/Meta-Llama-3-8B
@@ -44,35 +44,26 @@ You should see an output like the following:
 
 If you want to use your own virtual environment, you can run this instead:
 ```bash
-bash models/llama3/launch_server.sh -e $(poetry env info --path)
+bash src/launch_server.sh --model-family llama3 --venv $(poetry env info --path)
 ```
-By default, the `launch_server.sh` script in Llama 3 folder uses the 8B variant, you can switch to other variants with the `-v` flag, and make sure to change the requested resource accordingly. More information about the flags and customizations can be found in the [`models`](models) folder. The inference server is compatible with the OpenAI `Completion` and `ChatCompletion` API. You can inspect the Slurm output files to check the inference server status. 
+By default, the `launch_server.sh` script is set to use the 8B variant for Llama 3 based on the config file in `models/llama3` folder, you can switch to other variants with the `--model-variant` argument, and make sure to change the requested resource accordingly. More information about the flags and customizations can be found in the [`models`](models) folder. The inference server is compatible with the OpenAI `Completion` and `ChatCompletion` API. You can inspect the Slurm output files to check the inference server status. 
 
 Here is a more complicated example that launches a model variant using multiple nodes, say we want to launch Mixtral 8x22B, run
 ```bash
-bash models/mixtral/launch_server.sh -v 8x22B-v0.1 -N 2 -n 4
+bash src/launch_server.sh --model-family mixtral --model-variant 8x22B-v0.1 --num-nodes 2 --num-gpus 4
+```
+
+And for launching a multimodal model, here is an example for launching LLaVa-NEXT Mistral 7B (default variant)
+```bash
+bash src/launch_server.sh --model-family llava-next --is-vlm 
 ```
-The default partition for Mixtral models is a40, and we need 8 a40 GPUs to load Mixtral 8x22B, so we requested 2 a40 nodes with 4 GPUs per node. You should see an output like the following:
-> Number of nodes set to: 2
->
-> Number of GPUs set to: 4
->
-> Model variant set to: 8x22B-v0.1
-> 
-> Job Name: vLLM/Mixtral-8x22B-v0.1
-> 
-> Partition: a40
-> 
-> Generic Resource Scheduling: gpu:8
-> 
-> Data Type: auto
-> 
-> Submitted batch job 12430232
 
 ## Send inference requests
-Once the inference server is ready, you can start sending in inference requests. We provide example [Python](examples/inference.py) and [Bash](examples/inference.sh) scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. You can run either `python examples/inference.py` or `bash examples/inference.sh`, and you should expect to see an output like the following:
+Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
 > {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
 
+**NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
+
 ## SSH tunnel from your local device
 If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
 ```bash
 
@@ -1,6 +1,8 @@
 # Examples
-`inference.py`: Python example of sending inference requests to inference server using OpenAI API, make sure to install OpenAI API in your environment.
-
-`inference.sh`: Bash example of sending inference requests to inference server, supports JSON mode
-
-`logits.py`: Python example of getting logits from hosted model. 
+- [`inference`](inference): Examples for sending inference requests
+  - [`llm/chat_completions.py`](inference/llm/chat_completions.py): Python example of sending chat completion requests to OpenAI compatible server
+  - [`llm/completions.py`](inference/llm/completions.py): Python example of sending completion requests to OpenAI compatible server
+  - [`llm/completions.sh`](inference/llm/completions.sh): Bash example of sending completion requests to OpenAI compatible server, supports JSON mode
+  - [`vlm/vision_completions.py`](inference/vlm/vision_completions.py): Python example of sending chat completion requests with image attached to prompt to OpenAI compatible server for vision language models
+- [`logits`](logits): Example for logits generation
+  - [`logits.py`](logits/logits.py): Python example of getting logits from hosted model.  
@@ -0,0 +1,15 @@
+from openai import OpenAI
+
+# The url is located in the .vLLM_model-variant_url file in the corresponding model directory.
+client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
+
+# Update the model path accordingly
+completion = client.chat.completions.create(
+  model="/model-weights/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+    {"role": "user", "content": "Who are you?"},
+  ]
+)
+
+print(completion)
@@ -0,0 +1,13 @@
+from openai import OpenAI
+
+# The url is located in the .vLLM_model-variant_url file in the corresponding model directory.
+client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
+
+# Update the model path accordingly
+completion = client.completions.create(
+    model="/model-weights/Meta-Llama-3-8B",
+    prompt="Where is the capital of Canada?",
+    max_tokens=20,
+)
+
+print(completion)
@@ -0,0 +1,11 @@
+# The url is located in the .vLLM_model-variant_url file in the corresponding model directory.
+export API_BASE_URL=http://gpuXXX:XXXX/v1
+
+# Update the model path accordingly
+curl ${API_BASE_URL}/completions \
+   -H "Content-Type: application/json" \
+   -d '{
+       "model": "/model-weights/Meta-Llama-3-8B",
+       "prompt": "What is the capital of Canada?",
+       "max_tokens": 20
+   }'
@@ -0,0 +1,27 @@
+from openai import OpenAI
+
+# The url is located in the .vLLM_model-variant_url file in the corresponding model directory.
+client = OpenAI(base_url="http://gpuXXX:XXXX/v1", api_key="EMPTY")
+
+# Update the model path accordingly
+completion = client.chat.completions.create(
+    model="/model-weights/llava-1.5-13b-hf",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    },
+                },
+            ],
+        }
+    ],
+    max_tokens=50,
+)
+
+print(completion)
+