Skip to content

Commit d10758d

Browse files
authored
Merge pull request #15 from VectorInstitute/develop
Develop
2 parents 3641ef2 + 59e7622 commit d10758d

File tree

9 files changed

+20
-27
lines changed

9 files changed

+20
-27
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ vec-inf list Meta-Llama-3.1-70B-Instruct
6161

6262
## Send inference requests
6363
Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
64-
> {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
64+
> {"id":"cmpl-c08d8946224747af9cce9f4d9f36ceb3","object":"text_completion","created":1725394970,"model":"Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"text":" is a question that many people may wonder. The answer is, of course, Ottawa. But if","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
6565
6666
**NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
6767

examples/inference/llm/chat_completions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
# Update the model path accordingly
77
completion = client.chat.completions.create(
8-
model="/model-weights/Meta-Llama-3-8B-Instruct",
8+
model="Meta-Llama-3.1-8B-Instruct",
99
messages=[
1010
{
1111
"role": "system",

examples/inference/llm/completions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
# Update the model path accordingly
77
completion = client.completions.create(
8-
model="/model-weights/Meta-Llama-3-8B",
8+
model="Meta-Llama-3.1-8B-Instruct",
99
prompt="Where is the capital of Canada?",
1010
max_tokens=20,
1111
)

examples/inference/llm/completions.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ export API_BASE_URL=http://gpuXXX:XXXX/v1
55
curl ${API_BASE_URL}/completions \
66
-H "Content-Type: application/json" \
77
-d '{
8-
"model": "/model-weights/Meta-Llama-3-8B",
8+
"model": "Meta-Llama-3.1-8B-Instruct",
99
"prompt": "What is the capital of Canada?",
1010
"max_tokens": 20
1111
}'

examples/inference/vlm/vision_completions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
# Update the model path accordingly
77
completion = client.chat.completions.create(
8-
model="/model-weights/llava-1.5-13b-hf",
8+
model="llava-1.5-13b-hf",
99
messages=[
1010
{
1111
"role": "user",

examples/logits/logits.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
client = OpenAI(base_url="http://gpuXXX:XXXXX/v1", api_key="EMPTY")
55

66
completion = client.completions.create(
7-
model="/model-weights/Meta-Llama-3-8B",
7+
model="Meta-Llama-3.1-8B-Instruct",
88
prompt="Where is the capital of Canada?",
99
max_tokens=1,
1010
logprobs=32000, # Set to model vocab size to get logits

poetry.lock

Lines changed: 9 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

profile/gen.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import requests
21
import time
32

3+
import requests
4+
45
# Change the ENDPOINT and MODEL_PATH to match your setup
56
ENDPOINT = "http://gpuXXX:XXXX/v1"
67
MODEL_PATH = "Meta-Llama-3-70B"
@@ -71,11 +72,7 @@
7172

7273

7374
def send_request(prompt):
74-
data = {
75-
"model": f"/model-weights/{MODEL_PATH}",
76-
"prompt": prompt,
77-
"max_tokens": 100,
78-
}
75+
data = {"model": f"{MODEL_PATH}", "prompt": prompt, "max_tokens": 100}
7976
start_time = time.time()
8077
response = requests.post(f"{ENDPOINT}/completions", headers=HEADERS, json=data)
8178
duration = time.time() - start_time

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "vec-inf"
3-
version = "0.3.2"
3+
version = "0.3.3"
44
description = "Efficient LLM inference on Slurm clusters using vLLM."
55
authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
66
license = "MIT license"
@@ -11,6 +11,7 @@ python = "^3.10"
1111
requests = "^2.31.0"
1212
click = "^8.1.0"
1313
rich = "^13.7.0"
14+
pandas = "^2.2.2"
1415
vllm = { version = "^0.5.0", optional = true }
1516
vllm-nccl-cu12 = { version = ">=2.18,<2.19", optional = true }
1617
ray = { version = "^2.9.3", optional = true }

0 commit comments

Comments
 (0)