Merge pull request #15 from VectorInstitute/develop

XkunW · web-flow · commit d10758d44249 · 2024-09-03T17:39:09.000-04:00
Develop
diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ vec-inf list Meta-Llama-3.1-70B-Instruct
 
 ## Send inference requests
 Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
-> {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
+> {"id":"cmpl-c08d8946224747af9cce9f4d9f36ceb3","object":"text_completion","created":1725394970,"model":"Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"text":" is a question that many people may wonder. The answer is, of course, Ottawa. But if","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
 
 **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
 
diff --git a/examples/inference/llm/chat_completions.py b/examples/inference/llm/chat_completions.py
@@ -5,7 +5,7 @@
 
 # Update the model path accordingly
 completion = client.chat.completions.create(
-    model="/model-weights/Meta-Llama-3-8B-Instruct",
+    model="Meta-Llama-3.1-8B-Instruct",
     messages=[
         {
             "role": "system",
diff --git a/examples/inference/llm/completions.py b/examples/inference/llm/completions.py
@@ -5,7 +5,7 @@
 
 # Update the model path accordingly
 completion = client.completions.create(
-    model="/model-weights/Meta-Llama-3-8B",
+    model="Meta-Llama-3.1-8B-Instruct",
     prompt="Where is the capital of Canada?",
     max_tokens=20,
 )
diff --git a/examples/inference/llm/completions.sh b/examples/inference/llm/completions.sh
@@ -5,7 +5,7 @@ export API_BASE_URL=http://gpuXXX:XXXX/v1
 curl ${API_BASE_URL}/completions \
    -H "Content-Type: application/json" \
    -d '{
-       "model": "/model-weights/Meta-Llama-3-8B",
+       "model": "Meta-Llama-3.1-8B-Instruct",
        "prompt": "What is the capital of Canada?",
        "max_tokens": 20
    }'
diff --git a/examples/inference/vlm/vision_completions.py b/examples/inference/vlm/vision_completions.py
@@ -5,7 +5,7 @@
 
 # Update the model path accordingly
 completion = client.chat.completions.create(
-    model="/model-weights/llava-1.5-13b-hf",
+    model="llava-1.5-13b-hf",
     messages=[
         {
             "role": "user",
diff --git a/examples/logits/logits.py b/examples/logits/logits.py
@@ -4,7 +4,7 @@
 client = OpenAI(base_url="http://gpuXXX:XXXXX/v1", api_key="EMPTY")
 
 completion = client.completions.create(
-    model="/model-weights/Meta-Llama-3-8B",
+    model="Meta-Llama-3.1-8B-Instruct",
     prompt="Where is the capital of Canada?",
     max_tokens=1,
     logprobs=32000,  # Set to model vocab size to get logits
diff --git a/poetry.lock b/poetry.lock
diff --git a/profile/gen.py b/profile/gen.py
@@ -1,6 +1,7 @@
-import requests
 import time
 
+import requests
+
 # Change the ENDPOINT and MODEL_PATH to match your setup
 ENDPOINT = "http://gpuXXX:XXXX/v1"
 MODEL_PATH = "Meta-Llama-3-70B"
@@ -71,11 +72,7 @@
 
 
 def send_request(prompt):
-    data = {
-        "model": f"/model-weights/{MODEL_PATH}",
-        "prompt": prompt,
-        "max_tokens": 100,
-    }
+    data = {"model": f"{MODEL_PATH}", "prompt": prompt, "max_tokens": 100}
     start_time = time.time()
     response = requests.post(f"{ENDPOINT}/completions", headers=HEADERS, json=data)
     duration = time.time() - start_time
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vec-inf"
-version = "0.3.2"
+version = "0.3.3"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
 authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
 license = "MIT license"
@@ -11,6 +11,7 @@ python = "^3.10"
 requests = "^2.31.0"
 click = "^8.1.0"
 rich = "^13.7.0"
+pandas = "^2.2.2"
 vllm = { version = "^0.5.0", optional = true }
 vllm-nccl-cu12 = { version = ">=2.18,<2.19", optional = true }
 ray = { version = "^2.9.3", optional = true }