Skip to content

Commit fbb8527

Browse files
authored
Merge pull request #146 from VectorInstitute/bugfix/mpi-client-error
Bugfix for multi-node server dying on first request: * Updated RDMA packages in Dockerfile * Updated NCCL env vars in slurm script * Updated bind paths for RDMA drivers Other updates: * Added flash-infer backend, added sglang to dependencies for future sglang support * Fix for slurm script generation when using virtual env instead of container
2 parents 3a66593 + 9ace730 commit fbb8527

File tree

15 files changed

+3573
-2404
lines changed

15 files changed

+3573
-2404
lines changed

.github/workflows/code_checks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
with:
4141
python-version-file: ".python-version"
4242
- name: Install the project
43-
run: uv sync --dev
43+
run: uv sync --dev --prerelease=allow
4444
- name: Install dependencies and check code
4545
run: |
4646
source .venv/bin/activate

.github/workflows/docs.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@ jobs:
6767
python-version-file: ".python-version"
6868

6969
- name: Install the project
70-
run: uv sync --all-extras --group docs
70+
run: uv sync --all-extras --group docs --prerelease=allow
7171

7272
- name: Build docs
73-
run: uv run mkdocs build
73+
run: uv run --frozen mkdocs build
7474

7575
- name: Create .nojekyll file
7676
run: touch site/.nojekyll
@@ -104,7 +104,7 @@ jobs:
104104
python-version-file: ".python-version"
105105

106106
- name: Install the project
107-
run: uv sync --all-extras --group docs
107+
run: uv sync --all-extras --group docs --frozen
108108

109109
- name: Configure Git Credentials
110110
run: |

.github/workflows/unit_tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,18 +58,18 @@ jobs:
5858
python-version: ${{ matrix.python-version }}
5959

6060
- name: Install the project
61-
run: uv sync --dev
61+
run: uv sync --dev --prerelease=allow
6262

6363
- name: Install dependencies and check code
6464
run: |
65-
uv run pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
65+
uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
6666
6767
- name: Install the core package only
6868
run: uv sync --no-dev
6969

7070
- name: Run package import tests
7171
run: |
72-
uv run pytest tests/test_imports.py
72+
uv run --frozen pytest tests/test_imports.py
7373
7474
- name: Import Codecov GPG public key
7575
run: |
@@ -79,7 +79,7 @@ jobs:
7979
uses: codecov/codecov-action@v5.5.1
8080
with:
8181
token: ${{ secrets.CODECOV_TOKEN }}
82-
file: ./coverage.xml
82+
files: ./coverage.xml
8383
name: codecov-umbrella
8484
fail_ci_if_error: true
8585
verbose: true

Dockerfile

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,29 +35,33 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
3535
rm get-pip.py && \
3636
python3.10 -m pip install --upgrade pip setuptools wheel uv
3737

38-
# Install Infiniband/RDMA support
38+
# Install RDMA support
3939
RUN apt-get update && apt-get install -y \
4040
libibverbs1 libibverbs-dev ibverbs-utils \
4141
librdmacm1 librdmacm-dev rdmacm-utils \
42+
rdma-core ibverbs-providers infiniband-diags perftest \
4243
&& rm -rf /var/lib/apt/lists/*
4344

4445
# Set up RDMA environment (these will persist in the final container)
4546
ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
46-
ENV UCX_NET_DEVICES=all
4747
ENV NCCL_IB_DISABLE=0
48+
ENV NCCL_SOCKET_IFNAME="^lo,docker0"
49+
ENV NCCL_NET_GDR_LEVEL=PHB
50+
ENV NCCL_IB_TIMEOUT=22
51+
ENV NCCL_IB_RETRY_CNT=7
52+
ENV NCCL_DEBUG=INFO
4853

4954
# Set up project
5055
WORKDIR /vec-inf
5156
COPY . /vec-inf
5257

5358
# Install project dependencies with build requirements
54-
RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu128" uv pip install --system -e .[dev]
59+
RUN uv pip install --system -e .[dev] --prerelease=allow
5560

56-
# Final configuration
57-
RUN mkdir -p /vec-inf/nccl && \
58-
mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /vec-inf/nccl/libnccl.so.2.18.1
59-
ENV VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
60-
ENV NCCL_DEBUG=INFO
61+
# Install a single, system NCCL (from NVIDIA CUDA repo in base image)
62+
RUN apt-get update && apt-get install -y --allow-change-held-packages\
63+
libnccl2 libnccl-dev \
64+
&& rm -rf /var/lib/apt/lists/*
6165

6266
# Set the default command to start an interactive shell
6367
CMD ["bash"]

MODEL_TRACKING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ This document tracks all model weights available in the `/model-weights` directo
4040
| `gemma-2b-it` ||
4141
| `gemma-7b` ||
4242
| `gemma-7b-it` ||
43+
| `gemma-2-2b-it` ||
4344
| `gemma-2-9b` ||
4445
| `gemma-2-9b-it` ||
4546
| `gemma-2-27b` ||

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ dev = [
4242
"xgrammar>=0.1.11",
4343
"torch>=2.7.0",
4444
"vllm>=0.10.0",
45-
"vllm-nccl-cu12>=2.18,<2.19",
46-
"ray>=2.40.0",
47-
"cupy-cuda12x==12.1.0"
45+
"ray[default]>=2.50.0",
46+
"cupy-cuda12x==12.1.0",
47+
"flashinfer-python>=0.4.0",
48+
"sglang>=0.5.0",
4849
]
4950

5051
[project.scripts]

tests/vec_inf/cli/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def test_launch_command_success(runner):
3939
"mem_per_node": "32G",
4040
"model_weights_parent_dir": "/model-weights",
4141
"vocab_size": "128000",
42+
"venv": "/path/to/venv",
4243
"vllm_args": {"max_model_len": 8192},
4344
"env": {"CACHE": "/cache"},
4445
}

tests/vec_inf/cli/test_helper.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def test_format_table_output(self):
3535
"mem_per_node": "32G",
3636
"model_weights_parent_dir": "/model-weights",
3737
"log_dir": "/tmp/logs",
38+
"venv": "/path/to/venv",
3839
"vllm_args": {"max_model_len": 8192, "enable_prefix_caching": True},
3940
"env": {"CACHE": "/cache"},
4041
}
@@ -63,6 +64,7 @@ def test_format_table_output_with_minimal_params(self):
6364
"mem_per_node": "16G",
6465
"model_weights_parent_dir": "/weights",
6566
"log_dir": "/logs",
67+
"venv": "/path/to/venv",
6668
"vllm_args": {},
6769
"env": {},
6870
}

tests/vec_inf/client/test_slurm_script_generator.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def singularity_params(self, basic_params):
5353
singularity = basic_params.copy()
5454
singularity.update(
5555
{
56-
"venv": "singularity",
56+
"venv": "apptainer",
5757
"bind": "/scratch:/scratch,/data:/data",
5858
"env": {
5959
"CACHE_DIR": "/cache",
@@ -109,7 +109,7 @@ def test_init_singularity(self, singularity_params):
109109
def test_init_singularity_no_bind(self, basic_params):
110110
"""Test Singularity initialization without additional binds."""
111111
params = basic_params.copy()
112-
params["venv"] = "singularity"
112+
params["venv"] = "apptainer"
113113
generator = SlurmScriptGenerator(params)
114114

115115
assert generator.params == params
@@ -173,7 +173,6 @@ def test_generate_launch_cmd_venv(self, basic_params):
173173
generator = SlurmScriptGenerator(basic_params)
174174
launch_cmd = generator._generate_launch_cmd()
175175

176-
assert "source /path/to/venv/bin/activate" in launch_cmd
177176
assert "vllm serve /path/to/model_weights/test-model" in launch_cmd
178177
assert "--served-model-name test-model" in launch_cmd
179178
assert "--tensor-parallel-size 4" in launch_cmd
@@ -185,7 +184,7 @@ def test_generate_launch_cmd_singularity(self, singularity_params):
185184
generator = SlurmScriptGenerator(singularity_params)
186185
launch_cmd = generator._generate_launch_cmd()
187186

188-
assert "exec --nv" in launch_cmd
187+
assert "apptainer exec --nv" in launch_cmd
189188
assert "--bind /path/to/model_weights/test-model" in launch_cmd
190189
assert "--bind /scratch:/scratch,/data:/data" in launch_cmd
191190
assert "source" not in launch_cmd
@@ -306,9 +305,9 @@ def batch_params(self):
306305
def batch_singularity_params(self, batch_params):
307306
"""Generate batch SLURM configuration parameters with Singularity."""
308307
singularity_params = batch_params.copy()
309-
singularity_params["venv"] = "singularity" # Set top-level venv to singularity
308+
singularity_params["venv"] = "apptainer" # Set top-level venv to apptainer
310309
for model_name in singularity_params["models"]:
311-
singularity_params["models"][model_name]["venv"] = "singularity"
310+
singularity_params["models"][model_name]["venv"] = "apptainer"
312311
singularity_params["models"][model_name]["bind"] = (
313312
"/scratch:/scratch,/data:/data"
314313
)
@@ -341,9 +340,9 @@ def test_init_singularity(self, batch_singularity_params):
341340
def test_init_singularity_no_bind(self, batch_params):
342341
"""Test Singularity initialization without additional binds."""
343342
params = batch_params.copy()
344-
params["venv"] = "singularity" # Set top-level venv to singularity
343+
params["venv"] = "apptainer" # Set top-level venv to apptainer
345344
for model_name in params["models"]:
346-
params["models"][model_name]["venv"] = "singularity"
345+
params["models"][model_name]["venv"] = "apptainer"
347346

348347
generator = BatchSlurmScriptGenerator(params)
349348

0 commit comments

Comments
 (0)