diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 99857cc6..04ebbebd 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -11,6 +11,7 @@ on: permissions: id-token: write # Required for OIDC contents: read # Required for checkout + packages: write # Required for pushing cache layers to GHCR jobs: test: @@ -19,13 +20,13 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: astral-sh/setup-uv@v4 + - uses: astral-sh/setup-uv@v7 with: enable-cache: true cache-dependency-glob: "**/pyproject.toml" - name: Cache dependencies - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ${{ env.UV_CACHE_DIR }} key: ${{ runner.os }}-uv-${{ hashFiles('**/pyproject.toml') }} @@ -110,14 +111,251 @@ jobs: include: - component: api build_args: "--target nilai --platform linux/amd64" + - component: vllm + model_to_cache: "openai/gpt-oss-20b" steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 + + - name: Disable unattended upgrades + run: | + echo "Disabling unattended upgrades to prevent interference with CI builds..." + + # Stop unattended-upgrades service + sudo systemctl stop unattended-upgrades || true + sudo systemctl disable unattended-upgrades || true + + # Kill any running unattended-upgrades processes + sudo pkill -f unattended-upgrade || true + + # Remove or disable the unattended-upgrades configuration + sudo systemctl mask unattended-upgrades || true + + # Wait for any ongoing package operations to complete + while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do + echo "Waiting for package manager lock to be released..." + sleep 5 + done + + # Disable automatic updates in APT configuration + echo 'APT::Periodic::Update-Package-Lists "0";' | sudo tee /etc/apt/apt.conf.d/20auto-upgrades + echo 'APT::Periodic::Unattended-Upgrade "0";' | sudo tee -a /etc/apt/apt.conf.d/20auto-upgrades + + echo "โœ… Unattended upgrades disabled successfully" + + - name: Install Docker Buildx plugin + run: | + set -euo pipefail + BUILDX_VERSION="v0.14.1" + mkdir -p ~/.docker/cli-plugins + curl -sSL "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-amd64" \ + -o ~/.docker/cli-plugins/docker-buildx + chmod +x ~/.docker/cli-plugins/docker-buildx + docker buildx version + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: image=moby/buildkit:latest + buildkitd-flags: --allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + + - name: Check system resources + run: | + echo "=== System Resources ===" + df -h + free -h + echo "=== Docker Info ===" + docker info + echo "=== Docker System Usage ===" + docker system df + + - name: Pre-pull Docker base image (for vllm) + if: matrix.component == 'vllm' + run: | + echo "Pre-pulling vllm base image to avoid rate limiting during build..." + docker pull vllm/vllm-openai:v0.11.2 + + - name: Setup HuggingFace cache directory + if: matrix.component == 'vllm' && matrix.model_to_cache != '' + run: | + mkdir -p /home/ec2-user/.cache/huggingface + echo "Cache directory created at /home/ec2-user/.cache/huggingface" + + - name: Restore model from GHCR + if: matrix.component == 'vllm' && matrix.model_to_cache != '' + id: restore-model + run: | + MODEL_CACHE_DIR="/home/ec2-user/.cache/huggingface" + HF_DIR_NAME="models--$(echo ${{ matrix.model_to_cache }} | sed 's/\//--/g')" + FULL_PATH="$MODEL_CACHE_DIR/$HF_DIR_NAME" + + if [ -d "$FULL_PATH" ]; then + echo "Model found on host filesystem at $FULL_PATH" + echo "Skipping GHCR pull to save I/O." + echo "cache-hit=true" >> $GITHUB_OUTPUT + exit 0 + fi + + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + RAW_TAG="${{ matrix.model_to_cache }}-v1" + SAFE_TAG=$(echo "$RAW_TAG" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9_.-]/-/g') + MODEL_IMAGE="ghcr.io/${REPO_LOWER}/nilai-model-cache:${SAFE_TAG}" + + echo "Attempting to pull model cache image: $MODEL_IMAGE" + + if docker pull "$MODEL_IMAGE"; then + echo "Image found. Copying model files to host..." + mkdir -p "$MODEL_CACHE_DIR" + + CONTAINER_ID=$(docker create "$MODEL_IMAGE") + docker cp "$CONTAINER_ID":/model/. "$MODEL_CACHE_DIR/" + docker rm "$CONTAINER_ID" + echo "Model restored from GHCR." + echo "cache-hit=true" >> $GITHUB_OUTPUT + else + echo "Model cache not found in GHCR." + echo "cache-hit=false" >> $GITHUB_OUTPUT + fi + + - name: DEBUG - Verify Cache Structure + if: matrix.component == 'vllm' + run: | + echo "Listing /home/ec2-user/.cache/huggingface contents:" + ls -F /home/ec2-user/.cache/huggingface/ || echo "Directory not found" + + echo "Checking for specific model folder:" + ls -F /home/ec2-user/.cache/huggingface/models--openai--gpt-oss-20b/ || echo "Model folder not found" + + echo "Checking snapshot content (first few files):" + find /home/ec2-user/.cache/huggingface -maxdepth 4 | head -n 10 + + - name: Setup uv for model download + if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true' + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + cache-dependency-glob: "**/pyproject.toml" + + - name: Install dependencies for model download + if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true' + run: | + apt-get update && apt-get install curl git pkg-config automake file python3.12-dev -y + export ACLOCAL=aclocal + export AUTOMAKE=automake + uv sync + + - name: Download HuggingFace model + if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + echo "Downloading model ${{ matrix.model_to_cache }} to cache..." + uv run python -c "from huggingface_hub import snapshot_download; import os; os.environ['HF_TOKEN'] = '${{ secrets.HF_TOKEN }}'; snapshot_download('${{ matrix.model_to_cache }}', cache_dir='/home/ec2-user/.cache/huggingface'); print('Model cached successfully')" \ + || { echo "Failed to download model"; exit 1; } + echo "Model download completed successfully" + + - name: Save model to GHCR + if: matrix.component == 'vllm' && matrix.model_to_cache != '' && steps.restore-model.outputs.cache-hit != 'true' + run: | + echo "Saving model to GHCR..." + + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + RAW_TAG="${{ matrix.model_to_cache }}-v1" + SAFE_TAG=$(echo "$RAW_TAG" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9_.-]/-/g') + MODEL_IMAGE="ghcr.io/${REPO_LOWER}/nilai-model-cache:${SAFE_TAG}" + + echo "Using cache image: $MODEL_IMAGE" + + echo "FROM scratch" > Dockerfile.model + echo "COPY . /model" >> Dockerfile.model + + cd /home/ec2-user/.cache/huggingface + + echo "Building cache image..." + docker build -t "$MODEL_IMAGE" -f $GITHUB_WORKSPACE/Dockerfile.model . + + echo "Pushing cache image to GHCR..." + docker push "$MODEL_IMAGE" + echo "Model cached to GHCR." - name: Build ${{ matrix.component }} image run: | echo "Building ${{ matrix.component }} image..." - docker build -t nillion/nilai-${{ matrix.component }}:latest -f docker/${{ matrix.component }}.Dockerfile ${{ matrix.build_args || '' }} . + + # Convert repository name to lowercase for Docker registry compatibility + REPO_LOWER=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + + # Set cache references + CACHE_REF="ghcr.io/${REPO_LOWER}/nilai-${{ matrix.component }}:buildcache" + + # Check if cache exists and is accessible + echo "Checking cache availability..." + CACHE_ARGS="" + if docker manifest inspect ${CACHE_REF} >/dev/null 2>&1; then + echo "โœ… Cache found, using registry cache" + CACHE_ARGS="--cache-from=type=registry,ref=${CACHE_REF} --cache-to=type=registry,ref=${CACHE_REF},mode=max" + else + echo "โš ๏ธ No cache found or cache inaccessible, building without import cache" + CACHE_ARGS="--cache-to=type=registry,ref=${CACHE_REF},mode=max" + fi + + # Function to build with retry logic + build_with_retry() { + local attempt=1 + local max_attempts=3 + + while [ $attempt -le $max_attempts ]; do + echo "๐Ÿ”„ Build attempt $attempt of $max_attempts..." + + if docker buildx build \ + -t nillion/nilai-${{ matrix.component }}:latest \ + -f docker/${{ matrix.component }}.Dockerfile \ + ${CACHE_ARGS} \ + --load \ + ${{ matrix.build_args || '' }} \ + .; then + echo "โœ… Build succeeded on attempt $attempt" + return 0 + else + echo "โŒ Build failed on attempt $attempt" + if [ $attempt -lt $max_attempts ]; then + echo "โณ Waiting 30 seconds before retry..." + sleep 30 + + # Clean up any partial builds + echo "๐Ÿงน Cleaning up Docker system..." + docker system prune -f || true + + # On retry, disable cache export to reduce complexity + if [ $attempt -eq 2 ]; then + echo "โš ๏ธ Disabling cache export for retry..." + CACHE_ARGS="--cache-from=type=registry,ref=${CACHE_REF}" + fi + + # On final retry, disable all cache + if [ $attempt -eq 3 ]; then + echo "โš ๏ธ Disabling all cache for final retry..." + CACHE_ARGS="" + fi + fi + attempt=$((attempt + 1)) + fi + done + + echo "๐Ÿ’ฅ All build attempts failed" + return 1 + } + + # Execute build with retry logic + build_with_retry + echo "โœ… ${{ matrix.component }} build completed successfully" e2e-tests: @@ -126,9 +364,35 @@ jobs: runs-on: ${{ needs.start-runner.outputs.label }} steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 + + - name: Disable unattended upgrades + run: | + echo "Disabling unattended upgrades to prevent interference with CI builds..." + + # Stop unattended-upgrades service + sudo systemctl stop unattended-upgrades || true + sudo systemctl disable unattended-upgrades || true + + # Kill any running unattended-upgrades processes + sudo pkill -f unattended-upgrade || true + + # Remove or disable the unattended-upgrades configuration + sudo systemctl mask unattended-upgrades || true + + # Wait for any ongoing package operations to complete + while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1; do + echo "Waiting for package manager lock to be released..." + sleep 5 + done + + # Disable automatic updates in APT configuration + echo 'APT::Periodic::Update-Package-Lists "0";' | sudo tee /etc/apt/apt.conf.d/20auto-upgrades + echo 'APT::Periodic::Unattended-Upgrade "0";' | sudo tee -a /etc/apt/apt.conf.d/20auto-upgrades + + echo "โœ… Unattended upgrades disabled successfully" - - uses: astral-sh/setup-uv@v4 + - uses: astral-sh/setup-uv@v7 with: enable-cache: true cache-dependency-glob: "**/pyproject.toml" diff --git a/docker/compose/docker-compose.gemma-4b-gpu.ci.yml b/docker/compose/docker-compose.gemma-4b-gpu.ci.yml index 29423275..f80076a2 100644 --- a/docker/compose/docker-compose.gemma-4b-gpu.ci.yml +++ b/docker/compose/docker-compose.gemma-4b-gpu.ci.yml @@ -36,12 +36,10 @@ services: - VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True volumes: - - hugging_face_models:/root/.cache/huggingface + - /home/ec2-user/.cache/huggingface:/root/.cache/huggingface # Mount runner's HF cache healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s retries: 3 start_period: 60s timeout: 10s -volumes: - hugging_face_models: diff --git a/docker/compose/docker-compose.gpt-20b-gpu.ci.yml b/docker/compose/docker-compose.gpt-20b-gpu.ci.yml index dcfef4cb..9aa1dc47 100644 --- a/docker/compose/docker-compose.gpt-20b-gpu.ci.yml +++ b/docker/compose/docker-compose.gpt-20b-gpu.ci.yml @@ -34,12 +34,10 @@ services: - ETCD_PORT=2379 - TOOL_SUPPORT=true volumes: - - hugging_face_models:/root/.cache/huggingface # cache models + - /home/ec2-user/.cache/huggingface:/root/.cache/huggingface # Mount runner's HF cache healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s retries: 10 start_period: 900s timeout: 15s -volumes: - hugging_face_models: diff --git a/docker/compose/docker-compose.llama-1b-gpu.ci.yml b/docker/compose/docker-compose.llama-1b-gpu.ci.yml index cca105f7..1c82d2a9 100644 --- a/docker/compose/docker-compose.llama-1b-gpu.ci.yml +++ b/docker/compose/docker-compose.llama-1b-gpu.ci.yml @@ -37,12 +37,10 @@ services: - TOOL_SUPPORT=true - CUDA_LAUNCH_BLOCKING=1 volumes: - - hugging_face_models:/root/.cache/huggingface # cache models + - /home/ec2-user/.cache/huggingface:/root/.cache/huggingface # Mount runner's HF cache healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s retries: 3 start_period: 60s timeout: 10s -volumes: - hugging_face_models: diff --git a/docker/compose/docker-compose.qwen-2b-gpu.ci.yml b/docker/compose/docker-compose.qwen-2b-gpu.ci.yml index 7d040caf..14a31815 100644 --- a/docker/compose/docker-compose.qwen-2b-gpu.ci.yml +++ b/docker/compose/docker-compose.qwen-2b-gpu.ci.yml @@ -52,13 +52,10 @@ services: VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1" PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" volumes: - - hugging_face_models:/root/.cache/huggingface + - /home/ec2-user/.cache/huggingface:/root/.cache/huggingface # Mount runner's HF cache healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s retries: 3 start_period: 60s timeout: 10s - -volumes: - hugging_face_models: diff --git a/docker/vllm.Dockerfile b/docker/vllm.Dockerfile index eb938667..c7602047 100644 --- a/docker/vllm.Dockerfile +++ b/docker/vllm.Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:v0.10.1 +FROM vllm/vllm-openai:v0.11.2 # # Specify model name and path during build # ARG MODEL_NAME=llama_1b_cpu @@ -9,6 +9,7 @@ FROM vllm/vllm-openai:v0.10.1 # ENV MODEL_PATH=${MODEL_PATH} # ENV EXEC_PATH=nilai_models.models.${MODEL_NAME}:app +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True COPY --link . /daemon/ COPY --link vllm_templates /opt/vllm/templates @@ -27,4 +28,4 @@ EXPOSE 8000 ENTRYPOINT ["bash", "run.sh"] -CMD [""] +CMD [""] \ No newline at end of file