diff --git a/.github/scripts/generate-values-overrides.sh b/.github/scripts/generate-values-overrides.sh new file mode 100755 index 000000000..5a71444c0 --- /dev/null +++ b/.github/scripts/generate-values-overrides.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Usage: generate-values-overrides.sh OUTPUT_FILE TOOLKIT_IMAGE DEVICE_PLUGIN_IMAGE MIG_MANAGER_IMAGE +# +# Generates a Helm values override file for GPU Operator component images. +# This file can be used with `helm install -f values-overrides.yaml` to +# override default component image versions. + +if [[ $# -ne 4 ]]; then + echo "Usage: $0 OUTPUT_FILE TOOLKIT_IMAGE DEVICE_PLUGIN_IMAGE MIG_MANAGER_IMAGE" >&2 + echo "" >&2 + echo "Example:" >&2 + echo " $0 values.yaml \\" >&2 + echo " ghcr.io/nvidia/container-toolkit:v1.18.0-ubuntu20.04 \\" >&2 + echo " ghcr.io/nvidia/k8s-device-plugin:v0.17.0-ubi8 \\" >&2 + echo " ghcr.io/nvidia/k8s-mig-manager:v0.10.0-ubuntu20.04" >&2 + exit 1 +fi + +OUTPUT_FILE="$1" +TOOLKIT_IMAGE="$2" +DEVICE_PLUGIN_IMAGE="$3" +MIG_MANAGER_IMAGE="$4" + +# Generate values override file +cat > "${OUTPUT_FILE}" <" >&2 + exit 1 +fi + +# Verify regctl is available +if ! command -v regctl &> /dev/null; then + echo "Error: regctl not found. Please install regctl first." >&2 + exit 1 +fi + +# Map component names to GHCR image repositories and GitHub source repositories +case "${COMPONENT}" in + toolkit) + IMAGE_REPO="ghcr.io/nvidia/container-toolkit" + GITHUB_REPO="NVIDIA/container-toolkit" + ;; + device-plugin) + IMAGE_REPO="ghcr.io/nvidia/k8s-device-plugin" + GITHUB_REPO="NVIDIA/k8s-device-plugin" + ;; + mig-manager) + IMAGE_REPO="ghcr.io/nvidia/k8s-mig-manager" + GITHUB_REPO="NVIDIA/k8s-mig-manager" + ;; + *) + echo "Error: Unknown component '${COMPONENT}'" >&2 + echo "Valid components: toolkit, device-plugin, mig-manager" >&2 + exit 1 + ;; +esac + +echo "Fetching latest commit from ${GITHUB_REPO}..." >&2 + +# Get the latest commit SHA from the main branch using GitHub API +GITHUB_API_URL="https://api.github.com/repos/${GITHUB_REPO}/commits/main" + +# Use GITHUB_TOKEN if available for authentication (higher rate limits) +if [[ -n "${GITHUB_TOKEN:-}" ]]; then + LATEST_COMMIT=$(curl -sSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github.v3+json" \ + "${GITHUB_API_URL}" | \ + jq -r '.sha[0:8]') +else + LATEST_COMMIT=$(curl -sSL \ + -H "Accept: application/vnd.github.v3+json" \ + "${GITHUB_API_URL}" | \ + jq -r '.sha[0:8]') +fi + +if [[ -z "${LATEST_COMMIT}" || "${LATEST_COMMIT}" == "null" ]]; then + echo "Error: Failed to fetch latest commit from ${GITHUB_REPO}" >&2 + exit 1 +fi + +echo "Latest commit SHA: ${LATEST_COMMIT}" >&2 + +# Construct full image path with commit tag +FULL_IMAGE="${IMAGE_REPO}:${LATEST_COMMIT}" + +echo "Verifying image exists: ${FULL_IMAGE}" >&2 + +# Verify the image exists using regctl +if ! regctl manifest head "${FULL_IMAGE}" &> /dev/null; then + echo "Error: Image ${FULL_IMAGE} does not exist or is not accessible" >&2 + echo "The image may not have been built yet for commit ${LATEST_COMMIT}" >&2 + exit 1 +fi + +echo "Verified ${COMPONENT} image: ${FULL_IMAGE}" >&2 +echo "${FULL_IMAGE}" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6e8848f31..13850188d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,4 +1,4 @@ -# Copyright 2024 NVIDIA CORPORATION +# Copyright NVIDIA CORPORATION # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,429 +20,48 @@ on: - "pull-request/[0-9]+" - main - release-* + workflow_dispatch: concurrency: group: ${{ github.workflow }}-pr-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - ### Configuration checks ### - helm-lint: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Install Helm - uses: azure/setup-helm@v4.3.1 - id: install - - run: helm lint deployments/gpu-operator/ - validate-csv: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - run: make validate-csv - validate-helm-values: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - run: make validate-helm-values - - ### Golang checks and build ### - go-check: - needs: [helm-lint, validate-csv, validate-helm-values] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Checkout code - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - GOLANGCI_LINT_VERSION=$( grep "GOLANGCI_LINT_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - echo "GOLANGCI_LINT_VERSION=${GOLANGCI_LINT_VERSION##GOLANGCI_LINT_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - name: Lint - uses: golangci/golangci-lint-action@v9 - with: - version: ${{ env.GOLANGCI_LINT_VERSION }} - args: -v --timeout 5m - skip-cache: true - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - env: - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - make check - go-test: - needs: [helm-lint, validate-csv, validate-helm-values] - name: unit tests - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - env: - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - make coverage - go-build: - needs: [helm-lint, validate-csv, validate-helm-values] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Checkout code - - run: make docker-build - coverage: - needs: [go-test] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Set up Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - name: Generate coverage report - env: - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - make cov-report - - name: Upload to Coveralls - uses: coverallsapp/github-action@v2 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: lcov.info - - ### Image builds ### - build-gpu-operator-arm64: - needs: [go-check, go-test, go-build] - runs-on: linux-arm64-cpu4 - permissions: - contents: read - id-token: write - packages: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" - echo "${REPO_FULL_NAME}" - echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV - - GENERATE_ARTIFACTS="false" - if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then - GENERATE_ARTIFACTS="false" - elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then - GENERATE_ARTIFACTS="true" - elif [[ "${{ github.event_name }}" == "push" ]]; then - GENERATE_ARTIFACTS="true" - fi - echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - echo "DOCKER_BUILD_PLATFORM_OPTIONS=--platform=linux/arm64" >> $GITHUB_ENV - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - name: Build image - env: - IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator - VERSION: ${COMMIT_SHORT_SHA}-arm64 - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - echo "${VERSION}" - make build-image - build-gpu-operator-amd64: - needs: [go-check, go-test, go-build] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - packages: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" - echo "${REPO_FULL_NAME}" - echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV - - GENERATE_ARTIFACTS="false" - if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then - GENERATE_ARTIFACTS="false" - elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then - GENERATE_ARTIFACTS="true" - elif [[ "${{ github.event_name }}" == "push" ]]; then - GENERATE_ARTIFACTS="true" - fi - echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - echo "DOCKER_BUILD_PLATFORM_OPTIONS=--platform=linux/amd64" >> $GITHUB_ENV - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - name: Build image - env: - IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator - VERSION: ${COMMIT_SHORT_SHA}-amd64 - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - echo "${VERSION}" - make build-image - - build-multi-arch-images: - needs: [build-gpu-operator-arm64, build-gpu-operator-amd64] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build Manifest - env: - OPERATOR_IMAGE_ARM: ghcr.io/${{ env.LOWERCASE_REPO_OWNER }}/gpu-operator:${{ env.COMMIT_SHORT_SHA }}-arm64 - OPERATOR_IMAGE_AMD: ghcr.io/${{ env.LOWERCASE_REPO_OWNER}}/gpu-operator:${{ env.COMMIT_SHORT_SHA }}-amd64 - OPERATOR_MULTIARCH_IMAGE: ghcr.io/${{ env.LOWERCASE_REPO_OWNER }}/gpu-operator:${{ env.COMMIT_SHORT_SHA }} - run: | - docker manifest create \ - ${OPERATOR_MULTIARCH_IMAGE} \ - ${OPERATOR_IMAGE_AMD} \ - ${OPERATOR_IMAGE_ARM} - docker manifest push ${OPERATOR_MULTIARCH_IMAGE} - - ### e2e tests ### - e2e-tests-containerd: - needs: [build-multi-arch-images] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Set up Holodeck - uses: NVIDIA/holodeck@v0.2.17 - with: - aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} - holodeck_config: "tests/holodeck.yaml" - - name: Get public dns name - id: get_public_dns_name - uses: mikefarah/yq@master - with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - - name: Calculate test vars - id: vars - run: | - COMMIT_SHORT_SHA=${GITHUB_SHA:0:8} - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}') - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - - echo "OPERATOR_VERSION=${COMMIT_SHORT_SHA}" >> $GITHUB_ENV - echo "OPERATOR_IMAGE=ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator" >> $GITHUB_ENV - - echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV - echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV - - name: Run e2e tests - env: - GPU_PRODUCT_NAME: "Tesla-T4" - SKIP_LAUNCH: "true" - CONTAINER_RUNTIME: "containerd" - TEST_CASE: "./tests/cases/defaults.sh" - run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? - ./tests/scripts/pull.sh /tmp/logs logs - exit $rc - - name: Archive test logs - if: ${{ failure() }} - uses: actions/upload-artifact@v5 - with: - name: containerd-e2e-test-logs - path: ./logs/ - retention-days: 15 - - e2e-tests-nvidiadriver: - needs: [build-multi-arch-images] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Set up Holodeck - uses: NVIDIA/holodeck@v0.2.17 - with: - aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} - holodeck_config: "tests/holodeck.yaml" - - name: Get public dns name - id: get_public_dns_name - uses: mikefarah/yq@master - with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - - name: Calculate test vars - id: vars - run: | - COMMIT_SHORT_SHA=${GITHUB_SHA:0:8} - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}') - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - - echo "OPERATOR_VERSION=${COMMIT_SHORT_SHA}" >> $GITHUB_ENV - echo "OPERATOR_IMAGE=ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator" >> $GITHUB_ENV - - echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV - echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV - - name: Run e2e tests - env: - GPU_PRODUCT_NAME: "Tesla-T4" - SKIP_LAUNCH: "true" - CONTAINER_RUNTIME: "containerd" - TEST_CASE: "./tests/cases/nvidia-driver.sh" - run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? - ./tests/scripts/pull.sh /tmp/logs logs - exit $rc - - name: Archive test logs - if: ${{ failure() }} - uses: actions/upload-artifact@v5 - with: - name: nvidiadriver-e2e-test-logs - path: ./logs/ - retention-days: 15 - - release-latest-gpu-operator-image: - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} # Runs only if the event is a push to the main branch - needs: [e2e-tests-containerd, e2e-tests-nvidiadriver] - runs-on: linux-amd64-cpu4 - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: set-up regctl - run: | - export REGCTL_VERSION=v0.9.2 - mkdir -p bin - curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 - chmod a+x bin/regctl - echo "$(pwd)/bin" >> $GITHUB_PATH - - name: Set environment variables - id: vars - run: | - COMMIT_SHORT_SHA=${GITHUB_SHA:0:8} - LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}') - echo "OPERATOR_VERSION=${COMMIT_SHORT_SHA}" >> $GITHUB_ENV - echo "OPERATOR_IMAGE=ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator" >> $GITHUB_ENV - - name: Retag gpu-operator - run: | - regctl registry login ghcr.io -u $GITHUB_ACTOR -p ${{ secrets.GITHUB_TOKEN }} - regctl image copy ${OPERATOR_IMAGE}:${OPERATOR_VERSION} ${OPERATOR_IMAGE}:main-latest - - push-gpu-operator-bundle-image: - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} # Runs only if the event is a push to the main branch - needs: [release-latest-gpu-operator-image] - runs-on: linux-amd64-cpu4 - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build bundle-image - env: - BUNDLE_IMAGE: "ghcr.io/nvidia/gpu-operator/gpu-operator-bundle:${{ github.ref_name }}-latest" - VERSION: "" - DEFAULT_CHANNEL: "stable" - CHANNELS: "stable" - run: | - make push-bundle-image + variables: + uses: ./.github/workflows/variables.yaml + + code-scanning: + uses: ./.github/workflows/code-scanning.yaml + + config-checks: + uses: ./.github/workflows/config-checks.yaml + + golang-checks: + uses: ./.github/workflows/golang-checks.yaml + + image-builds: + needs: [variables, config-checks, golang-checks] + uses: ./.github/workflows/image-builds.yaml + with: + commit_short_sha: ${{ needs.variables.outputs.commit_short_sha }} + label_image_source: ${{ needs.variables.outputs.label_image_source }} + push_on_build: ${{ needs.variables.outputs.push_on_build }} + operator_image_base: ${{ needs.variables.outputs.operator_image_base }} + + e2e-tests: + needs: [variables, image-builds] + uses: ./.github/workflows/e2e-tests.yaml + with: + operator_image: ${{ needs.variables.outputs.operator_image }} + operator_version: ${{ needs.variables.outputs.operator_version }} + secrets: inherit + + release: + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + needs: [variables, e2e-tests] + uses: ./.github/workflows/release.yaml + with: + commit_short_sha: ${{ needs.variables.outputs.commit_short_sha }} + operator_version: ${{ needs.variables.outputs.operator_version }} + operator_image_base: ${{ needs.variables.outputs.operator_image_base }} + secrets: inherit diff --git a/.github/workflows/code-scanning.yaml b/.github/workflows/code-scanning.yaml new file mode 100644 index 000000000..bf021380e --- /dev/null +++ b/.github/workflows/code-scanning.yaml @@ -0,0 +1,52 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "CodeQL" + +on: + workflow_call: {} + pull_request: + types: + - opened + - synchronize + branches: + - main + - release-* + +jobs: + analyze: + name: Analyze Go code with CodeQL + runs-on: ubuntu-latest + timeout-minutes: 360 + permissions: + security-events: write + packages: read + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: go + build-mode: manual + + - shell: bash + run: | + make build + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "/language:go" diff --git a/.github/workflows/config-checks.yaml b/.github/workflows/config-checks.yaml new file mode 100644 index 000000000..b41aa53e8 --- /dev/null +++ b/.github/workflows/config-checks.yaml @@ -0,0 +1,68 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Configuration Checks + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + workflow_dispatch: + +jobs: + helm-lint: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Install Helm + uses: azure/setup-helm@v4.3.1 + id: install + - run: helm lint deployments/gpu-operator/ + + validate-csv: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - run: make validate-csv + + validate-helm-values: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - run: make validate-helm-values + diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml new file mode 100644 index 000000000..55875d2e0 --- /dev/null +++ b/.github/workflows/e2e-tests.yaml @@ -0,0 +1,173 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: E2E Tests + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + inputs: + operator_image: + required: true + type: string + operator_version: + required: true + type: string + use_values_override: + required: false + type: boolean + default: false + description: 'Use values-overrides artifact for component image configuration' + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + AWS_SSH_KEY: + required: true + SLACK_BOT_TOKEN: + required: false + SLACK_CHANNEL_ID: + required: false + workflow_dispatch: + inputs: + operator_image: + description: 'Operator image to test (override)' + required: false + type: string + operator_version: + description: 'Operator version to test (override)' + required: false + type: string + +jobs: + variables: + uses: ./.github/workflows/variables.yaml + with: + operator_image: ${{ inputs.operator_image }} + operator_version: ${{ inputs.operator_version }} + + e2e-tests-containerd: + needs: [variables] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Download values override file + if: ${{ inputs.use_values_override }} + uses: actions/download-artifact@v5 + with: + name: values-overrides + path: ${{ github.workspace }} + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.17 + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + - name: Set test environment + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + if [[ "${{ inputs.use_values_override }}" == "true" ]]; then + echo "VALUES_FILE=${{ github.workspace }}/values-overrides.yaml" >> $GITHUB_ENV + fi + - name: Run e2e tests + env: + OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} + OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} + GPU_PRODUCT_NAME: "Tesla-T4" + SKIP_LAUNCH: "true" + CONTAINER_RUNTIME: "containerd" + TEST_CASE: "./tests/cases/defaults.sh" + run: | + echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v5 + with: + name: containerd-e2e-test-logs + path: ./logs/ + retention-days: 15 + + e2e-tests-nvidiadriver: + needs: [variables] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Download values override file + if: ${{ inputs.use_values_override }} + uses: actions/download-artifact@v5 + with: + name: values-overrides + path: ${{ github.workspace }} + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.17 + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + - name: Set test environment + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + if [[ "${{ inputs.use_values_override }}" == "true" ]]; then + echo "VALUES_FILE=${{ github.workspace }}/values-overrides.yaml" >> $GITHUB_ENV + fi + - name: Run e2e tests + env: + OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} + OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} + GPU_PRODUCT_NAME: "Tesla-T4" + SKIP_LAUNCH: "true" + CONTAINER_RUNTIME: "containerd" + TEST_CASE: "./tests/cases/nvidia-driver.sh" + run: | + echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v5 + with: + name: nvidiadriver-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/forward-compatibility.yaml b/.github/workflows/forward-compatibility.yaml new file mode 100644 index 000000000..d6ecea2f0 --- /dev/null +++ b/.github/workflows/forward-compatibility.yaml @@ -0,0 +1,100 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Forward Compatibility + +on: + schedule: + - cron: '0 2 * * 1' # Weekly on Monday at 2 AM UTC + workflow_dispatch: # Manual trigger + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + fetch-latest-images: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + + - name: Install regctl + run: | + REGCTL_VERSION=v0.9.2 + mkdir -p bin + curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 + chmod +x bin/regctl + echo "$(pwd)/bin" >> $GITHUB_PATH + + - name: Get latest component images and generate values override file + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Fetch latest images from component repositories + echo "::notice::Fetching latest container-toolkit image..." + TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit) + + echo "::notice::Fetching latest device-plugin image..." + DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin) + + echo "::notice::Fetching latest mig-manager image..." + MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager) + + # Generate values override file + .github/scripts/generate-values-overrides.sh \ + values-overrides.yaml \ + "${TOOLKIT}" \ + "${DEVICE_PLUGIN}" \ + "${MIG_MANAGER}" + + - name: Upload values override file + uses: actions/upload-artifact@v5 + with: + name: values-overrides + path: values-overrides.yaml + retention-days: 30 + + run-e2e-tests: + needs: [fetch-latest-images] + uses: ./.github/workflows/e2e-tests.yaml + with: + operator_image: ghcr.io/nvidia/gpu-operator + operator_version: main-latest + use_values_override: true + secrets: inherit + + notify-failure: + runs-on: ubuntu-latest + needs: [fetch-latest-images, run-e2e-tests] + if: ${{ failure() }} + steps: + - name: Send Slack alert notification + uses: slackapi/slack-github-action@v2.1.1 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ secrets.SLACK_CHANNEL_ID }} + text: | + :x: *Forward Compatibility Test Failed for GPU Operator* + + *Workflow:* ${{ github.workflow }} + *Repository:* ${{ github.repository }} + *Trigger:* ${{ github.event_name }} + + *Tested Components:* + Download `values-overrides` artifact to see tested component versions + + *Details:* + <@S095E7BNGJU> diff --git a/.github/workflows/golang-checks.yaml b/.github/workflows/golang-checks.yaml new file mode 100644 index 000000000..d36a88a5e --- /dev/null +++ b/.github/workflows/golang-checks.yaml @@ -0,0 +1,126 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Golang Checks + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + workflow_dispatch: + +jobs: + go-check: + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Checkout code + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + GOLANGCI_LINT_VERSION=$( grep "GOLANGCI_LINT_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + echo "GOLANGCI_LINT_VERSION=${GOLANGCI_LINT_VERSION##GOLANGCI_LINT_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - name: Lint + uses: golangci/golangci-lint-action@v8 + with: + version: ${{ env.GOLANGCI_LINT_VERSION }} + args: -v --timeout 5m + skip-cache: true + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - env: + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + run: | + make check + + go-test: + name: unit tests + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - env: + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + run: | + make coverage + + go-build: + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Checkout code + - run: make docker-build + + coverage: + needs: [go-test] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - name: Generate coverage report + env: + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + run: | + make cov-report + - name: Upload to Coveralls + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + path-to-lcov: lcov.info diff --git a/.github/workflows/image-builds.yaml b/.github/workflows/image-builds.yaml new file mode 100644 index 000000000..5882762f2 --- /dev/null +++ b/.github/workflows/image-builds.yaml @@ -0,0 +1,191 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Image Builds + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + inputs: + commit_short_sha: + required: true + type: string + label_image_source: + required: true + type: string + push_on_build: + required: true + type: string + operator_image_base: + required: true + type: string + workflow_dispatch: + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + label_image_source: ${{ steps.vars.outputs.label_image_source }} + push_on_build: ${{ steps.vars.outputs.push_on_build }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_image_arm64: ${{ steps.vars.outputs.operator_image_arm64 }} + operator_image_amd64: ${{ steps.vars.outputs.operator_image_amd64 }} + operator_image_multiarch: ${{ steps.vars.outputs.operator_image_multiarch }} + steps: + - name: Checkout code + if: ${{ github.event_name != 'workflow_call' }} + uses: actions/checkout@v5 + - name: Calculate build variables + id: vars + run: | + # Use inputs from workflow_call if available, otherwise calculate + if [[ "${{ github.event_name }}" == "workflow_call" ]]; then + COMMIT_SHORT_SHA="${{ inputs.commit_short_sha }}" + LABEL_IMAGE_SOURCE="${{ inputs.label_image_source }}" + PUSH_ON_BUILD="${{ inputs.push_on_build }}" + OPERATOR_IMAGE_BASE="${{ inputs.operator_image_base }}" + else + # Calculate for standalone runs + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + + REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" + if [[ -z "${REPO_FULL_NAME}" ]]; then + REPO_FULL_NAME="${{ github.repository }}" + fi + LABEL_IMAGE_SOURCE="https://github.com/${REPO_FULL_NAME}" + + PUSH_ON_BUILD="false" + if [[ "${{ github.actor }}" != "dependabot[bot]" ]]; then + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "push" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + PUSH_ON_BUILD="true" + fi + fi + + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + fi + + # Calculate derived image names + OPERATOR_IMAGE_ARM64="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}-arm64" + OPERATOR_IMAGE_AMD64="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}-amd64" + OPERATOR_IMAGE_MULTIARCH="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}" + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "label_image_source=${LABEL_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_image_arm64=${OPERATOR_IMAGE_ARM64}" >> $GITHUB_OUTPUT + echo "operator_image_amd64=${OPERATOR_IMAGE_AMD64}" >> $GITHUB_OUTPUT + echo "operator_image_multiarch=${OPERATOR_IMAGE_MULTIARCH}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" + echo "::notice::Push on build: ${PUSH_ON_BUILD}" + echo "::notice::Multi-arch image: ${OPERATOR_IMAGE_MULTIARCH}" + + build-gpu-operator-arm64: + needs: [variables] + runs-on: linux-arm64-cpu4 + permissions: + contents: read + id-token: write + packages: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - name: Build image + env: + IMAGE_NAME: ${{ needs.variables.outputs.operator_image_base }} + VERSION: ${{ needs.variables.outputs.commit_short_sha }}-arm64 + PUSH_ON_BUILD: ${{ needs.variables.outputs.push_on_build }} + DOCKER_BUILD_PLATFORM_OPTIONS: --platform=linux/arm64 + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + LABEL_IMAGE_SOURCE: ${{ needs.variables.outputs.label_image_source }} + run: | + echo "${VERSION}" + make build-image + + build-gpu-operator-amd64: + needs: [variables] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + packages: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - name: Build image + env: + IMAGE_NAME: ${{ needs.variables.outputs.operator_image_base }} + VERSION: ${{ needs.variables.outputs.commit_short_sha }}-amd64 + PUSH_ON_BUILD: ${{ needs.variables.outputs.push_on_build }} + DOCKER_BUILD_PLATFORM_OPTIONS: --platform=linux/amd64 + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + LABEL_IMAGE_SOURCE: ${{ needs.variables.outputs.label_image_source }} + run: | + echo "${VERSION}" + make build-image + + build-multi-arch-images: + needs: [variables, build-gpu-operator-arm64, build-gpu-operator-amd64] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build Manifest + env: + OPERATOR_IMAGE_ARM: ${{ needs.variables.outputs.operator_image_arm64 }} + OPERATOR_IMAGE_AMD: ${{ needs.variables.outputs.operator_image_amd64 }} + OPERATOR_MULTIARCH_IMAGE: ${{ needs.variables.outputs.operator_image_multiarch }} + run: | + docker manifest create \ + ${OPERATOR_MULTIARCH_IMAGE} \ + ${OPERATOR_IMAGE_AMD} \ + ${OPERATOR_IMAGE_ARM} + docker manifest push ${OPERATOR_MULTIARCH_IMAGE} + diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 000000000..b46affb97 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,115 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Release + +on: + push: + branches: + - main + workflow_call: + inputs: + commit_short_sha: + required: true + type: string + operator_version: + required: true + type: string + operator_image_base: + required: true + type: string + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_image_source: ${{ steps.vars.outputs.operator_image_source }} + operator_image_latest: ${{ steps.vars.outputs.operator_image_latest }} + bundle_image: ${{ steps.vars.outputs.bundle_image }} + steps: + - name: Checkout code + if: ${{ github.event_name != 'workflow_call' }} + uses: actions/checkout@v5 + - name: Calculate release variables + id: vars + run: | + # Use inputs from workflow_call if available + if [[ "${{ github.event_name }}" == "workflow_call" ]]; then + COMMIT_SHORT_SHA="${{ inputs.commit_short_sha }}" + OPERATOR_IMAGE_BASE="${{ inputs.operator_image_base }}" + else + # Calculate for standalone runs + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + fi + + # Calculate derived values + OPERATOR_IMAGE_SOURCE="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}" + OPERATOR_IMAGE_LATEST="${OPERATOR_IMAGE_BASE}:main-latest" + BUNDLE_IMAGE="ghcr.io/nvidia/gpu-operator/gpu-operator-bundle:main-latest" + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_image_source=${OPERATOR_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "operator_image_latest=${OPERATOR_IMAGE_LATEST}" >> $GITHUB_OUTPUT + echo "bundle_image=${BUNDLE_IMAGE}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Releasing: ${OPERATOR_IMAGE_SOURCE} → ${OPERATOR_IMAGE_LATEST}" + + release-latest-gpu-operator-image: + needs: [variables] + runs-on: linux-amd64-cpu4 + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Set-up regctl + run: | + export REGCTL_VERSION=v0.9.2 + mkdir -p bin + curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 + chmod a+x bin/regctl + echo "$(pwd)/bin" >> $GITHUB_PATH + - name: Retag gpu-operator + env: + OPERATOR_IMAGE_SOURCE: ${{ needs.variables.outputs.operator_image_source }} + OPERATOR_IMAGE_LATEST: ${{ needs.variables.outputs.operator_image_latest }} + run: | + regctl registry login ghcr.io -u $GITHUB_ACTOR -p ${{ secrets.GITHUB_TOKEN }} + regctl image copy ${OPERATOR_IMAGE_SOURCE} ${OPERATOR_IMAGE_LATEST} + + push-gpu-operator-bundle-image: + needs: [variables, release-latest-gpu-operator-image] + runs-on: linux-amd64-cpu4 + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build bundle-image + env: + BUNDLE_IMAGE: ${{ needs.variables.outputs.bundle_image }} + VERSION: "" + DEFAULT_CHANNEL: "stable" + CHANNELS: "stable" + run: | + make push-bundle-image + diff --git a/.github/workflows/variables.yaml b/.github/workflows/variables.yaml new file mode 100644 index 000000000..f04d1237a --- /dev/null +++ b/.github/workflows/variables.yaml @@ -0,0 +1,116 @@ +# Copyright 2025 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +on: + workflow_call: + inputs: + operator_image: + description: 'Operator image to use (optional override)' + required: false + type: string + operator_version: + description: 'Operator version to use (optional override)' + required: false + type: string + outputs: + commit_short_sha: + description: "The short SHA to use as a version string" + value: ${{ jobs.variables.outputs.commit_short_sha }} + repo_full_name: + description: "The full repository name" + value: ${{ jobs.variables.outputs.repo_full_name }} + label_image_source: + description: "The image source label URL" + value: ${{ jobs.variables.outputs.label_image_source }} + push_on_build: + description: "Whether to push images on build" + value: ${{ jobs.variables.outputs.push_on_build }} + operator_image_base: + description: "The base operator image name" + value: ${{ jobs.variables.outputs.operator_image_base }} + operator_version: + description: "The operator version" + value: ${{ jobs.variables.outputs.operator_version }} + operator_image: + description: "The operator image (with override support)" + value: ${{ jobs.variables.outputs.operator_image }} + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + repo_full_name: ${{ steps.vars.outputs.repo_full_name }} + label_image_source: ${{ steps.vars.outputs.label_image_source }} + push_on_build: ${{ steps.vars.outputs.push_on_build }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_version: ${{ steps.vars.outputs.operator_version }} + operator_image: ${{ steps.vars.outputs.operator_image }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Calculate all variables + id: vars + run: | + # Basic computed values + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + + # Repository information + REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" + if [[ -z "${REPO_FULL_NAME}" ]]; then + REPO_FULL_NAME="${{ github.repository }}" + fi + LABEL_IMAGE_SOURCE="https://github.com/${REPO_FULL_NAME}" + + # Determine if we should push images + PUSH_ON_BUILD="false" + if [[ "${{ github.actor }}" != "dependabot[bot]" ]]; then + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "push" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + PUSH_ON_BUILD="true" + fi + fi + + # Image and version information (with override support) + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + if [[ -n "${{ inputs.operator_version }}" ]]; then + OPERATOR_VERSION="${{ inputs.operator_version }}" + else + OPERATOR_VERSION="${COMMIT_SHORT_SHA}" + fi + + if [[ -n "${{ inputs.operator_image }}" ]]; then + OPERATOR_IMAGE="${{ inputs.operator_image }}" + else + OPERATOR_IMAGE="${OPERATOR_IMAGE_BASE}" + fi + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "repo_full_name=${REPO_FULL_NAME}" >> $GITHUB_OUTPUT + echo "label_image_source=${LABEL_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT + echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" + echo "::notice::Push on build: ${PUSH_ON_BUILD}" + echo "::notice::Operator image: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" + diff --git a/tests/scripts/env-to-values.sh b/tests/scripts/env-to-values.sh new file mode 100755 index 000000000..970a966e9 --- /dev/null +++ b/tests/scripts/env-to-values.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash + +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Usage: env-to-values.sh OUTPUT_FILE +# +# Converts environment variables to GPU Operator Helm values YAML format. +# This script reads common test environment variables and generates a +# values file that can be used with `helm install -f values.yaml`. +# +# Supported environment variables: +# - OPERATOR_IMAGE: operator image path (repository will be extracted) +# - OPERATOR_VERSION: operator version +# - TOOLKIT_CONTAINER_IMAGE: container-toolkit image override +# - DEVICE_PLUGIN_IMAGE: device-plugin image override +# - MIG_MANAGER_IMAGE: mig-manager image override +# - CONTAINER_RUNTIME: default runtime (docker, containerd, crio) + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 OUTPUT_FILE" >&2 + echo "" >&2 + echo "Converts environment variables to GPU Operator Helm values format." >&2 + exit 1 +fi + +OUTPUT_FILE="$1" + +# Start with header +cat > "${OUTPUT_FILE}" <> "${OUTPUT_FILE}" + echo -e "${OPERATOR_CONFIG}" >> "${OUTPUT_FILE}" +fi + +# Write validator configuration if any +if [[ -n "${VALIDATOR_CONFIG}" ]]; then + echo "validator:" >> "${OUTPUT_FILE}" + echo -e "${VALIDATOR_CONFIG}" >> "${OUTPUT_FILE}" +fi + +# Container Toolkit configuration +if [[ -n "${TOOLKIT_CONTAINER_IMAGE:-}" ]]; then + cat >> "${OUTPUT_FILE}" <> "${OUTPUT_FILE}" <> "${OUTPUT_FILE}" <&2 + echo "# No values to override" >> "${OUTPUT_FILE}" +fi + +echo "" +echo "Generated values file: ${OUTPUT_FILE}" +echo "" +echo "=== File Contents ===" +cat "${OUTPUT_FILE}" diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3fcb55ba2..6e7d173d1 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -10,20 +10,68 @@ source ${SCRIPT_DIR}/.definitions.sh OPERATOR_REPOSITORY=$(dirname ${OPERATOR_IMAGE}) -: ${OPERATOR_OPTIONS:=""} -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.repository=${OPERATOR_REPOSITORY} --set validator.repository=${OPERATOR_REPOSITORY}" +# Determine if we should use values file approach or --set flags +USE_VALUES_FILE=false +if [[ -n "${VALUES_FILE:-}" ]]; then + USE_VALUES_FILE=true +fi -if [[ -n "${OPERATOR_VERSION}" ]]; then -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION} --set validator.version=${OPERATOR_VERSION}" +# Build operator options conditionally +: ${OPERATOR_OPTIONS:=""} +if [[ "${USE_VALUES_FILE}" == "false" ]]; then + # Traditional approach: build --set flags + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.repository=${OPERATOR_REPOSITORY} --set validator.repository=${OPERATOR_REPOSITORY}" + + if [[ -n "${OPERATOR_VERSION}" ]]; then + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION} --set validator.version=${OPERATOR_VERSION}" + fi + + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.defaultRuntime=${CONTAINER_RUNTIME}" fi -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.defaultRuntime=${CONTAINER_RUNTIME}" +if [[ "${USE_VALUES_FILE}" == "true" ]]; then + # Generate a temporary values file from environment variables + # and merge it with the provided VALUES_FILE + TEMP_ENV_VALUES=$(mktemp) + ${SCRIPT_DIR}/env-to-values.sh "${TEMP_ENV_VALUES}" + + # If VALUES_FILE exists, merge it with env-generated values + # Otherwise just use the env-generated values + if [[ -f "${VALUES_FILE}" ]]; then + echo "" + echo "Using provided values file: ${VALUES_FILE}" + cat "${VALUES_FILE}" + echo "" + echo "Merged with environment-based values:" + cat "${TEMP_ENV_VALUES}" + # Create a combined values file + COMBINED_VALUES=$(mktemp) + cat "${VALUES_FILE}" "${TEMP_ENV_VALUES}" > "${COMBINED_VALUES}" + VALUES_FILE="${COMBINED_VALUES}" + else + VALUES_FILE="${TEMP_ENV_VALUES}" + fi + + # Clear individual options since we're using values file + TOOLKIT_CONTAINER_OPTIONS="" + DEVICE_PLUGIN_OPTIONS="" + MIG_MANAGER_OPTIONS="" +else + # Traditional approach: use --set flags for backward compatibility + : ${TOOLKIT_CONTAINER_OPTIONS:=""} + if [[ -n "${TOOLKIT_CONTAINER_IMAGE:-}" ]]; then + TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\"" + fi -# We set up the options for the toolkit container -: ${TOOLKIT_CONTAINER_OPTIONS:=""} + : ${DEVICE_PLUGIN_OPTIONS:=""} + if [[ -n "${DEVICE_PLUGIN_IMAGE:-}" ]]; then + DEVICE_PLUGIN_OPTIONS="${DEVICE_PLUGIN_OPTIONS} --set devicePlugin.repository=\"\" --set devicePlugin.version=\"\" --set devicePlugin.image=\"${DEVICE_PLUGIN_IMAGE}\"" + fi -if [[ -n "${TOOLKIT_CONTAINER_IMAGE}" ]]; then -TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\"" + : ${MIG_MANAGER_OPTIONS:=""} + if [[ -n "${MIG_MANAGER_IMAGE:-}" ]]; then + MIG_MANAGER_OPTIONS="${MIG_MANAGER_OPTIONS} --set migManager.repository=\"\" --set migManager.version=\"\" --set migManager.image=\"${MIG_MANAGER_IMAGE}\"" + fi fi # Create the test namespace @@ -44,8 +92,26 @@ if [[ "${GPU_MODE}" == "vgpu" ]]; then fi # Run the helm install command -${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ - -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ - ${TOOLKIT_CONTAINER_OPTIONS} \ +echo "" +echo "Installing GPU Operator with Helm..." +echo "Operator image: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" + +if [[ "${USE_VALUES_FILE}" == "true" ]]; then + echo "Using values file approach: ${VALUES_FILE}" + ${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ + -n "${TEST_NAMESPACE}" \ + -f "${VALUES_FILE}" \ --wait + + # Cleanup temporary values files + rm -f "${TEMP_ENV_VALUES:-}" "${COMBINED_VALUES:-}" +else + echo "Using --set flags approach" + ${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ + -n "${TEST_NAMESPACE}" \ + ${OPERATOR_OPTIONS} \ + ${TOOLKIT_CONTAINER_OPTIONS} \ + ${DEVICE_PLUGIN_OPTIONS} \ + ${MIG_MANAGER_OPTIONS} \ + --wait +fi