From c9d808c4356f06c20356baee4b7590d4062cac26 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 10 Nov 2025 16:29:52 +0100 Subject: [PATCH 1/6] refactor: modularize CI workflows with centralized variables Split monolithic ci.yaml (448 lines) into 6 specialized workflows. Refactored ci.yaml as orchestrator (118 lines) with centralized variable calculation. All workflows support workflow_call, workflow_dispatch, and standalone execution. Eliminates duplicate variable calculations. Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/ci.yaml | 494 +++++---------------------- .github/workflows/code-scanning.yaml | 52 +++ .github/workflows/config-checks.yaml | 68 ++++ .github/workflows/e2e-tests.yaml | 175 ++++++++++ .github/workflows/golang-checks.yaml | 126 +++++++ .github/workflows/image-builds.yaml | 191 +++++++++++ .github/workflows/release.yaml | 115 +++++++ 7 files changed, 809 insertions(+), 412 deletions(-) create mode 100644 .github/workflows/code-scanning.yaml create mode 100644 .github/workflows/config-checks.yaml create mode 100644 .github/workflows/e2e-tests.yaml create mode 100644 .github/workflows/golang-checks.yaml create mode 100644 .github/workflows/image-builds.yaml create mode 100644 .github/workflows/release.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6e8848f31..c795665b6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,4 +1,4 @@ -# Copyright 2024 NVIDIA CORPORATION +# Copyright NVIDIA CORPORATION # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,429 +20,99 @@ on: - "pull-request/[0-9]+" - main - release-* + workflow_dispatch: concurrency: group: ${{ github.workflow }}-pr-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - ### Configuration checks ### - helm-lint: + variables: runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + repo_full_name: ${{ steps.vars.outputs.repo_full_name }} + label_image_source: ${{ steps.vars.outputs.label_image_source }} + push_on_build: ${{ steps.vars.outputs.push_on_build }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_version: ${{ steps.vars.outputs.operator_version }} steps: - name: Checkout code uses: actions/checkout@v5 - - name: Install Helm - uses: azure/setup-helm@v4.3.1 - id: install - - run: helm lint deployments/gpu-operator/ - validate-csv: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - run: make validate-csv - validate-helm-values: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - run: make validate-helm-values - - ### Golang checks and build ### - go-check: - needs: [helm-lint, validate-csv, validate-helm-values] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Checkout code - - name: Get Golang version + - name: Calculate all variables id: vars run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - GOLANGCI_LINT_VERSION=$( grep "GOLANGCI_LINT_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - echo "GOLANGCI_LINT_VERSION=${GOLANGCI_LINT_VERSION##GOLANGCI_LINT_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - name: Lint - uses: golangci/golangci-lint-action@v9 - with: - version: ${{ env.GOLANGCI_LINT_VERSION }} - args: -v --timeout 5m - skip-cache: true - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - env: - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - make check - go-test: - needs: [helm-lint, validate-csv, validate-helm-values] - name: unit tests - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Install Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - env: - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - make coverage - go-build: - needs: [helm-lint, validate-csv, validate-helm-values] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Checkout code - - run: make docker-build - coverage: - needs: [go-test] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Get Golang version - id: vars - run: | - GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) - echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV - - name: Set up Go - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GOLANG_VERSION }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - name: Generate coverage report - env: - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - make cov-report - - name: Upload to Coveralls - uses: coverallsapp/github-action@v2 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: lcov.info - - ### Image builds ### - build-gpu-operator-arm64: - needs: [go-check, go-test, go-build] - runs-on: linux-arm64-cpu4 - permissions: - contents: read - id-token: write - packages: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV + # Basic computed values + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + + # Repository information REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" - echo "${REPO_FULL_NAME}" - echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV - - GENERATE_ARTIFACTS="false" - if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then - GENERATE_ARTIFACTS="false" - elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then - GENERATE_ARTIFACTS="true" - elif [[ "${{ github.event_name }}" == "push" ]]; then - GENERATE_ARTIFACTS="true" + if [[ -z "${REPO_FULL_NAME}" ]]; then + REPO_FULL_NAME="${{ github.repository }}" fi - echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - echo "DOCKER_BUILD_PLATFORM_OPTIONS=--platform=linux/arm64" >> $GITHUB_ENV - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - name: Build image - env: - IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator - VERSION: ${COMMIT_SHORT_SHA}-arm64 - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - echo "${VERSION}" - make build-image - build-gpu-operator-amd64: - needs: [go-check, go-test, go-build] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - packages: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" - echo "${REPO_FULL_NAME}" - echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV - - GENERATE_ARTIFACTS="false" - if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then - GENERATE_ARTIFACTS="false" - elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then - GENERATE_ARTIFACTS="true" - elif [[ "${{ github.event_name }}" == "push" ]]; then - GENERATE_ARTIFACTS="true" + LABEL_IMAGE_SOURCE="https://github.com/${REPO_FULL_NAME}" + + # Determine if we should push images + PUSH_ON_BUILD="false" + if [[ "${{ github.actor }}" != "dependabot[bot]" ]]; then + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "push" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + PUSH_ON_BUILD="true" + fi fi - echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - echo "DOCKER_BUILD_PLATFORM_OPTIONS=--platform=linux/amd64" >> $GITHUB_ENV - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Setup Go Proxy - id: setup-go-proxy - uses: nv-gha-runners/setup-artifactory-go-proxy@main - - name: Build image - env: - IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator - VERSION: ${COMMIT_SHORT_SHA}-amd64 - GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} - run: | - echo "${VERSION}" - make build-image - - build-multi-arch-images: - needs: [build-gpu-operator-arm64, build-gpu-operator-amd64] - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build Manifest - env: - OPERATOR_IMAGE_ARM: ghcr.io/${{ env.LOWERCASE_REPO_OWNER }}/gpu-operator:${{ env.COMMIT_SHORT_SHA }}-arm64 - OPERATOR_IMAGE_AMD: ghcr.io/${{ env.LOWERCASE_REPO_OWNER}}/gpu-operator:${{ env.COMMIT_SHORT_SHA }}-amd64 - OPERATOR_MULTIARCH_IMAGE: ghcr.io/${{ env.LOWERCASE_REPO_OWNER }}/gpu-operator:${{ env.COMMIT_SHORT_SHA }} - run: | - docker manifest create \ - ${OPERATOR_MULTIARCH_IMAGE} \ - ${OPERATOR_IMAGE_AMD} \ - ${OPERATOR_IMAGE_ARM} - docker manifest push ${OPERATOR_MULTIARCH_IMAGE} - - ### e2e tests ### - e2e-tests-containerd: - needs: [build-multi-arch-images] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Set up Holodeck - uses: NVIDIA/holodeck@v0.2.17 - with: - aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} - holodeck_config: "tests/holodeck.yaml" - - name: Get public dns name - id: get_public_dns_name - uses: mikefarah/yq@master - with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - - name: Calculate test vars - id: vars - run: | - COMMIT_SHORT_SHA=${GITHUB_SHA:0:8} - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}') - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - - echo "OPERATOR_VERSION=${COMMIT_SHORT_SHA}" >> $GITHUB_ENV - echo "OPERATOR_IMAGE=ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator" >> $GITHUB_ENV - - echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV - echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV - - name: Run e2e tests - env: - GPU_PRODUCT_NAME: "Tesla-T4" - SKIP_LAUNCH: "true" - CONTAINER_RUNTIME: "containerd" - TEST_CASE: "./tests/cases/defaults.sh" - run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? - ./tests/scripts/pull.sh /tmp/logs logs - exit $rc - - name: Archive test logs - if: ${{ failure() }} - uses: actions/upload-artifact@v5 - with: - name: containerd-e2e-test-logs - path: ./logs/ - retention-days: 15 - - e2e-tests-nvidiadriver: - needs: [build-multi-arch-images] - runs-on: linux-amd64-cpu4 - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Set up Holodeck - uses: NVIDIA/holodeck@v0.2.17 - with: - aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} - holodeck_config: "tests/holodeck.yaml" - - name: Get public dns name - id: get_public_dns_name - uses: mikefarah/yq@master - with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - - name: Calculate test vars - id: vars - run: | - COMMIT_SHORT_SHA=${GITHUB_SHA:0:8} - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}') - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - - echo "OPERATOR_VERSION=${COMMIT_SHORT_SHA}" >> $GITHUB_ENV - echo "OPERATOR_IMAGE=ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator" >> $GITHUB_ENV - - echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV - echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV - - name: Run e2e tests - env: - GPU_PRODUCT_NAME: "Tesla-T4" - SKIP_LAUNCH: "true" - CONTAINER_RUNTIME: "containerd" - TEST_CASE: "./tests/cases/nvidia-driver.sh" - run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? - ./tests/scripts/pull.sh /tmp/logs logs - exit $rc - - name: Archive test logs - if: ${{ failure() }} - uses: actions/upload-artifact@v5 - with: - name: nvidiadriver-e2e-test-logs - path: ./logs/ - retention-days: 15 - - release-latest-gpu-operator-image: - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} # Runs only if the event is a push to the main branch - needs: [e2e-tests-containerd, e2e-tests-nvidiadriver] - runs-on: linux-amd64-cpu4 - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: set-up regctl - run: | - export REGCTL_VERSION=v0.9.2 - mkdir -p bin - curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 - chmod a+x bin/regctl - echo "$(pwd)/bin" >> $GITHUB_PATH - - name: Set environment variables - id: vars - run: | - COMMIT_SHORT_SHA=${GITHUB_SHA:0:8} - LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}') - echo "OPERATOR_VERSION=${COMMIT_SHORT_SHA}" >> $GITHUB_ENV - echo "OPERATOR_IMAGE=ghcr.io/${LOWERCASE_REPO_OWNER}/gpu-operator" >> $GITHUB_ENV - - name: Retag gpu-operator - run: | - regctl registry login ghcr.io -u $GITHUB_ACTOR -p ${{ secrets.GITHUB_TOKEN }} - regctl image copy ${OPERATOR_IMAGE}:${OPERATOR_VERSION} ${OPERATOR_IMAGE}:main-latest - - push-gpu-operator-bundle-image: - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} # Runs only if the event is a push to the main branch - needs: [release-latest-gpu-operator-image] - runs-on: linux-amd64-cpu4 - steps: - - uses: actions/checkout@v5 - name: Check out code - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build bundle-image - env: - BUNDLE_IMAGE: "ghcr.io/nvidia/gpu-operator/gpu-operator-bundle:${{ github.ref_name }}-latest" - VERSION: "" - DEFAULT_CHANNEL: "stable" - CHANNELS: "stable" - run: | - make push-bundle-image + + # Image and version information + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + OPERATOR_VERSION="${COMMIT_SHORT_SHA}" + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "repo_full_name=${REPO_FULL_NAME}" >> $GITHUB_OUTPUT + echo "label_image_source=${LABEL_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" + echo "::notice::Push on build: ${PUSH_ON_BUILD}" + echo "::notice::Operator image: ${OPERATOR_IMAGE_BASE}:${OPERATOR_VERSION}" + + code-scanning: + uses: ./.github/workflows/code-scanning.yaml + + config-checks: + uses: ./.github/workflows/config-checks.yaml + + golang-checks: + uses: ./.github/workflows/golang-checks.yaml + + image-builds: + needs: [variables, config-checks, golang-checks] + uses: ./.github/workflows/image-builds.yaml + with: + commit_short_sha: ${{ needs.variables.outputs.commit_short_sha }} + label_image_source: ${{ needs.variables.outputs.label_image_source }} + push_on_build: ${{ needs.variables.outputs.push_on_build }} + operator_image_base: ${{ needs.variables.outputs.operator_image_base }} + + e2e-tests: + needs: [variables, image-builds] + uses: ./.github/workflows/e2e-tests.yaml + with: + operator_image: ${{ needs.variables.outputs.operator_image_base }} + operator_version: ${{ needs.variables.outputs.operator_version }} + secrets: inherit + + release: + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + needs: [variables, e2e-tests] + uses: ./.github/workflows/release.yaml + with: + commit_short_sha: ${{ needs.variables.outputs.commit_short_sha }} + operator_version: ${{ needs.variables.outputs.operator_version }} + operator_image_base: ${{ needs.variables.outputs.operator_image_base }} + secrets: inherit diff --git a/.github/workflows/code-scanning.yaml b/.github/workflows/code-scanning.yaml new file mode 100644 index 000000000..bf021380e --- /dev/null +++ b/.github/workflows/code-scanning.yaml @@ -0,0 +1,52 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "CodeQL" + +on: + workflow_call: {} + pull_request: + types: + - opened + - synchronize + branches: + - main + - release-* + +jobs: + analyze: + name: Analyze Go code with CodeQL + runs-on: ubuntu-latest + timeout-minutes: 360 + permissions: + security-events: write + packages: read + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: go + build-mode: manual + + - shell: bash + run: | + make build + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: "/language:go" diff --git a/.github/workflows/config-checks.yaml b/.github/workflows/config-checks.yaml new file mode 100644 index 000000000..b41aa53e8 --- /dev/null +++ b/.github/workflows/config-checks.yaml @@ -0,0 +1,68 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Configuration Checks + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + workflow_dispatch: + +jobs: + helm-lint: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Install Helm + uses: azure/setup-helm@v4.3.1 + id: install + - run: helm lint deployments/gpu-operator/ + + validate-csv: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - run: make validate-csv + + validate-helm-values: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - run: make validate-helm-values + diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml new file mode 100644 index 000000000..b6e89fda8 --- /dev/null +++ b/.github/workflows/e2e-tests.yaml @@ -0,0 +1,175 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: E2E Tests + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + inputs: + operator_image: + required: true + type: string + operator_version: + required: true + type: string + secrets: + AWS_ACCESS_KEY_ID: + required: true + AWS_SECRET_ACCESS_KEY: + required: true + AWS_SSH_KEY: + required: true + workflow_dispatch: + inputs: + operator_image: + description: 'Operator image to test (override)' + required: false + type: string + operator_version: + description: 'Operator version to test (override)' + required: false + type: string + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + operator_version: ${{ steps.vars.outputs.operator_version }} + operator_image: ${{ steps.vars.outputs.operator_image }} + steps: + - name: Checkout code + if: ${{ github.event_name != 'workflow_call' }} + uses: actions/checkout@v5 + - name: Calculate test variables + id: vars + run: | + # Use inputs from workflow_call if available + if [[ "${{ github.event_name }}" == "workflow_call" ]]; then + OPERATOR_IMAGE="${{ inputs.operator_image }}" + OPERATOR_VERSION="${{ inputs.operator_version }}" + # Use workflow_dispatch inputs if provided + elif [[ -n "${{ inputs.operator_version }}" && -n "${{ inputs.operator_image }}" ]]; then + OPERATOR_VERSION="${{ inputs.operator_version }}" + OPERATOR_IMAGE="${{ inputs.operator_image }}" + else + # Calculate for standalone runs + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + OPERATOR_VERSION="${COMMIT_SHORT_SHA}" + OPERATOR_IMAGE="ghcr.io/nvidia/gpu-operator" + fi + + # Output all variables + echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT + echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Testing operator: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" + + e2e-tests-containerd: + needs: [variables] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.17 + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + - name: Set test environment + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + - name: Run e2e tests + env: + OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} + OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} + GPU_PRODUCT_NAME: "Tesla-T4" + SKIP_LAUNCH: "true" + CONTAINER_RUNTIME: "containerd" + TEST_CASE: "./tests/cases/defaults.sh" + run: | + echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v5 + with: + name: containerd-e2e-test-logs + path: ./logs/ + retention-days: 15 + + e2e-tests-nvidiadriver: + needs: [variables] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.17 + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + - name: Set test environment + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + - name: Run e2e tests + env: + OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} + OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} + GPU_PRODUCT_NAME: "Tesla-T4" + SKIP_LAUNCH: "true" + CONTAINER_RUNTIME: "containerd" + TEST_CASE: "./tests/cases/nvidia-driver.sh" + run: | + echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + ./tests/ci-run-e2e.sh ${OPERATOR_IMAGE} ${OPERATOR_VERSION} ${GPU_PRODUCT_NAME} ${TEST_CASE} || rc=$? + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v5 + with: + name: nvidiadriver-e2e-test-logs + path: ./logs/ + retention-days: 15 + diff --git a/.github/workflows/golang-checks.yaml b/.github/workflows/golang-checks.yaml new file mode 100644 index 000000000..d36a88a5e --- /dev/null +++ b/.github/workflows/golang-checks.yaml @@ -0,0 +1,126 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Golang Checks + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + workflow_dispatch: + +jobs: + go-check: + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Checkout code + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + GOLANGCI_LINT_VERSION=$( grep "GOLANGCI_LINT_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + echo "GOLANGCI_LINT_VERSION=${GOLANGCI_LINT_VERSION##GOLANGCI_LINT_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - name: Lint + uses: golangci/golangci-lint-action@v8 + with: + version: ${{ env.GOLANGCI_LINT_VERSION }} + args: -v --timeout 5m + skip-cache: true + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - env: + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + run: | + make check + + go-test: + name: unit tests + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Install Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - env: + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + run: | + make coverage + + go-build: + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v5 + name: Checkout code + - run: make docker-build + + coverage: + needs: [go-test] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@v5 + - name: Get Golang version + id: vars + run: | + GOLANG_VERSION=$( grep "GOLANG_VERSION ?=" versions.mk ) + echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION ?= }" >> $GITHUB_ENV + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GOLANG_VERSION }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - name: Generate coverage report + env: + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + run: | + make cov-report + - name: Upload to Coveralls + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + path-to-lcov: lcov.info diff --git a/.github/workflows/image-builds.yaml b/.github/workflows/image-builds.yaml new file mode 100644 index 000000000..5882762f2 --- /dev/null +++ b/.github/workflows/image-builds.yaml @@ -0,0 +1,191 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Image Builds + +on: + push: + branches: + - "pull-request/[0-9]+" + - main + - release-* + workflow_call: + inputs: + commit_short_sha: + required: true + type: string + label_image_source: + required: true + type: string + push_on_build: + required: true + type: string + operator_image_base: + required: true + type: string + workflow_dispatch: + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + label_image_source: ${{ steps.vars.outputs.label_image_source }} + push_on_build: ${{ steps.vars.outputs.push_on_build }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_image_arm64: ${{ steps.vars.outputs.operator_image_arm64 }} + operator_image_amd64: ${{ steps.vars.outputs.operator_image_amd64 }} + operator_image_multiarch: ${{ steps.vars.outputs.operator_image_multiarch }} + steps: + - name: Checkout code + if: ${{ github.event_name != 'workflow_call' }} + uses: actions/checkout@v5 + - name: Calculate build variables + id: vars + run: | + # Use inputs from workflow_call if available, otherwise calculate + if [[ "${{ github.event_name }}" == "workflow_call" ]]; then + COMMIT_SHORT_SHA="${{ inputs.commit_short_sha }}" + LABEL_IMAGE_SOURCE="${{ inputs.label_image_source }}" + PUSH_ON_BUILD="${{ inputs.push_on_build }}" + OPERATOR_IMAGE_BASE="${{ inputs.operator_image_base }}" + else + # Calculate for standalone runs + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + + REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" + if [[ -z "${REPO_FULL_NAME}" ]]; then + REPO_FULL_NAME="${{ github.repository }}" + fi + LABEL_IMAGE_SOURCE="https://github.com/${REPO_FULL_NAME}" + + PUSH_ON_BUILD="false" + if [[ "${{ github.actor }}" != "dependabot[bot]" ]]; then + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "push" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + PUSH_ON_BUILD="true" + fi + fi + + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + fi + + # Calculate derived image names + OPERATOR_IMAGE_ARM64="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}-arm64" + OPERATOR_IMAGE_AMD64="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}-amd64" + OPERATOR_IMAGE_MULTIARCH="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}" + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "label_image_source=${LABEL_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_image_arm64=${OPERATOR_IMAGE_ARM64}" >> $GITHUB_OUTPUT + echo "operator_image_amd64=${OPERATOR_IMAGE_AMD64}" >> $GITHUB_OUTPUT + echo "operator_image_multiarch=${OPERATOR_IMAGE_MULTIARCH}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" + echo "::notice::Push on build: ${PUSH_ON_BUILD}" + echo "::notice::Multi-arch image: ${OPERATOR_IMAGE_MULTIARCH}" + + build-gpu-operator-arm64: + needs: [variables] + runs-on: linux-arm64-cpu4 + permissions: + contents: read + id-token: write + packages: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - name: Build image + env: + IMAGE_NAME: ${{ needs.variables.outputs.operator_image_base }} + VERSION: ${{ needs.variables.outputs.commit_short_sha }}-arm64 + PUSH_ON_BUILD: ${{ needs.variables.outputs.push_on_build }} + DOCKER_BUILD_PLATFORM_OPTIONS: --platform=linux/arm64 + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + LABEL_IMAGE_SOURCE: ${{ needs.variables.outputs.label_image_source }} + run: | + echo "${VERSION}" + make build-image + + build-gpu-operator-amd64: + needs: [variables] + runs-on: linux-amd64-cpu4 + permissions: + contents: read + id-token: write + packages: write + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Setup Go Proxy + id: setup-go-proxy + uses: nv-gha-runners/setup-artifactory-go-proxy@main + - name: Build image + env: + IMAGE_NAME: ${{ needs.variables.outputs.operator_image_base }} + VERSION: ${{ needs.variables.outputs.commit_short_sha }}-amd64 + PUSH_ON_BUILD: ${{ needs.variables.outputs.push_on_build }} + DOCKER_BUILD_PLATFORM_OPTIONS: --platform=linux/amd64 + GOPROXY: ${{ steps.setup-go-proxy.outputs.goproxy-url }} + LABEL_IMAGE_SOURCE: ${{ needs.variables.outputs.label_image_source }} + run: | + echo "${VERSION}" + make build-image + + build-multi-arch-images: + needs: [variables, build-gpu-operator-arm64, build-gpu-operator-amd64] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build Manifest + env: + OPERATOR_IMAGE_ARM: ${{ needs.variables.outputs.operator_image_arm64 }} + OPERATOR_IMAGE_AMD: ${{ needs.variables.outputs.operator_image_amd64 }} + OPERATOR_MULTIARCH_IMAGE: ${{ needs.variables.outputs.operator_image_multiarch }} + run: | + docker manifest create \ + ${OPERATOR_MULTIARCH_IMAGE} \ + ${OPERATOR_IMAGE_AMD} \ + ${OPERATOR_IMAGE_ARM} + docker manifest push ${OPERATOR_MULTIARCH_IMAGE} + diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 000000000..b46affb97 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,115 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Release + +on: + push: + branches: + - main + workflow_call: + inputs: + commit_short_sha: + required: true + type: string + operator_version: + required: true + type: string + operator_image_base: + required: true + type: string + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_image_source: ${{ steps.vars.outputs.operator_image_source }} + operator_image_latest: ${{ steps.vars.outputs.operator_image_latest }} + bundle_image: ${{ steps.vars.outputs.bundle_image }} + steps: + - name: Checkout code + if: ${{ github.event_name != 'workflow_call' }} + uses: actions/checkout@v5 + - name: Calculate release variables + id: vars + run: | + # Use inputs from workflow_call if available + if [[ "${{ github.event_name }}" == "workflow_call" ]]; then + COMMIT_SHORT_SHA="${{ inputs.commit_short_sha }}" + OPERATOR_IMAGE_BASE="${{ inputs.operator_image_base }}" + else + # Calculate for standalone runs + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + fi + + # Calculate derived values + OPERATOR_IMAGE_SOURCE="${OPERATOR_IMAGE_BASE}:${COMMIT_SHORT_SHA}" + OPERATOR_IMAGE_LATEST="${OPERATOR_IMAGE_BASE}:main-latest" + BUNDLE_IMAGE="ghcr.io/nvidia/gpu-operator/gpu-operator-bundle:main-latest" + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_image_source=${OPERATOR_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "operator_image_latest=${OPERATOR_IMAGE_LATEST}" >> $GITHUB_OUTPUT + echo "bundle_image=${BUNDLE_IMAGE}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Releasing: ${OPERATOR_IMAGE_SOURCE} → ${OPERATOR_IMAGE_LATEST}" + + release-latest-gpu-operator-image: + needs: [variables] + runs-on: linux-amd64-cpu4 + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Set-up regctl + run: | + export REGCTL_VERSION=v0.9.2 + mkdir -p bin + curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 + chmod a+x bin/regctl + echo "$(pwd)/bin" >> $GITHUB_PATH + - name: Retag gpu-operator + env: + OPERATOR_IMAGE_SOURCE: ${{ needs.variables.outputs.operator_image_source }} + OPERATOR_IMAGE_LATEST: ${{ needs.variables.outputs.operator_image_latest }} + run: | + regctl registry login ghcr.io -u $GITHUB_ACTOR -p ${{ secrets.GITHUB_TOKEN }} + regctl image copy ${OPERATOR_IMAGE_SOURCE} ${OPERATOR_IMAGE_LATEST} + + push-gpu-operator-bundle-image: + needs: [variables, release-latest-gpu-operator-image] + runs-on: linux-amd64-cpu4 + steps: + - uses: actions/checkout@v5 + name: Check out code + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build bundle-image + env: + BUNDLE_IMAGE: ${{ needs.variables.outputs.bundle_image }} + VERSION: "" + DEFAULT_CHANNEL: "stable" + CHANNELS: "stable" + run: | + make push-bundle-image + From 3221716f1450828203b77d861a3484a5ff9725de Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 10 Nov 2025 18:33:41 +0100 Subject: [PATCH 2/6] Move variables to a standalone file variables.yaml Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/ci.yaml | 53 +----------------- .github/workflows/variables.yaml | 92 ++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 52 deletions(-) create mode 100644 .github/workflows/variables.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c795665b6..686c3cc7b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,58 +28,7 @@ concurrency: jobs: variables: - runs-on: ubuntu-latest - outputs: - commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} - repo_full_name: ${{ steps.vars.outputs.repo_full_name }} - label_image_source: ${{ steps.vars.outputs.label_image_source }} - push_on_build: ${{ steps.vars.outputs.push_on_build }} - operator_image_base: ${{ steps.vars.outputs.operator_image_base }} - operator_version: ${{ steps.vars.outputs.operator_version }} - steps: - - name: Checkout code - uses: actions/checkout@v5 - - name: Calculate all variables - id: vars - run: | - # Basic computed values - COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" - - # Repository information - REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" - if [[ -z "${REPO_FULL_NAME}" ]]; then - REPO_FULL_NAME="${{ github.repository }}" - fi - LABEL_IMAGE_SOURCE="https://github.com/${REPO_FULL_NAME}" - - # Determine if we should push images - PUSH_ON_BUILD="false" - if [[ "${{ github.actor }}" != "dependabot[bot]" ]]; then - if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then - PUSH_ON_BUILD="true" - elif [[ "${{ github.event_name }}" == "push" ]]; then - PUSH_ON_BUILD="true" - elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then - PUSH_ON_BUILD="true" - fi - fi - - # Image and version information - OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" - OPERATOR_VERSION="${COMMIT_SHORT_SHA}" - - # Output all variables - echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT - echo "repo_full_name=${REPO_FULL_NAME}" >> $GITHUB_OUTPUT - echo "label_image_source=${LABEL_IMAGE_SOURCE}" >> $GITHUB_OUTPUT - echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT - echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT - echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT - - # Display for debugging - echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" - echo "::notice::Push on build: ${PUSH_ON_BUILD}" - echo "::notice::Operator image: ${OPERATOR_IMAGE_BASE}:${OPERATOR_VERSION}" + uses: ./.github/workflows/variables.yaml code-scanning: uses: ./.github/workflows/code-scanning.yaml diff --git a/.github/workflows/variables.yaml b/.github/workflows/variables.yaml new file mode 100644 index 000000000..6924afdda --- /dev/null +++ b/.github/workflows/variables.yaml @@ -0,0 +1,92 @@ +# Copyright 2025 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +on: + workflow_call: + outputs: + commit_short_sha: + description: "The short SHA to use as a version string" + value: ${{ jobs.variables.outputs.commit_short_sha }} + repo_full_name: + description: "The full repository name" + value: ${{ jobs.variables.outputs.repo_full_name }} + label_image_source: + description: "The image source label URL" + value: ${{ jobs.variables.outputs.label_image_source }} + push_on_build: + description: "Whether to push images on build" + value: ${{ jobs.variables.outputs.push_on_build }} + operator_image_base: + description: "The base operator image name" + value: ${{ jobs.variables.outputs.operator_image_base }} + operator_version: + description: "The operator version" + value: ${{ jobs.variables.outputs.operator_version }} + +jobs: + variables: + runs-on: ubuntu-latest + outputs: + commit_short_sha: ${{ steps.vars.outputs.commit_short_sha }} + repo_full_name: ${{ steps.vars.outputs.repo_full_name }} + label_image_source: ${{ steps.vars.outputs.label_image_source }} + push_on_build: ${{ steps.vars.outputs.push_on_build }} + operator_image_base: ${{ steps.vars.outputs.operator_image_base }} + operator_version: ${{ steps.vars.outputs.operator_version }} + steps: + - name: Checkout code + uses: actions/checkout@v5 + + - name: Calculate all variables + id: vars + run: | + # Basic computed values + COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" + + # Repository information + REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" + if [[ -z "${REPO_FULL_NAME}" ]]; then + REPO_FULL_NAME="${{ github.repository }}" + fi + LABEL_IMAGE_SOURCE="https://github.com/${REPO_FULL_NAME}" + + # Determine if we should push images + PUSH_ON_BUILD="false" + if [[ "${{ github.actor }}" != "dependabot[bot]" ]]; then + if [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "push" ]]; then + PUSH_ON_BUILD="true" + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + PUSH_ON_BUILD="true" + fi + fi + + # Image and version information + OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" + OPERATOR_VERSION="${COMMIT_SHORT_SHA}" + + # Output all variables + echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT + echo "repo_full_name=${REPO_FULL_NAME}" >> $GITHUB_OUTPUT + echo "label_image_source=${LABEL_IMAGE_SOURCE}" >> $GITHUB_OUTPUT + echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT + echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT + echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT + + # Display for debugging + echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" + echo "::notice::Push on build: ${PUSH_ON_BUILD}" + echo "::notice::Operator image: ${OPERATOR_IMAGE_BASE}:${OPERATOR_VERSION}" + From 93a72267eac9e8112e91c4b315a205947d307c1b Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 10 Nov 2025 18:35:42 +0100 Subject: [PATCH 3/6] Add weekly forward compatibility testing Implement automated weekly tests validating GPU Operator against latest container-toolkit, device-plugin, and mig-manager images from GHCR. - Add forward-compatibility.yaml workflow with Slack alerts - Create get-latest-images.sh for fetching latest commit-based tags - Extend e2e-tests.yaml and install-operator.sh for component overrides - Add variables.yaml reusable workflow for shared CI variables Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/scripts/get-latest-images.sh | 91 +++++++++++++ .github/workflows/e2e-tests.yaml | 54 ++++++++ .github/workflows/forward-compatibility.yaml | 131 +++++++++++++++++++ tests/scripts/install-operator.sh | 16 +++ 4 files changed, 292 insertions(+) create mode 100755 .github/scripts/get-latest-images.sh create mode 100644 .github/workflows/forward-compatibility.yaml diff --git a/.github/scripts/get-latest-images.sh b/.github/scripts/get-latest-images.sh new file mode 100755 index 000000000..6399948b4 --- /dev/null +++ b/.github/scripts/get-latest-images.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +COMPONENT=${1:-} + +if [[ -z "${COMPONENT}" ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# Verify regctl is available +if ! command -v regctl &> /dev/null; then + echo "Error: regctl not found. Please install regctl first." >&2 + exit 1 +fi + +# Map component names to GHCR image repositories and GitHub source repositories +case "${COMPONENT}" in + toolkit) + IMAGE_REPO="ghcr.io/nvidia/container-toolkit" + GITHUB_REPO="NVIDIA/container-toolkit" + ;; + device-plugin) + IMAGE_REPO="ghcr.io/nvidia/k8s-device-plugin" + GITHUB_REPO="NVIDIA/k8s-device-plugin" + ;; + mig-manager) + IMAGE_REPO="ghcr.io/nvidia/k8s-mig-manager" + GITHUB_REPO="NVIDIA/k8s-mig-manager" + ;; + *) + echo "Error: Unknown component '${COMPONENT}'" >&2 + echo "Valid components: toolkit, device-plugin, mig-manager" >&2 + exit 1 + ;; +esac + +echo "Fetching latest commit from ${GITHUB_REPO}..." >&2 + +# Get the latest commit SHA from the main branch using GitHub API +GITHUB_API_URL="https://api.github.com/repos/${GITHUB_REPO}/commits/main" + +# Use GITHUB_TOKEN if available for authentication (higher rate limits) +if [[ -n "${GITHUB_TOKEN:-}" ]]; then + LATEST_COMMIT=$(curl -sSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github.v3+json" \ + "${GITHUB_API_URL}" | \ + jq -r '.sha[0:8]') +else + LATEST_COMMIT=$(curl -sSL \ + -H "Accept: application/vnd.github.v3+json" \ + "${GITHUB_API_URL}" | \ + jq -r '.sha[0:8]') +fi + +if [[ -z "${LATEST_COMMIT}" || "${LATEST_COMMIT}" == "null" ]]; then + echo "Error: Failed to fetch latest commit from ${GITHUB_REPO}" >&2 + exit 1 +fi + +echo "Latest commit SHA: ${LATEST_COMMIT}" >&2 + +# Construct full image path with commit tag +FULL_IMAGE="${IMAGE_REPO}:${LATEST_COMMIT}" + +echo "Verifying image exists: ${FULL_IMAGE}" >&2 + +# Verify the image exists using regctl +if ! regctl manifest head "${FULL_IMAGE}" &> /dev/null; then + echo "Error: Image ${FULL_IMAGE} does not exist or is not accessible" >&2 + echo "The image may not have been built yet for commit ${LATEST_COMMIT}" >&2 + exit 1 +fi + +echo "Verified ${COMPONENT} image: ${FULL_IMAGE}" >&2 +echo "${FULL_IMAGE}" diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index b6e89fda8..b1bc1dcd8 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -28,6 +28,18 @@ on: operator_version: required: true type: string + toolkit_image: + required: false + type: string + description: 'Full container-toolkit image path (e.g., ghcr.io/nvidia/container-toolkit:v1.18.0)' + device_plugin_image: + required: false + type: string + description: 'Full device-plugin image path' + mig_manager_image: + required: false + type: string + description: 'Full mig-manager image path' secrets: AWS_ACCESS_KEY_ID: required: true @@ -35,6 +47,10 @@ on: required: true AWS_SSH_KEY: required: true + SLACK_BOT_TOKEN: + required: false + SLACK_CHANNEL_ID: + required: false workflow_dispatch: inputs: operator_image: @@ -45,6 +61,18 @@ on: description: 'Operator version to test (override)' required: false type: string + toolkit_image: + description: 'Override container-toolkit image' + required: false + type: string + device_plugin_image: + description: 'Override device-plugin image' + required: false + type: string + mig_manager_image: + description: 'Override mig-manager image' + required: false + type: string jobs: variables: @@ -52,6 +80,9 @@ jobs: outputs: operator_version: ${{ steps.vars.outputs.operator_version }} operator_image: ${{ steps.vars.outputs.operator_image }} + toolkit_image: ${{ steps.vars.outputs.toolkit_image }} + device_plugin_image: ${{ steps.vars.outputs.device_plugin_image }} + mig_manager_image: ${{ steps.vars.outputs.mig_manager_image }} steps: - name: Checkout code if: ${{ github.event_name != 'workflow_call' }} @@ -74,12 +105,29 @@ jobs: OPERATOR_IMAGE="ghcr.io/nvidia/gpu-operator" fi + # Component images (optional, use inputs if provided) + TOOLKIT_IMAGE="${{ inputs.toolkit_image }}" + DEVICE_PLUGIN_IMAGE="${{ inputs.device_plugin_image }}" + MIG_MANAGER_IMAGE="${{ inputs.mig_manager_image }}" + # Output all variables echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT + echo "toolkit_image=${TOOLKIT_IMAGE}" >> $GITHUB_OUTPUT + echo "device_plugin_image=${DEVICE_PLUGIN_IMAGE}" >> $GITHUB_OUTPUT + echo "mig_manager_image=${MIG_MANAGER_IMAGE}" >> $GITHUB_OUTPUT # Display for debugging echo "::notice::Testing operator: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" + if [[ -n "${TOOLKIT_IMAGE}" ]]; then + echo "::notice::Using custom toolkit: ${TOOLKIT_IMAGE}" + fi + if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then + echo "::notice::Using custom device-plugin: ${DEVICE_PLUGIN_IMAGE}" + fi + if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then + echo "::notice::Using custom mig-manager: ${MIG_MANAGER_IMAGE}" + fi e2e-tests-containerd: needs: [variables] @@ -110,6 +158,9 @@ jobs: env: OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} + TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }} + DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }} + MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }} GPU_PRODUCT_NAME: "Tesla-T4" SKIP_LAUNCH: "true" CONTAINER_RUNTIME: "containerd" @@ -156,6 +207,9 @@ jobs: env: OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} + TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }} + DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }} + MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }} GPU_PRODUCT_NAME: "Tesla-T4" SKIP_LAUNCH: "true" CONTAINER_RUNTIME: "containerd" diff --git a/.github/workflows/forward-compatibility.yaml b/.github/workflows/forward-compatibility.yaml new file mode 100644 index 000000000..2ade0eba3 --- /dev/null +++ b/.github/workflows/forward-compatibility.yaml @@ -0,0 +1,131 @@ +# Copyright NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Forward Compatibility + +on: + schedule: + - cron: '0 2 * * 1' # Weekly on Monday at 2 AM UTC + workflow_dispatch: + inputs: + toolkit_image: + description: 'Override container-toolkit image' + required: false + type: string + device_plugin_image: + description: 'Override device-plugin image' + required: false + type: string + mig_manager_image: + description: 'Override mig-manager image' + required: false + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + fetch-latest-images: + runs-on: ubuntu-latest + outputs: + toolkit_image: ${{ steps.images.outputs.toolkit_image }} + device_plugin_image: ${{ steps.images.outputs.device_plugin_image }} + mig_manager_image: ${{ steps.images.outputs.mig_manager_image }} + steps: + - uses: actions/checkout@v5 + + - name: Install regctl + run: | + REGCTL_VERSION=v0.9.2 + mkdir -p bin + curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 + chmod +x bin/regctl + echo "$(pwd)/bin" >> $GITHUB_PATH + + - name: Get latest component images + id: images + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Use workflow_dispatch inputs if provided, otherwise fetch latest + if [[ -n "${{ inputs.toolkit_image }}" ]]; then + TOOLKIT="${{ inputs.toolkit_image }}" + echo "::notice::Using provided toolkit image: ${TOOLKIT}" + else + echo "::notice::Fetching latest container-toolkit image..." + TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit) + fi + echo "toolkit_image=${TOOLKIT}" >> $GITHUB_OUTPUT + + if [[ -n "${{ inputs.device_plugin_image }}" ]]; then + DEVICE_PLUGIN="${{ inputs.device_plugin_image }}" + echo "::notice::Using provided device-plugin image: ${DEVICE_PLUGIN}" + else + echo "::notice::Fetching latest device-plugin image..." + DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin) + fi + echo "device_plugin_image=${DEVICE_PLUGIN}" >> $GITHUB_OUTPUT + + if [[ -n "${{ inputs.mig_manager_image }}" ]]; then + MIG_MANAGER="${{ inputs.mig_manager_image }}" + echo "::notice::Using provided mig-manager image: ${MIG_MANAGER}" + else + echo "::notice::Fetching latest mig-manager image..." + MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager) + fi + echo "mig_manager_image=${MIG_MANAGER}" >> $GITHUB_OUTPUT + + echo "::notice::=== Forward Compatibility Test Configuration ===" + echo "::notice::Container Toolkit: ${TOOLKIT}" + echo "::notice::Device Plugin: ${DEVICE_PLUGIN}" + echo "::notice::MIG Manager: ${MIG_MANAGER}" + + run-e2e-tests: + needs: [fetch-latest-images] + uses: ./.github/workflows/e2e-tests.yaml + with: + operator_image: ghcr.io/nvidia/gpu-operator + operator_version: main-latest + toolkit_image: ${{ needs.fetch-latest-images.outputs.toolkit_image }} + device_plugin_image: ${{ needs.fetch-latest-images.outputs.device_plugin_image }} + mig_manager_image: ${{ needs.fetch-latest-images.outputs.mig_manager_image }} + secrets: inherit + + notify-failure: + runs-on: ubuntu-latest + needs: [fetch-latest-images, run-e2e-tests] + if: ${{ failure() }} + steps: + - name: Send Slack alert notification + uses: slackapi/slack-github-action@v2.1.1 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ secrets.SLACK_CHANNEL_ID }} + text: | + :x: *Forward Compatibility Test Failed for GPU Operator* + + *Workflow:* ${{ github.workflow }} + *Repository:* ${{ github.repository }} + *Trigger:* ${{ github.event_name }} + + *Tested Components:* + • Container Toolkit: `${{ needs.fetch-latest-images.outputs.toolkit_image }}` + • Device Plugin: `${{ needs.fetch-latest-images.outputs.device_plugin_image }}` + • MIG Manager: `${{ needs.fetch-latest-images.outputs.mig_manager_image }}` + + *Details:* + <@D044YE2MBAR> | <@D051KR3TAQN> | <@D04D866RKLH> | <@D045R30QRPS> diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3fcb55ba2..00700a976 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -26,6 +26,20 @@ if [[ -n "${TOOLKIT_CONTAINER_IMAGE}" ]]; then TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\"" fi +# We set up the options for the device plugin +: ${DEVICE_PLUGIN_OPTIONS:=""} + +if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then +DEVICE_PLUGIN_OPTIONS="${DEVICE_PLUGIN_OPTIONS} --set devicePlugin.repository=\"\" --set devicePlugin.version=\"\" --set devicePlugin.image=\"${DEVICE_PLUGIN_IMAGE}\"" +fi + +# We set up the options for the MIG manager +: ${MIG_MANAGER_OPTIONS:=""} + +if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then +MIG_MANAGER_OPTIONS="${MIG_MANAGER_OPTIONS} --set migManager.repository=\"\" --set migManager.version=\"\" --set migManager.image=\"${MIG_MANAGER_IMAGE}\"" +fi + # Create the test namespace kubectl create namespace "${TEST_NAMESPACE}" @@ -48,4 +62,6 @@ ${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ -n "${TEST_NAMESPACE}" \ ${OPERATOR_OPTIONS} \ ${TOOLKIT_CONTAINER_OPTIONS} \ + ${DEVICE_PLUGIN_OPTIONS} \ + ${MIG_MANAGER_OPTIONS} \ --wait From 7d2aaefaf59c2c3670a356d0ba5c4d3ed0aaa4f6 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 11 Nov 2025 12:21:23 +0100 Subject: [PATCH 4/6] refactor: centralize CI variables and add component image overrides Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/e2e-tests.yaml | 60 ++--------------- .github/workflows/forward-compatibility.yaml | 2 +- .github/workflows/variables.yaml | 71 +++++++++++++++++++- 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index b1bc1dcd8..ccc7bcd2a 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -76,58 +76,13 @@ on: jobs: variables: - runs-on: ubuntu-latest - outputs: - operator_version: ${{ steps.vars.outputs.operator_version }} - operator_image: ${{ steps.vars.outputs.operator_image }} - toolkit_image: ${{ steps.vars.outputs.toolkit_image }} - device_plugin_image: ${{ steps.vars.outputs.device_plugin_image }} - mig_manager_image: ${{ steps.vars.outputs.mig_manager_image }} - steps: - - name: Checkout code - if: ${{ github.event_name != 'workflow_call' }} - uses: actions/checkout@v5 - - name: Calculate test variables - id: vars - run: | - # Use inputs from workflow_call if available - if [[ "${{ github.event_name }}" == "workflow_call" ]]; then - OPERATOR_IMAGE="${{ inputs.operator_image }}" - OPERATOR_VERSION="${{ inputs.operator_version }}" - # Use workflow_dispatch inputs if provided - elif [[ -n "${{ inputs.operator_version }}" && -n "${{ inputs.operator_image }}" ]]; then - OPERATOR_VERSION="${{ inputs.operator_version }}" - OPERATOR_IMAGE="${{ inputs.operator_image }}" - else - # Calculate for standalone runs - COMMIT_SHORT_SHA="${GITHUB_SHA:0:8}" - OPERATOR_VERSION="${COMMIT_SHORT_SHA}" - OPERATOR_IMAGE="ghcr.io/nvidia/gpu-operator" - fi - - # Component images (optional, use inputs if provided) - TOOLKIT_IMAGE="${{ inputs.toolkit_image }}" - DEVICE_PLUGIN_IMAGE="${{ inputs.device_plugin_image }}" - MIG_MANAGER_IMAGE="${{ inputs.mig_manager_image }}" - - # Output all variables - echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT - echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT - echo "toolkit_image=${TOOLKIT_IMAGE}" >> $GITHUB_OUTPUT - echo "device_plugin_image=${DEVICE_PLUGIN_IMAGE}" >> $GITHUB_OUTPUT - echo "mig_manager_image=${MIG_MANAGER_IMAGE}" >> $GITHUB_OUTPUT - - # Display for debugging - echo "::notice::Testing operator: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" - if [[ -n "${TOOLKIT_IMAGE}" ]]; then - echo "::notice::Using custom toolkit: ${TOOLKIT_IMAGE}" - fi - if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then - echo "::notice::Using custom device-plugin: ${DEVICE_PLUGIN_IMAGE}" - fi - if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then - echo "::notice::Using custom mig-manager: ${MIG_MANAGER_IMAGE}" - fi + uses: ./.github/workflows/variables.yaml + with: + operator_image: ${{ inputs.operator_image }} + operator_version: ${{ inputs.operator_version }} + toolkit_image: ${{ inputs.toolkit_image }} + device_plugin_image: ${{ inputs.device_plugin_image }} + mig_manager_image: ${{ inputs.mig_manager_image }} e2e-tests-containerd: needs: [variables] @@ -226,4 +181,3 @@ jobs: name: nvidiadriver-e2e-test-logs path: ./logs/ retention-days: 15 - diff --git a/.github/workflows/forward-compatibility.yaml b/.github/workflows/forward-compatibility.yaml index 2ade0eba3..76548ab15 100644 --- a/.github/workflows/forward-compatibility.yaml +++ b/.github/workflows/forward-compatibility.yaml @@ -128,4 +128,4 @@ jobs: • MIG Manager: `${{ needs.fetch-latest-images.outputs.mig_manager_image }}` *Details:* - <@D044YE2MBAR> | <@D051KR3TAQN> | <@D04D866RKLH> | <@D045R30QRPS> + <@S095E7BNGJU> diff --git a/.github/workflows/variables.yaml b/.github/workflows/variables.yaml index 6924afdda..ef271fdd8 100644 --- a/.github/workflows/variables.yaml +++ b/.github/workflows/variables.yaml @@ -14,6 +14,27 @@ on: workflow_call: + inputs: + operator_image: + description: 'Operator image to use (optional override)' + required: false + type: string + operator_version: + description: 'Operator version to use (optional override)' + required: false + type: string + toolkit_image: + description: 'Full container-toolkit image path (optional)' + required: false + type: string + device_plugin_image: + description: 'Full device-plugin image path (optional)' + required: false + type: string + mig_manager_image: + description: 'Full mig-manager image path (optional)' + required: false + type: string outputs: commit_short_sha: description: "The short SHA to use as a version string" @@ -33,6 +54,18 @@ on: operator_version: description: "The operator version" value: ${{ jobs.variables.outputs.operator_version }} + operator_image: + description: "The operator image (with override support)" + value: ${{ jobs.variables.outputs.operator_image }} + toolkit_image: + description: "The container-toolkit image override" + value: ${{ jobs.variables.outputs.toolkit_image }} + device_plugin_image: + description: "The device-plugin image override" + value: ${{ jobs.variables.outputs.device_plugin_image }} + mig_manager_image: + description: "The mig-manager image override" + value: ${{ jobs.variables.outputs.mig_manager_image }} jobs: variables: @@ -44,6 +77,10 @@ jobs: push_on_build: ${{ steps.vars.outputs.push_on_build }} operator_image_base: ${{ steps.vars.outputs.operator_image_base }} operator_version: ${{ steps.vars.outputs.operator_version }} + operator_image: ${{ steps.vars.outputs.operator_image }} + toolkit_image: ${{ steps.vars.outputs.toolkit_image }} + device_plugin_image: ${{ steps.vars.outputs.device_plugin_image }} + mig_manager_image: ${{ steps.vars.outputs.mig_manager_image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -73,9 +110,24 @@ jobs: fi fi - # Image and version information + # Image and version information (with override support) OPERATOR_IMAGE_BASE="ghcr.io/nvidia/gpu-operator" - OPERATOR_VERSION="${COMMIT_SHORT_SHA}" + if [[ -n "${{ inputs.operator_version }}" ]]; then + OPERATOR_VERSION="${{ inputs.operator_version }}" + else + OPERATOR_VERSION="${COMMIT_SHORT_SHA}" + fi + + if [[ -n "${{ inputs.operator_image }}" ]]; then + OPERATOR_IMAGE="${{ inputs.operator_image }}" + else + OPERATOR_IMAGE="${OPERATOR_IMAGE_BASE}" + fi + + # Component images (optional overrides) + TOOLKIT_IMAGE="${{ inputs.toolkit_image }}" + DEVICE_PLUGIN_IMAGE="${{ inputs.device_plugin_image }}" + MIG_MANAGER_IMAGE="${{ inputs.mig_manager_image }}" # Output all variables echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT @@ -84,9 +136,22 @@ jobs: echo "push_on_build=${PUSH_ON_BUILD}" >> $GITHUB_OUTPUT echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT + echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT + echo "toolkit_image=${TOOLKIT_IMAGE}" >> $GITHUB_OUTPUT + echo "device_plugin_image=${DEVICE_PLUGIN_IMAGE}" >> $GITHUB_OUTPUT + echo "mig_manager_image=${MIG_MANAGER_IMAGE}" >> $GITHUB_OUTPUT # Display for debugging echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" echo "::notice::Push on build: ${PUSH_ON_BUILD}" - echo "::notice::Operator image: ${OPERATOR_IMAGE_BASE}:${OPERATOR_VERSION}" + echo "::notice::Operator image: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" + if [[ -n "${TOOLKIT_IMAGE}" ]]; then + echo "::notice::Using custom toolkit: ${TOOLKIT_IMAGE}" + fi + if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then + echo "::notice::Using custom device-plugin: ${DEVICE_PLUGIN_IMAGE}" + fi + if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then + echo "::notice::Using custom mig-manager: ${MIG_MANAGER_IMAGE}" + fi From 31be0c75304c574b639794d462d51cca09089f6c Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 3 Dec 2025 11:39:37 +0100 Subject: [PATCH 5/6] [no-relnote] set workflow_dispatch to use get-latest-images.sh Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/ci.yaml | 2 +- .github/workflows/forward-compatibility.yaml | 44 ++++---------------- 2 files changed, 9 insertions(+), 37 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 686c3cc7b..13850188d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -52,7 +52,7 @@ jobs: needs: [variables, image-builds] uses: ./.github/workflows/e2e-tests.yaml with: - operator_image: ${{ needs.variables.outputs.operator_image_base }} + operator_image: ${{ needs.variables.outputs.operator_image }} operator_version: ${{ needs.variables.outputs.operator_version }} secrets: inherit diff --git a/.github/workflows/forward-compatibility.yaml b/.github/workflows/forward-compatibility.yaml index 76548ab15..73550c9bc 100644 --- a/.github/workflows/forward-compatibility.yaml +++ b/.github/workflows/forward-compatibility.yaml @@ -17,20 +17,7 @@ name: Forward Compatibility on: schedule: - cron: '0 2 * * 1' # Weekly on Monday at 2 AM UTC - workflow_dispatch: - inputs: - toolkit_image: - description: 'Override container-toolkit image' - required: false - type: string - device_plugin_image: - description: 'Override device-plugin image' - required: false - type: string - mig_manager_image: - description: 'Override mig-manager image' - required: false - type: string + workflow_dispatch: # Manual trigger concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -59,32 +46,17 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - # Use workflow_dispatch inputs if provided, otherwise fetch latest - if [[ -n "${{ inputs.toolkit_image }}" ]]; then - TOOLKIT="${{ inputs.toolkit_image }}" - echo "::notice::Using provided toolkit image: ${TOOLKIT}" - else - echo "::notice::Fetching latest container-toolkit image..." - TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit) - fi + # Fetch latest images from component repositories + echo "::notice::Fetching latest container-toolkit image..." + TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit) echo "toolkit_image=${TOOLKIT}" >> $GITHUB_OUTPUT - if [[ -n "${{ inputs.device_plugin_image }}" ]]; then - DEVICE_PLUGIN="${{ inputs.device_plugin_image }}" - echo "::notice::Using provided device-plugin image: ${DEVICE_PLUGIN}" - else - echo "::notice::Fetching latest device-plugin image..." - DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin) - fi + echo "::notice::Fetching latest device-plugin image..." + DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin) echo "device_plugin_image=${DEVICE_PLUGIN}" >> $GITHUB_OUTPUT - if [[ -n "${{ inputs.mig_manager_image }}" ]]; then - MIG_MANAGER="${{ inputs.mig_manager_image }}" - echo "::notice::Using provided mig-manager image: ${MIG_MANAGER}" - else - echo "::notice::Fetching latest mig-manager image..." - MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager) - fi + echo "::notice::Fetching latest mig-manager image..." + MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager) echo "mig_manager_image=${MIG_MANAGER}" >> $GITHUB_OUTPUT echo "::notice::=== Forward Compatibility Test Configuration ===" From 203941a80dd3097e486e8f2732974d435940a2f9 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Wed, 3 Dec 2025 12:00:08 +0100 Subject: [PATCH 6/6] replace component image vars with Helm values file Replace individual component image environment variables with a values override file approach. Add helper scripts for generating values files from env vars and component images. Ensure clean separation between Helm values file (-f) and --set flags to avoid conflicts. This reduces workflow boilerplate by ~58% and makes adding new operands trivial (no workflow changes needed). Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/scripts/generate-values-overrides.sh | 74 ++++++++++ .github/workflows/e2e-tests.yaml | 54 +++---- .github/workflows/forward-compatibility.yaml | 35 +++-- .github/workflows/variables.yaml | 41 ------ tests/scripts/env-to-values.sh | 140 +++++++++++++++++++ tests/scripts/install-operator.sh | 110 +++++++++++---- 6 files changed, 332 insertions(+), 122 deletions(-) create mode 100755 .github/scripts/generate-values-overrides.sh create mode 100755 tests/scripts/env-to-values.sh diff --git a/.github/scripts/generate-values-overrides.sh b/.github/scripts/generate-values-overrides.sh new file mode 100755 index 000000000..5a71444c0 --- /dev/null +++ b/.github/scripts/generate-values-overrides.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Usage: generate-values-overrides.sh OUTPUT_FILE TOOLKIT_IMAGE DEVICE_PLUGIN_IMAGE MIG_MANAGER_IMAGE +# +# Generates a Helm values override file for GPU Operator component images. +# This file can be used with `helm install -f values-overrides.yaml` to +# override default component image versions. + +if [[ $# -ne 4 ]]; then + echo "Usage: $0 OUTPUT_FILE TOOLKIT_IMAGE DEVICE_PLUGIN_IMAGE MIG_MANAGER_IMAGE" >&2 + echo "" >&2 + echo "Example:" >&2 + echo " $0 values.yaml \\" >&2 + echo " ghcr.io/nvidia/container-toolkit:v1.18.0-ubuntu20.04 \\" >&2 + echo " ghcr.io/nvidia/k8s-device-plugin:v0.17.0-ubi8 \\" >&2 + echo " ghcr.io/nvidia/k8s-mig-manager:v0.10.0-ubuntu20.04" >&2 + exit 1 +fi + +OUTPUT_FILE="$1" +TOOLKIT_IMAGE="$2" +DEVICE_PLUGIN_IMAGE="$3" +MIG_MANAGER_IMAGE="$4" + +# Generate values override file +cat > "${OUTPUT_FILE}" <> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + if [[ "${{ inputs.use_values_override }}" == "true" ]]; then + echo "VALUES_FILE=${{ github.workspace }}/values-overrides.yaml" >> $GITHUB_ENV + fi - name: Run e2e tests env: OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} - TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }} - DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }} - MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }} GPU_PRODUCT_NAME: "Tesla-T4" SKIP_LAUNCH: "true" CONTAINER_RUNTIME: "containerd" @@ -142,6 +126,12 @@ jobs: steps: - uses: actions/checkout@v5 name: Check out code + - name: Download values override file + if: ${{ inputs.use_values_override }} + uses: actions/download-artifact@v5 + with: + name: values-overrides + path: ${{ github.workspace }} - name: Set up Holodeck uses: NVIDIA/holodeck@v0.2.17 with: @@ -158,13 +148,13 @@ jobs: run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + if [[ "${{ inputs.use_values_override }}" == "true" ]]; then + echo "VALUES_FILE=${{ github.workspace }}/values-overrides.yaml" >> $GITHUB_ENV + fi - name: Run e2e tests env: OPERATOR_VERSION: ${{ needs.variables.outputs.operator_version }} OPERATOR_IMAGE: ${{ needs.variables.outputs.operator_image }} - TOOLKIT_CONTAINER_IMAGE: ${{ needs.variables.outputs.toolkit_image }} - DEVICE_PLUGIN_IMAGE: ${{ needs.variables.outputs.device_plugin_image }} - MIG_MANAGER_IMAGE: ${{ needs.variables.outputs.mig_manager_image }} GPU_PRODUCT_NAME: "Tesla-T4" SKIP_LAUNCH: "true" CONTAINER_RUNTIME: "containerd" diff --git a/.github/workflows/forward-compatibility.yaml b/.github/workflows/forward-compatibility.yaml index 73550c9bc..d6ecea2f0 100644 --- a/.github/workflows/forward-compatibility.yaml +++ b/.github/workflows/forward-compatibility.yaml @@ -26,10 +26,6 @@ concurrency: jobs: fetch-latest-images: runs-on: ubuntu-latest - outputs: - toolkit_image: ${{ steps.images.outputs.toolkit_image }} - device_plugin_image: ${{ steps.images.outputs.device_plugin_image }} - mig_manager_image: ${{ steps.images.outputs.mig_manager_image }} steps: - uses: actions/checkout@v5 @@ -41,28 +37,33 @@ jobs: chmod +x bin/regctl echo "$(pwd)/bin" >> $GITHUB_PATH - - name: Get latest component images - id: images + - name: Get latest component images and generate values override file env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | # Fetch latest images from component repositories echo "::notice::Fetching latest container-toolkit image..." TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit) - echo "toolkit_image=${TOOLKIT}" >> $GITHUB_OUTPUT echo "::notice::Fetching latest device-plugin image..." DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin) - echo "device_plugin_image=${DEVICE_PLUGIN}" >> $GITHUB_OUTPUT echo "::notice::Fetching latest mig-manager image..." MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager) - echo "mig_manager_image=${MIG_MANAGER}" >> $GITHUB_OUTPUT - echo "::notice::=== Forward Compatibility Test Configuration ===" - echo "::notice::Container Toolkit: ${TOOLKIT}" - echo "::notice::Device Plugin: ${DEVICE_PLUGIN}" - echo "::notice::MIG Manager: ${MIG_MANAGER}" + # Generate values override file + .github/scripts/generate-values-overrides.sh \ + values-overrides.yaml \ + "${TOOLKIT}" \ + "${DEVICE_PLUGIN}" \ + "${MIG_MANAGER}" + + - name: Upload values override file + uses: actions/upload-artifact@v5 + with: + name: values-overrides + path: values-overrides.yaml + retention-days: 30 run-e2e-tests: needs: [fetch-latest-images] @@ -70,9 +71,7 @@ jobs: with: operator_image: ghcr.io/nvidia/gpu-operator operator_version: main-latest - toolkit_image: ${{ needs.fetch-latest-images.outputs.toolkit_image }} - device_plugin_image: ${{ needs.fetch-latest-images.outputs.device_plugin_image }} - mig_manager_image: ${{ needs.fetch-latest-images.outputs.mig_manager_image }} + use_values_override: true secrets: inherit notify-failure: @@ -95,9 +94,7 @@ jobs: *Trigger:* ${{ github.event_name }} *Tested Components:* - • Container Toolkit: `${{ needs.fetch-latest-images.outputs.toolkit_image }}` - • Device Plugin: `${{ needs.fetch-latest-images.outputs.device_plugin_image }}` - • MIG Manager: `${{ needs.fetch-latest-images.outputs.mig_manager_image }}` + Download `values-overrides` artifact to see tested component versions *Details:* <@S095E7BNGJU> diff --git a/.github/workflows/variables.yaml b/.github/workflows/variables.yaml index ef271fdd8..f04d1237a 100644 --- a/.github/workflows/variables.yaml +++ b/.github/workflows/variables.yaml @@ -23,18 +23,6 @@ on: description: 'Operator version to use (optional override)' required: false type: string - toolkit_image: - description: 'Full container-toolkit image path (optional)' - required: false - type: string - device_plugin_image: - description: 'Full device-plugin image path (optional)' - required: false - type: string - mig_manager_image: - description: 'Full mig-manager image path (optional)' - required: false - type: string outputs: commit_short_sha: description: "The short SHA to use as a version string" @@ -57,15 +45,6 @@ on: operator_image: description: "The operator image (with override support)" value: ${{ jobs.variables.outputs.operator_image }} - toolkit_image: - description: "The container-toolkit image override" - value: ${{ jobs.variables.outputs.toolkit_image }} - device_plugin_image: - description: "The device-plugin image override" - value: ${{ jobs.variables.outputs.device_plugin_image }} - mig_manager_image: - description: "The mig-manager image override" - value: ${{ jobs.variables.outputs.mig_manager_image }} jobs: variables: @@ -78,9 +57,6 @@ jobs: operator_image_base: ${{ steps.vars.outputs.operator_image_base }} operator_version: ${{ steps.vars.outputs.operator_version }} operator_image: ${{ steps.vars.outputs.operator_image }} - toolkit_image: ${{ steps.vars.outputs.toolkit_image }} - device_plugin_image: ${{ steps.vars.outputs.device_plugin_image }} - mig_manager_image: ${{ steps.vars.outputs.mig_manager_image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -124,11 +100,6 @@ jobs: OPERATOR_IMAGE="${OPERATOR_IMAGE_BASE}" fi - # Component images (optional overrides) - TOOLKIT_IMAGE="${{ inputs.toolkit_image }}" - DEVICE_PLUGIN_IMAGE="${{ inputs.device_plugin_image }}" - MIG_MANAGER_IMAGE="${{ inputs.mig_manager_image }}" - # Output all variables echo "commit_short_sha=${COMMIT_SHORT_SHA}" >> $GITHUB_OUTPUT echo "repo_full_name=${REPO_FULL_NAME}" >> $GITHUB_OUTPUT @@ -137,21 +108,9 @@ jobs: echo "operator_image_base=${OPERATOR_IMAGE_BASE}" >> $GITHUB_OUTPUT echo "operator_version=${OPERATOR_VERSION}" >> $GITHUB_OUTPUT echo "operator_image=${OPERATOR_IMAGE}" >> $GITHUB_OUTPUT - echo "toolkit_image=${TOOLKIT_IMAGE}" >> $GITHUB_OUTPUT - echo "device_plugin_image=${DEVICE_PLUGIN_IMAGE}" >> $GITHUB_OUTPUT - echo "mig_manager_image=${MIG_MANAGER_IMAGE}" >> $GITHUB_OUTPUT # Display for debugging echo "::notice::Commit SHA: ${COMMIT_SHORT_SHA}" echo "::notice::Push on build: ${PUSH_ON_BUILD}" echo "::notice::Operator image: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" - if [[ -n "${TOOLKIT_IMAGE}" ]]; then - echo "::notice::Using custom toolkit: ${TOOLKIT_IMAGE}" - fi - if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then - echo "::notice::Using custom device-plugin: ${DEVICE_PLUGIN_IMAGE}" - fi - if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then - echo "::notice::Using custom mig-manager: ${MIG_MANAGER_IMAGE}" - fi diff --git a/tests/scripts/env-to-values.sh b/tests/scripts/env-to-values.sh new file mode 100755 index 000000000..970a966e9 --- /dev/null +++ b/tests/scripts/env-to-values.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash + +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Usage: env-to-values.sh OUTPUT_FILE +# +# Converts environment variables to GPU Operator Helm values YAML format. +# This script reads common test environment variables and generates a +# values file that can be used with `helm install -f values.yaml`. +# +# Supported environment variables: +# - OPERATOR_IMAGE: operator image path (repository will be extracted) +# - OPERATOR_VERSION: operator version +# - TOOLKIT_CONTAINER_IMAGE: container-toolkit image override +# - DEVICE_PLUGIN_IMAGE: device-plugin image override +# - MIG_MANAGER_IMAGE: mig-manager image override +# - CONTAINER_RUNTIME: default runtime (docker, containerd, crio) + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 OUTPUT_FILE" >&2 + echo "" >&2 + echo "Converts environment variables to GPU Operator Helm values format." >&2 + exit 1 +fi + +OUTPUT_FILE="$1" + +# Start with header +cat > "${OUTPUT_FILE}" <> "${OUTPUT_FILE}" + echo -e "${OPERATOR_CONFIG}" >> "${OUTPUT_FILE}" +fi + +# Write validator configuration if any +if [[ -n "${VALIDATOR_CONFIG}" ]]; then + echo "validator:" >> "${OUTPUT_FILE}" + echo -e "${VALIDATOR_CONFIG}" >> "${OUTPUT_FILE}" +fi + +# Container Toolkit configuration +if [[ -n "${TOOLKIT_CONTAINER_IMAGE:-}" ]]; then + cat >> "${OUTPUT_FILE}" <> "${OUTPUT_FILE}" <> "${OUTPUT_FILE}" <&2 + echo "# No values to override" >> "${OUTPUT_FILE}" +fi + +echo "" +echo "Generated values file: ${OUTPUT_FILE}" +echo "" +echo "=== File Contents ===" +cat "${OUTPUT_FILE}" diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 00700a976..6e7d173d1 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -10,34 +10,68 @@ source ${SCRIPT_DIR}/.definitions.sh OPERATOR_REPOSITORY=$(dirname ${OPERATOR_IMAGE}) -: ${OPERATOR_OPTIONS:=""} -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.repository=${OPERATOR_REPOSITORY} --set validator.repository=${OPERATOR_REPOSITORY}" - -if [[ -n "${OPERATOR_VERSION}" ]]; then -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION} --set validator.version=${OPERATOR_VERSION}" -fi - -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.defaultRuntime=${CONTAINER_RUNTIME}" - -# We set up the options for the toolkit container -: ${TOOLKIT_CONTAINER_OPTIONS:=""} - -if [[ -n "${TOOLKIT_CONTAINER_IMAGE}" ]]; then -TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\"" +# Determine if we should use values file approach or --set flags +USE_VALUES_FILE=false +if [[ -n "${VALUES_FILE:-}" ]]; then + USE_VALUES_FILE=true fi -# We set up the options for the device plugin -: ${DEVICE_PLUGIN_OPTIONS:=""} - -if [[ -n "${DEVICE_PLUGIN_IMAGE}" ]]; then -DEVICE_PLUGIN_OPTIONS="${DEVICE_PLUGIN_OPTIONS} --set devicePlugin.repository=\"\" --set devicePlugin.version=\"\" --set devicePlugin.image=\"${DEVICE_PLUGIN_IMAGE}\"" +# Build operator options conditionally +: ${OPERATOR_OPTIONS:=""} +if [[ "${USE_VALUES_FILE}" == "false" ]]; then + # Traditional approach: build --set flags + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.repository=${OPERATOR_REPOSITORY} --set validator.repository=${OPERATOR_REPOSITORY}" + + if [[ -n "${OPERATOR_VERSION}" ]]; then + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION} --set validator.version=${OPERATOR_VERSION}" + fi + + OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.defaultRuntime=${CONTAINER_RUNTIME}" fi -# We set up the options for the MIG manager -: ${MIG_MANAGER_OPTIONS:=""} - -if [[ -n "${MIG_MANAGER_IMAGE}" ]]; then -MIG_MANAGER_OPTIONS="${MIG_MANAGER_OPTIONS} --set migManager.repository=\"\" --set migManager.version=\"\" --set migManager.image=\"${MIG_MANAGER_IMAGE}\"" +if [[ "${USE_VALUES_FILE}" == "true" ]]; then + # Generate a temporary values file from environment variables + # and merge it with the provided VALUES_FILE + TEMP_ENV_VALUES=$(mktemp) + ${SCRIPT_DIR}/env-to-values.sh "${TEMP_ENV_VALUES}" + + # If VALUES_FILE exists, merge it with env-generated values + # Otherwise just use the env-generated values + if [[ -f "${VALUES_FILE}" ]]; then + echo "" + echo "Using provided values file: ${VALUES_FILE}" + cat "${VALUES_FILE}" + echo "" + echo "Merged with environment-based values:" + cat "${TEMP_ENV_VALUES}" + # Create a combined values file + COMBINED_VALUES=$(mktemp) + cat "${VALUES_FILE}" "${TEMP_ENV_VALUES}" > "${COMBINED_VALUES}" + VALUES_FILE="${COMBINED_VALUES}" + else + VALUES_FILE="${TEMP_ENV_VALUES}" + fi + + # Clear individual options since we're using values file + TOOLKIT_CONTAINER_OPTIONS="" + DEVICE_PLUGIN_OPTIONS="" + MIG_MANAGER_OPTIONS="" +else + # Traditional approach: use --set flags for backward compatibility + : ${TOOLKIT_CONTAINER_OPTIONS:=""} + if [[ -n "${TOOLKIT_CONTAINER_IMAGE:-}" ]]; then + TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\"" + fi + + : ${DEVICE_PLUGIN_OPTIONS:=""} + if [[ -n "${DEVICE_PLUGIN_IMAGE:-}" ]]; then + DEVICE_PLUGIN_OPTIONS="${DEVICE_PLUGIN_OPTIONS} --set devicePlugin.repository=\"\" --set devicePlugin.version=\"\" --set devicePlugin.image=\"${DEVICE_PLUGIN_IMAGE}\"" + fi + + : ${MIG_MANAGER_OPTIONS:=""} + if [[ -n "${MIG_MANAGER_IMAGE:-}" ]]; then + MIG_MANAGER_OPTIONS="${MIG_MANAGER_OPTIONS} --set migManager.repository=\"\" --set migManager.version=\"\" --set migManager.image=\"${MIG_MANAGER_IMAGE}\"" + fi fi # Create the test namespace @@ -58,10 +92,26 @@ if [[ "${GPU_MODE}" == "vgpu" ]]; then fi # Run the helm install command -${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ - -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ - ${TOOLKIT_CONTAINER_OPTIONS} \ - ${DEVICE_PLUGIN_OPTIONS} \ - ${MIG_MANAGER_OPTIONS} \ +echo "" +echo "Installing GPU Operator with Helm..." +echo "Operator image: ${OPERATOR_IMAGE}:${OPERATOR_VERSION}" + +if [[ "${USE_VALUES_FILE}" == "true" ]]; then + echo "Using values file approach: ${VALUES_FILE}" + ${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ + -n "${TEST_NAMESPACE}" \ + -f "${VALUES_FILE}" \ + --wait + + # Cleanup temporary values files + rm -f "${TEMP_ENV_VALUES:-}" "${COMBINED_VALUES:-}" +else + echo "Using --set flags approach" + ${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ + -n "${TEST_NAMESPACE}" \ + ${OPERATOR_OPTIONS} \ + ${TOOLKIT_CONTAINER_OPTIONS} \ + ${DEVICE_PLUGIN_OPTIONS} \ + ${MIG_MANAGER_OPTIONS} \ --wait +fi