From fe3f54a605abd515773312741c578db9bbc988e3 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 7 Oct 2025 06:01:35 +0000 Subject: [PATCH] feat(gpu): Add robust proxy support for driver installation This PR introduces comprehensive HTTP/S proxy support for the GPU driver installation script, enabling its use in environments with restricted internet egress, such as those using Secure Web Proxy. The `set_proxy` function, controlled by the `http-proxy` and new `http-proxy-pem-uri` metadata attributes, now configures APT, GPG, Java, pip, and Conda to route traffic through the specified proxy. If a PEM certificate URI is provided, the certificate is installed into the OS, Conda, and Java trust stores. The script now correctly handles the proxy scheme (HTTP vs HTTPS) based on the presence of the `http-proxy-pem-uri` metadata. This change was validated in a development environment where all internet access was routed through an explicit proxy. Additional changes: - `README.md` updated to document the new `http-proxy-pem-uri` metadata option and clarify `http-proxy` usage. - GCS caching for the NVIDIA driver is checked earlier to avoid unnecessary HEAD requests to the NVIDIA CDN. - `configure_dkms_certs` is now more idempotent. - Spark RAPIDS versions and repository URL aligned with `spark-rapids/spark-rapids.sh` as part of a move towards a unified GPU/RAPIDS installation script. - Switched to using `/sys/bus/pci/devices/*/uevent` for GPU detection to remove dependency on pciutils - Moved `set_proxy` call earlier in `prepare_to_install`. - Refactored `no_proxy` and `nvcc_gencode` list generation. fix(ci): Add retry logic to kubectl logs in presubmit - Wrapped `kubectl logs` command in `run-presubmit-on-k8s.sh` with a retry loop to handle transient "No agent available" errors from GKE. --- cloudbuild/run-presubmit-on-k8s.sh | 37 +++- gpu/README.md | 12 ++ gpu/install_gpu_driver.sh | 269 +++++++++++++++++++++--- integration_tests/dataproc_test_case.py | 13 +- 4 files changed, 288 insertions(+), 43 deletions(-) diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh index 94793dc9f..f31d0c876 100644 --- a/cloudbuild/run-presubmit-on-k8s.sh +++ b/cloudbuild/run-presubmit-on-k8s.sh @@ -42,19 +42,46 @@ EOF kubectl apply -f $POD_CONFIG # Delete POD on exit and describe it before deletion if exit was unsuccessful -trap '[[ $? != 0 ]] && kubectl describe "pod/${POD_NAME}"; kubectl delete pods "${POD_NAME}"' EXIT +trap 'exit_code=$? +if [[ ${exit_code} != 0 ]]; then + echo "Presubmit failed for ${POD_NAME}. Describing pod..." + kubectl describe "pod/${POD_NAME}" || echo "Failed to describe pod." + + PROJECT_ID=$(gcloud config get-value project 2>/dev/null || echo "unknown-project") + BUCKET="dataproc-init-actions-test-${PROJECT_ID}" + LOG_GCS_PATH="gs://${BUCKET}/${BUILD_ID}/logs/${POD_NAME}.log" + + echo "Attempting to upload logs to ${LOG_GCS_PATH}" + if kubectl logs "${POD_NAME}" | gsutil cp - "${LOG_GCS_PATH}"; then + echo "Logs for failed pod ${POD_NAME} uploaded to: ${LOG_GCS_PATH}" + else + echo "Log upload to ${LOG_GCS_PATH} failed." + fi +fi +echo "Deleting pod ${POD_NAME}..." +kubectl delete pods "${POD_NAME}" --ignore-not-found=true +exit ${exit_code}' EXIT kubectl wait --for=condition=Ready "pod/${POD_NAME}" --timeout=15m +# To mitigate problems with early test failure, retry kubectl logs +sleep 10s while ! kubectl describe "pod/${POD_NAME}" | grep -q Terminated; do - kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true + # Try to stream logs, but primary log capture is now in the trap + kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true || true LOGS_SINCE_TIME=$(date --iso-8601=seconds) + sleep 2 # Short sleep to avoid busy waiting if logs -f exits done -EXIT_CODE=$(kubectl get pod "${POD_NAME}" \ - -o go-template="{{range .status.containerStatuses}}{{.state.terminated.exitCode}}{{end}}") +# Final check on the pod exit code +EXIT_CODE=$(kubectl get pod "${POD_NAME}" -o go-template="{{range .status.containerStatuses}}{{.state.terminated.exitCode}}{{end}}" || echo "1") if [[ ${EXIT_CODE} != 0 ]]; then - echo "Presubmit failed!" + echo "Presubmit final state for ${POD_NAME} indicates failure (Exit Code: ${EXIT_CODE})." + # The trap will handle the log upload and cleanup exit 1 fi + +echo "Presubmit for ${POD_NAME} successful." +# Explicitly exit 0 to clear the trap's exit code +exit 0 diff --git a/gpu/README.md b/gpu/README.md index c03f9505a..cb92b40c2 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -225,6 +225,18 @@ sometimes found in the "building from source" sections. modulus md5sum of the files referenced by both the private and public secret names. +- `http-proxy: :` - Optional. The address of an HTTP + proxy to use for internet egress. The script will configure `apt`, + `curl`, `gsutil`, `pip`, `java`, and `gpg` to use this proxy. + +- `http-proxy-pem-uri: ` - Optional. A `gs://` path to the + PEM-encoded certificate file used by the proxy specified in + `http-proxy`. This is needed if the proxy uses TLS and its + certificate is not already trusted by the cluster's default trust + store (e.g., if it's a self-signed certificate or signed by an + internal CA). The script will install this certificate into the + system and Java trust stores. + #### Loading built kernel module For platforms which do not have pre-built binary kernel drivers, the diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 47b7d979b..673df668c 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -108,7 +108,6 @@ function get_metadata_value() { print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi - return ${return_code} } @@ -186,6 +185,7 @@ function set_cuda_version() { "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; + "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 @@ -266,10 +266,47 @@ function set_driver_version() { export DRIVER_VERSION DRIVER gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200' ; then - echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" - exit 1 + + # GCS Cache Check Logic + local driver_filename + driver_filename=$(basename "${gpu_driver_url}") + local gcs_cache_path="${pkg_bucket}/nvidia/${driver_filename}" + + echo "Checking for cached NVIDIA driver at: ${gcs_cache_path}" + + if ! gsutil -q stat "${gcs_cache_path}"; then + echo "Driver not found in GCS cache. Validating URL: ${gpu_driver_url}" + # Use curl to check if the URL is valid (HEAD request) + if curl -sSLfI --connect-timeout 10 --max-time 30 "${gpu_driver_url}" 2>/dev/null | grep -E -q 'HTTP.*200'; then + echo "NVIDIA URL is valid. Downloading to cache..." + local temp_driver_file="${tmpdir}/${driver_filename}" + + # Download the file + echo "Downloading from ${gpu_driver_url} to ${temp_driver_file}" + if curl -sSLf -o "${temp_driver_file}" "${gpu_driver_url}"; then + echo "Download complete. Uploading to ${gcs_cache_path}" + # Upload to GCS + if gsutil cp "${temp_driver_file}" "${gcs_cache_path}"; then + echo "Successfully cached to GCS." + rm -f "${temp_driver_file}" + else + echo "ERROR: Failed to upload driver to GCS: ${gcs_cache_path}" + rm -f "${temp_driver_file}" + exit 1 + fi + else + echo "ERROR: Failed to download driver from NVIDIA: ${gpu_driver_url}" + rm -f "${temp_driver_file}" # File might not exist if curl failed early + exit 1 + fi + else + echo "ERROR: NVIDIA driver URL is not valid or accessible: ${gpu_driver_url}" + exit 1 + fi + else + echo "Driver found in GCS cache: ${gcs_cache_path}" fi + # End of GCS Cache Check Logic } function set_cudnn_version() { @@ -422,8 +459,11 @@ function set_cuda_runfile_url() { elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" fi + } + + function set_cudnn_tarball_url() { CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" @@ -673,14 +713,19 @@ function install_nvidia_nccl() { # Ada: SM_89, compute_89 # Hopper: SM_90,SM_90a compute_90,compute_90a # Blackwell: SM_100, compute_100 - NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" + local nvcc_gencode=("-gencode=arch=compute_70,code=sm_70" "-gencode=arch=compute_72,code=sm_72" + "-gencode=arch=compute_80,code=sm_80" "-gencode=arch=compute_86,code=sm_86") + if version_gt "${CUDA_VERSION}" "11.6" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi + nvcc_gencode+=("-gencode=arch=compute_87,code=sm_87") + fi if version_ge "${CUDA_VERSION}" "11.8" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi + nvcc_gencode+=("-gencode=arch=compute_89,code=sm_89") + fi if version_ge "${CUDA_VERSION}" "12.0" ; then - NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi + nvcc_gencode+=("-gencode=arch=compute_90,code=sm_90" "-gencode=arch=compute_90a,code=compute_90a") + fi + NVCC_GENCODE="${nvcc_gencode[*]}" if is_debuntu ; then # These packages are required to build .deb packages from source @@ -771,7 +816,7 @@ function install_nvidia_cudnn() { sync else - echo "Unsupported cudnn version: [${CUDNN_VERSION}]" + echo "Unsupported cudnn version: [\"${CUDNN_VERSION}\"]" fi fi else @@ -866,6 +911,7 @@ function configure_dkms_certs() { echo "No signing secret provided. skipping"; return 0 fi + if [[ -f "${mok_der}" ]] ; then return 0; fi mkdir -p "${CA_TMPDIR}" @@ -950,6 +996,7 @@ function add_contrib_component() { elif is_debian ; then sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list fi + return 0 } function add_nonfree_components() { @@ -974,7 +1021,8 @@ function add_repo_nvidia_container_toolkit() { local repo_data if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" - else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi + else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" + fi os_add_repo nvidia-container-toolkit \ "${signing_key_url}" \ @@ -1330,6 +1378,7 @@ function install_cuda(){ add_repo_cuda mark_complete cuda-repo + return 0 } function install_nvidia_container_toolkit() { @@ -1418,6 +1467,11 @@ function install_gpu_agent() { "${python_interpreter}" -m venv "${venv}" ( source "${venv}/bin/activate" + if [[ -v METADATA_HTTP_PROXY_PEM_URI ]]; then + export REQUESTS_CA_BUNDLE="${trusted_pem_path}" + pip install pip-system-certs + unset REQUESTS_CA_BUNDLE + fi python3 -m pip install --upgrade pip execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" ) @@ -1601,7 +1655,9 @@ EOF # having AQE enabled gives user the best performance. #spark.sql.autoBroadcastJoinThreshold=10m #spark.sql.files.maxPartitionBytes=512m + spark.executor.resource.gpu.amount=1 + #spark.executor.cores=${executor_cores} #spark.executor.memory=${executor_memory_gb}G #spark.dynamicAllocation.enabled=false @@ -1725,7 +1781,7 @@ function mark_incomplete() { function install_dependencies() { is_complete install-dependencies && return 0 - pkg_list="pciutils screen" + pkg_list="screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi mark_complete install-dependencies @@ -1818,7 +1874,7 @@ function check_secure_boot() { echo "Secure boot is enabled, but no signing material provided." echo "Please either disable secure boot or provide signing material as per" echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - return 1 + exit 1 fi CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" @@ -1837,7 +1893,7 @@ function main() { configure_yarn_resources # Detect NVIDIA GPU - if (lspci | grep -q NVIDIA); then + if (grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent); then # if this is called without the MIG script then the drivers are not installed migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi @@ -2154,19 +2210,161 @@ function set_proxy(){ if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi - export http_proxy="${METADATA_HTTP_PROXY}" - export https_proxy="${METADATA_HTTP_PROXY}" - export HTTP_PROXY="${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="${METADATA_HTTP_PROXY}" - no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" - local no_proxy_svc - for no_proxy_svc in compute secretmanager dns servicedirectory logging \ - bigquery composer pubsub bigquerydatatransfer dataflow \ - storage datafusion ; do - no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" + no_proxy_list=("localhost" "127.0.0.0/8" "::1" "metadata.google.internal" "169.254.169.254") + + services=( compute secretmanager dns servicedirectory networkmanagement + bigquery composer pubsub bigquerydatatransfer networkservices + storage datafusion dataproc certificatemanager networksecurity + dataflow privateca logging ) + + for svc in "${services[@]}"; do + no_proxy_list+=("${svc}.googleapis.com") done + no_proxy="$( IFS=',' ; echo "${no_proxy_list[*]}" )" + + export http_proxy="http://${METADATA_HTTP_PROXY}" + export https_proxy="http://${METADATA_HTTP_PROXY}" + export no_proxy + export HTTP_PROXY="http://${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="http://${METADATA_HTTP_PROXY}" export NO_PROXY="${no_proxy}" + + # configure gcloud + gcloud config set proxy/type http + gcloud config set proxy/address "${METADATA_HTTP_PROXY%:*}" + gcloud config set proxy/port "${METADATA_HTTP_PROXY#*:}" + + # add proxy environment variables to /etc/environment + grep http_proxy /etc/environment || echo "http_proxy=${http_proxy}" >> /etc/environment + grep https_proxy /etc/environment || echo "https_proxy=${https_proxy}" >> /etc/environment + grep no_proxy /etc/environment || echo "no_proxy=${no_proxy}" >> /etc/environment + grep HTTP_PROXY /etc/environment || echo "HTTP_PROXY=${HTTP_PROXY}" >> /etc/environment + grep HTTPS_PROXY /etc/environment || echo "HTTPS_PROXY=${HTTPS_PROXY}" >> /etc/environment + grep NO_PROXY /etc/environment || echo "NO_PROXY=${NO_PROXY}" >> /etc/environment + + local pkg_proxy_conf_file + if is_debuntu ; then + # configure Apt to use the proxy: + pkg_proxy_conf_file="/etc/apt/apt.conf.d/99proxy" + cat > "${pkg_proxy_conf_file}" < "${TMP_FILE}" + + cat "${TMP_FILE}" "${pkg_proxy_conf_file}" > "${pkg_proxy_conf_file}".new + mv "${pkg_proxy_conf_file}".new "${pkg_proxy_conf_file}" + + rm "${TMP_FILE}" + fi + else + echo "unknown OS" + exit 1 + fi + # configure gpg to use the proxy: + if ! grep 'keyserver-options http-proxy' /etc/gnupg/dirmngr.conf ; then + mkdir -p /etc/gnupg + cat >> /etc/gnupg/dirmngr.conf <&1)|| { + echo "curl rejects proxy configuration" + echo "${curl_output}" + exit 1 + } + output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run" 2>&1)|| { + echo "curl rejects proxy configuration" + echo "${output}" + exit 1 + } + + # Instruct conda to use the system certificate + echo "Attempting to install pip-system-certs using the proxy certificate..." + export REQUESTS_CA_BUNDLE="${trusted_pem_path}" + pip install pip-system-certs + unset REQUESTS_CA_BUNDLE + + # For the binaries bundled with conda, append our certificate to the bundle + openssl crl2pkcs7 -nocrl -certfile /opt/conda/default/ssl/cacert.pem | openssl pkcs7 -print_certs -noout | grep -Fx "${ca_subject}" || { + cat "${proxy_ca_pem}" >> /opt/conda/default/ssl/cacert.pem + } + + sed -i -e 's|http://|https://|' /etc/gnupg/dirmngr.conf + export http_proxy="https://${METADATA_HTTP_PROXY}" + export https_proxy="https://${METADATA_HTTP_PROXY}" + export HTTP_PROXY="https://${METADATA_HTTP_PROXY}" + export HTTPS_PROXY="https://${METADATA_HTTP_PROXY}" + sed -i -e 's|proxy=http://|proxy=https://|' -e 's|PROXY=http://|PROXY=https://|' /etc/environment + + # Instruct the JRE to trust the certificate + JAVA_HOME="$(awk -F= '/^JAVA_HOME=/ {print $2}' /etc/environment)" + "${JAVA_HOME}/bin/keytool" -import -cacerts -storepass changeit -noprompt -alias swp_ca -file "${proxy_ca_pem}" } function mount_ramdisk(){ @@ -2229,6 +2427,7 @@ function prepare_to_install(){ # Verify OS compatability and Secure boot state check_os check_secure_boot + set_proxy # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` @@ -2241,8 +2440,6 @@ function prepare_to_install(){ fi curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" - prepare_gpu_env - workdir=/opt/install-dpgce tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" @@ -2251,9 +2448,10 @@ function prepare_to_install(){ readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + prepare_gpu_env + mkdir -p "${workdir}/complete" trap exit_handler EXIT - set_proxy mount_ramdisk readonly install_log="${tmpdir}/install.log" @@ -2391,24 +2589,27 @@ function install_spark_rapids() { # Update SPARK RAPIDS config local DEFAULT_SPARK_RAPIDS_VERSION + local nvidia_repo_url DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" - if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then - DEFAULT_SPARK_RAPIDS_VERSION="25.02.1" + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then + DEFAULT_SPARK_RAPIDS_VERSION="25.08.0" + nvidia_repo_url='https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia' + elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then + DEFAULT_SPARK_RAPIDS_VERSION="25.08.0" + nvidia_repo_url='https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia' fi local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then - DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 - fi - readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' local jar_basename diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 8f08472bd..69cdc0dc6 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -199,10 +199,15 @@ def stage_init_actions(self, project): bucket = "gs://dataproc-init-actions-test-{}".format( re.sub("[.:]", "", project.replace("google", "goog"))) - ret_val, _, _ = self.run_command("gsutil ls -b {}".format(bucket)) - # Create staging bucket if it does not exist - if ret_val != 0: - self.assert_command("gsutil mb {}".format(bucket)) + # Check if the bucket exists by listing it. + ret_val, stdout, _ = self.run_command("gsutil ls -b {}".format(bucket)) + + # If gsutil ls -b succeeds and the bucket name is in the output, it exists. + if ret_val == 0 and bucket in stdout: + print(f"Bucket {bucket} already exists.") + else: + print(f"Bucket {bucket} does not exist or could not be verified, attempting to create.") + self.assert_command("gsutil mb -p {} -l {} {}".format(self.PROJECT, self.REGION, bucket)) staging_dir = "{}/{}-{}".format(bucket, self.datetime_str(), self.random_str())