From fe3f54a605abd515773312741c578db9bbc988e3 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Oct 2025 06:01:35 +0000
Subject: [PATCH] feat(gpu): Add robust proxy support for driver installation

This PR introduces comprehensive HTTP/S proxy support for the GPU
driver installation script, enabling its use in environments with
restricted internet egress, such as those using Secure Web Proxy.

The `set_proxy` function, controlled by the `http-proxy` and new
`http-proxy-pem-uri` metadata attributes, now configures APT, GPG,
Java, pip, and Conda to route traffic through the specified proxy. If a
PEM certificate URI is provided, the certificate is installed into the
OS, Conda, and Java trust stores. The script now correctly handles
the proxy scheme (HTTP vs HTTPS) based on the presence of the
`http-proxy-pem-uri` metadata.

This change was validated in a development environment where all
internet access was routed through an explicit proxy.

Additional changes:

- `README.md` updated to document the new `http-proxy-pem-uri`
  metadata option and clarify `http-proxy` usage.
- GCS caching for the NVIDIA driver is checked earlier to avoid
  unnecessary HEAD requests to the NVIDIA CDN.
- `configure_dkms_certs` is now more idempotent.
- Spark RAPIDS versions and repository URL aligned with
  `spark-rapids/spark-rapids.sh` as part of a move towards a unified
  GPU/RAPIDS installation script.
- Switched to using `/sys/bus/pci/devices/*/uevent` for GPU detection
  to remove dependency on pciutils
- Moved `set_proxy` call earlier in `prepare_to_install`.
- Refactored `no_proxy` and `nvcc_gencode` list generation.

fix(ci): Add retry logic to kubectl logs in presubmit

- Wrapped `kubectl logs` command in `run-presubmit-on-k8s.sh` with a
  retry loop to handle transient "No agent available" errors from GKE.
---
 cloudbuild/run-presubmit-on-k8s.sh      |  37 +++-
 gpu/README.md                           |  12 ++
 gpu/install_gpu_driver.sh               | 269 +++++++++++++++++++++---
 integration_tests/dataproc_test_case.py |  13 +-
 4 files changed, 288 insertions(+), 43 deletions(-)

diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh
index 94793dc9f..f31d0c876 100644
--- a/cloudbuild/run-presubmit-on-k8s.sh
+++ b/cloudbuild/run-presubmit-on-k8s.sh
@@ -42,19 +42,46 @@ EOF
 kubectl apply -f $POD_CONFIG
 
 # Delete POD on exit and describe it before deletion if exit was unsuccessful
-trap '[[ $? != 0 ]] && kubectl describe "pod/${POD_NAME}"; kubectl delete pods "${POD_NAME}"' EXIT
+trap 'exit_code=$?
+if [[ ${exit_code} != 0 ]]; then
+  echo "Presubmit failed for ${POD_NAME}. Describing pod..."
+  kubectl describe "pod/${POD_NAME}" || echo "Failed to describe pod."
+
+  PROJECT_ID=$(gcloud config get-value project 2>/dev/null || echo "unknown-project")
+  BUCKET="dataproc-init-actions-test-${PROJECT_ID}"
+  LOG_GCS_PATH="gs://${BUCKET}/${BUILD_ID}/logs/${POD_NAME}.log"
+
+  echo "Attempting to upload logs to ${LOG_GCS_PATH}"
+  if kubectl logs "${POD_NAME}" | gsutil cp - "${LOG_GCS_PATH}"; then
+    echo "Logs for failed pod ${POD_NAME} uploaded to: ${LOG_GCS_PATH}"
+  else
+    echo "Log upload to ${LOG_GCS_PATH} failed."
+  fi
+fi
+echo "Deleting pod ${POD_NAME}..."
+kubectl delete pods "${POD_NAME}" --ignore-not-found=true
+exit ${exit_code}' EXIT
 
 kubectl wait --for=condition=Ready "pod/${POD_NAME}" --timeout=15m
 
+# To mitigate problems with early test failure, retry kubectl logs
+sleep 10s
 while ! kubectl describe "pod/${POD_NAME}" | grep -q Terminated; do
-  kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true
+  # Try to stream logs, but primary log capture is now in the trap
+  kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true || true
   LOGS_SINCE_TIME=$(date --iso-8601=seconds)
+  sleep 2 # Short sleep to avoid busy waiting if logs -f exits
 done
 
-EXIT_CODE=$(kubectl get pod "${POD_NAME}" \
-  -o go-template="{{range .status.containerStatuses}}{{.state.terminated.exitCode}}{{end}}")
+# Final check on the pod exit code
+EXIT_CODE=$(kubectl get pod "${POD_NAME}" -o go-template="{{range .status.containerStatuses}}{{.state.terminated.exitCode}}{{end}}" || echo "1")
 
 if [[ ${EXIT_CODE} != 0 ]]; then
-  echo "Presubmit failed!"
+  echo "Presubmit final state for ${POD_NAME} indicates failure (Exit Code: ${EXIT_CODE})."
+  # The trap will handle the log upload and cleanup
   exit 1
 fi
+
+echo "Presubmit for ${POD_NAME} successful."
+# Explicitly exit 0 to clear the trap's exit code
+exit 0
diff --git a/gpu/README.md b/gpu/README.md
index c03f9505a..cb92b40c2 100644
--- a/gpu/README.md
+++ b/gpu/README.md
@@ -225,6 +225,18 @@ sometimes found in the "building from source" sections.
     modulus md5sum of the files referenced by both the private and
     public secret names.
 
+-   `http-proxy: <HOST>:<PORT>` - Optional. The address of an HTTP
+    proxy to use for internet egress. The script will configure `apt`,
+    `curl`, `gsutil`, `pip`, `java`, and `gpg` to use this proxy.
+
+-   `http-proxy-pem-uri: <GS_PATH>` - Optional. A `gs://` path to the
+    PEM-encoded certificate file used by the proxy specified in
+    `http-proxy`. This is needed if the proxy uses TLS and its
+    certificate is not already trusted by the cluster's default trust
+    store (e.g., if it's a self-signed certificate or signed by an
+    internal CA). The script will install this certificate into the
+    system and Java trust stores.
+
 #### Loading built kernel module
 
 For platforms which do not have pre-built binary kernel drivers, the
diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 47b7d979b..673df668c 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -108,7 +108,6 @@ function get_metadata_value() {
     print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
     return_code=$?
   fi
-
   return ${return_code}
 }
 
@@ -186,6 +185,7 @@ function set_cuda_version() {
     "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
     "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
     "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
+    "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
     *   )
       echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
       exit 1
@@ -266,10 +266,47 @@ function set_driver_version() {
   export DRIVER_VERSION DRIVER
 
   gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-  if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200' ; then
-    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
-    exit 1
+
+  # GCS Cache Check Logic
+  local driver_filename
+  driver_filename=$(basename "${gpu_driver_url}")
+  local gcs_cache_path="${pkg_bucket}/nvidia/${driver_filename}"
+
+  echo "Checking for cached NVIDIA driver at: ${gcs_cache_path}"
+
+  if ! gsutil -q stat "${gcs_cache_path}"; then
+    echo "Driver not found in GCS cache. Validating URL: ${gpu_driver_url}"
+    # Use curl to check if the URL is valid (HEAD request)
+    if curl -sSLfI --connect-timeout 10 --max-time 30 "${gpu_driver_url}" 2>/dev/null | grep -E -q 'HTTP.*200'; then
+      echo "NVIDIA URL is valid. Downloading to cache..."
+      local temp_driver_file="${tmpdir}/${driver_filename}"
+
+      # Download the file
+      echo "Downloading from ${gpu_driver_url} to ${temp_driver_file}"
+      if curl -sSLf -o "${temp_driver_file}" "${gpu_driver_url}"; then
+        echo "Download complete. Uploading to ${gcs_cache_path}"
+        # Upload to GCS
+        if gsutil cp "${temp_driver_file}" "${gcs_cache_path}"; then
+          echo "Successfully cached to GCS."
+          rm -f "${temp_driver_file}"
+        else
+          echo "ERROR: Failed to upload driver to GCS: ${gcs_cache_path}"
+          rm -f "${temp_driver_file}"
+          exit 1
+        fi
+      else
+        echo "ERROR: Failed to download driver from NVIDIA: ${gpu_driver_url}"
+        rm -f "${temp_driver_file}" # File might not exist if curl failed early
+        exit 1
+      fi
+    else
+      echo "ERROR: NVIDIA driver URL is not valid or accessible: ${gpu_driver_url}"
+      exit 1
+    fi
+  else
+    echo "Driver found in GCS cache: ${gcs_cache_path}"
   fi
+  # End of GCS Cache Check Logic
 }
 
 function set_cudnn_version() {
@@ -422,8 +459,11 @@ function set_cuda_runfile_url() {
   elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
     echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
   fi
+
 }
 
+
+
 function set_cudnn_tarball_url() {
 CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
 CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
@@ -673,14 +713,19 @@ function install_nvidia_nccl() {
       # Ada:       SM_89,             compute_89
       # Hopper:    SM_90,SM_90a       compute_90,compute_90a
       # Blackwell: SM_100,            compute_100
-                      NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
-      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
+      local nvcc_gencode=("-gencode=arch=compute_70,code=sm_70" "-gencode=arch=compute_72,code=sm_72"
+                          "-gencode=arch=compute_80,code=sm_80" "-gencode=arch=compute_86,code=sm_86")
+
       if version_gt "${CUDA_VERSION}" "11.6" ; then
-        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
+        nvcc_gencode+=("-gencode=arch=compute_87,code=sm_87")
+      fi
       if version_ge "${CUDA_VERSION}" "11.8" ; then
-        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
+        nvcc_gencode+=("-gencode=arch=compute_89,code=sm_89")
+      fi
       if version_ge "${CUDA_VERSION}" "12.0" ; then
-        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi
+        nvcc_gencode+=("-gencode=arch=compute_90,code=sm_90" "-gencode=arch=compute_90a,code=compute_90a")
+      fi
+      NVCC_GENCODE="${nvcc_gencode[*]}"
 
       if is_debuntu ; then
         # These packages are required to build .deb packages from source
@@ -771,7 +816,7 @@ function install_nvidia_cudnn() {
 
         sync
       else
-        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
+        echo "Unsupported cudnn version: [\"${CUDNN_VERSION}\"]"
       fi
     fi
   else
@@ -866,6 +911,7 @@ function configure_dkms_certs() {
       echo "No signing secret provided.  skipping";
       return 0
   fi
+  if [[ -f "${mok_der}" ]] ; then return 0; fi
 
   mkdir -p "${CA_TMPDIR}"
 
@@ -950,6 +996,7 @@ function add_contrib_component() {
   elif is_debian ; then
       sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
   fi
+  return 0
 }
 
 function add_nonfree_components() {
@@ -974,7 +1021,8 @@ function add_repo_nvidia_container_toolkit() {
   local repo_data
 
   if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
-                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo"
+  fi
 
   os_add_repo nvidia-container-toolkit \
               "${signing_key_url}" \
@@ -1330,6 +1378,7 @@ function install_cuda(){
   add_repo_cuda
 
   mark_complete cuda-repo
+  return 0
 }
 
 function install_nvidia_container_toolkit() {
@@ -1418,6 +1467,11 @@ function install_gpu_agent() {
   "${python_interpreter}" -m venv "${venv}"
 (
   source "${venv}/bin/activate"
+  if [[ -v METADATA_HTTP_PROXY_PEM_URI ]]; then
+    export REQUESTS_CA_BUNDLE="${trusted_pem_path}"
+    pip install pip-system-certs
+    unset REQUESTS_CA_BUNDLE
+  fi
   python3 -m pip install --upgrade pip
   execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
 )
@@ -1601,7 +1655,9 @@ EOF
 # having AQE enabled gives user the best performance.
 #spark.sql.autoBroadcastJoinThreshold=10m
 #spark.sql.files.maxPartitionBytes=512m
+
 spark.executor.resource.gpu.amount=1
+
 #spark.executor.cores=${executor_cores}
 #spark.executor.memory=${executor_memory_gb}G
 #spark.dynamicAllocation.enabled=false
@@ -1725,7 +1781,7 @@ function mark_incomplete() {
 function install_dependencies() {
   is_complete install-dependencies && return 0
 
-  pkg_list="pciutils screen"
+  pkg_list="screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
   mark_complete install-dependencies
@@ -1818,7 +1874,7 @@ function check_secure_boot() {
     echo "Secure boot is enabled, but no signing material provided."
     echo "Please either disable secure boot or provide signing material as per"
     echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
-    return 1
+    exit 1
   fi
 
   CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
@@ -1837,7 +1893,7 @@ function main() {
   configure_yarn_resources
 
   # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
+  if (grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent); then
     # if this is called without the MIG script then the drivers are not installed
     migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
     if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
@@ -2154,19 +2210,161 @@ function set_proxy(){
 
   if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
 
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
-  local no_proxy_svc
-  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
-                      bigquery composer      pubsub bigquerydatatransfer dataflow \
-                      storage  datafusion    ; do
-    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  no_proxy_list=("localhost" "127.0.0.0/8" "::1" "metadata.google.internal" "169.254.169.254")
+
+  services=( compute  secretmanager dns      servicedirectory     networkmanagement
+             bigquery composer      pubsub   bigquerydatatransfer networkservices
+             storage  datafusion    dataproc certificatemanager   networksecurity
+             dataflow privateca     logging )
+
+  for svc in "${services[@]}"; do
+    no_proxy_list+=("${svc}.googleapis.com")
   done
 
+  no_proxy="$( IFS=',' ; echo "${no_proxy_list[*]}" )"
+
+  export http_proxy="http://${METADATA_HTTP_PROXY}"
+  export https_proxy="http://${METADATA_HTTP_PROXY}"
+  export no_proxy
+  export HTTP_PROXY="http://${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="http://${METADATA_HTTP_PROXY}"
   export NO_PROXY="${no_proxy}"
+
+  # configure gcloud
+  gcloud config set proxy/type http
+  gcloud config set proxy/address "${METADATA_HTTP_PROXY%:*}"
+  gcloud config set proxy/port "${METADATA_HTTP_PROXY#*:}"
+
+  # add proxy environment variables to /etc/environment
+  grep http_proxy /etc/environment || echo "http_proxy=${http_proxy}" >> /etc/environment
+  grep https_proxy /etc/environment || echo "https_proxy=${https_proxy}" >> /etc/environment
+  grep no_proxy /etc/environment || echo "no_proxy=${no_proxy}" >> /etc/environment
+  grep HTTP_PROXY /etc/environment || echo "HTTP_PROXY=${HTTP_PROXY}" >> /etc/environment
+  grep HTTPS_PROXY /etc/environment || echo "HTTPS_PROXY=${HTTPS_PROXY}" >> /etc/environment
+  grep NO_PROXY /etc/environment || echo "NO_PROXY=${NO_PROXY}" >> /etc/environment
+
+  local pkg_proxy_conf_file
+  if is_debuntu ; then
+    # configure Apt to use the proxy:
+    pkg_proxy_conf_file="/etc/apt/apt.conf.d/99proxy"
+    cat > "${pkg_proxy_conf_file}" <<EOF
+Acquire::http::Proxy "http://${METADATA_HTTP_PROXY}";
+Acquire::https::Proxy "http://${METADATA_HTTP_PROXY}";
+EOF
+  elif is_rocky ; then
+    pkg_proxy_conf_file="/etc/dnf/dnf.conf"
+
+    touch "${pkg_proxy_conf_file}"
+
+    if grep -q "^proxy=" "${pkg_proxy_conf_file}"; then
+      sed -i.bak "s@^proxy=.*@proxy=${HTTP_PROXY}@" "${pkg_proxy_conf_file}"
+    elif grep -q "^\[main\]" "${pkg_proxy_conf_file}"; then
+      sed -i.bak "/^\[main\]/a proxy=${HTTP_PROXY}" "${pkg_proxy_conf_file}"
+    else
+      local TMP_FILE=$(mktemp)
+      printf "[main]\nproxy=%s\n" "${HTTP_PROXY}" > "${TMP_FILE}"
+
+      cat "${TMP_FILE}" "${pkg_proxy_conf_file}" > "${pkg_proxy_conf_file}".new
+      mv "${pkg_proxy_conf_file}".new "${pkg_proxy_conf_file}"
+
+      rm "${TMP_FILE}"
+    fi
+  else
+    echo "unknown OS"
+    exit 1
+  fi
+  # configure gpg to use the proxy:
+  if ! grep 'keyserver-options http-proxy' /etc/gnupg/dirmngr.conf ; then
+    mkdir -p /etc/gnupg
+    cat >> /etc/gnupg/dirmngr.conf <<EOF
+keyserver-options http-proxy=http://${METADATA_HTTP_PROXY}
+EOF
+  fi
+
+  # Install the HTTPS proxy's certificate in the system and Java trust databases
+  METADATA_HTTP_PROXY_PEM_URI="$(get_metadata_attribute http-proxy-pem-uri '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY_PEM_URI}" ]] ; then return ; fi
+  if [[ ! "${METADATA_HTTP_PROXY_PEM_URI}" =~ ^gs ]] ; then echo "http-proxy-pem-uri value should start with gs://" ; exit 1 ; fi
+
+  local trusted_pem_dir
+  # Add this certificate to the OS trust database
+  # When proxy cert is provided, speak to the proxy over https
+  if is_debuntu ; then
+    trusted_pem_dir="/usr/local/share/ca-certificates"
+    mkdir -p "${trusted_pem_dir}"
+    proxy_ca_pem="${trusted_pem_dir}/proxy_ca.crt"
+    gsutil cp "${METADATA_HTTP_PROXY_PEM_URI}" "${proxy_ca_pem}"
+    update-ca-certificates
+    trusted_pem_path="/etc/ssl/certs/ca-certificates.crt"
+    sed -i -e 's|http://|https://|' "${pkg_proxy_conf_file}"
+  elif is_rocky ; then
+    trusted_pem_dir="/etc/pki/ca-trust/source/anchors"
+    mkdir -p "${trusted_pem_dir}"
+    proxy_ca_pem="${trusted_pem_dir}/proxy_ca.crt"
+    gsutil cp "${METADATA_HTTP_PROXY_PEM_URI}" "${proxy_ca_pem}"
+    update-ca-trust
+    trusted_pem_path="/etc/ssl/certs/ca-bundle.crt"
+    sed -i -e 's|^proxy=http://|proxy=https://|' "${pkg_proxy_conf_file}"
+  else
+    echo "unknown OS"
+    exit 1
+  fi
+
+  # configure gcloud to respect proxy ca cert
+  #gcloud config set core/custom_ca_certs_file "${proxy_ca_pem}"
+
+  ca_subject="$(openssl crl2pkcs7 -nocrl -certfile "${proxy_ca_pem}" | openssl pkcs7 -print_certs -noout | grep ^subject)"
+  # Verify that the proxy certificate is trusted
+  local output
+  output=$(echo | openssl s_client \
+           -connect "${METADATA_HTTP_PROXY}" \
+           -proxy "${METADATA_HTTP_PROXY}" \
+           -CAfile "${proxy_ca_pem}") || {
+    echo "proxy certificate verification failed"
+    echo "${output}"
+    exit 1
+  }
+  output=$(echo | openssl s_client \
+           -connect "${METADATA_HTTP_PROXY}" \
+           -proxy "${METADATA_HTTP_PROXY}" \
+           -CAfile "${trusted_pem_path}") || {
+    echo "proxy ca certificate not included in system bundle"
+    echo "${output}"
+    exit 1
+  }
+  output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://google.com" 2>&1)|| {
+    echo "curl rejects proxy configuration"
+    echo "${curl_output}"
+    exit 1
+  }
+  output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run" 2>&1)|| {
+    echo "curl rejects proxy configuration"
+    echo "${output}"
+    exit 1
+  }
+
+  # Instruct conda to use the system certificate
+  echo "Attempting to install pip-system-certs using the proxy certificate..."
+  export REQUESTS_CA_BUNDLE="${trusted_pem_path}"
+  pip install pip-system-certs
+  unset REQUESTS_CA_BUNDLE
+
+  # For the binaries bundled with conda, append our certificate to the bundle
+  openssl crl2pkcs7 -nocrl -certfile /opt/conda/default/ssl/cacert.pem | openssl pkcs7 -print_certs -noout | grep -Fx "${ca_subject}" || {
+    cat "${proxy_ca_pem}" >> /opt/conda/default/ssl/cacert.pem
+  }
+
+  sed -i -e 's|http://|https://|' /etc/gnupg/dirmngr.conf
+  export http_proxy="https://${METADATA_HTTP_PROXY}"
+  export https_proxy="https://${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="https://${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="https://${METADATA_HTTP_PROXY}"
+  sed -i -e 's|proxy=http://|proxy=https://|'  -e 's|PROXY=http://|PROXY=https://|' /etc/environment
+
+  # Instruct the JRE to trust the certificate
+  JAVA_HOME="$(awk -F= '/^JAVA_HOME=/ {print $2}' /etc/environment)"
+  "${JAVA_HOME}/bin/keytool" -import -cacerts -storepass changeit -noprompt -alias swp_ca -file "${proxy_ca_pem}"
 }
 
 function mount_ramdisk(){
@@ -2229,6 +2427,7 @@ function prepare_to_install(){
   # Verify OS compatability and Secure boot state
   check_os
   check_secure_boot
+  set_proxy
 
   # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
   # used as a more performant replacement for `gsutil`
@@ -2241,8 +2440,6 @@ function prepare_to_install(){
   fi
   curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30"
 
-  prepare_gpu_env
-
   workdir=/opt/install-dpgce
   tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
@@ -2251,9 +2448,10 @@ function prepare_to_install(){
   readonly bdcfg="/usr/local/bin/bdconfig"
   export DEBIAN_FRONTEND=noninteractive
 
+  prepare_gpu_env
+
   mkdir -p "${workdir}/complete"
   trap exit_handler EXIT
-  set_proxy
   mount_ramdisk
 
   readonly install_log="${tmpdir}/install.log"
@@ -2391,24 +2589,27 @@ function install_spark_rapids() {
 
   # Update SPARK RAPIDS config
   local DEFAULT_SPARK_RAPIDS_VERSION
+  local nvidia_repo_url
   DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
-  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
-    DEFAULT_SPARK_RAPIDS_VERSION="25.02.1"
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+    nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
+    DEFAULT_SPARK_RAPIDS_VERSION="25.08.0"
+    nvidia_repo_url='https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia'
+  elif version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ; then
+    DEFAULT_SPARK_RAPIDS_VERSION="25.08.0"
+    nvidia_repo_url='https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia'
   fi
   local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
   # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
   local -r scala_ver="2.12"
 
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
-  fi
-
   readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
   readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
   local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
-  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
   local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
 
   local jar_basename
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 8f08472bd..69cdc0dc6 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -199,10 +199,15 @@ def stage_init_actions(self, project):
         bucket = "gs://dataproc-init-actions-test-{}".format(
             re.sub("[.:]", "", project.replace("google", "goog")))
 
-        ret_val, _, _ = self.run_command("gsutil ls -b {}".format(bucket))
-        # Create staging bucket if it does not exist
-        if ret_val != 0:
-            self.assert_command("gsutil mb {}".format(bucket))
+        # Check if the bucket exists by listing it.
+        ret_val, stdout, _ = self.run_command("gsutil ls -b {}".format(bucket))
+
+        # If gsutil ls -b succeeds and the bucket name is in the output, it exists.
+        if ret_val == 0 and bucket in stdout:
+            print(f"Bucket {bucket} already exists.")
+        else:
+            print(f"Bucket {bucket} does not exist or could not be verified, attempting to create.")
+            self.assert_command("gsutil mb -p {} -l {} {}".format(self.PROJECT, self.REGION, bucket))
 
         staging_dir = "{}/{}-{}".format(bucket, self.datetime_str(),
                                         self.random_str())