From b1be931c151c256eff3f941360ff89b73d94f0bf Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 20 Oct 2025 16:35:40 +0800 Subject: [PATCH 1/8] update for dataproc 3.0 images support --- spark-rapids/spark-rapids.sh | 125 +++++++++++++++++++++++++++++------ 1 file changed, 103 insertions(+), 22 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index f6415e05e..460336c72 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -13,7 +13,8 @@ # limitations under the License. # This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2. -# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only +# However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only. +# For Ubuntu 24.04 with kernel 6.14+, uses repository installation to get latest CUDA toolkit and NVIDIA driver 570+ for compatibility. # Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions. # Note that the script is designed to work when secure boot is disabled during cluster creation. # It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu. @@ -60,6 +61,10 @@ function is_ubuntu22() { is_ubuntu && [[ "$(os_version)" == '22.04'* ]] } +function is_ubuntu24() { + is_ubuntu && [[ "$(os_version)" == '24.04'* ]] +} + function is_rocky() { [[ "$(os_id)" == 'rocky' ]] } @@ -210,6 +215,11 @@ readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[ if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then readonly DEFAULT_XGBOOST_VERSION="1.7.6" readonly SPARK_VERSION="3.0" + readonly SCALA_VERSION="2.12" +elif [[ "${SPARK_VERSION_ENV}" == "4"* ]]; then + readonly DEFAULT_XGBOOST_VERSION="1.7.6" + readonly SPARK_VERSION="4.0" + readonly SCALA_VERSION="2.13" else echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." exit 1 @@ -232,11 +242,32 @@ CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2 # EXCEPTIONS # Change CUDA version for Ubuntu 18 (Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) +# Change CUDA version for Ubuntu 24 (Cuda 12.4.1 is not available, use 12.6.0) if [[ "${OS_NAME}" == "ubuntu" ]]; then if is_ubuntu18 ; then CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.1.1') #12.1.1 NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '530.30.02') #530.30.02 CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1 + elif is_ubuntu24 ; then + # CUDA 12.4.1 is not available for Ubuntu 24.04, use 12.6.0 instead + # For kernel 6.14+, use NVIDIA driver 570 for compatibility + KERNEL_VERSION=$(uname -r | cut -d'-' -f1) + KERNEL_MAJOR=$(echo "$KERNEL_VERSION" | cut -d'.' -f1) + KERNEL_MINOR=$(echo "$KERNEL_VERSION" | cut -d'.' -f2) + + if [[ "$KERNEL_MAJOR" -eq 6 && "$KERNEL_MINOR" -ge 14 ]]; then + # For kernel 6.14+ (dataproc 3), use repository installation to get latest CUDA and compatible drivers + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' 'latest') #latest from repo + NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '570') #570 series + CUDA_VERSION_MAJOR="12" #Will be determined from repository + USE_REPO_INSTALL="true" + else + # Use CUDA 12.6.0 local installer for older kernels + CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.6.0') #12.6.0 + NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '560.28.03') #560.28.03 + CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.6 + USE_REPO_INSTALL="false" + fi fi fi @@ -272,20 +303,27 @@ function execute_with_retries() { function install_spark_rapids() { local -r nvidia_repo_url='https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${nvidia_repo_url}/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar" \ - -P /usr/lib/spark/jars/ + + # For Spark 4.0 with Scala 2.13, use the cuda12 variant + if [[ "${SPARK_VERSION}" == "4.0" ]]; then + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_${SCALA_VERSION}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VERSION}-${SPARK_RAPIDS_VERSION}-cuda12.jar" \ + -P /usr/lib/spark/jars/ + else + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${nvidia_repo_url}/rapids-4-spark_${SCALA_VERSION}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VERSION}-${SPARK_RAPIDS_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + fi } function configure_spark() { - if [[ "${SPARK_VERSION}" == "3"* ]]; then + if [[ "${SPARK_VERSION}" == "3"* ]] || [[ "${SPARK_VERSION}" == "4"* ]]; then cat >>${SPARK_CONF_DIR}/spark-defaults.conf <&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) - if [[ ${spark_version} != 3.* ]]; then + if [[ ${spark_version} != 3.* ]] && [[ ${spark_version} != 4.* ]]; then # include exclusive mode on GPU nvidia-smi -c EXCLUSIVE_PROCESS fi @@ -762,7 +842,7 @@ function check_os_and_secure_boot() { exit 1 fi elif is_ubuntu ; then - if ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ; then + if ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 && ! is_ubuntu24 ; then echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." exit 1 fi @@ -832,3 +912,4 @@ function main() { } main + From f5b2f1af4c89ebcc0fe68a87a6d3913aa7c78776 Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:03:15 +0800 Subject: [PATCH 2/8] Update spark-rapids/spark-rapids.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- spark-rapids/spark-rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 460336c72..8929051cd 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -527,7 +527,7 @@ function install_nvidia_gpu_driver() { # Install latest CUDA toolkit and compatible NVIDIA driver execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit" - execute_with_retries "apt-get install -y -q --no-install-recommends nvidia-driver-570-open" + execute_with_retries "apt-get install -y -q --no-install-recommends nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" clear_dkms_key modprobe nvidia From 3e226b46e5cb93ff9d3750e1e5e76e47533f2411 Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:03:31 +0800 Subject: [PATCH 3/8] Update spark-rapids/spark-rapids.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- spark-rapids/spark-rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 8929051cd..d457b508a 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -514,7 +514,7 @@ function install_nvidia_gpu_driver() { # Repository-based installation for latest CUDA and kernel 6.14+ compatibility # Install CUDA keyring for repository access - execute_with_retries "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb" + execute_with_retries "wget https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/cuda-keyring_1.1-1_all.deb" execute_with_retries "dpkg -i cuda-keyring_1.1-1_all.deb" rm -f cuda-keyring_1.1-1_all.deb From 7979da1214d5022c9b62b799a9ded0f52fbbcefc Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:03:50 +0800 Subject: [PATCH 4/8] Update spark-rapids/spark-rapids.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- spark-rapids/spark-rapids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index d457b508a..1d17175ee 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -519,6 +519,7 @@ function install_nvidia_gpu_driver() { rm -f cuda-keyring_1.1-1_all.deb # Add graphics-drivers PPA for latest NVIDIA drivers + execute_with_retries "apt-get install -y -q software-properties-common" execute_with_retries "add-apt-repository -y ppa:graphics-drivers/ppa" execute_with_retries "apt-get update" From 6881284ecd5ae49791b2f32dcaa1eac4aa7d2dfc Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:04:07 +0800 Subject: [PATCH 5/8] Update spark-rapids/spark-rapids.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- spark-rapids/spark-rapids.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 1d17175ee..7da270ceb 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -557,6 +557,7 @@ function install_nvidia_gpu_driver() { execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}" # Then upgrade to driver 570 from graphics-drivers PPA + execute_with_retries "apt-get install -y -q --no-install-recommends software-properties-common" execute_with_retries "add-apt-repository -y ppa:graphics-drivers/ppa" execute_with_retries "apt-get update" execute_with_retries "apt-get install -y -q --no-install-recommends nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open" From c0cb9f696b35c184f6ac31c75eceb3e348970176 Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:04:28 +0800 Subject: [PATCH 6/8] Update spark-rapids/spark-rapids.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- spark-rapids/spark-rapids.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 7da270ceb..c33d00819 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -14,7 +14,7 @@ # This script installs NVIDIA GPU drivers (version 535.104.05) along with CUDA 12.2. # However, Cuda 12.1.1 - Driver v530.30.02 is used for Ubuntu 18 only. -# For Ubuntu 24.04 with kernel 6.14+, uses repository installation to get latest CUDA toolkit and NVIDIA driver 570+ for compatibility. +# For Ubuntu 24.04 with kernel 6.14+, this script uses repository installation to get the latest CUDA toolkit and NVIDIA driver 570+ for compatibility. # Additionally, it installs the RAPIDS Spark plugin, configures Spark and YARN, and is compatible with Debian, Ubuntu, and Rocky Linux distributions. # Note that the script is designed to work when secure boot is disabled during cluster creation. # It also creates a Systemd Service for maintaining up-to-date Kernel Headers on Debian and Ubuntu. From 4da616c80f11bcc1372e0c2a1f1655abcfd612e5 Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 20 Oct 2025 17:05:01 +0800 Subject: [PATCH 7/8] Update spark-rapids/spark-rapids.sh Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- spark-rapids/spark-rapids.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index c33d00819..578bf5313 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -314,11 +314,11 @@ function install_spark_rapids() { "${nvidia_repo_url}/rapids-4-spark_${SCALA_VERSION}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VERSION}-${SPARK_RAPIDS_VERSION}.jar" \ -P /usr/lib/spark/jars/ wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ + "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ - -P /usr/lib/spark/jars/ + "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ fi } From 0194597765200f25478764ea96fd6229149a257a Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 20 Oct 2025 17:10:22 +0800 Subject: [PATCH 8/8] address comment to update xgboost jars Signed-off-by: liyuan --- spark-rapids/spark-rapids.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 578bf5313..2a45ebde6 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -217,7 +217,7 @@ if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then readonly SPARK_VERSION="3.0" readonly SCALA_VERSION="2.12" elif [[ "${SPARK_VERSION_ENV}" == "4"* ]]; then - readonly DEFAULT_XGBOOST_VERSION="1.7.6" + readonly DEFAULT_XGBOOST_VERSION="2.1.4" readonly SPARK_VERSION="4.0" readonly SCALA_VERSION="2.13" else @@ -304,20 +304,28 @@ function install_spark_rapids() { local -r nvidia_repo_url='https://edge.urm.nvidia.com/artifactory/sw-spark-maven/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - # For Spark 4.0 with Scala 2.13, use the cuda12 variant + # For Spark 4.0 with Scala 2.13, use the cuda12 variant and Scala 2.13 XGBoost JARs if [[ "${SPARK_VERSION}" == "4.0" ]]; then wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${nvidia_repo_url}/rapids-4-spark_${SCALA_VERSION}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VERSION}-${SPARK_RAPIDS_VERSION}-cuda12.jar" \ -P /usr/lib/spark/jars/ + # Download XGBoost JARs for Scala 2.13 (Spark 4.0) + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_${SCALA_VERSION}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${SCALA_VERSION}-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + "${dmlc_repo_url}/xgboost4j-gpu_${SCALA_VERSION}/${XGBOOST_VERSION}/xgboost4j-gpu_${SCALA_VERSION}-${XGBOOST_VERSION}.jar" \ + -P /usr/lib/spark/jars/ else + # For Spark 3.0 with Scala 2.12 wget -nv --timeout=30 --tries=5 --retry-connrefused \ "${nvidia_repo_url}/rapids-4-spark_${SCALA_VERSION}/${SPARK_RAPIDS_VERSION}/rapids-4-spark_${SCALA_VERSION}-${SPARK_RAPIDS_VERSION}.jar" \ -P /usr/lib/spark/jars/ wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-spark-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-spark-gpu_2.12-${XGBOOST_VERSION}.jar" \ + "${dmlc_repo_url}/xgboost4j-spark-gpu_${SCALA_VERSION}/${XGBOOST_VERSION}/xgboost4j-spark-gpu_${SCALA_VERSION}-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${dmlc_repo_url}/xgboost4j-gpu_2.12/${XGBOOST_VERSION}/xgboost4j-gpu_2.12-${XGBOOST_VERSION}.jar" \ + "${dmlc_repo_url}/xgboost4j-gpu_${SCALA_VERSION}/${XGBOOST_VERSION}/xgboost4j-gpu_${SCALA_VERSION}-${XGBOOST_VERSION}.jar" \ -P /usr/lib/spark/jars/ fi }