From f5f21efcfcb3bba3bcbd9d1e372db49d8cb723b4 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 11:16:24 -0700 Subject: [PATCH 1/8] [gpu] merge install_gpu_driver.sh from custom-images --- gpu/install_gpu_driver.sh | 370 +++++++++++++++++++++++++++----------- 1 file changed, 266 insertions(+), 104 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 10b1aa061..c0129dcb7 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -16,24 +16,26 @@ set -euxo pipefail -function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function os_version() { grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; } -function is_rocky() { [[ "$(os_id)" == 'rocky' ]] ; } -function is_rocky8() { is_rocky && [[ "$(os_version)" == '8'* ]] ; } -function is_rocky9() { is_rocky && [[ "$(os_version)" == '9'* ]] ; } -function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } -function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } -function is_ubuntu20() { is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; } -function is_ubuntu22() { is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; } -function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } -function is_debian10() { is_debian && [[ "$(os_version)" == '10'* ]] ; } -function is_debian11() { is_debian && [[ "$(os_version)" == '11'* ]] ; } -function is_debian12() { is_debian && [[ "$(os_version)" == '12'* ]] ; } -function os_vercat() { set +x +function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) +function is_rocky() ( set +x ; [[ "$(os_id)" == 'rocky' ]] ; ) +function is_rocky8() ( set +x ; is_rocky && [[ "$(os_version)" == '8'* ]] ; ) +function is_rocky9() ( set +x ; is_rocky && [[ "$(os_version)" == '9'* ]] ; ) +function is_ubuntu() ( set +x ; [[ "$(os_id)" == 'ubuntu' ]] ; ) +function is_ubuntu18() ( set +x ; is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; ) +function is_ubuntu20() ( set +x ; is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; ) +function is_ubuntu22() ( set +x ; is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; ) +function is_debian() ( set +x ; [[ "$(os_id)" == 'debian' ]] ; ) +function is_debian10() ( set +x ; is_debian && [[ "$(os_version)" == '10'* ]] ; ) +function is_debian11() ( set +x ; is_debian && [[ "$(os_version)" == '11'* ]] ; ) +function is_debian12() ( set +x ; is_debian && [[ "$(os_version)" == '12'* ]] ; ) +function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) + +function os_vercat() ( set +x if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' - else os_version ; fi ; set -x ; } + else os_version ; fi ; ) function remove_old_backports { if is_debian12 ; then return ; fi @@ -56,11 +58,13 @@ function remove_old_backports { done } +# Return true if the first argument is equal to or less than the second argument function compare_versions_lte { [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; } -function compare_versions_lt() { +# Return true if the first argument is less than the second argument +function compare_versions_lt() ( set +x [ "$1" = "$2" ] && return 1 || compare_versions_lte $1 $2 -} +) function print_metadata_value() { local readonly tmpfile=$(mktemp) @@ -83,7 +87,7 @@ function print_metadata_value_if_exists() { return ${return_code} } -function get_metadata_value() { +function get_metadata_value() ( set +x local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 @@ -95,17 +99,16 @@ function get_metadata_value() { print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi - set -x + return ${return_code} -} +) -function get_metadata_attribute() { +function get_metadata_attribute() ( set +x local -r attribute_name="$1" local -r default_value="${2:-}" get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" - set -x -} +) OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') distribution=$(. /etc/os-release;echo $ID$VERSION_ID) @@ -140,8 +143,8 @@ CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") readonly CUDA_VERSION readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}" -function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } -function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } +function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) +function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) readonly DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") if is_debian11 || is_ubuntu22 || is_ubuntu20 ; then DRIVER_VERSION="560.28.03" ; fi @@ -153,8 +156,8 @@ readonly DRIVER=${DRIVER_VERSION%%.*} # Parameters for NVIDIA-provided CUDNN library readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") -function is_cudnn8() { [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; } -function is_cudnn9() { [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; } +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) if is_rocky \ && (compare_versions_lte "${CUDNN_VERSION}" "8.0.5.39") ; then CUDNN_VERSION="8.0.5.39" @@ -256,16 +259,19 @@ NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 -function execute_with_retries() { +function execute_with_retries() ( set +x local -r cmd="$*" + + if [[ "$cmd" =~ "^apt-get install" ]] ; then + cmd="apt-get -y clean && $cmd" + fi for ((i = 0; i < 3; i++)); do - if eval "$cmd"; then set -x ; return 0 ; fi + if eval "$cmd" ; then return 0 ; fi sleep 5 done - set -x return 1 -} +) CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { @@ -273,9 +279,9 @@ function install_cuda_keyring_pkg() { local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ - -o /tmp/cuda-keyring.deb - dpkg -i "/tmp/cuda-keyring.deb" - rm -f "/tmp/cuda-keyring.deb" + -o "${download_dir}/cuda-keyring.deb" + dpkg -i "${download_dir}/cuda-keyring.deb" + rm -f "${download_dir}/cuda-keyring.deb" CUDA_KEYRING_PKG_INSTALLED="1" } @@ -295,10 +301,10 @@ function install_local_cuda_repo() { readonly DIST_KEYRING_DIR="/var/${pkgname}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o "/tmp/${LOCAL_INSTALLER_DEB}" + "${LOCAL_DEB_URL}" -o "${download_dir}/${LOCAL_INSTALLER_DEB}" - dpkg -i "/tmp/${LOCAL_INSTALLER_DEB}" - rm "/tmp/${LOCAL_INSTALLER_DEB}" + dpkg -i "${download_dir}/${LOCAL_INSTALLER_DEB}" + rm "${download_dir}/${LOCAL_INSTALLER_DEB}" cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then @@ -323,11 +329,11 @@ function install_local_cudnn_repo() { # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o /tmp/local-installer.deb + "${local_deb_url}" -o "${download_dir}/local-installer.deb" - dpkg -i /tmp/local-installer.deb + dpkg -i "${download_dir}/local-installer.deb" - rm -f /tmp/local-installer.deb + rm -f "${download_dir}/local-installer.deb" cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings @@ -354,8 +360,9 @@ function install_local_cudnn8_repo() { pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" CUDNN8_PKG_NAME="${pkgname}" - local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${local_deb_fn}" + deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_fn="${download_dir}/${deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ "${local_deb_url}" -o "${local_deb_fn}" @@ -378,7 +385,9 @@ function install_nvidia_nccl() { if is_rocky ; then time execute_with_retries \ dnf -y -q install \ - "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" + "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync elif is_ubuntu ; then install_cuda_keyring_pkg @@ -387,11 +396,15 @@ function install_nvidia_nccl() { if is_ubuntu18 ; then time execute_with_retries \ apt-get install -q -y \ - libnccl2 libnccl-dev + libnccl2 libnccl-dev \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync else time execute_with_retries \ apt-get install -q -y \ - "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" + "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync fi else echo "Unsupported OS: '${OS_NAME}'" @@ -403,8 +416,8 @@ function install_nvidia_nccl() { fi } -function is_src_nvidia() { [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; } -function is_src_os() { [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; } +function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) +function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { local major_version @@ -414,17 +427,21 @@ function install_nvidia_cudnn() { if is_rocky ; then if is_cudnn8 ; then - execute_with_retries "dnf -y -q install" \ + time execute_with_retries dnf -y -q install \ "libcudnn${major_version}" \ - "libcudnn${major_version}-devel" + "libcudnn${major_version}-devel" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync elif is_cudnn9 ; then - execute_with_retries "dnf -y -q install" \ + time execute_with_retries dnf -y -q install \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" + "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync else echo "Unsupported cudnn version: '${major_version}'" fi - elif is_debian || is_ubuntu; then + elif is_debuntu; then if is_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else @@ -434,20 +451,24 @@ function install_nvidia_cudnn() { apt-get update -qq - execute_with_retries \ + time execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ - "libcudnn8-dev=${cudnn_pkg_version}" + "libcudnn8-dev=${cudnn_pkg_version}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync elif is_cudnn9 ; then install_cuda_keyring_pkg apt-get update -qq - execute_with_retries \ + time execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi @@ -457,8 +478,10 @@ function install_nvidia_cudnn() { packages=( "libcudnn${major_version}=${cudnn_pkg_version}" "libcudnn${major_version}-dev=${cudnn_pkg_version}") - execute_with_retries \ - "apt-get install -q -y --no-install-recommends ${packages[*]}" + time execute_with_retries \ + apt-get install -q -y --no-install-recommends "${packages[*]}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync else echo "Unsupported OS: '${OS_NAME}'" exit 1 @@ -579,7 +602,7 @@ function add_nonfree_components() { } function add_repo_nvidia_container_toolkit() { - if is_debian || is_ubuntu ; then + if is_debuntu ; then local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html @@ -595,7 +618,7 @@ function add_repo_nvidia_container_toolkit() { } function add_repo_cuda() { - if is_debian || is_ubuntu ; then + if is_debuntu ; then local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ @@ -624,8 +647,7 @@ function build_driver_from_github() { tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ - -o "${tarball_fn}" - tar xzf "${tarball_fn}" + | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } cd open-gpu-kernel-modules @@ -633,6 +655,7 @@ function build_driver_from_github() { time make -j$(nproc) modules \ > /var/log/open-gpu-kernel-modules-build.log \ 2> /var/log/open-gpu-kernel-modules-build_error.log + sync if [[ -n "${PSN}" ]]; then #configure_dkms_certs @@ -669,38 +692,47 @@ function build_driver_from_packages() { fi add_contrib_component apt-get update -qq - execute_with_retries "apt-get install -y -qq --no-install-recommends dkms" + execute_with_retries apt-get install -y -qq --no-install-recommends dkms \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } #configure_dkms_certs - time execute_with_retries "apt-get install -y -qq --no-install-recommends ${pkglist[@]}" + time execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync elif is_rocky ; then #configure_dkms_certs - if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then + if time execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else - time execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + time execute_with_retries dnf -y -q module install 'nvidia-driver:latest' \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } fi + sync fi #clear_dkms_key } function install_nvidia_userspace_runfile() { - if test -d /run/nvidia-userspace ; then return ; fi + if test -f "${download_dir}/userspace-complete" ; then return ; fi curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${USERSPACE_URL}" -o userspace.run - time bash "./userspace.run" --no-kernel-modules --silent --install-libglvnd \ - > /dev/null 2>&1 - rm -f userspace.run - mkdir -p /run/nvidia-userspace + "${USERSPACE_URL}" -o "${download_dir}/userspace.run" + time bash "${download_dir}/userspace.run" --no-kernel-modules --silent --install-libglvnd \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + rm -f "${download_dir}/userspace.run" + touch "${download_dir}/userspace-complete" + sync } function install_cuda_runfile() { - if test -d /run/nvidia-cuda ; then return ; fi - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_CUDA_URL}" -o cuda.run - time bash "./cuda.run" --silent --toolkit --no-opengl-libs - rm -f cuda.run - mkdir -p /run/nvidia-cuda + if test -f "${download_dir}/cuda-complete" ; then return ; fi + time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_CUDA_URL}" -o "${download_dir}/cuda.run" + time bash "${download_dir}/cuda.run" --silent --toolkit --no-opengl-libs \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + rm -f "${download_dir}/cuda.run" + touch "${download_dir}/cuda-complete" + sync } function install_cuda_toolkit() { @@ -712,11 +744,15 @@ function install_cuda_toolkit() { fi cuda_package="cuda=${CUDA_FULL_VERSION}-1" readonly cudatk_package - if is_ubuntu || is_debian ; then + if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi - time execute_with_retries "apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}" + time execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} \ + > "${install_log}" 2>&1 || { cat "${install_log}" ; exit -4 ; } + sync elif is_rocky ; then - time execute_with_retries "dnf -y -q install ${cudatk_package}" + time execute_with_retries dnf -y -q install "${cudatk_package}" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync fi } @@ -774,7 +810,7 @@ function install_nvidia_gpu_driver() { load_kernel_module install_cuda_runfile - elif is_debian || is_ubuntu ; then + elif is_debuntu ; then install_cuda_keyring_pkg build_driver_from_packages @@ -796,7 +832,11 @@ function install_nvidia_gpu_driver() { exit 1 fi ldconfig - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + if is_src_os ; then + echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" + else + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + fi } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics @@ -812,7 +852,9 @@ function install_gpu_agent() { "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" - pip install -r "${install_dir}/requirements.txt" + time execute_with_retries pip install -r "${install_dir}/requirements.txt" \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + sync # Generate GPU service. cat </lib/systemd/system/gpu-utilization-agent.service @@ -837,7 +879,6 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } -readonly bdcfg="/usr/local/bin/bdconfig" function set_hadoop_property() { local -r config_file=$1 local -r property=$2 @@ -991,7 +1032,6 @@ EOF systemctl start dataproc-cgroup-device-permissions } -nvsmi_works="0" function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" if [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2 @@ -1018,24 +1058,28 @@ function main() { remove_old_backports - if is_debian || is_ubuntu ; then + if is_debuntu ; then export DEBIAN_FRONTEND=noninteractive - execute_with_retries "apt-get install -y -qq pciutils linux-headers-${uname_r}" + time execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1 elif is_rocky ; then - execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" - execute_with_retries "dnf -y -q install pciutils gcc" + time execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + time execute_with_retries dnf -y -q install pciutils gcc \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" local kernel_devel_pkg_out="$(eval "${dnf_cmd} 2>&1")" if [[ "${kernel_devel_pkg_out}" =~ 'Unable to find a match: kernel-devel-' ]] ; then # this kernel-devel may have been migrated to the vault local vault="https://download.rockylinux.org/vault/rocky/$(os_version)" - execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + time execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ - "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" \ + > "${install_log}" 2>&1 || { cat "${install_log}" ; exit -4 ; } + sync else execute_with_retries "${dnf_cmd}" fi @@ -1213,21 +1257,139 @@ function clean_up_sources_lists() { sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi - if -f /etc/apt/trusted.gpg ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi } -if is_debian ; then - clean_up_sources_lists - apt-get update - if is_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi -fi +function exit_handler() { + echo "Exit handler invoked" + set +ex + # Purge private key material until next grant + clear_dkms_key -configure_dkms_certs + # Free conda cache + /opt/conda/miniconda3/bin/conda clean -a -main + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # remove the tmpfs conda pkgs_dirs + if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi + + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to set global pip cache" + + # Clean up shared memory mounts + for shmdir in /mnt/shm /var/cache/apt/archives /var/cache/dnf ; do + if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then + rm -rf ${shmdir}/* + sync + + execute_with_retries umount -f ${shmdir} + fi + done + + # Clean up OS package cache ; re-hold systemd package + if is_debuntu ; then + apt-get -y -qq clean + apt-get -y -qq autoremove + if is_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # print disk usage statistics + if is_debuntu ; then + # Rocky doesn't have sort -h and fails when the argument is passed + du --max-depth 3 -hx / | sort -h | tail -10 + fi + + # Process disk usage logs from installation period + rm -f /tmp/keep-running-df + sleep 6s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem Size Used Avail Use% Mounted on +#/dev/vda2 6.8G 2.5G 4.0G 39% / + df --si + perl -e '$max=( sort + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } )[-1]; +print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero ; sync ; rm -f /zero + fi + + return 0 +} + +trap exit_handler EXIT + +function prepare_to_install(){ + nvsmi_works="0" + readonly bdcfg="/usr/local/bin/bdconfig" + download_dir=/tmp/ + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + # Write to a ramdisk instead of churning the persistent disk + if [[ ${free_mem} -ge 5250000 ]]; then + download_dir="/mnt/shm" + mkdir -p "${download_dir}" + mount -t tmpfs tmpfs "${download_dir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${download_dir}" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${download_dir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi + fi + install_log="${download_dir}/install.log" + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -y -qq autoremove + if is_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # Clean conda cache + /opt/conda/miniconda3/bin/conda clean -a + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + set +e + time dd if=/dev/zero of=/zero ; sync ; rm -f /zero + set -e + fi -clear_dkms_key + configure_dkms_certs -df -h + # Monitor disk usage in a screen session + if is_debuntu ; then + apt-get install -y -qq screen > /dev/null 2>&1 + elif is_rocky ; then + dnf -y -q install screen > /dev/null 2>&1 + fi + touch /tmp/keep-running-df + screen -d -m -US keep-running-df \ + bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' +} + +prepare_to_install + +main From 363ea49d8c144eccd1e2641ad39453e9787ca379 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 12:05:23 -0700 Subject: [PATCH 2/8] increase instance size to take advantage of ramdisk --- gpu/test_gpu.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 55d8ead85..64dd83e55 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -49,7 +49,7 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-standard-2", + machine_type="n1-standard-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -72,7 +72,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-standard-2", + machine_type="n1-standard-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -97,7 +97,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-standard-2", + machine_type="n1-standard-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -124,7 +124,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-standard-2", + machine_type="n1-standard-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, @@ -178,7 +178,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-2", + machine_type="n1-standard-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, boot_disk_size="50GB", @@ -214,7 +214,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-standard-2", + machine_type="n1-standard-8", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, From 69714c7619507138d77824ff0b07c2d7605d0baa Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 12:07:21 -0700 Subject: [PATCH 3/8] create ram disk at 10.5G free (increased from 5.25G) --- gpu/install_gpu_driver.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index c0129dcb7..960e92d5c 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1334,9 +1334,10 @@ function prepare_to_install(){ nvsmi_works="0" readonly bdcfg="/usr/local/bin/bdconfig" download_dir=/tmp/ + local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk - if [[ ${free_mem} -ge 5250000 ]]; then + if [[ ${free_mem} -ge 10500000 ]]; then download_dir="/mnt/shm" mkdir -p "${download_dir}" mount -t tmpfs tmpfs "${download_dir}" From 5b1c3e6026b881e2b8376b3108a049096d4619ff Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 17:00:28 -0700 Subject: [PATCH 4/8] renamed download_dir to tmpdir --- gpu/install_gpu_driver.sh | 79 ++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 960e92d5c..12cf274d1 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -279,9 +279,9 @@ function install_cuda_keyring_pkg() { local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ - -o "${download_dir}/cuda-keyring.deb" - dpkg -i "${download_dir}/cuda-keyring.deb" - rm -f "${download_dir}/cuda-keyring.deb" + -o "${tmpdir}/cuda-keyring.deb" + dpkg -i "${tmpdir}/cuda-keyring.deb" + rm -f "${tmpdir}/cuda-keyring.deb" CUDA_KEYRING_PKG_INSTALLED="1" } @@ -301,10 +301,10 @@ function install_local_cuda_repo() { readonly DIST_KEYRING_DIR="/var/${pkgname}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o "${download_dir}/${LOCAL_INSTALLER_DEB}" + "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" - dpkg -i "${download_dir}/${LOCAL_INSTALLER_DEB}" - rm "${download_dir}/${LOCAL_INSTALLER_DEB}" + dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" + rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then @@ -329,11 +329,11 @@ function install_local_cudnn_repo() { # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o "${download_dir}/local-installer.deb" + "${local_deb_url}" -o "${tmpdir}/local-installer.deb" - dpkg -i "${download_dir}/local-installer.deb" + dpkg -i "${tmpdir}/local-installer.deb" - rm -f "${download_dir}/local-installer.deb" + rm -f "${tmpdir}/local-installer.deb" cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings @@ -361,7 +361,7 @@ function install_local_cudnn8_repo() { CUDNN8_PKG_NAME="${pkgname}" deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_fn="${download_dir}/${deb_fn}" + local_deb_fn="${tmpdir}/${deb_fn}" local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ "${local_deb_url}" -o "${local_deb_fn}" @@ -714,24 +714,24 @@ function build_driver_from_packages() { } function install_nvidia_userspace_runfile() { - if test -f "${download_dir}/userspace-complete" ; then return ; fi + if test -f "${tmpdir}/userspace-complete" ; then return ; fi curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${USERSPACE_URL}" -o "${download_dir}/userspace.run" - time bash "${download_dir}/userspace.run" --no-kernel-modules --silent --install-libglvnd \ + "${USERSPACE_URL}" -o "${tmpdir}/userspace.run" + time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd \ > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } - rm -f "${download_dir}/userspace.run" - touch "${download_dir}/userspace-complete" + rm -f "${tmpdir}/userspace.run" + touch "${tmpdir}/userspace-complete" sync } function install_cuda_runfile() { - if test -f "${download_dir}/cuda-complete" ; then return ; fi + if test -f "${tmpdir}/cuda-complete" ; then return ; fi time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_CUDA_URL}" -o "${download_dir}/cuda.run" - time bash "${download_dir}/cuda.run" --silent --toolkit --no-opengl-libs \ + "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run" + time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs \ > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } - rm -f "${download_dir}/cuda.run" - touch "${download_dir}/cuda-complete" + rm -f "${tmpdir}/cuda.run" + touch "${tmpdir}/cuda-complete" sync } @@ -1241,8 +1241,10 @@ function clean_up_sources_lists() { # cran-r # if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi rm -f /usr/share/keyrings/cran-r.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7' | \ + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ gpg --dearmor -o /usr/share/keyrings/cran-r.gpg sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi @@ -1328,25 +1330,24 @@ print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log return 0 } -trap exit_handler EXIT - function prepare_to_install(){ nvsmi_works="0" readonly bdcfg="/usr/local/bin/bdconfig" - download_dir=/tmp/ + tmpdir=/tmp/ local free_mem + trap exit_handler EXIT free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk if [[ ${free_mem} -ge 10500000 ]]; then - download_dir="/mnt/shm" - mkdir -p "${download_dir}" - mount -t tmpfs tmpfs "${download_dir}" + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" # Download conda packages to tmpfs - /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${download_dir}" + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" # Download pip packages to tmpfs - pip config set global.cache-dir "${download_dir}" || echo "unable to set global.cache-dir" + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" # Download OS packages to tmpfs if is_debuntu ; then @@ -1354,8 +1355,10 @@ function prepare_to_install(){ else mount -t tmpfs tmpfs /var/cache/dnf fi + else + tmpdir=/tmp fi - install_log="${download_dir}/install.log" + install_log="${tmpdir}/install.log" if is_debuntu ; then clean_up_sources_lists @@ -1372,23 +1375,23 @@ function prepare_to_install(){ /opt/conda/miniconda3/bin/conda clean -a # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - set +e - time dd if=/dev/zero of=/zero ; sync ; rm -f /zero - set -e - fi + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + df -h + time dd if=/dev/zero of=/zero status=progress ; sync ; sleep 3s ; rm -f /zero + ) fi configure_dkms_certs # Monitor disk usage in a screen session if is_debuntu ; then apt-get install -y -qq screen > /dev/null 2>&1 - elif is_rocky ; then + else dnf -y -q install screen > /dev/null 2>&1 fi - touch /tmp/keep-running-df + df -h / | tee "${tmpdir}/disk-usage.log" + touch "${tmpdir}/keep-running-df" screen -d -m -US keep-running-df \ - bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' + bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done" } prepare_to_install From abeffef7f7e822832c412fe82552cc6cd4c38bb0 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 17:00:57 -0700 Subject: [PATCH 5/8] triple timeout - rocky9 I am looking at you --- gpu/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 64dd83e55..4650c03fe 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -128,7 +128,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) From 021b55d2df3efabe276c846e6410cd71286bb161 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 17:46:43 -0700 Subject: [PATCH 6/8] execute_with_retries now times the eval and prints log on failure ; refactoring code accordingly --- gpu/install_gpu_driver.sh | 78 ++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 47 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 12cf274d1..1f73e1af7 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -267,7 +267,8 @@ function execute_with_retries() ( cmd="apt-get -y clean && $cmd" fi for ((i = 0; i < 3; i++)); do - if eval "$cmd" ; then return 0 ; fi + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 @@ -383,10 +384,9 @@ function install_nvidia_nccl() { local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" if is_rocky ; then - time execute_with_retries \ + execute_with_retries \ dnf -y -q install \ - "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" sync elif is_ubuntu ; then install_cuda_keyring_pkg @@ -394,16 +394,14 @@ function install_nvidia_nccl() { apt-get update -qq if is_ubuntu18 ; then - time execute_with_retries \ + execute_with_retries \ apt-get install -q -y \ - libnccl2 libnccl-dev \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + libnccl2 libnccl-dev sync else - time execute_with_retries \ + execute_with_retries \ apt-get install -q -y \ - "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" sync fi else @@ -427,16 +425,14 @@ function install_nvidia_cudnn() { if is_rocky ; then if is_cudnn8 ; then - time execute_with_retries dnf -y -q install \ + execute_with_retries dnf -y -q install \ "libcudnn${major_version}" \ - "libcudnn${major_version}-devel" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + "libcudnn${major_version}-devel" sync elif is_cudnn9 ; then - time execute_with_retries dnf -y -q install \ + execute_with_retries dnf -y -q install \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" sync else echo "Unsupported cudnn version: '${major_version}'" @@ -451,23 +447,21 @@ function install_nvidia_cudnn() { apt-get update -qq - time execute_with_retries \ + execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ - "libcudnn8-dev=${cudnn_pkg_version}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + "libcudnn8-dev=${cudnn_pkg_version}" sync elif is_cudnn9 ; then install_cuda_keyring_pkg apt-get update -qq - time execute_with_retries \ + execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" @@ -478,9 +472,8 @@ function install_nvidia_cudnn() { packages=( "libcudnn${major_version}=${cudnn_pkg_version}" "libcudnn${major_version}-dev=${cudnn_pkg_version}") - time execute_with_retries \ - apt-get install -q -y --no-install-recommends "${packages[*]}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries \ + apt-get install -q -y --no-install-recommends "${packages[*]}" sync else echo "Unsupported OS: '${OS_NAME}'" @@ -692,21 +685,17 @@ function build_driver_from_packages() { fi add_contrib_component apt-get update -qq - execute_with_retries apt-get install -y -qq --no-install-recommends dkms \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries apt-get install -y -qq --no-install-recommends dkms #configure_dkms_certs - time execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then #configure_dkms_certs - if time execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } ; then + if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else - time execute_with_retries dnf -y -q module install 'nvidia-driver:latest' \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries dnf -y -q module install 'nvidia-driver:latest' fi sync fi @@ -717,8 +706,7 @@ function install_nvidia_userspace_runfile() { if test -f "${tmpdir}/userspace-complete" ; then return ; fi curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${USERSPACE_URL}" -o "${tmpdir}/userspace.run" - time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + time bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd rm -f "${tmpdir}/userspace.run" touch "${tmpdir}/userspace-complete" sync @@ -728,8 +716,7 @@ function install_cuda_runfile() { if test -f "${tmpdir}/cuda-complete" ; then return ; fi time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run" - time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + time bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs rm -f "${tmpdir}/cuda.run" touch "${tmpdir}/cuda-complete" sync @@ -746,12 +733,10 @@ function install_cuda_toolkit() { readonly cudatk_package if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi - time execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} \ - > "${install_log}" 2>&1 || { cat "${install_log}" ; exit -4 ; } + execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} sync elif is_rocky ; then - time execute_with_retries dnf -y -q install "${cudatk_package}" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries dnf -y -q install "${cudatk_package}" sync fi } @@ -852,8 +837,7 @@ function install_gpu_agent() { "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" - time execute_with_retries pip install -r "${install_dir}/requirements.txt" \ - > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries pip install -r "${install_dir}/requirements.txt" sync # Generate GPU service. @@ -1060,11 +1044,11 @@ function main() { if is_debuntu ; then export DEBIAN_FRONTEND=noninteractive - time execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1 + execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1 elif is_rocky ; then - time execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \ + execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \ > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } - time execute_with_retries dnf -y -q install pciutils gcc \ + execute_with_retries dnf -y -q install pciutils gcc \ > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" @@ -1072,7 +1056,7 @@ function main() { if [[ "${kernel_devel_pkg_out}" =~ 'Unable to find a match: kernel-devel-' ]] ; then # this kernel-devel may have been migrated to the vault local vault="https://download.rockylinux.org/vault/rocky/$(os_version)" - time execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + execute_with_retries dnf -y -q --setopt=localpkg_gpgcheck=1 install \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ From b6cc9d4548a4b1f3a1ab618dab7b32ca600a813d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Mon, 28 Oct 2024 17:52:59 -0700 Subject: [PATCH 7/8] using correct path to the disk-usage.log --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 1f73e1af7..0f29d0886 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -1302,7 +1302,7 @@ function exit_handler() { perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; -print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log +print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log" echo "exit_handler has completed" From 4f99a53e047db02c7eb9b000ed892d8c95971376 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 29 Oct 2024 08:12:30 -0700 Subject: [PATCH 8/8] retry install of screen ; update versions to include cuda 12.6 --- gpu/install_gpu_driver.sh | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 0f29d0886..6e373913d 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -120,21 +120,20 @@ readonly ROLE # CUDA version and Driver version # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +# https://developer.nvidia.com/cuda-downloads readonly -A DRIVER_FOR_CUDA=( - [11.8]="525.147.05" [12.1]="530.30.02" [12.4]="550.54.14" - [12.5]="555.42.06" [12.6]="560.28.03" + [11.8]="525.147.05" [12.4]="550.54.14" [12.6]="560.35.03" ) +# https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( - [11.8]="8.6.0.163" [12.1]="8.9.0" [12.4]="9.1.0.70" - [12.5]="9.2.1.18" + [11.8]="9.5.1.17" [12.4]="9.5.1.17" [12.6]="9.5.1.17" ) +# https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( - [11.8]="2.15.5" [12.1]="2.17.1" [12.4]="2.21.5" - [12.5]="2.22.3" + [11.8]="2.15.5" [12.4]="2.23.4" [12.6]="2.23.4" ) readonly -A CUDA_SUBVER=( - [11.8]="11.8.0" [12.1]="12.1.0" [12.4]="12.4.1" - [12.5]="12.5.1" + [11.8]="11.8.0" [12.4]="12.4.1" [12.6]="12.6.2" ) RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') @@ -216,6 +215,7 @@ readonly -A DEFAULT_NVIDIA_CUDA_URLS=( [11.8]="${NVIDIA_BASE_DL_URL}/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run" [12.1]="${NVIDIA_BASE_DL_URL}/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run" [12.4]="${NVIDIA_BASE_DL_URL}/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run" + [12.6]="${NVIDIA_BASE_DL_URL}/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run" ) readonly DEFAULT_NVIDIA_CUDA_URL=${DEFAULT_NVIDIA_CUDA_URLS["${CUDA_VERSION}"]} NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") @@ -233,9 +233,9 @@ if ( compare_versions_lte "8.3.1.22" "${CUDNN_VERSION}" ); then fi CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" fi -if ( compare_versions_lte "12.0" "${CUDA_VERSION}" ); then - # When cuda version is greater than 12.0 - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.2.0.82_cuda12-archive.tar.xz" +if is_cuda12 ; then + # When cuda version is 12 + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz" fi readonly CUDNN_TARBALL readonly CUDNN_TARBALL_URL @@ -1368,9 +1368,9 @@ function prepare_to_install(){ # Monitor disk usage in a screen session if is_debuntu ; then - apt-get install -y -qq screen > /dev/null 2>&1 + execute_with_retries apt-get install -y -qq screen else - dnf -y -q install screen > /dev/null 2>&1 + execute_with_retries dnf -y -q install screen fi df -h / | tee "${tmpdir}/disk-usage.log" touch "${tmpdir}/keep-running-df"