From 21edb5e4f49057e29845971ad44c7b953e39438b Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Mon, 15 Sep 2025 12:09:13 +0300 Subject: [PATCH 1/2] AZP: accept nvidia_peermem and drop legacy service check - Pass GPU peer-memory check if either nvidia_peermem or nv_peer_mem is loaded - Remove checking for old nv_peer_mem systemd service - Keep try_load_cuda_env() behavior: verify via /sys/kernel/mm/memory_peers/nv_mem/version - Remove redundant lsmod grep and stop referencing nv_peer_mem systemd service - Support hosts migrating to nvidia_peermem while preserving backward compatibility Signed-off-by: Alexey Rivkin --- buildlib/az-helpers.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh index 6c1e2f217b2..c143e59ceb3 100644 --- a/buildlib/az-helpers.sh +++ b/buildlib/az-helpers.sh @@ -163,10 +163,9 @@ check_nv_peer_mem() { return 0 fi - if ! lsmod | grep -q 'nv.*_peer.*mem'; then - lsmod | grep 'nv.*_peer.*mem' - systemctl status nv_peer_mem - azure_log_error "nv_peer_mem module not loaded on $(hostname -s)" + # Accept both legacy nv_peer_mem and new nvidia_peermem + if ! lsmod | egrep -q '^(nvidia_peermem|nv_peer_mem)\b'; then + azure_log_error "NV peer memory module not loaded on $(hostname -s)" exit 1 fi } From 38a7247e451420f990d0d5f5e4fdecaff201a10e Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Mon, 15 Sep 2025 15:19:24 +0300 Subject: [PATCH 2/2] AZP: accept nvidia_peermem and drop legacy service check AZP: accept nvidia_peermem and drop legacy service check Signed-off-by: Alexey Rivkin --- buildlib/az-helpers.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh index c143e59ceb3..3d62692e904 100644 --- a/buildlib/az-helpers.sh +++ b/buildlib/az-helpers.sh @@ -163,9 +163,9 @@ check_nv_peer_mem() { return 0 fi - # Accept both legacy nv_peer_mem and new nvidia_peermem - if ! lsmod | egrep -q '^(nvidia_peermem|nv_peer_mem)\b'; then - azure_log_error "NV peer memory module not loaded on $(hostname -s)" + if ! lsmod | grep -q 'nv.*_peer.*mem'; then + lsmod | grep 'nv.*_peer.*mem' + azure_log_error "nv_peer_mem module not loaded on $(hostname -s)" exit 1 fi }