From 203620daf1415f265c755c5d4f6b9362618a144b Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Oct 2025 16:42:17 +0000 Subject: [PATCH 1/6] Enable teporary disabled tests --- sycl/test-e2e/Adapters/level_zero/interop-get-native-mem.cpp | 4 ---- sycl/test-e2e/Basic/buffer/buffer_create.cpp | 5 ----- 2 files changed, 9 deletions(-) diff --git a/sycl/test-e2e/Adapters/level_zero/interop-get-native-mem.cpp b/sycl/test-e2e/Adapters/level_zero/interop-get-native-mem.cpp index ed3ec9cf72845..373892f2b8e18 100644 --- a/sycl/test-e2e/Adapters/level_zero/interop-get-native-mem.cpp +++ b/sycl/test-e2e/Adapters/level_zero/interop-get-native-mem.cpp @@ -7,10 +7,6 @@ // RUN: %{build} %level_zero_options -o %t.out // RUN: %{run} %t.out -// L0v2 adapter does not support integrated buffers yet -// UNSUPPORTED: level_zero_v2_adapter -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20280 - // Test get_native_mem for the Level Zero backend. // Level-Zero diff --git a/sycl/test-e2e/Basic/buffer/buffer_create.cpp b/sycl/test-e2e/Basic/buffer/buffer_create.cpp index c3b7f2bf85e01..08c7f1049b7ce 100644 --- a/sycl/test-e2e/Basic/buffer/buffer_create.cpp +++ b/sycl/test-e2e/Basic/buffer/buffer_create.cpp @@ -8,11 +8,6 @@ // RUN: %{run} %t.out 2>&1 | FileCheck %s // UNSUPPORTED: ze_debug -// L0v2 adapter doesn't optimize buffer creation based on device type yet -// (integrated buffer implementation needs more work). -// UNSUPPORTED: level_zero_v2_adapter -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20121 - #include #include #include From c3ce32be2851c70e8b54e94c093994c0326960e7 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Mon, 3 Nov 2025 13:55:57 +0000 Subject: [PATCH 2/6] detect and handle host mem in integrated gpu Signed-off-by: Mateusz P. Nowak --- sycl/test-e2e/lit.cfg.py | 2 +- .../source/adapters/level_zero/v2/memory.cpp | 102 +++++++++++++----- .../source/adapters/level_zero/v2/memory.hpp | 1 - 3 files changed, 74 insertions(+), 31 deletions(-) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 36e06223bdb7f..ef69d99e30367 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -706,7 +706,7 @@ def remove_level_zero_suffix(devices): available_devices = { "opencl": ("cpu", "gpu", "fpga"), "cuda": "gpu", - "level_zero": "gpu", + "level_zero": ("gpu", "0", "1"), "hip": "gpu", "native_cpu": "cpu", "offload": "gpu", diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index 941d73be8de3e..3c84048e0d3d4 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -57,16 +57,34 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( ur_context_handle_t hContext, void *hostPtr, size_t size, device_access_mode_t accessMode) : ur_mem_buffer_t(hContext, size, accessMode) { - bool hostPtrImported = - maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, - hContext->getZeHandle(), hostPtr, size); + if (hostPtr) { + // Host pointer provided - check if it's already USM or needs import + ZeStruct memProps; + auto ret = getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); + + if (ret == UR_RESULT_SUCCESS && memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { + // Already a USM allocation - just use it directly without import + this->ptr = usm_unique_ptr_t(hostPtr, [](void *) { + // Don't free - we don't own this memory + }); + } else { + // Not USM - try to import it + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + + if (!hostPtrImported) { + // This should not happen if urMemBufferCreate logic is correct + throw UR_RESULT_ERROR_INVALID_VALUE; + } - if (hostPtrImported) { - this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { - ZeUSMImport.doZeUSMRelease( - hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); - }); + this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { + ZeUSMImport.doZeUSMRelease( + hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); + }); + } } else { + // No host pointer - allocate new USM host memory void *rawPtr; UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr)); @@ -77,11 +95,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( UR_LOG(ERR, "Failed to free host memory: {}", ret); } }); - - if (hostPtr) { - std::memcpy(this->ptr.get(), hostPtr, size); - writeBackPtr = hostPtr; - } } } @@ -98,9 +111,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( } ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { - if (writeBackPtr) { - std::memcpy(writeBackPtr, this->ptr.get(), size); - } + // No writeback needed - integrated buffers use zero-copy access } void *ur_integrated_buffer_handle_t::getDevicePtr( @@ -113,18 +124,14 @@ void *ur_integrated_buffer_handle_t::getDevicePtr( void *ur_integrated_buffer_handle_t::mapHostPtr( ur_map_flags_t /*flags*/, size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) { - // TODO: if writeBackPtr is set, we should map to that pointer - // because that's what SYCL expects, SYCL will attempt to call free - // on the resulting pointer leading to double free with the current - // implementation. Investigate the SYCL implementation. + // For integrated devices, both device and host access the same memory return ur_cast(ptr.get()) + offset; } void ur_integrated_buffer_handle_t::unmapHostPtr( void * /*pMappedPtr*/, ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) { - // TODO: if writeBackPtr is set, we should copy the data back - /* nop */ + // No-op: integrated buffers use zero-copy, no synchronization needed } static v2::raii::command_list_unique_handle @@ -410,19 +417,16 @@ void ur_shared_buffer_handle_t::unmapHostPtr( // nop } -static bool useHostBuffer(ur_context_handle_t /* hContext */) { +static bool useHostBuffer(ur_context_handle_t hContext) { // We treat integrated devices (physical memory shared with the CPU) // differently from discrete devices (those with distinct memories). // For integrated devices, allocating the buffer in the host memory // enables automatic access from the device, and makes copying // unnecessary in the map/unmap operations. This improves performance. - // TODO: fix integrated buffer implementation - return false; - - // return hContext->getDevices().size() == 1 && - // hContext->getDevices()[0]->ZeDeviceProperties->flags & - // ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; + return hContext->getDevices().size() == 1 && + hContext->getDevices()[0]->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; } ur_mem_sub_buffer_t::ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset, @@ -566,6 +570,46 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, void *hostPtr = pProperties ? pProperties->pHost : nullptr; auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags); + // For integrated devices, we can use zero-copy host buffers when: + // 1. No host pointer is provided (we'll allocate USM host memory) + // 2. Host pointer is already USM memory + // 3. Host pointer can be imported as USM + // Otherwise, fall back to discrete buffer (explicit copies). + if (useHostBuffer(hContext) && hostPtr) { + // Check what type of memory this pointer is + ZeStruct memProps; + auto ret = getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); + + if (ret == UR_RESULT_SUCCESS) { + if (memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { + // Already USM memory (host, device, or shared) - use integrated path + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); + return UR_RESULT_SUCCESS; + } + + // Memory type is UNKNOWN - try to import it + bool canImport = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + if (!canImport) { + // Cannot import: fall back to discrete buffer path + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); + return UR_RESULT_SUCCESS; + } + // Successfully imported: release it now, constructor will import again + ZeUSMImport.doZeUSMRelease( + hContext->getPlatform()->ZeDriverHandleExpTranslated, hostPtr); + } else { + // Cannot get memory attributes: fall back to discrete buffer + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); + return UR_RESULT_SUCCESS; + } + } + + // Use integrated buffer path (no hostPtr, or hostPtr is USM/importable) if (useHostBuffer(hContext)) { *phBuffer = ur_mem_handle_t_::create( hContext, hostPtr, size, accessMode); diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index 813d09bf864c2..dbf89fae845e1 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -111,7 +111,6 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { private: usm_unique_ptr_t ptr; - void *writeBackPtr = nullptr; }; struct host_allocation_desc_t { From f1b62a925b2108cf1ed020ed65e95e35184c15ce Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 4 Nov 2025 11:58:42 +0000 Subject: [PATCH 3/6] some cleanups Signed-off-by: Mateusz P. Nowak --- .../source/adapters/level_zero/v2/memory.cpp | 25 ++++++++----------- .../source/adapters/level_zero/v2/memory.hpp | 2 -- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index d15c313756817..bad71c08e8542 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -#include "memory.hpp" #include "../ur_interface_loader.hpp" #include "context.hpp" +#include "memory.hpp" #include "../helpers/memory_helpers.hpp" #include "../image_common.hpp" @@ -60,21 +60,19 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( if (hostPtr) { // Host pointer provided - check if it's already USM or needs import ZeStruct memProps; - auto ret = getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); - + auto ret = + getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); + if (ret == UR_RESULT_SUCCESS && memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { // Already a USM allocation - just use it directly without import - this->ptr = usm_unique_ptr_t(hostPtr, [](void *) { - // Don't free - we don't own this memory - }); + this->ptr = usm_unique_ptr_t(hostPtr, [](void *) {}); } else { // Not USM - try to import it bool hostPtrImported = maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, hContext->getZeHandle(), hostPtr, size); - + if (!hostPtrImported) { - // This should not happen if urMemBufferCreate logic is correct throw UR_RESULT_ERROR_INVALID_VALUE; } @@ -110,10 +108,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( }); } -ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { - // No writeback needed - integrated buffers use zero-copy access -} - void *ur_integrated_buffer_handle_t::getDevicePtr( ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/, size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/, @@ -578,8 +572,9 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, if (useHostBuffer(hContext) && hostPtr) { // Check what type of memory this pointer is ZeStruct memProps; - auto ret = getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); - + auto ret = + getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); + if (ret == UR_RESULT_SUCCESS) { if (memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { // Already USM memory (host, device, or shared) - use integrated path @@ -587,7 +582,7 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, hContext, hostPtr, size, accessMode); return UR_RESULT_SUCCESS; } - + // Memory type is UNKNOWN - try to import it bool canImport = maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index dbf89fae845e1..49953dd77adc9 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -98,8 +98,6 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { size_t size, device_access_mode_t accessMode, bool ownHostPtr); - ~ur_integrated_buffer_handle_t(); - void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, size_t size, ze_command_list_handle_t cmdList, wait_list_view &waitListView) override; From 2a0d2dfdc8b3e8435b528a1e55bc2cbf6d6fb9e2 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 4 Nov 2025 15:22:58 +0000 Subject: [PATCH 4/6] copy-back for non-USM memory Signed-off-by: Mateusz P. Nowak --- sycl/test-e2e/lit.cfg.py | 1 + .../source/adapters/level_zero/v2/memory.cpp | 149 ++++++++++-------- .../source/adapters/level_zero/v2/memory.hpp | 26 +-- 3 files changed, 100 insertions(+), 76 deletions(-) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index ef69d99e30367..b9a0b5421c565 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -1112,6 +1112,7 @@ def get_sycl_ls_verbose(sycl_device, env): features.update(sg_size_features) features.update(architecture_feature) features.update(device_family) + features.update(aspects) be, dev = sycl_device.split(":") features.add(dev.replace("fpga", "accelerator")) diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index bad71c08e8542..6f22b156477fb 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// +#include "memory.hpp" #include "../ur_interface_loader.hpp" #include "context.hpp" -#include "memory.hpp" #include "../helpers/memory_helpers.hpp" #include "../image_common.hpp" @@ -66,33 +66,44 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( if (ret == UR_RESULT_SUCCESS && memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { // Already a USM allocation - just use it directly without import this->ptr = usm_unique_ptr_t(hostPtr, [](void *) {}); - } else { - // Not USM - try to import it - bool hostPtrImported = - maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, - hContext->getZeHandle(), hostPtr, size); + return; + } - if (!hostPtrImported) { - throw UR_RESULT_ERROR_INVALID_VALUE; - } + // Not USM - try to import it + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + if (hostPtrImported) { + // Successfully imported - use it with release this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { ZeUSMImport.doZeUSMRelease( hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); }); + // No copy-back needed for imported pointers + return; } - } else { - // No host pointer - allocate new USM host memory - void *rawPtr; - UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( - hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr)); - this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) { - auto ret = hContext->getDefaultUSMPool()->free(ptr); - if (ret != UR_RESULT_SUCCESS) { - UR_LOG(ERR, "Failed to free host memory: {}", ret); - } - }); + // Import failed - allocate backing buffer and set up copy-back + } + + // No host pointer, or import failed - allocate new USM host memory + void *rawPtr; + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr)); + + this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + UR_LOG(ERR, "Failed to free host memory: {}", ret); + } + }); + + if (hostPtr) { + // Copy data from user pointer to our backing buffer + std::memcpy(this->ptr.get(), hostPtr, size); + // Remember to copy back on destruction + writeBackPtr = hostPtr; } } @@ -108,6 +119,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( }); } +ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { + if (writeBackPtr) { + std::memcpy(writeBackPtr, ptr.get(), size); + } +} + void *ur_integrated_buffer_handle_t::getDevicePtr( ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/, size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/, @@ -116,16 +133,53 @@ void *ur_integrated_buffer_handle_t::getDevicePtr( } void *ur_integrated_buffer_handle_t::mapHostPtr( - ur_map_flags_t /*flags*/, size_t offset, size_t /*size*/, + ur_map_flags_t flags, size_t offset, size_t mapSize, ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) { - // For integrated devices, both device and host access the same memory + if (writeBackPtr) { + // Copy-back path: user gets back their original pointer + void *mappedPtr = ur_cast(writeBackPtr) + offset; + + if (flags & UR_MAP_FLAG_READ) { + std::memcpy(mappedPtr, ur_cast(ptr.get()) + offset, mapSize); + } + + // Track this mapping for unmap + mappedRegions.emplace_back(usm_unique_ptr_t(mappedPtr, [](void *) {}), + mapSize, offset, flags); + + return mappedPtr; + } + + // Zero-copy path: for successfully imported or USM pointers return ur_cast(ptr.get()) + offset; } void ur_integrated_buffer_handle_t::unmapHostPtr( - void * /*pMappedPtr*/, ze_command_list_handle_t /*cmdList*/, + void *pMappedPtr, ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) { - // No-op: integrated buffers use zero-copy, no synchronization needed + if (writeBackPtr) { + // Copy-back path: find the mapped region and copy data back if needed + auto mappedRegion = + std::find_if(mappedRegions.begin(), mappedRegions.end(), + [pMappedPtr](const host_allocation_desc_t &desc) { + return desc.ptr.get() == pMappedPtr; + }); + + if (mappedRegion == mappedRegions.end()) { + UR_DFAILURE("could not find pMappedPtr:" << pMappedPtr); + throw UR_RESULT_ERROR_INVALID_ARGUMENT; + } + + if (mappedRegion->flags & + (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) { + std::memcpy(ur_cast(ptr.get()) + mappedRegion->offset, + mappedRegion->ptr.get(), mappedRegion->size); + } + + mappedRegions.erase(mappedRegion); + return; + } + // No op for zero-copy path, memory is synced } static v2::raii::command_list_unique_handle @@ -564,47 +618,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, void *hostPtr = pProperties ? pProperties->pHost : nullptr; auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags); - // For integrated devices, we can use zero-copy host buffers when: - // 1. No host pointer is provided (we'll allocate USM host memory) - // 2. Host pointer is already USM memory - // 3. Host pointer can be imported as USM - // Otherwise, fall back to discrete buffer (explicit copies). - if (useHostBuffer(hContext) && hostPtr) { - // Check what type of memory this pointer is - ZeStruct memProps; - auto ret = - getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); - - if (ret == UR_RESULT_SUCCESS) { - if (memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { - // Already USM memory (host, device, or shared) - use integrated path - *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, accessMode); - return UR_RESULT_SUCCESS; - } - - // Memory type is UNKNOWN - try to import it - bool canImport = - maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, - hContext->getZeHandle(), hostPtr, size); - if (!canImport) { - // Cannot import: fall back to discrete buffer path - *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, accessMode); - return UR_RESULT_SUCCESS; - } - // Successfully imported: release it now, constructor will import again - ZeUSMImport.doZeUSMRelease( - hContext->getPlatform()->ZeDriverHandleExpTranslated, hostPtr); - } else { - // Cannot get memory attributes: fall back to discrete buffer - *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, accessMode); - return UR_RESULT_SUCCESS; - } - } - - // Use integrated buffer path (no hostPtr, or hostPtr is USM/importable) + // For integrated devices, use zero-copy host buffers. The integrated buffer + // constructor will handle all cases: + // 1. No host pointer - allocate USM host memory + // 2. Host pointer is already USM - use directly + // 3. Host pointer can be imported - import it + // 4. Otherwise - allocate USM and copy-back on destruction if (useHostBuffer(hContext)) { *phBuffer = ur_mem_handle_t_::create( hContext, hostPtr, size, accessMode); diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index 49953dd77adc9..1af30cb242c6e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -87,6 +87,17 @@ struct ur_usm_handle_t : ur_mem_buffer_t { void *ptr; }; +struct host_allocation_desc_t { + host_allocation_desc_t(usm_unique_ptr_t ptr, size_t size, size_t offset, + ur_map_flags_t flags) + : ptr(std::move(ptr)), size(size), offset(offset), flags(flags) {} + + usm_unique_ptr_t ptr; + size_t size; + size_t offset; + ur_map_flags_t flags; +}; + // Manages memory buffer for integrated GPU. // For integrated devices the buffer has been allocated in host memory // and can be accessed by the device without copying. @@ -98,6 +109,8 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { size_t size, device_access_mode_t accessMode, bool ownHostPtr); + ~ur_integrated_buffer_handle_t(); + void *getDevicePtr(ur_device_handle_t, device_access_mode_t, size_t offset, size_t size, ze_command_list_handle_t cmdList, wait_list_view &waitListView) override; @@ -109,17 +122,8 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { private: usm_unique_ptr_t ptr; -}; - -struct host_allocation_desc_t { - host_allocation_desc_t(usm_unique_ptr_t ptr, size_t size, size_t offset, - ur_map_flags_t flags) - : ptr(std::move(ptr)), size(size), offset(offset), flags(flags) {} - - usm_unique_ptr_t ptr; - size_t size; - size_t offset; - ur_map_flags_t flags; + void *writeBackPtr = nullptr; + std::vector mappedRegions; }; // Manages memory buffer for discrete GPU. From a76d8608defa39794c1a44603715c83ec332f75e Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Thu, 20 Nov 2025 08:46:17 +0000 Subject: [PATCH 5/6] fix lit.cfg for arch selection with l0v2 adapter --- sycl/test-e2e/lit.cfg.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index b9a0b5421c565..9d692803807a9 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -917,12 +917,14 @@ def get_sycl_ls_verbose(sycl_device, env): env = copy.copy(llvm_config.config.environment) + backend_for_selector = backend.replace("_v2", "").replace("_v1", "") + # Find all available devices under the backend - env["ONEAPI_DEVICE_SELECTOR"] = backend + ":*" + env["ONEAPI_DEVICE_SELECTOR"] = backend_for_selector + ":*" detected_architectures = [] - platform_devices = remove_level_zero_suffix(backend + ":*") + platform_devices = backend_for_selector + ":*" for line in get_sycl_ls_verbose(platform_devices, env).stdout.splitlines(): if re.match(r" *Architecture:", line): From f731333036b05c05e8802431e6d7c3c98eed8c8e Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 21 Nov 2025 09:38:40 +0000 Subject: [PATCH 6/6] fix AddressSanitizer sycl-e2e tests --- .../double-free/double-free.cpp | 2 + sycl/test-e2e/format.py | 13 +- .../source/adapters/level_zero/v2/memory.cpp | 152 +++++++++--------- .../sanitizer_common/sanitizer_utils.cpp | 13 ++ 4 files changed, 99 insertions(+), 81 deletions(-) diff --git a/sycl/test-e2e/AddressSanitizer/double-free/double-free.cpp b/sycl/test-e2e/AddressSanitizer/double-free/double-free.cpp index 61137c15c0ec7..5bb36514846c9 100644 --- a/sycl/test-e2e/AddressSanitizer/double-free/double-free.cpp +++ b/sycl/test-e2e/AddressSanitizer/double-free/double-free.cpp @@ -6,6 +6,8 @@ // RUN: %{build} %device_asan_flags -DMALLOC_SHARED -O0 -g -o %t3.out // RUN: %force_device_asan_rt UR_LAYER_ASAN_OPTIONS="quarantine_size_mb:1;detect_kernel_arguments:0" %{run} not %t3.out 2>&1 | FileCheck --check-prefixes CHECK,CHECK-SHARED %s #include +#include +#include constexpr size_t N = 64; diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index b503108f937b1..a1d10d019b3af 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -334,6 +334,17 @@ def get_extra_env(sycl_devices): expanded = "env" extra_env = get_extra_env([parsed_dev_name]) + backend, device = parsed_dev_name.split(":", 1) + device_selector = parsed_dev_name + if backend == "level_zero" and device.isdigit(): + # When filtering to a specific Level Zero GPU via + # ZE_AFFINITY_MASK, the remaining devices are + # renumbered starting from zero. Keep the affinity mask + # aligned with the resolved device index and address the + # only visible device via selector index 0. + extra_env.append(f"ZE_AFFINITY_MASK={device}") + device_selector = f"{backend}:0" + if extra_env: expanded += " {}".format(" ".join(extra_env)) @@ -343,7 +354,7 @@ def get_extra_env(sycl_devices): expanded += " env UR_LOADER_USE_LEVEL_ZERO_V2=0" expanded += " ONEAPI_DEVICE_SELECTOR={} {}".format( - parsed_dev_name, test.config.run_launcher + device_selector, test.config.run_launcher ) cmd = directive.command.replace("%{run}", expanded) # Expand device-specific condtions (%if ... %{ ... %}). diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index 6f22b156477fb..e912e66dc8c9f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -#include "memory.hpp" #include "../ur_interface_loader.hpp" #include "context.hpp" +#include "memory.hpp" #include "../helpers/memory_helpers.hpp" #include "../image_common.hpp" @@ -53,57 +53,69 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/, /* nop */ } +static v2::raii::command_list_unique_handle +getSyncCommandListForCopy(ur_context_handle_t hContext, + ur_device_handle_t hDevice) { + v2::command_list_desc_t listDesc; + listDesc.IsInOrder = true; + listDesc.Ordinal = + hDevice + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] + .ZeOrdinal; + listDesc.CopyOffloadEnable = true; + return hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt); +} + +static ur_result_t synchronousZeCopy(ur_context_handle_t hContext, + ur_device_handle_t hDevice, void *dst, + const void *src, size_t size) try { + auto commandList = getSyncCommandListForCopy(hContext, hDevice); + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (commandList.get(), dst, src, size, nullptr, 0, nullptr)); + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( ur_context_handle_t hContext, void *hostPtr, size_t size, device_access_mode_t accessMode) : ur_mem_buffer_t(hContext, size, accessMode) { - if (hostPtr) { - // Host pointer provided - check if it's already USM or needs import - ZeStruct memProps; - auto ret = - getMemoryAttrs(hContext->getZeHandle(), hostPtr, nullptr, &memProps); - - if (ret == UR_RESULT_SUCCESS && memProps.type != ZE_MEMORY_TYPE_UNKNOWN) { - // Already a USM allocation - just use it directly without import - this->ptr = usm_unique_ptr_t(hostPtr, [](void *) {}); - return; - } - - // Not USM - try to import it - bool hostPtrImported = - maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, - hContext->getZeHandle(), hostPtr, size); - - if (hostPtrImported) { - // Successfully imported - use it with release - this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { - ZeUSMImport.doZeUSMRelease( - hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); - }); - // No copy-back needed for imported pointers - return; - } - - // Import failed - allocate backing buffer and set up copy-back - } + bool hostPtrImported = + maybeImportUSM(hContext->getPlatform()->ZeDriverHandleExpTranslated, + hContext->getZeHandle(), hostPtr, size); + + if (hostPtrImported) { + this->ptr = usm_unique_ptr_t(hostPtr, [hContext](void *ptr) { + ZeUSMImport.doZeUSMRelease( + hContext->getPlatform()->ZeDriverHandleExpTranslated, ptr); + }); + } else { + void *rawPtr; + // Use HOST memory for integrated GPUs to enable zero-copy device access + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr)); - // No host pointer, or import failed - allocate new USM host memory - void *rawPtr; - UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( - hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr)); + this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + UR_LOG(ERR, "Failed to free host memory: {}", ret); + } + }); - this->ptr = usm_unique_ptr_t(rawPtr, [hContext](void *ptr) { - auto ret = hContext->getDefaultUSMPool()->free(ptr); - if (ret != UR_RESULT_SUCCESS) { - UR_LOG(ERR, "Failed to free host memory: {}", ret); + if (hostPtr) { + // Initial copy using Level Zero for USM HOST memory + auto hDevice = hContext->getDevices()[0]; + UR_CALL_THROWS( + synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size)); + // Set writeBackPtr to enable map/unmap copy-back (but NOT destructor + // copy-back) + writeBackPtr = hostPtr; } - }); - - if (hostPtr) { - // Copy data from user pointer to our backing buffer - std::memcpy(this->ptr.get(), hostPtr, size); - // Remember to copy back on destruction - writeBackPtr = hostPtr; } } @@ -119,12 +131,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( }); } -ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { - if (writeBackPtr) { - std::memcpy(writeBackPtr, ptr.get(), size); - } -} - void *ur_integrated_buffer_handle_t::getDevicePtr( ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/, size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/, @@ -140,7 +146,11 @@ void *ur_integrated_buffer_handle_t::mapHostPtr( void *mappedPtr = ur_cast(writeBackPtr) + offset; if (flags & UR_MAP_FLAG_READ) { - std::memcpy(mappedPtr, ur_cast(ptr.get()) + offset, mapSize); + // Use Level Zero copy for USM HOST memory to ensure GPU visibility + auto hDevice = hContext->getDevices()[0]; + UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr, + ur_cast(ptr.get()) + offset, + mapSize)); } // Track this mapping for unmap @@ -172,8 +182,11 @@ void ur_integrated_buffer_handle_t::unmapHostPtr( if (mappedRegion->flags & (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) { - std::memcpy(ur_cast(ptr.get()) + mappedRegion->offset, - mappedRegion->ptr.get(), mappedRegion->size); + // Use Level Zero copy for USM HOST memory to ensure GPU visibility + auto hDevice = hContext->getDevices()[0]; + UR_CALL_THROWS(synchronousZeCopy( + hContext, hDevice, ur_cast(ptr.get()) + mappedRegion->offset, + mappedRegion->ptr.get(), mappedRegion->size)); } mappedRegions.erase(mappedRegion); @@ -182,32 +195,11 @@ void ur_integrated_buffer_handle_t::unmapHostPtr( // No op for zero-copy path, memory is synced } -static v2::raii::command_list_unique_handle -getSyncCommandListForCopy(ur_context_handle_t hContext, - ur_device_handle_t hDevice) { - v2::command_list_desc_t listDesc; - listDesc.IsInOrder = true; - listDesc.Ordinal = - hDevice - ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] - .ZeOrdinal; - listDesc.CopyOffloadEnable = true; - return hContext->getCommandListCache().getImmediateCommandList( - hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt); -} - -static ur_result_t synchronousZeCopy(ur_context_handle_t hContext, - ur_device_handle_t hDevice, void *dst, - const void *src, size_t size) try { - auto commandList = getSyncCommandListForCopy(hContext, hDevice); - - ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (commandList.get(), dst, src, size, nullptr, 0, nullptr)); - - return UR_RESULT_SUCCESS; -} catch (...) { - return exceptionToResult(std::current_exception()); +ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { + // Do NOT do automatic copy-back in destructor - it causes heap corruption + // because writeBackPtr may be freed by SYCL runtime before buffer destructor + // runs. Copy-back happens via explicit map/unmap operations (see + // mapHostPtr/unmapHostPtr). } void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice, diff --git a/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp b/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp index edfc4f8a56f3d..27f4bf6a5731a 100644 --- a/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp @@ -211,6 +211,19 @@ DeviceType GetDeviceType(ur_context_handle_t Context, // TODO: Check fpga is fpga emulator return DeviceType::CPU; case UR_DEVICE_TYPE_GPU: { + // Check if this is an integrated GPU - they share system memory with CPU + // and should use CPU-style shadow memory layout + ur_bool_t IsIntegrated = false; + [[maybe_unused]] ur_result_t QueryResult = + getContext()->urDdiTable.Device.pfnGetInfo( + Device, UR_DEVICE_INFO_IS_INTEGRATED_GPU, sizeof(ur_bool_t), + &IsIntegrated, nullptr); + if (QueryResult == UR_RESULT_SUCCESS && IsIntegrated) { + UR_LOG_L(getContext()->logger, DEBUG, + "GetDeviceType: Integrated GPU detected, using CPU layout"); + return DeviceType::CPU; + } + uptr Ptr; [[maybe_unused]] ur_result_t Result = getContext()->urDdiTable.USM.pfnDeviceAlloc(Context, Device, nullptr,