intel · mateuszpn · Oct 28, 2025 · Nov 3, 2025 · Nov 4, 2025 · Nov 4, 2025
@@ -7,10 +7,6 @@
 // RUN: %{build} %level_zero_options -o %t.out
 // RUN: %{run} %t.out
 
-// L0v2 adapter does not support integrated buffers yet
-// UNSUPPORTED: level_zero_v2_adapter
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20280
-
 // Test get_native_mem for the Level Zero backend.
 
 // Level-Zero

@@ -6,6 +6,8 @@
 // RUN: %{build} %device_asan_flags -DMALLOC_SHARED -O0 -g -o %t3.out
 // RUN: %force_device_asan_rt UR_LAYER_ASAN_OPTIONS="quarantine_size_mb:1;detect_kernel_arguments:0" %{run} not %t3.out 2>&1 | FileCheck --check-prefixes CHECK,CHECK-SHARED %s
 #include <sycl/usm.hpp>
+#include <iostream>
+#include <cstdlib>
 
 constexpr size_t N = 64;
 

@@ -8,11 +8,6 @@
 // RUN: %{run} %t.out 2>&1 | FileCheck %s
 // UNSUPPORTED: ze_debug
 
-// L0v2 adapter doesn't optimize buffer creation based on device type yet
-// (integrated buffer implementation needs more work).
-// UNSUPPORTED: level_zero_v2_adapter
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20121
-
 #include <iostream>
 #include <level_zero/ze_api.h>
 #include <sycl/detail/core.hpp>

@@ -334,6 +334,17 @@ def get_extra_env(sycl_devices):
                 expanded = "env"
 
                 extra_env = get_extra_env([parsed_dev_name])
+                backend, device = parsed_dev_name.split(":", 1)
+                device_selector = parsed_dev_name
+                if backend == "level_zero" and device.isdigit():
+                    # When filtering to a specific Level Zero GPU via
+                    # ZE_AFFINITY_MASK, the remaining devices are
+                    # renumbered starting from zero. Keep the affinity mask
+                    # aligned with the resolved device index and address the
+                    # only visible device via selector index 0.
+                    extra_env.append(f"ZE_AFFINITY_MASK={device}")
+                    device_selector = f"{backend}:0"
+
                 if extra_env:
                     expanded += " {}".format(" ".join(extra_env))
 
@@ -343,7 +354,7 @@ def get_extra_env(sycl_devices):
                     expanded += " env UR_LOADER_USE_LEVEL_ZERO_V2=0"
 
                 expanded += " ONEAPI_DEVICE_SELECTOR={} {}".format(
-                    parsed_dev_name, test.config.run_launcher
+                    device_selector, test.config.run_launcher
                 )
                 cmd = directive.command.replace("%{run}", expanded)
                 # Expand device-specific condtions (%if ... %{ ... %}).

@@ -706,7 +706,7 @@ def remove_level_zero_suffix(devices):
 available_devices = {
     "opencl": ("cpu", "gpu", "fpga"),
     "cuda": "gpu",
-    "level_zero": "gpu",
+    "level_zero": ("gpu", "0", "1"),
     "hip": "gpu",
     "native_cpu": "cpu",
     "offload": "gpu",
@@ -917,12 +917,14 @@ def get_sycl_ls_verbose(sycl_device, env):
 
     env = copy.copy(llvm_config.config.environment)
 
+    backend_for_selector = backend.replace("_v2", "").replace("_v1", "")
+
     # Find all available devices under the backend
-    env["ONEAPI_DEVICE_SELECTOR"] = backend + ":*"
+    env["ONEAPI_DEVICE_SELECTOR"] = backend_for_selector + ":*"
 
     detected_architectures = []
 
-    platform_devices = remove_level_zero_suffix(backend + ":*")
+    platform_devices = backend_for_selector + ":*"
 
     for line in get_sycl_ls_verbose(platform_devices, env).stdout.splitlines():
         if re.match(r" *Architecture:", line):
@@ -1112,6 +1114,7 @@ def get_sycl_ls_verbose(sycl_device, env):
     features.update(sg_size_features)
     features.update(architecture_feature)
     features.update(device_family)
+    features.update(aspects)
 
     be, dev = sycl_device.split(":")
     features.add(dev.replace("fpga", "accelerator"))

@@ -8,9 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "memory.hpp"
 #include "../ur_interface_loader.hpp"
 #include "context.hpp"
+#include "memory.hpp"
 
 #include "../helpers/memory_helpers.hpp"
 #include "../image_common.hpp"
@@ -53,6 +53,34 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/,
   /* nop */
 }
 
+static v2::raii::command_list_unique_handle
+getSyncCommandListForCopy(ur_context_handle_t hContext,
+                          ur_device_handle_t hDevice) {
+  v2::command_list_desc_t listDesc;
+  listDesc.IsInOrder = true;
+  listDesc.Ordinal =
+      hDevice
+          ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
+          .ZeOrdinal;
+  listDesc.CopyOffloadEnable = true;
+  return hContext->getCommandListCache().getImmediateCommandList(
+      hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
+      ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
+}
+
+static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
+                                     ur_device_handle_t hDevice, void *dst,
+                                     const void *src, size_t size) try {
+  auto commandList = getSyncCommandListForCopy(hContext, hDevice);
+
+  ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+             (commandList.get(), dst, src, size, nullptr, 0, nullptr));
+
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
     ur_context_handle_t hContext, void *hostPtr, size_t size,
     device_access_mode_t accessMode)
@@ -68,6 +96,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
     });
   } else {
     void *rawPtr;
+    // Use HOST memory for integrated GPUs to enable zero-copy device access
     UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
         hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));
 
@@ -79,7 +108,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
     });
 
     if (hostPtr) {
-      std::memcpy(this->ptr.get(), hostPtr, size);
+      // Initial copy using Level Zero for USM HOST memory
+      auto hDevice = hContext->getDevices()[0];
+      UR_CALL_THROWS(
+          synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size));
+      // Set writeBackPtr to enable map/unmap copy-back (but NOT destructor
+      // copy-back)
       writeBackPtr = hostPtr;
     }
   }
@@ -97,12 +131,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
   });
 }
 
-ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
-  if (writeBackPtr) {
-    std::memcpy(writeBackPtr, this->ptr.get(), size);
-  }
-}
-
 void *ur_integrated_buffer_handle_t::getDevicePtr(
     ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/,
     size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
@@ -111,48 +139,67 @@ void *ur_integrated_buffer_handle_t::getDevicePtr(
 }
 
 void *ur_integrated_buffer_handle_t::mapHostPtr(
-    ur_map_flags_t /*flags*/, size_t offset, size_t /*size*/,
+    ur_map_flags_t flags, size_t offset, size_t mapSize,
     ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) {
-  // TODO: if writeBackPtr is set, we should map to that pointer
-  // because that's what SYCL expects, SYCL will attempt to call free
-  // on the resulting pointer leading to double free with the current
-  // implementation. Investigate the SYCL implementation.
+  if (writeBackPtr) {
+    // Copy-back path: user gets back their original pointer
+    void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset;
+
+    if (flags & UR_MAP_FLAG_READ) {
+      // Use Level Zero copy for USM HOST memory to ensure GPU visibility
+      auto hDevice = hContext->getDevices()[0];
+      UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr,
+                                       ur_cast<char *>(ptr.get()) + offset,
+                                       mapSize));
+    }
+
+    // Track this mapping for unmap
+    mappedRegions.emplace_back(usm_unique_ptr_t(mappedPtr, [](void *) {}),
+                               mapSize, offset, flags);
+
+    return mappedPtr;
+  }
+
+  // Zero-copy path: for successfully imported or USM pointers
   return ur_cast<char *>(ptr.get()) + offset;
 }
 
 void ur_integrated_buffer_handle_t::unmapHostPtr(
-    void * /*pMappedPtr*/, ze_command_list_handle_t /*cmdList*/,
+    void *pMappedPtr, ze_command_list_handle_t /*cmdList*/,
     wait_list_view & /*waitListView*/) {
-  // TODO: if writeBackPtr is set, we should copy the data back
-  /* nop */
-}
-
-static v2::raii::command_list_unique_handle
-getSyncCommandListForCopy(ur_context_handle_t hContext,
-                          ur_device_handle_t hDevice) {
-  v2::command_list_desc_t listDesc;
-  listDesc.IsInOrder = true;
-  listDesc.Ordinal =
-      hDevice
-          ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
-          .ZeOrdinal;
-  listDesc.CopyOffloadEnable = true;
-  return hContext->getCommandListCache().getImmediateCommandList(
-      hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
-      ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
-}
+  if (writeBackPtr) {
+    // Copy-back path: find the mapped region and copy data back if needed
+    auto mappedRegion =
+        std::find_if(mappedRegions.begin(), mappedRegions.end(),
+                     [pMappedPtr](const host_allocation_desc_t &desc) {
+                       return desc.ptr.get() == pMappedPtr;
+                     });
+
+    if (mappedRegion == mappedRegions.end()) {
+      UR_DFAILURE("could not find pMappedPtr:" << pMappedPtr);
+      throw UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
 
-static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
-                                     ur_device_handle_t hDevice, void *dst,
-                                     const void *src, size_t size) try {
-  auto commandList = getSyncCommandListForCopy(hContext, hDevice);
+    if (mappedRegion->flags &
+        (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) {
+      // Use Level Zero copy for USM HOST memory to ensure GPU visibility
+      auto hDevice = hContext->getDevices()[0];
+      UR_CALL_THROWS(synchronousZeCopy(
+          hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset,
+          mappedRegion->ptr.get(), mappedRegion->size));
+    }
 
-  ZE2UR_CALL(zeCommandListAppendMemoryCopy,
-             (commandList.get(), dst, src, size, nullptr, 0, nullptr));
+    mappedRegions.erase(mappedRegion);
+    return;
+  }
+  // No op for zero-copy path, memory is synced
+}
 
-  return UR_RESULT_SUCCESS;
-} catch (...) {
-  return exceptionToResult(std::current_exception());
+ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
+  // Do NOT do automatic copy-back in destructor - it causes heap corruption
+  // because writeBackPtr may be freed by SYCL runtime before buffer destructor
+  // runs. Copy-back happens via explicit map/unmap operations (see
+  // mapHostPtr/unmapHostPtr).
 }
 
 void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
@@ -410,19 +457,16 @@ void ur_shared_buffer_handle_t::unmapHostPtr(
   // nop
 }
 
-static bool useHostBuffer(ur_context_handle_t /* hContext */) {
+static bool useHostBuffer(ur_context_handle_t hContext) {
   // We treat integrated devices (physical memory shared with the CPU)
   // differently from discrete devices (those with distinct memories).
   // For integrated devices, allocating the buffer in the host memory
   // enables automatic access from the device, and makes copying
   // unnecessary in the map/unmap operations. This improves performance.
 
-  // TODO: fix integrated buffer implementation
-  return false;
-
-  // return hContext->getDevices().size() == 1 &&
-  //        hContext->getDevices()[0]->ZeDeviceProperties->flags &
-  //            ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
+  return hContext->getDevices().size() == 1 &&
+         hContext->getDevices()[0]->ZeDeviceProperties->flags &
+             ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
 }
 
 ur_mem_sub_buffer_t::ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset,
@@ -566,6 +610,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext,
   void *hostPtr = pProperties ? pProperties->pHost : nullptr;
   auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags);
 
+  // For integrated devices, use zero-copy host buffers. The integrated buffer
+  // constructor will handle all cases:
+  // 1. No host pointer - allocate USM host memory
+  // 2. Host pointer is already USM - use directly
+  // 3. Host pointer can be imported - import it
+  // 4. Otherwise - allocate USM and copy-back on destruction
   if (useHostBuffer(hContext)) {
     *phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>(
         hContext, hostPtr, size, accessMode);

@@ -87,6 +87,17 @@ struct ur_usm_handle_t : ur_mem_buffer_t {
   void *ptr;
 };
 
+struct host_allocation_desc_t {
+  host_allocation_desc_t(usm_unique_ptr_t ptr, size_t size, size_t offset,
+                         ur_map_flags_t flags)
+      : ptr(std::move(ptr)), size(size), offset(offset), flags(flags) {}
+
+  usm_unique_ptr_t ptr;
+  size_t size;
+  size_t offset;
+  ur_map_flags_t flags;
+};
+
 // Manages memory buffer for integrated GPU.
 // For integrated devices the buffer has been allocated in host memory
 // and can be accessed by the device without copying.
@@ -112,17 +123,7 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t {
 private:
   usm_unique_ptr_t ptr;
   void *writeBackPtr = nullptr;
-};
-
-struct host_allocation_desc_t {
-  host_allocation_desc_t(usm_unique_ptr_t ptr, size_t size, size_t offset,
-                         ur_map_flags_t flags)
-      : ptr(std::move(ptr)), size(size), offset(offset), flags(flags) {}
-
-  usm_unique_ptr_t ptr;
-  size_t size;
-  size_t offset;
-  ur_map_flags_t flags;
+  std::vector<host_allocation_desc_t> mappedRegions;
 };
 
 // Manages memory buffer for discrete GPU.

@@ -211,6 +211,19 @@ DeviceType GetDeviceType(ur_context_handle_t Context,
     // TODO: Check fpga is fpga emulator
     return DeviceType::CPU;
   case UR_DEVICE_TYPE_GPU: {
+    // Check if this is an integrated GPU - they share system memory with CPU
+    // and should use CPU-style shadow memory layout
+    ur_bool_t IsIntegrated = false;
+    [[maybe_unused]] ur_result_t QueryResult =
+        getContext()->urDdiTable.Device.pfnGetInfo(
+            Device, UR_DEVICE_INFO_IS_INTEGRATED_GPU, sizeof(ur_bool_t),
+            &IsIntegrated, nullptr);
+    if (QueryResult == UR_RESULT_SUCCESS && IsIntegrated) {
+      UR_LOG_L(getContext()->logger, DEBUG,
+               "GetDeviceType: Integrated GPU detected, using CPU layout");
+      return DeviceType::CPU;
+    }
+
     uptr Ptr;
     [[maybe_unused]] ur_result_t Result =
         getContext()->urDdiTable.USM.pfnDeviceAlloc(Context, Device, nullptr,