Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
// RUN: %{build} %level_zero_options -o %t.out
// RUN: %{run} %t.out

// L0v2 adapter does not support integrated buffers yet
// UNSUPPORTED: level_zero_v2_adapter
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20280

// Test get_native_mem for the Level Zero backend.

// Level-Zero
Expand Down
2 changes: 2 additions & 0 deletions sycl/test-e2e/AddressSanitizer/double-free/double-free.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
// RUN: %{build} %device_asan_flags -DMALLOC_SHARED -O0 -g -o %t3.out
// RUN: %force_device_asan_rt UR_LAYER_ASAN_OPTIONS="quarantine_size_mb:1;detect_kernel_arguments:0" %{run} not %t3.out 2>&1 | FileCheck --check-prefixes CHECK,CHECK-SHARED %s
#include <sycl/usm.hpp>
#include <iostream>
#include <cstdlib>

constexpr size_t N = 64;

Expand Down
5 changes: 0 additions & 5 deletions sycl/test-e2e/Basic/buffer/buffer_create.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
// RUN: %{run} %t.out 2>&1 | FileCheck %s
// UNSUPPORTED: ze_debug

// L0v2 adapter doesn't optimize buffer creation based on device type yet
// (integrated buffer implementation needs more work).
// UNSUPPORTED: level_zero_v2_adapter
// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20121

#include <iostream>
#include <level_zero/ze_api.h>
#include <sycl/detail/core.hpp>
Expand Down
13 changes: 12 additions & 1 deletion sycl/test-e2e/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,17 @@ def get_extra_env(sycl_devices):
expanded = "env"

extra_env = get_extra_env([parsed_dev_name])
backend, device = parsed_dev_name.split(":", 1)
device_selector = parsed_dev_name
if backend == "level_zero" and device.isdigit():
# When filtering to a specific Level Zero GPU via
# ZE_AFFINITY_MASK, the remaining devices are
# renumbered starting from zero. Keep the affinity mask
# aligned with the resolved device index and address the
# only visible device via selector index 0.
extra_env.append(f"ZE_AFFINITY_MASK={device}")
device_selector = f"{backend}:0"

if extra_env:
expanded += " {}".format(" ".join(extra_env))

Expand All @@ -343,7 +354,7 @@ def get_extra_env(sycl_devices):
expanded += " env UR_LOADER_USE_LEVEL_ZERO_V2=0"

expanded += " ONEAPI_DEVICE_SELECTOR={} {}".format(
parsed_dev_name, test.config.run_launcher
device_selector, test.config.run_launcher
)
cmd = directive.command.replace("%{run}", expanded)
# Expand device-specific condtions (%if ... %{ ... %}).
Expand Down
9 changes: 6 additions & 3 deletions sycl/test-e2e/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ def remove_level_zero_suffix(devices):
available_devices = {
"opencl": ("cpu", "gpu", "fpga"),
"cuda": "gpu",
"level_zero": "gpu",
"level_zero": ("gpu", "0", "1"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed for local tests, but may be useful in CI, too, depending on hw config. Allows to point out which gpu should be used for tests (eg. integrated/discrete). Example:
llvm-lit --param "sycl_devices=level_zero_v2:1" - the tests are run on device level_zero:1 with v2 adapter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After closer look, as some tests appeared as unsupported, additional one-line fix was necessary.
So, if any of reviewers thinks the change does not belong to this PR, I am also OK with that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should be changing the default config. CI and other dev systems where this is used may have a different configuration where :0 and :1 mean e.g., different dGPUs. Or there may not be a second GPU at all.

"hip": "gpu",
"native_cpu": "cpu",
"offload": "gpu",
Expand Down Expand Up @@ -917,12 +917,14 @@ def get_sycl_ls_verbose(sycl_device, env):

env = copy.copy(llvm_config.config.environment)

backend_for_selector = backend.replace("_v2", "").replace("_v1", "")

# Find all available devices under the backend
env["ONEAPI_DEVICE_SELECTOR"] = backend + ":*"
env["ONEAPI_DEVICE_SELECTOR"] = backend_for_selector + ":*"

detected_architectures = []

platform_devices = remove_level_zero_suffix(backend + ":*")
platform_devices = backend_for_selector + ":*"

for line in get_sycl_ls_verbose(platform_devices, env).stdout.splitlines():
if re.match(r" *Architecture:", line):
Expand Down Expand Up @@ -1112,6 +1114,7 @@ def get_sycl_ls_verbose(sycl_device, env):
features.update(sg_size_features)
features.update(architecture_feature)
features.update(device_family)
features.update(aspects)

be, dev = sycl_device.split(":")
features.add(dev.replace("fpga", "accelerator"))
Expand Down
146 changes: 98 additions & 48 deletions unified-runtime/source/adapters/level_zero/v2/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
//
//===----------------------------------------------------------------------===//

#include "memory.hpp"
#include "../ur_interface_loader.hpp"
#include "context.hpp"
#include "memory.hpp"

#include "../helpers/memory_helpers.hpp"
#include "../image_common.hpp"
Expand Down Expand Up @@ -53,6 +53,34 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/,
/* nop */
}

static v2::raii::command_list_unique_handle
getSyncCommandListForCopy(ur_context_handle_t hContext,
ur_device_handle_t hDevice) {
v2::command_list_desc_t listDesc;
listDesc.IsInOrder = true;
listDesc.Ordinal =
hDevice
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
.ZeOrdinal;
listDesc.CopyOffloadEnable = true;
return hContext->getCommandListCache().getImmediateCommandList(
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
}

static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
ur_device_handle_t hDevice, void *dst,
const void *src, size_t size) try {
auto commandList = getSyncCommandListForCopy(hContext, hDevice);

ZE2UR_CALL(zeCommandListAppendMemoryCopy,
(commandList.get(), dst, src, size, nullptr, 0, nullptr));

return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
}

ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
ur_context_handle_t hContext, void *hostPtr, size_t size,
device_access_mode_t accessMode)
Expand All @@ -68,6 +96,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
});
} else {
void *rawPtr;
// Use HOST memory for integrated GPUs to enable zero-copy device access
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr));

Expand All @@ -79,7 +108,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
});

if (hostPtr) {
std::memcpy(this->ptr.get(), hostPtr, size);
// Initial copy using Level Zero for USM HOST memory
auto hDevice = hContext->getDevices()[0];
UR_CALL_THROWS(
synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size));
// Set writeBackPtr to enable map/unmap copy-back (but NOT destructor
// copy-back)
writeBackPtr = hostPtr;
}
}
Expand All @@ -97,12 +131,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t(
});
}

ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
if (writeBackPtr) {
std::memcpy(writeBackPtr, this->ptr.get(), size);
}
}

void *ur_integrated_buffer_handle_t::getDevicePtr(
ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/,
size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
Expand All @@ -111,48 +139,67 @@ void *ur_integrated_buffer_handle_t::getDevicePtr(
}

void *ur_integrated_buffer_handle_t::mapHostPtr(
ur_map_flags_t /*flags*/, size_t offset, size_t /*size*/,
ur_map_flags_t flags, size_t offset, size_t mapSize,
ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) {
// TODO: if writeBackPtr is set, we should map to that pointer
// because that's what SYCL expects, SYCL will attempt to call free
// on the resulting pointer leading to double free with the current
// implementation. Investigate the SYCL implementation.
Comment on lines -116 to -119
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was this comment false?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, the comment was not false as long as map/unmap functions missed full handling of copy-back path. Last commit addresses the issue.

if (writeBackPtr) {
// Copy-back path: user gets back their original pointer
void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset;

if (flags & UR_MAP_FLAG_READ) {
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
auto hDevice = hContext->getDevices()[0];
UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr,
ur_cast<char *>(ptr.get()) + offset,
mapSize));
}

// Track this mapping for unmap
mappedRegions.emplace_back(usm_unique_ptr_t(mappedPtr, [](void *) {}),
mapSize, offset, flags);

return mappedPtr;
}

// Zero-copy path: for successfully imported or USM pointers
return ur_cast<char *>(ptr.get()) + offset;
}

void ur_integrated_buffer_handle_t::unmapHostPtr(
void * /*pMappedPtr*/, ze_command_list_handle_t /*cmdList*/,
void *pMappedPtr, ze_command_list_handle_t /*cmdList*/,
wait_list_view & /*waitListView*/) {
// TODO: if writeBackPtr is set, we should copy the data back
/* nop */
}

static v2::raii::command_list_unique_handle
getSyncCommandListForCopy(ur_context_handle_t hContext,
ur_device_handle_t hDevice) {
v2::command_list_desc_t listDesc;
listDesc.IsInOrder = true;
listDesc.Ordinal =
hDevice
->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
.ZeOrdinal;
listDesc.CopyOffloadEnable = true;
return hContext->getCommandListCache().getImmediateCommandList(
hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS,
ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt);
}
if (writeBackPtr) {
// Copy-back path: find the mapped region and copy data back if needed
auto mappedRegion =
std::find_if(mappedRegions.begin(), mappedRegions.end(),
[pMappedPtr](const host_allocation_desc_t &desc) {
return desc.ptr.get() == pMappedPtr;
});

if (mappedRegion == mappedRegions.end()) {
UR_DFAILURE("could not find pMappedPtr:" << pMappedPtr);
throw UR_RESULT_ERROR_INVALID_ARGUMENT;
}

static ur_result_t synchronousZeCopy(ur_context_handle_t hContext,
ur_device_handle_t hDevice, void *dst,
const void *src, size_t size) try {
auto commandList = getSyncCommandListForCopy(hContext, hDevice);
if (mappedRegion->flags &
(UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) {
// Use Level Zero copy for USM HOST memory to ensure GPU visibility
auto hDevice = hContext->getDevices()[0];
UR_CALL_THROWS(synchronousZeCopy(
hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset,
mappedRegion->ptr.get(), mappedRegion->size));
}

ZE2UR_CALL(zeCommandListAppendMemoryCopy,
(commandList.get(), dst, src, size, nullptr, 0, nullptr));
mappedRegions.erase(mappedRegion);
return;
}
// No op for zero-copy path, memory is synced
}

return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() {
// Do NOT do automatic copy-back in destructor - it causes heap corruption
// because writeBackPtr may be freed by SYCL runtime before buffer destructor
// runs. Copy-back happens via explicit map/unmap operations (see
// mapHostPtr/unmapHostPtr).
}

void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
Expand Down Expand Up @@ -410,19 +457,16 @@ void ur_shared_buffer_handle_t::unmapHostPtr(
// nop
}

static bool useHostBuffer(ur_context_handle_t /* hContext */) {
static bool useHostBuffer(ur_context_handle_t hContext) {
// We treat integrated devices (physical memory shared with the CPU)
// differently from discrete devices (those with distinct memories).
// For integrated devices, allocating the buffer in the host memory
// enables automatic access from the device, and makes copying
// unnecessary in the map/unmap operations. This improves performance.

// TODO: fix integrated buffer implementation
return false;

// return hContext->getDevices().size() == 1 &&
// hContext->getDevices()[0]->ZeDeviceProperties->flags &
// ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
return hContext->getDevices().size() == 1 &&
hContext->getDevices()[0]->ZeDeviceProperties->flags &
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
}

ur_mem_sub_buffer_t::ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset,
Expand Down Expand Up @@ -566,6 +610,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext,
void *hostPtr = pProperties ? pProperties->pHost : nullptr;
auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags);

// For integrated devices, use zero-copy host buffers. The integrated buffer
// constructor will handle all cases:
// 1. No host pointer - allocate USM host memory
// 2. Host pointer is already USM - use directly
// 3. Host pointer can be imported - import it
// 4. Otherwise - allocate USM and copy-back on destruction
if (useHostBuffer(hContext)) {
*phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>(
hContext, hostPtr, size, accessMode);
Expand Down
23 changes: 12 additions & 11 deletions unified-runtime/source/adapters/level_zero/v2/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ struct ur_usm_handle_t : ur_mem_buffer_t {
void *ptr;
};

struct host_allocation_desc_t {
host_allocation_desc_t(usm_unique_ptr_t ptr, size_t size, size_t offset,
ur_map_flags_t flags)
: ptr(std::move(ptr)), size(size), offset(offset), flags(flags) {}

usm_unique_ptr_t ptr;
size_t size;
size_t offset;
ur_map_flags_t flags;
};

// Manages memory buffer for integrated GPU.
// For integrated devices the buffer has been allocated in host memory
// and can be accessed by the device without copying.
Expand All @@ -112,17 +123,7 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t {
private:
usm_unique_ptr_t ptr;
void *writeBackPtr = nullptr;
};

struct host_allocation_desc_t {
host_allocation_desc_t(usm_unique_ptr_t ptr, size_t size, size_t offset,
ur_map_flags_t flags)
: ptr(std::move(ptr)), size(size), offset(offset), flags(flags) {}

usm_unique_ptr_t ptr;
size_t size;
size_t offset;
ur_map_flags_t flags;
std::vector<host_allocation_desc_t> mappedRegions;
};

// Manages memory buffer for discrete GPU.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,19 @@ DeviceType GetDeviceType(ur_context_handle_t Context,
// TODO: Check fpga is fpga emulator
return DeviceType::CPU;
case UR_DEVICE_TYPE_GPU: {
// Check if this is an integrated GPU - they share system memory with CPU
// and should use CPU-style shadow memory layout
ur_bool_t IsIntegrated = false;
[[maybe_unused]] ur_result_t QueryResult =
getContext()->urDdiTable.Device.pfnGetInfo(
Device, UR_DEVICE_INFO_IS_INTEGRATED_GPU, sizeof(ur_bool_t),
&IsIntegrated, nullptr);
if (QueryResult == UR_RESULT_SUCCESS && IsIntegrated) {
UR_LOG_L(getContext()->logger, DEBUG,
"GetDeviceType: Integrated GPU detected, using CPU layout");
return DeviceType::CPU;
}

uptr Ptr;
[[maybe_unused]] ur_result_t Result =
getContext()->urDdiTable.USM.pfnDeviceAlloc(Context, Device, nullptr,
Expand Down
Loading