-
Notifications
You must be signed in to change notification settings - Fork 798
[UR][L0v2 adapter] Fix buffers for integrated gpu #20495
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
203620d
c3ce32b
33b3f69
f1b62a9
2a0d2df
197201e
5e6f37c
4f43c32
a76d860
a675fa2
f731333
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,9 +8,9 @@ | |
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include "memory.hpp" | ||
| #include "../ur_interface_loader.hpp" | ||
| #include "context.hpp" | ||
| #include "memory.hpp" | ||
|
|
||
| #include "../helpers/memory_helpers.hpp" | ||
| #include "../image_common.hpp" | ||
|
|
@@ -53,6 +53,34 @@ void ur_usm_handle_t::unmapHostPtr(void * /*pMappedPtr*/, | |
| /* nop */ | ||
| } | ||
|
|
||
| static v2::raii::command_list_unique_handle | ||
| getSyncCommandListForCopy(ur_context_handle_t hContext, | ||
| ur_device_handle_t hDevice) { | ||
| v2::command_list_desc_t listDesc; | ||
| listDesc.IsInOrder = true; | ||
| listDesc.Ordinal = | ||
| hDevice | ||
| ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] | ||
| .ZeOrdinal; | ||
| listDesc.CopyOffloadEnable = true; | ||
| return hContext->getCommandListCache().getImmediateCommandList( | ||
| hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, | ||
| ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt); | ||
| } | ||
|
|
||
| static ur_result_t synchronousZeCopy(ur_context_handle_t hContext, | ||
| ur_device_handle_t hDevice, void *dst, | ||
| const void *src, size_t size) try { | ||
| auto commandList = getSyncCommandListForCopy(hContext, hDevice); | ||
|
|
||
| ZE2UR_CALL(zeCommandListAppendMemoryCopy, | ||
| (commandList.get(), dst, src, size, nullptr, 0, nullptr)); | ||
|
|
||
| return UR_RESULT_SUCCESS; | ||
| } catch (...) { | ||
| return exceptionToResult(std::current_exception()); | ||
| } | ||
|
|
||
| ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( | ||
| ur_context_handle_t hContext, void *hostPtr, size_t size, | ||
| device_access_mode_t accessMode) | ||
|
|
@@ -68,6 +96,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( | |
| }); | ||
| } else { | ||
| void *rawPtr; | ||
| // Use HOST memory for integrated GPUs to enable zero-copy device access | ||
| UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( | ||
| hContext, nullptr, nullptr, UR_USM_TYPE_HOST, size, &rawPtr)); | ||
|
|
||
|
|
@@ -79,7 +108,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( | |
| }); | ||
|
|
||
| if (hostPtr) { | ||
| std::memcpy(this->ptr.get(), hostPtr, size); | ||
| // Initial copy using Level Zero for USM HOST memory | ||
| auto hDevice = hContext->getDevices()[0]; | ||
| UR_CALL_THROWS( | ||
| synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size)); | ||
| // Set writeBackPtr to enable map/unmap copy-back (but NOT destructor | ||
| // copy-back) | ||
| writeBackPtr = hostPtr; | ||
| } | ||
| } | ||
|
|
@@ -97,12 +131,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( | |
| }); | ||
| } | ||
|
|
||
| ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { | ||
| if (writeBackPtr) { | ||
| std::memcpy(writeBackPtr, this->ptr.get(), size); | ||
| } | ||
| } | ||
|
|
||
| void *ur_integrated_buffer_handle_t::getDevicePtr( | ||
| ur_device_handle_t /*hDevice*/, device_access_mode_t /*access*/, | ||
| size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/, | ||
|
|
@@ -111,48 +139,67 @@ void *ur_integrated_buffer_handle_t::getDevicePtr( | |
| } | ||
|
|
||
| void *ur_integrated_buffer_handle_t::mapHostPtr( | ||
| ur_map_flags_t /*flags*/, size_t offset, size_t /*size*/, | ||
| ur_map_flags_t flags, size_t offset, size_t mapSize, | ||
| ze_command_list_handle_t /*cmdList*/, wait_list_view & /*waitListView*/) { | ||
| // TODO: if writeBackPtr is set, we should map to that pointer | ||
| // because that's what SYCL expects, SYCL will attempt to call free | ||
| // on the resulting pointer leading to double free with the current | ||
| // implementation. Investigate the SYCL implementation. | ||
|
Comment on lines
-116
to
-119
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this comment false?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right, the comment was not false as long as map/unmap functions missed full handling of copy-back path. Last commit addresses the issue. |
||
| if (writeBackPtr) { | ||
| // Copy-back path: user gets back their original pointer | ||
| void *mappedPtr = ur_cast<char *>(writeBackPtr) + offset; | ||
|
|
||
| if (flags & UR_MAP_FLAG_READ) { | ||
| // Use Level Zero copy for USM HOST memory to ensure GPU visibility | ||
| auto hDevice = hContext->getDevices()[0]; | ||
| UR_CALL_THROWS(synchronousZeCopy(hContext, hDevice, mappedPtr, | ||
| ur_cast<char *>(ptr.get()) + offset, | ||
| mapSize)); | ||
| } | ||
|
|
||
| // Track this mapping for unmap | ||
| mappedRegions.emplace_back(usm_unique_ptr_t(mappedPtr, [](void *) {}), | ||
| mapSize, offset, flags); | ||
|
|
||
| return mappedPtr; | ||
| } | ||
|
|
||
| // Zero-copy path: for successfully imported or USM pointers | ||
| return ur_cast<char *>(ptr.get()) + offset; | ||
| } | ||
|
|
||
| void ur_integrated_buffer_handle_t::unmapHostPtr( | ||
| void * /*pMappedPtr*/, ze_command_list_handle_t /*cmdList*/, | ||
| void *pMappedPtr, ze_command_list_handle_t /*cmdList*/, | ||
| wait_list_view & /*waitListView*/) { | ||
| // TODO: if writeBackPtr is set, we should copy the data back | ||
| /* nop */ | ||
| } | ||
|
|
||
| static v2::raii::command_list_unique_handle | ||
| getSyncCommandListForCopy(ur_context_handle_t hContext, | ||
| ur_device_handle_t hDevice) { | ||
| v2::command_list_desc_t listDesc; | ||
| listDesc.IsInOrder = true; | ||
| listDesc.Ordinal = | ||
| hDevice | ||
| ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] | ||
| .ZeOrdinal; | ||
| listDesc.CopyOffloadEnable = true; | ||
| return hContext->getCommandListCache().getImmediateCommandList( | ||
| hDevice->ZeDevice, listDesc, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, | ||
| ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt); | ||
| } | ||
| if (writeBackPtr) { | ||
| // Copy-back path: find the mapped region and copy data back if needed | ||
| auto mappedRegion = | ||
| std::find_if(mappedRegions.begin(), mappedRegions.end(), | ||
| [pMappedPtr](const host_allocation_desc_t &desc) { | ||
| return desc.ptr.get() == pMappedPtr; | ||
| }); | ||
|
|
||
| if (mappedRegion == mappedRegions.end()) { | ||
| UR_DFAILURE("could not find pMappedPtr:" << pMappedPtr); | ||
| throw UR_RESULT_ERROR_INVALID_ARGUMENT; | ||
| } | ||
|
|
||
| static ur_result_t synchronousZeCopy(ur_context_handle_t hContext, | ||
| ur_device_handle_t hDevice, void *dst, | ||
| const void *src, size_t size) try { | ||
| auto commandList = getSyncCommandListForCopy(hContext, hDevice); | ||
| if (mappedRegion->flags & | ||
| (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION)) { | ||
| // Use Level Zero copy for USM HOST memory to ensure GPU visibility | ||
| auto hDevice = hContext->getDevices()[0]; | ||
| UR_CALL_THROWS(synchronousZeCopy( | ||
| hContext, hDevice, ur_cast<char *>(ptr.get()) + mappedRegion->offset, | ||
| mappedRegion->ptr.get(), mappedRegion->size)); | ||
| } | ||
|
|
||
| ZE2UR_CALL(zeCommandListAppendMemoryCopy, | ||
| (commandList.get(), dst, src, size, nullptr, 0, nullptr)); | ||
| mappedRegions.erase(mappedRegion); | ||
| return; | ||
| } | ||
| // No op for zero-copy path, memory is synced | ||
| } | ||
|
|
||
| return UR_RESULT_SUCCESS; | ||
| } catch (...) { | ||
| return exceptionToResult(std::current_exception()); | ||
| ur_integrated_buffer_handle_t::~ur_integrated_buffer_handle_t() { | ||
| // Do NOT do automatic copy-back in destructor - it causes heap corruption | ||
| // because writeBackPtr may be freed by SYCL runtime before buffer destructor | ||
| // runs. Copy-back happens via explicit map/unmap operations (see | ||
| // mapHostPtr/unmapHostPtr). | ||
| } | ||
|
|
||
| void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice, | ||
|
|
@@ -410,19 +457,16 @@ void ur_shared_buffer_handle_t::unmapHostPtr( | |
| // nop | ||
| } | ||
|
|
||
| static bool useHostBuffer(ur_context_handle_t /* hContext */) { | ||
| static bool useHostBuffer(ur_context_handle_t hContext) { | ||
| // We treat integrated devices (physical memory shared with the CPU) | ||
| // differently from discrete devices (those with distinct memories). | ||
| // For integrated devices, allocating the buffer in the host memory | ||
| // enables automatic access from the device, and makes copying | ||
| // unnecessary in the map/unmap operations. This improves performance. | ||
|
|
||
| // TODO: fix integrated buffer implementation | ||
| return false; | ||
|
|
||
| // return hContext->getDevices().size() == 1 && | ||
| // hContext->getDevices()[0]->ZeDeviceProperties->flags & | ||
| // ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; | ||
| return hContext->getDevices().size() == 1 && | ||
| hContext->getDevices()[0]->ZeDeviceProperties->flags & | ||
| ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; | ||
| } | ||
|
|
||
| ur_mem_sub_buffer_t::ur_mem_sub_buffer_t(ur_mem_handle_t hParent, size_t offset, | ||
|
|
@@ -566,6 +610,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, | |
| void *hostPtr = pProperties ? pProperties->pHost : nullptr; | ||
| auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags); | ||
|
|
||
| // For integrated devices, use zero-copy host buffers. The integrated buffer | ||
| // constructor will handle all cases: | ||
| // 1. No host pointer - allocate USM host memory | ||
| // 2. Host pointer is already USM - use directly | ||
| // 3. Host pointer can be imported - import it | ||
| // 4. Otherwise - allocate USM and copy-back on destruction | ||
| if (useHostBuffer(hContext)) { | ||
| *phBuffer = ur_mem_handle_t_::create<ur_integrated_buffer_handle_t>( | ||
| hContext, hostPtr, size, accessMode); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed for local tests, but may be useful in CI, too, depending on hw config. Allows to point out which gpu should be used for tests (eg. integrated/discrete). Example:
llvm-lit --param "sycl_devices=level_zero_v2:1"- the tests are run on device level_zero:1 with v2 adapterThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After closer look, as some tests appeared as unsupported, additional one-line fix was necessary.
So, if any of reviewers thinks the change does not belong to this PR, I am also OK with that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we should be changing the default config. CI and other dev systems where this is used may have a different configuration where :0 and :1 mean e.g., different dGPUs. Or there may not be a second GPU at all.