Skip to content

Commit 4a245b8

Browse files
authored
[SYCL] optimize createSyclObjFromImpl calls to take rvalue-ref to shared_ptr (#20859)
The optimization results in moving shared_pointer inside _createSyclObjFromImpl_ instead of copying and thanks to it we save two atomic operations (see e.g. [this SO thread](https://stackoverflow.com/a/41874953/1654158)). I've applied it to all possible places in the code, leaving only these where copying is indeed needed (mostly for _context_impl_ use). ### Results summary overhead over UR reduced by ~8% in scenarios using events. Other benchmarks also show visible improvements in many cases, including new pytorch multiqueue benchmarks which improved overall by 2.7% ### Results Examples The new result is expressed by dots on the right sides of plots. <img width="1548" height="735" alt="SubmitKernel out of order using events long kernel, CPU count(1)" src="https://github.com/user-attachments/assets/829ab30e-76f3-42a8-b6d9-c17714a5a145" /> old = 134.6, new = 132.8, UR baseline = 113, overhead over UR reduced by 8.3% <img width="1548" height="735" alt="SubmitKernel out of order with completion using events, CPU count(4)" src="https://github.com/user-attachments/assets/17e9d7cf-c88d-455e-b116-39e2f1b8f04c" /> old = 140, new = 138.2, UR baseline = 118.1, **overhead over UR reduced by 8.1%** <img width="1548" height="735" alt="SubmitKernel in order, CPU count(5)" src="https://github.com/user-attachments/assets/1e3f784a-efdb-47cc-8dea-bc516bdad33a" /> old = 122.3, new = 121.3, UR baseline = 108.1, **overhead over UR reduced by 7.0%** <img width="1548" height="810" alt="SubmitKernel in order using events(2)" src="https://github.com/user-attachments/assets/e96770aa-9716-4293-a0b6-29babd25ee5e" /> old time = 13.91, new time = 13.58, **whole stack reduced by 2.4%** And finally new pytorch microbenchmarks: <img width="1548" height="735" alt="KernelSubmitMultiQueue small" src="https://github.com/user-attachments/assets/d23b3c40-ae0e-4528-9c5e-f2726e3030ce" /> old time = 1.81, new time = 1.76, L0 baseline = 1.44 whole stack reduced by 2.8%, **overhead over L0 reduced by 13.5%**
1 parent 4fa9f71 commit 4a245b8

File tree

7 files changed

+14
-12
lines changed

7 files changed

+14
-12
lines changed

sycl/include/sycl/ext/oneapi/weak_object.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ class weak_object : public detail::weak_object_base<SYCLObjT> {
7373
auto MObjImplPtr = this->MObjWeakPtr.lock();
7474
if (!MObjImplPtr)
7575
return std::nullopt;
76-
return sycl::detail::createSyclObjFromImpl<SYCLObjT>(MObjImplPtr);
76+
return sycl::detail::createSyclObjFromImpl<SYCLObjT>(
77+
std::move(MObjImplPtr));
7778
}
7879
SYCLObjT lock() const {
7980
std::optional<SYCLObjT> OptionalObj = try_lock();

sycl/include/sycl/kernel_bundle.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,8 @@ template <bundle_state State>
731731
kernel_bundle<State> get_empty_interop_kernel_bundle(const context &Ctx) {
732732
detail::KernelBundleImplPtr Impl =
733733
detail::get_empty_interop_kernel_bundle_impl(Ctx, Ctx.get_devices());
734-
return detail::createSyclObjFromImpl<sycl::kernel_bundle<State>>(Impl);
734+
return detail::createSyclObjFromImpl<sycl::kernel_bundle<State>>(
735+
std::move(Impl));
735736
}
736737
} // namespace detail
737738

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,10 +1723,8 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
17231723
// ... and create a unique kernel ID for the entry
17241724
auto It = m_KernelName2KernelIDs.find(name);
17251725
if (It == m_KernelName2KernelIDs.end()) {
1726-
std::shared_ptr<detail::kernel_id_impl> KernelIDImpl =
1727-
std::make_shared<detail::kernel_id_impl>(name);
1728-
sycl::kernel_id KernelID =
1729-
detail::createSyclObjFromImpl<sycl::kernel_id>(KernelIDImpl);
1726+
sycl::kernel_id KernelID = detail::createSyclObjFromImpl<sycl::kernel_id>(
1727+
std::make_shared<detail::kernel_id_impl>(name));
17301728

17311729
It = m_KernelName2KernelIDs.emplace_hint(It, name, KernelID);
17321730
}

sycl/source/detail/queue_impl.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ prepareSYCLEventAssociatedWithQueue(detail::queue_impl &QueueImpl) {
8181
auto EventImpl = detail::event_impl::create_device_event(QueueImpl);
8282
EventImpl->setContextImpl(QueueImpl.getContextImpl());
8383
EventImpl->setStateIncomplete();
84-
return detail::createSyclObjFromImpl<event>(EventImpl);
84+
return detail::createSyclObjFromImpl<event>(std::move(EventImpl));
8585
}
8686

8787
const std::vector<event> &
@@ -103,7 +103,8 @@ queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
103103
if (ExternalEvent)
104104
MutableVec.push_back(*ExternalEvent);
105105
if (ExtraEvent)
106-
MutableVec.push_back(detail::createSyclObjFromImpl<event>(ExtraEvent));
106+
MutableVec.push_back(
107+
detail::createSyclObjFromImpl<event>(std::move(ExtraEvent)));
107108
return MutableVec;
108109
}
109110

sycl/source/detail/queue_impl.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
348348

349349
detail::EventImplPtr ResEvent = submit_impl(CGF, /*CallerNeedsEvent=*/true,
350350
Loc, IsTopCodeLoc, SubmitInfo);
351-
return createSyclObjFromImpl<event>(ResEvent);
351+
return createSyclObjFromImpl<event>(std::move(ResEvent));
352352
}
353353

354354
event submit_kernel_direct_with_event(
@@ -361,7 +361,7 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
361361
detail::EventImplPtr EventImpl = submit_kernel_direct_impl(
362362
NDRDescT(RangeView), HostKernel, DeviceKernelInfo,
363363
/*CallerNeedsEvent*/ true, DepEvents, Props, CodeLoc, IsTopCodeLoc);
364-
return createSyclObjFromImpl<event>(EventImpl);
364+
return createSyclObjFromImpl<event>(std::move(EventImpl));
365365
}
366366

367367
void submit_kernel_direct_without_event(

sycl/source/kernel_bundle.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ obj_kb compile_from_source(
516516
kernel_bundle_impl &sourceImpl = *getSyclObjImpl(SourceKB);
517517
std::shared_ptr<kernel_bundle_impl> KBImpl = sourceImpl.compile_from_source(
518518
UniqueDevices, BuildOptions, LogPtr, RegisteredKernelNames);
519-
auto result = sycl::detail::createSyclObjFromImpl<obj_kb>(KBImpl);
519+
auto result = sycl::detail::createSyclObjFromImpl<obj_kb>(std::move(KBImpl));
520520
if (LogView)
521521
*LogView = Log;
522522
return result;

sycl/source/queue.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ queue::ext_oneapi_get_graph() const {
9292

9393
return sycl::detail::createSyclObjFromImpl<
9494
ext::oneapi::experimental::command_graph<
95-
ext::oneapi::experimental::graph_state::modifiable>>(Graph);
95+
ext::oneapi::experimental::graph_state::modifiable>>(
96+
std::move(Graph));
9697
}
9798

9899
void queue::throw_asynchronous() { impl->throw_asynchronous(); }

0 commit comments

Comments
 (0)