From 5d0388b14139a10e87554184faad68dc66deb520 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Mon, 31 Mar 2025 15:24:55 -0700 Subject: [PATCH 1/3] interim --- sycl/source/CMakeLists.txt | 4 +++ sycl/source/detail/scheduler/scheduler.cpp | 30 +++++++++++++--------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 2570921a2d565..8a9db4ba8c2ef 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -3,6 +3,10 @@ #2. Use AddLLVM to modify the build and access config options #cmake_policy(SET CMP0057 NEW) #include(AddLLVM) + +set(CMAKE_BUILD_TYPE Debug) + + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/version.rc.in ${CMAKE_CURRENT_BINARY_DIR}/version.rc diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c36ff2acbb21a..d41c39468b831 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -60,7 +60,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "1- Enqueue process failed."); #ifdef XPTI_ENABLE_INSTRUMENTATION // Capture the dependencies DepCommands.insert(Cmd); @@ -76,7 +76,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "2- Enqueue process failed."); #ifdef XPTI_ENABLE_INSTRUMENTATION DepCommands.insert(Cmd); #endif @@ -89,7 +89,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, Res, ToCleanUp, ReleaseCmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "3- Enqueue process failed."); #ifdef XPTI_ENABLE_INSTRUMENTATION // Report these dependencies to the Command so these dependencies can be // reported as edges @@ -156,6 +156,12 @@ void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent, bool Enqueued; auto CleanUp = [&]() { + // restore the enqueue status + // this fixes the bug where the buffer is not re-usable. + // BUT, ironically, it reintroduces the other scheduler failure I fixed, + // where exceptions lead to memory leaks. if(NewCmd) + // NewCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + if (NewCmd && (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0)) { if (NewEvent) { NewEvent->setCommand(nullptr); @@ -189,7 +195,7 @@ void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent, NewCmd, Lock, Res, ToCleanUp, NewCmd, Blocking); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "4- Enqueue process failed."); } catch (...) { // enqueueCommand() func and if statement above may throw an exception, // so destroy required resources to avoid memory leak @@ -230,7 +236,7 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) { CopyBackCmdsFailed |= Res.MCmd == Cmd; throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "5- Enqueue process failed."); } } @@ -239,7 +245,7 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) { CopyBackCmdsFailed |= Res.MCmd == NewCmd; throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "6- Enqueue process failed."); } } catch (...) { if (CopyBackCmdsFailed) { @@ -323,7 +329,7 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "7- Enqueue process failed."); } if (Command *NewCmd = static_cast(NewCmdEvent->getCommand())) { @@ -331,7 +337,7 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "8- Enqueue process failed."); } } @@ -366,7 +372,7 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req, ToCleanUp, Cmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "9- Enqueue process failed."); } }; @@ -386,7 +392,7 @@ void Scheduler::enqueueUnblockedCommands( GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "10- Enqueue process failed."); } } @@ -632,7 +638,7 @@ EventImplPtr Scheduler::addCommandGraphUpdate( Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "11- Enqueue process failed."); } if (Command *NewCmd = static_cast(NewCmdEvent->getCommand())) { @@ -640,7 +646,7 @@ EventImplPtr Scheduler::addCommandGraphUpdate( GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + "12- Enqueue process failed."); } } From 7090dac1ef16ac1d7e619ff2bb30cf0c86c84e8b Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Mon, 31 Mar 2025 17:45:44 -0700 Subject: [PATCH 2/3] seems to fix, but needs to be checked --- sycl/source/detail/scheduler/graph_processor.cpp | 5 +++++ sycl/source/detail/scheduler/scheduler.cpp | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 14396d8790968..95dbf7b9120c1 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -84,6 +84,11 @@ bool Scheduler::GraphProcessor::enqueueCommand( return false; } + // CP + // Reset enqueue status if reattempting + if(Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + Cmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + // Recursively enqueue all the implicit + explicit backend level dependencies // first and exit immediately if any of the commands cannot be enqueued. for (const EventImplPtr &Event : Cmd->getPreparedDepsEvents()) { diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index d41c39468b831..ae461c143e64b 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -159,7 +159,8 @@ void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent, // restore the enqueue status // this fixes the bug where the buffer is not re-usable. // BUT, ironically, it reintroduces the other scheduler failure I fixed, - // where exceptions lead to memory leaks. if(NewCmd) + // where exceptions lead to memory leaks. + // if(NewCmd) // NewCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; if (NewCmd && (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0)) { From 96e423b7e95a3d006c79488b37a381f14515be5d Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Wed, 2 Apr 2025 17:51:04 -0700 Subject: [PATCH 3/3] fix? --- sycl/source/detail/scheduler/graph_builder.cpp | 4 +++- .../source/detail/scheduler/graph_processor.cpp | 6 ++++-- sycl/source/detail/scheduler/scheduler.cpp | 11 +++++++++-- sycl/unittests/scheduler/FailedCommands.cpp | 16 ++++++++-------- sycl/unittests/scheduler/SchedulerTestUtils.hpp | 17 +++++++++++++++++ 5 files changed, 41 insertions(+), 13 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 85bc93f7d6a9a..ee058ac56501e 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -486,8 +486,10 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, std::vector ToCleanUp; for (Command *Dep : Deps) { - if (Dep->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + if (Dep->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) { + std::cout << "graph_builder:490 says HELLO!!" << std::endl; continue; + } Command *ConnCmd = MemCpyCmd->addDep( DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp); diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 95dbf7b9120c1..5d252fd9a833f 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -84,10 +84,12 @@ bool Scheduler::GraphProcessor::enqueueCommand( return false; } - // CP + // CP -- FAIL TWO // Reset enqueue status if reattempting - if(Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + if(Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed){ Cmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + //std::cout << "CP FAIL TWO. EnqueueResult/Cmd/Err: " << EnqueueResult.MResult << "/" << (long)EnqueueResult.MCmd << "/" << EnqueueResult.MErrCode << std::endl; + } // Recursively enqueue all the implicit + explicit backend level dependencies // first and exit immediately if any of the commands cannot be enqueued. diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index ae461c143e64b..a9fea94147108 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -52,8 +52,10 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, #endif std::vector ToCleanUp; for (Command *Cmd : Record->MReadLeaves) { - if (Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + if (Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) { + std::cout << "scheduler.cpp:56 says HELLO!" << std::endl; continue; + } EnqueueResultT Res; bool Enqueued = @@ -68,8 +70,10 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp); } for (Command *Cmd : Record->MWriteLeaves) { - if (Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + if (Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) { + std::cout << "scheduler.cpp:74 says HELLO!!" << std::endl; continue; + } EnqueueResultT Res; bool Enqueued = @@ -156,12 +160,15 @@ void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent, bool Enqueued; auto CleanUp = [&]() { + + // CP -- FAIL ONE // restore the enqueue status // this fixes the bug where the buffer is not re-usable. // BUT, ironically, it reintroduces the other scheduler failure I fixed, // where exceptions lead to memory leaks. // if(NewCmd) // NewCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + //std::cout << "CleanUp Hit! " << NewCmd->MMarkedForCleanup << std::endl; if (NewCmd && (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0)) { if (NewEvent) { diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp index 48f0f906a0fc2..19b2924a773f5 100644 --- a/sycl/unittests/scheduler/FailedCommands.cpp +++ b/sycl/unittests/scheduler/FailedCommands.cpp @@ -20,13 +20,13 @@ TEST_F(SchedulerTest, FailedDependency) { queue Queue(context(Plt), default_selector_v); detail::Requirement MockReq = getMockRequirement(); - MockCommand MDep(detail::getSyclObjImpl(Queue)); - MockCommand MUser(detail::getSyclObjImpl(Queue)); - MDep.addUser(&MUser); + MockCommand MDepFail(false, detail::getSyclObjImpl(Queue)); // <-- will fail to enqueue + MockCommand MUser(detail::getSyclObjImpl(Queue)); + MDepFail.addUser(&MUser); std::vector ToCleanUp; - (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}, ToCleanUp); + (void)MUser.addDep(detail::DepDesc{&MDepFail, &MockReq, nullptr}, ToCleanUp); MUser.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueReady; - MDep.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueFailed; + MDepFail.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueReady; MockScheduler MS; auto Lock = MS.acquireGraphReadLock(); @@ -35,13 +35,13 @@ TEST_F(SchedulerTest, FailedDependency) { MockScheduler::enqueueCommand(&MUser, Res, detail::NON_BLOCKING); ASSERT_FALSE(Enqueued) << "Enqueue process must fail\n"; - ASSERT_EQ(Res.MCmd, &MDep) << "Wrong failed command\n"; + ASSERT_EQ(Res.MCmd, &MDepFail) << "Wrong failed command\n"; ASSERT_EQ(Res.MResult, detail::EnqueueResultT::SyclEnqueueFailed) << "Enqueue process must fail\n"; ASSERT_EQ(MUser.MEnqueueStatus, detail::EnqueueResultT::SyclEnqueueReady) << "MUser shouldn't be marked as failed\n"; - ASSERT_EQ(MDep.MEnqueueStatus, detail::EnqueueResultT::SyclEnqueueFailed) - << "MDep should be marked as failed\n"; + ASSERT_EQ(MDepFail.MEnqueueStatus, detail::EnqueueResultT::SyclEnqueueFailed) + << "MDepFail should be marked as failed\n"; } void RunWithFailedCommandsAndCheck(bool SyncExceptionExpected, diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index ffc0567ba7daa..5b870035bf34e 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -53,6 +53,15 @@ class MockCommand : public sycl::detail::Command { EXPECT_CALL(*this, enqueue).Times(AnyNumber()); } + MockCommand(bool, sycl::detail::QueueImplPtr Queue, + sycl::detail::Command::CommandType Type = sycl::detail::Command::RUN_CG) + : Command{Type, Queue}, MRequirement{std::move(getMockRequirement())} { + using namespace testing; + ON_CALL(*this, enqueue) + .WillByDefault(Invoke(this, &MockCommand::enqueueFail)); + EXPECT_CALL(*this, enqueue).Times(AnyNumber()); + } + void printDot(std::ostream &) const override {} void emitInstrumentationData() override {} @@ -70,6 +79,14 @@ class MockCommand : public sycl::detail::Command { std::vector &ToCleanUp) { return sycl::detail::Command::enqueue(EnqueueResult, Blocking, ToCleanUp); } + bool enqueueFail(sycl::detail::EnqueueResultT &EnqueueResult, + sycl::detail::BlockingT Blocking, + std::vector &ToCleanUp) { + this->MEnqueueStatus = sycl::detail::EnqueueResultT::SyclEnqueueFailed; + EnqueueResult = {sycl::detail::EnqueueResultT::SyclEnqueueFailed, this}; + ToCleanUp.push_back(this); + return false; + } ur_result_t MRetVal = UR_RESULT_SUCCESS;