From 9dd5ad84c935a1bbfae9761367a666e79d1a1b21 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 27 Jun 2025 11:41:44 -0700 Subject: [PATCH 1/4] [NFCI][SYCL] Refactor `HandlerAccess::postProcess` Implements the idea from the earlier TODO comment. Instead of having a hacky `handler::MLastEvent` just `swap` original and post-processing `handler`, so that "natural" `finalize` will work on the latest task in the chain. --- .../include/sycl/detail/reduction_forward.hpp | 5 - sycl/include/sycl/handler.hpp | 49 ++++--- sycl/include/sycl/reduction.hpp | 9 -- sycl/source/handler.cpp | 123 ++++++------------ sycl/test/abi/sycl_symbols_linux.dump | 4 + 5 files changed, 80 insertions(+), 110 deletions(-) diff --git a/sycl/include/sycl/detail/reduction_forward.hpp b/sycl/include/sycl/detail/reduction_forward.hpp index 4045a00ec0cce..6bafc93b70ce6 100644 --- a/sycl/include/sycl/detail/reduction_forward.hpp +++ b/sycl/include/sycl/detail/reduction_forward.hpp @@ -46,11 +46,6 @@ enum class strategy : int { multi, }; -// Reductions implementation need access to private members of handler. Those -// are limited to those below. -inline void finalizeHandler(handler &CGH); -template void withAuxHandler(handler &CGH, FunctorTy Func); - template item getDelinearizedItem(range Range, id Id) { return Builder::createItem(Range, Id); diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 852045e2bf3f0..e074e5f1ada42 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -488,6 +488,9 @@ class __SYCL_EXPORT handler { /// \param Graph is a SYCL command_graph handler(std::shared_ptr Graph); #endif + handler(std::unique_ptr &&HandlerImpl); + + ~handler(); void *storeRawArg(const void *Ptr, size_t Size); @@ -619,8 +622,6 @@ class __SYCL_EXPORT handler { addReduction(std::shared_ptr(ReduBuf)); } - ~handler() = default; - #ifdef __SYCL_DEVICE_ONLY__ // In device compilation accessor isn't inherited from host base classes, so // can't detect by it. Since we don't expect it to be ever called in device @@ -3396,9 +3397,7 @@ class __SYCL_EXPORT handler { private: #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - // TODO: Maybe make it a reference when non-preview branch is removed. - // On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might - // be useful in future, pointer here would make that possible/easier. + std::unique_ptr implOwner; detail::handler_impl *impl; #else std::shared_ptr impl; @@ -3423,11 +3422,10 @@ class __SYCL_EXPORT handler { std::unique_ptr MHostKernel; detail::code_location MCodeLoc = {}; - bool MIsFinalized = false; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - detail::EventImplPtr MLastEvent; -#else - event MLastEvent; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + // Was used for the previous reduction implementation (via `withAuxHandler`). + bool MIsFinalizedDoNotUse = false; + event MLastEventDoNotUse; #endif // Make queue_impl class friend to be able to call finalize method. @@ -3452,8 +3450,6 @@ class __SYCL_EXPORT handler { bool ExplicitIdentity, typename RedOutVar> friend class detail::reduction_impl_algo; - friend inline void detail::reduction::finalizeHandler(handler &CGH); - template friend void detail::reduction_parallel_for(handler &CGH, range NDRange, @@ -3920,6 +3916,30 @@ class HandlerAccess { Handler.parallel_for_impl(Range, Props, Kernel); } + static void swap(handler &LHS, handler &RHS) { +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + std::swap(LHS.implOwner, RHS.implOwner); +#endif + std::swap(LHS.impl, RHS.impl); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + std::swap(LHS.MQueueDoNotUse, RHS.MQueueDoNotUse); +#endif + std::swap(LHS.MLocalAccStorage, RHS.MLocalAccStorage); + std::swap(LHS.MStreamStorage, RHS.MStreamStorage); + std::swap(LHS.MKernelName, RHS.MKernelName); + std::swap(LHS.MKernel, RHS.MKernel); + std::swap(LHS.MSrcPtr, RHS.MSrcPtr); + std::swap(LHS.MDstPtr, RHS.MDstPtr); + std::swap(LHS.MLength, RHS.MLength); + std::swap(LHS.MPattern, RHS.MPattern); + std::swap(LHS.MHostKernel, RHS.MHostKernel); + std::swap(LHS.MCodeLoc, RHS.MCodeLoc); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + std::swap(LHS.MIsFinalizedDoNotUse, RHS.MIsFinalizedDoNotUse); + std::swap(LHS.MLastEventDoNotUse, RHS.MLastEventDoNotUse); +#endif + } + // pre/postProcess are used only for reductions right now, but the // abstractions they provide aren't reduction-specific. The main problem they // solve is @@ -3932,9 +3952,8 @@ class HandlerAccess { // // that needs to be implemented as multiple enqueues involving // pre-/post-processing internally. SYCL prohibits recursive submits from - // inside control group function object (lambda above) so we resort to a - // somewhat hacky way of creating multiple `handler`s and manual finalization - // of them (instead of the one in `queue::submit`). + // inside control group function object (lambda above) so we need some + // internal interface to implement that. __SYCL_EXPORT static void preProcess(handler &CGH, type_erased_cgfo_ty F); __SYCL_EXPORT static void postProcess(handler &CGH, type_erased_cgfo_ty F); diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 2da2fb89e61d6..d7afe62562ff3 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -1162,10 +1162,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) { RedVar, std::forward(Rest)...}; } -namespace reduction { -inline void finalizeHandler(handler &CGH) { CGH.finalize(); } -} // namespace reduction - // This method is used for implementation of parallel_for accepting 1 reduction. // TODO: remove this method when everything is switched to general algorithm // implementing arbitrary number of reductions in parallel_for(). @@ -1723,8 +1719,6 @@ struct NDRangeReduction< } }); - reduction::finalizeHandler(CGH); - // Run the additional kernel as many times as needed to reduce all partial // sums into one scalar. @@ -1901,8 +1895,6 @@ template <> struct NDRangeReduction { else First(KernelMultipleWGTag{}); - reduction::finalizeHandler(CGH); - // 2. Run the additional kernel as many times as needed to reduce // all partial sums into one scalar. @@ -2598,7 +2590,6 @@ template <> struct NDRangeReduction { reduCGFuncMulti(CGH, KernelFunc, NDRange, Properties, ReduTuple, ReduIndices); - reduction::finalizeHandler(CGH); size_t NWorkItems = NDRange.get_group_range().size(); while (NWorkItems > 1) { diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index d9ed0ad67e785..f5db0860bb4a2 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -318,7 +318,12 @@ fill_copy_args(detail::handler_impl *impl, #ifdef __INTEL_PREVIEW_BREAKING_CHANGES handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {} +handler::handler(std::unique_ptr &&HandlerImpl) + : implOwner(std::move(HandlerImpl)), impl(implOwner.get()) {} #else +handler::handler(std::unique_ptr &&HandlerImpl) + : impl(std::move(HandlerImpl)) {} + handler::handler(std::shared_ptr Queue, bool CallerNeedsEvent) : impl(std::make_shared(*Queue, nullptr, @@ -344,6 +349,7 @@ handler::handler( : impl(std::make_shared(*Graph)) {} #endif +handler::~handler() = default; // Sets the submission state to indicate that an explicit kernel bundle has been // set. Throws a sycl::exception with errc::invalid if the current state @@ -426,12 +432,6 @@ detail::EventImplPtr handler::finalize() { #else event handler::finalize() { #endif - // This block of code is needed only for reduction implementation. - // It is harmless (does nothing) for everything else. - if (MIsFinalized) - return MLastEvent; - MIsFinalized = true; - const auto &type = getType(); detail::queue_impl *Queue = impl->get_queue_or_null(); ext::oneapi::experimental::detail::graph_impl *Graph = @@ -559,12 +559,6 @@ event handler::finalize() { std::vector RawEvents = detail::Command::getUrEvents( impl->CGData.MEvents, impl->get_queue_or_null(), false); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - detail::EventImplPtr &LastEventImpl = MLastEvent; -#else - const detail::EventImplPtr &LastEventImpl = - detail::getSyclObjImpl(MLastEvent); -#endif bool DiscardEvent = !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); @@ -577,11 +571,10 @@ event handler::finalize() { DiscardEvent = !KernelUsesAssert; } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - if (!DiscardEvent) { - LastEventImpl = detail::event_impl::create_completed_host_event(); - } -#endif + std::shared_ptr ResultEvent = + DiscardEvent + ? nullptr + : detail::event_impl::create_device_event(impl->get_queue()); #ifdef XPTI_ENABLE_INSTRUMENTATION const bool xptiEnabled = xptiTraceEnabled(); @@ -612,9 +605,8 @@ event handler::finalize() { enqueueImpKernel( impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr, MKernel.get(), toKernelNameStrT(MKernelName), - impl->MKernelNameBasedCachePtr, RawEvents, - DiscardEvent ? nullptr : LastEventImpl.get(), nullptr, - impl->MKernelCacheConfig, impl->MKernelIsCooperative, + impl->MKernelNameBasedCachePtr, RawEvents, ResultEvent.get(), + nullptr, impl->MKernelCacheConfig, impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs, impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures); @@ -624,7 +616,7 @@ event handler::finalize() { if (!DiscardEvent) { detail::emitInstrumentationGeneral( StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, - static_cast(LastEventImpl->getHandle())); + static_cast(ResultEvent->getHandle())); } detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, @@ -635,29 +627,32 @@ event handler::finalize() { if (DiscardEvent) { EnqueueKernel(); -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - LastEventImpl->setStateDiscarded(); -#endif } else { detail::queue_impl &Queue = impl->get_queue(); - LastEventImpl->setQueue(Queue); - LastEventImpl->setWorkerQueue(Queue.weak_from_this()); - LastEventImpl->setContextImpl(impl->get_context()); - LastEventImpl->setStateIncomplete(); - LastEventImpl->setSubmissionTime(); + ResultEvent->setQueue(Queue); + ResultEvent->setWorkerQueue(Queue.weak_from_this()); + ResultEvent->setContextImpl(impl->get_context()); + ResultEvent->setStateIncomplete(); + ResultEvent->setSubmissionTime(); EnqueueKernel(); - LastEventImpl->setEnqueued(); + ResultEvent->setEnqueued(); // connect returned event with dependent events if (!Queue.isInOrder()) { // MEvents is not used anymore, so can move. - LastEventImpl->getPreparedDepsEvents() = + ResultEvent->getPreparedDepsEvents() = std::move(impl->CGData.MEvents); - // LastEventImpl is local for current thread, no need to lock. - LastEventImpl->cleanDepEventsThroughOneLevelUnlocked(); + // ResultEvent is local for current thread, no need to lock. + ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); } } - return MLastEvent; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + return ResultEvent; +#else + return detail::createSyclObjFromImpl( + ResultEvent ? ResultEvent + : detail::event_impl::create_discarded_event()); +#endif } } @@ -939,11 +934,10 @@ event handler::finalize() { std::move(CommandGroup), *Queue, !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - MLastEvent = DiscardEvent ? nullptr : Event; + return DiscardEvent ? nullptr : Event; #else - MLastEvent = detail::createSyclObjFromImpl(Event); + return detail::createSyclObjFromImpl(Event); #endif - return MLastEvent; } void handler::addReduction(const std::shared_ptr &ReduObj) { @@ -2474,58 +2468,25 @@ __SYCL_EXPORT void HandlerAccess::preProcess(handler &CGH, AuxHandler.copyCodeLoc(CGH); F(AuxHandler); auto E = AuxHandler.finalize(); - assert(!CGH.MIsFinalized && - "Can't do pre-processing if the command has been enqueued already!"); if (EventNeeded) CGH.depends_on(E); } __SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH, type_erased_cgfo_ty F) { - // The "hacky" `handler`s manipulation mentioned near the declaration in - // `handler.hpp` and implemented here is far from perfect. A better approach - // would be - // - // bool OrigNeedsEvent = CGH.needsEvent() - // assert(CGH.not_finalized/enqueued()); - // if (!InOrderQueue) - // CGH.setNeedsEvent() - // - // handler PostProcessHandler(Queue, OrigNeedsEvent) - // auto E = CGH.finalize(); // enqueue original or current last - // // post-process - // if (!InOrder) - // PostProcessHandler.depends_on(E) - // - // swap_impls(CGH, PostProcessHandler) - // return; // queue::submit finalizes PostProcessHandler and returns its - // // event if necessary. - // - // Still hackier than "real" `queue::submit` but at least somewhat sane. - // That, however hasn't been tried yet and we have an even hackier approach - // copied from what's been done in an old reductions implementation before - // eventless submission work has started. Not sure how feasible the approach - // above is at this moment. - - // This `finalize` is wrong (at least logically) if - // `assert(!CGH.eventNeeded())` - auto E = CGH.finalize(); + bool EventNeeded = CGH.impl->MEventNeeded; queue_impl &Q = CGH.impl->get_queue(); bool InOrder = Q.isInOrder(); - // Cannot use `CGH.eventNeeded()` alone as there might be subsequent - // `postProcess` calls and we cannot address them properly similarly to the - // `finalize` issue described above. `swap_impls` suggested above might be - // able to handle this scenario naturally. -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - handler_impl HandlerImpl{Q, nullptr, CGH.eventNeeded() || !InOrder}; - handler AuxHandler{HandlerImpl}; -#else - handler AuxHandler{Q.shared_from_this(), CGH.eventNeeded() || !InOrder}; -#endif if (!InOrder) - AuxHandler.depends_on(E); - AuxHandler.copyCodeLoc(CGH); - F(AuxHandler); - CGH.MLastEvent = AuxHandler.finalize(); + CGH.impl->MEventNeeded = true; + + handler PostProcessHandler{ + std::make_unique(Q, nullptr, EventNeeded)}; + PostProcessHandler.copyCodeLoc(CGH); + auto E = CGH.finalize(); + if (!InOrder) + PostProcessHandler.depends_on(E); + F(PostProcessHandler); + swap(CGH, PostProcessHandler); } } // namespace detail } // namespace _V1 diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 3c953567c12dd..71c9a68339f11 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3617,12 +3617,16 @@ _ZN4sycl3_V17handler8getQueueEv _ZN4sycl3_V17handler8prefetchEPKvm _ZN4sycl3_V17handler9clearArgsEv _ZN4sycl3_V17handler9fill_implEPvPKvmm +_ZN4sycl3_V17handlerC1EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb +_ZN4sycl3_V17handlerC2EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb +_ZN4sycl3_V17handlerD1Ev +_ZN4sycl3_V17handlerD2Ev _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE _ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE _ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE From 4c99be4ca90bde5dffbd3570bb1ec5bc84350650 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 27 Jun 2025 12:29:43 -0700 Subject: [PATCH 2/4] clang-format --- sycl/source/handler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index f5db0860bb4a2..483e56a562717 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -559,7 +559,6 @@ event handler::finalize() { std::vector RawEvents = detail::Command::getUrEvents( impl->CGData.MEvents, impl->get_queue_or_null(), false); - bool DiscardEvent = !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); if (DiscardEvent) { From 7c5daafdd4586504d112ff1e2a73c1a2c6f4c1b8 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 27 Jun 2025 16:34:48 -0700 Subject: [PATCH 3/4] Copy MAuxiliaryResources --- sycl/source/handler.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 483e56a562717..4444e519b3515 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -2481,6 +2481,9 @@ __SYCL_EXPORT void HandlerAccess::postProcess(handler &CGH, handler PostProcessHandler{ std::make_unique(Q, nullptr, EventNeeded)}; PostProcessHandler.copyCodeLoc(CGH); + // Extend lifetimes of auxiliary resources till the last kernel in the chain + // finishes: + PostProcessHandler.impl->MAuxiliaryResources = CGH.impl->MAuxiliaryResources; auto E = CGH.finalize(); if (!InOrder) PostProcessHandler.depends_on(E); From d7bf37aa0e520dd886ba318d488de0b5ee89b674 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 30 Jun 2025 07:28:37 -0700 Subject: [PATCH 4/4] Update `sycl/test/abi/sycl_symbols_windows.dump` --- sycl/test/abi/sycl_symbols_windows.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 7f96e48b36d47..b9f58f92a8c41 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -417,6 +417,7 @@ ??0gpu_selector@_V1@sycl@@QEAA@$$QEAV012@@Z ??0gpu_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0gpu_selector@_V1@sycl@@QEAA@XZ +??0handler@_V1@sycl@@AEAA@$$QEAV?$unique_ptr@Vhandler_impl@detail@_V1@sycl@@U?$default_delete@Vhandler_impl@detail@_V1@sycl@@@std@@@std@@@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@PEAVqueue_impl@detail@12@_N@Z