diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 7d58d8868c348..321f51e8863c7 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -424,28 +424,23 @@ template bool range_size_fits_in_size_t(const range &r) { /// \ingroup sycl_api class __SYCL_EXPORT handler { private: +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + /// Constructs SYCL handler from the pre-constructed stack-allocated + /// `handler_impl` (not enforced, but meaningless to do a heap allocation + /// outside handler instance). + /// + /// \param HandlerImpl is a pre-constructed handler_impl. + // + // Can't provide this overload outside preview because `handler` lacks + // required data members. + handler(detail::handler_impl &HandlerImpl); +#else /// Constructs SYCL handler from queue. /// /// \param Queue is a SYCL queue. /// \param CallerNeedsEvent indicates if the event resulting from this handler /// is needed by the caller. -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - handler(const std::shared_ptr &Queue, - bool CallerNeedsEvent); -#else handler(std::shared_ptr Queue, bool CallerNeedsEvent); -#endif - -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - /// Constructs SYCL handler from the pre-constructed handler_impl and the - /// associated queue. Inside of Graph implementation, the Queue value is not - /// used, for those cases it can be initialized with an empty shared_ptr. - /// - /// \param HandlerImpl is a pre-constructed handler_impl. - /// \param Queue is a SYCL queue. - handler(detail::handler_impl *HandlerImpl, - const std::shared_ptr &Queue); -#else /// Constructs SYCL handler from the associated queue and the submission's /// primary and secondary queue. /// @@ -456,20 +451,14 @@ class __SYCL_EXPORT handler { /// is null if no secondary queue is associated with the submission. /// \param CallerNeedsEvent indicates if the event resulting from this handler /// is needed by the caller. -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES - // TODO: This function is not used anymore, remove it in the next - // ABI-breaking window. handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, std::shared_ptr SecondaryQueue, bool CallerNeedsEvent); -#endif __SYCL_DLL_LOCAL handler(std::shared_ptr Queue, detail::queue_impl *SecondaryQueue, bool CallerNeedsEvent); -#endif -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// Constructs SYCL handler from Graph. /// /// The handler will add the command-group as a node to the graph rather than @@ -3368,16 +3357,15 @@ class __SYCL_EXPORT handler { private: #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - // In some cases we need to construct handler_impl in heap. Sole propose - // of MImplOwner is to destroy handler_impl in destructor of handler. - // Can't use unique_ptr because declaration of handler_impl is not available - // in this header. - std::shared_ptr MImplOwner; + // TODO: Maybe make it a reference when non-preview branch is removed. + // On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might + // be useful in future, pointer here would make that possible/easier. detail::handler_impl *impl; - const std::shared_ptr &MQueue; #else std::shared_ptr impl; - std::shared_ptr MQueue; + + // Use impl->get_queue*() instead: + std::shared_ptr MQueueDoNotUse; #endif std::vector MLocalAccStorage; std::vector> MStreamStorage; diff --git a/sycl/source/detail/async_alloc.cpp b/sycl/source/detail/async_alloc.cpp index 1a97b7a804760..8ed468e8dae7a 100644 --- a/sycl/source/detail/async_alloc.cpp +++ b/sycl/source/detail/async_alloc.cpp @@ -33,7 +33,7 @@ getUrEvents(const std::vector> &DepEvents) { } std::vector> getDepGraphNodes( - sycl::handler &Handler, const std::shared_ptr &Queue, + sycl::handler &Handler, detail::queue_impl *Queue, const std::shared_ptr &Graph, const std::vector> &DepEvents) { auto HandlerImpl = detail::getSyclObjImpl(Handler); @@ -46,7 +46,7 @@ std::vector> getDepGraphNodes( // If this is being recorded from an in-order queue we need to get the last // in-order node if any, since this will later become a dependency of the // node being processed here. - if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue.get()); + if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue); LastInOrderNode) { DepNodes.push_back(LastInOrderNode); } @@ -78,10 +78,11 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) { ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents); + auto DepNodes = + getDepGraphNodes(h, h.impl->get_queue_or_null(), Graph, DepEvents); alloc = Graph->getMemPool().malloc(size, kind, DepNodes); } else { - auto &Q = h.MQueue->getHandleRef(); + ur_queue_handle_t Q = h.impl->get_queue().getHandleRef(); Adapter->call( Q, (ur_usm_pool_handle_t)0, size, nullptr, UREvents.size(), @@ -128,13 +129,14 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, ur_event_handle_t Event = nullptr; // If a graph is present do the allocation from the graph memory pool instead. if (auto Graph = h.getCommandGraph(); Graph) { - auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents); + auto DepNodes = + getDepGraphNodes(h, h.impl->get_queue_or_null(), Graph, DepEvents); // Memory pool is passed as the graph may use some properties of it. alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes, sycl::detail::getSyclObjImpl(pool)); } else { - auto &Q = h.MQueue->getHandleRef(); + ur_queue_handle_t Q = h.impl->get_queue().getHandleRef(); Adapter->call( Q, memPoolImpl.get()->get_handle(), size, nullptr, UREvents.size(), diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 2de82014c2b5b..d974da6180d53 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -338,14 +338,14 @@ void context_impl::addDeviceGlobalInitializer( } } -std::vector context_impl::initializeDeviceGlobals( - ur_program_handle_t NativePrg, - const std::shared_ptr &QueueImpl) { +std::vector +context_impl::initializeDeviceGlobals(ur_program_handle_t NativePrg, + queue_impl &QueueImpl) { if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire)) return {}; const AdapterPtr &Adapter = getAdapter(); - device_impl &DeviceImpl = QueueImpl->getDeviceImpl(); + device_impl &DeviceImpl = QueueImpl.getDeviceImpl(); std::lock_guard NativeProgramLock(MDeviceGlobalInitializersMutex); auto ImgIt = MDeviceGlobalInitializers.find( std::make_pair(NativePrg, DeviceImpl.getHandleRef())); @@ -417,7 +417,7 @@ std::vector context_impl::initializeDeviceGlobals( for (DeviceGlobalMapEntry *DeviceGlobalEntry : DeviceGlobalEntries) { // Get or allocate the USM memory associated with the device global. DeviceGlobalUSMMem &DeviceGlobalUSM = - DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(*QueueImpl); + DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(QueueImpl); // If the device global still has a initialization event it should be // added to the initialization events list. Since initialization events @@ -432,7 +432,7 @@ std::vector context_impl::initializeDeviceGlobals( ur_event_handle_t InitEvent; void *const &USMPtr = DeviceGlobalUSM.getPtr(); Adapter->call( - QueueImpl->getHandleRef(), NativePrg, + QueueImpl.getHandleRef(), NativePrg, DeviceGlobalEntry->MUniqueId.c_str(), false, sizeof(void *), 0, &USMPtr, 0, nullptr, &InitEvent); diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index 84787ad50c5e2..3f93b1b03721b 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -223,8 +223,7 @@ class context_impl : public std::enable_shared_from_this { /// Initializes device globals for a program on the associated queue. std::vector - initializeDeviceGlobals(ur_program_handle_t NativePrg, - const std::shared_ptr &QueueImpl); + initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl); void memcpyToHostOnlyDeviceGlobal(device_impl &DeviceImpl, const void *DeviceGlobalPtr, diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 822e8bd0808f0..908c3d07fa45f 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -176,9 +176,9 @@ event_impl::event_impl(const QueueImplPtr &Queue) MState.store(HES_Complete); } -void event_impl::setQueue(const QueueImplPtr &Queue) { - MQueue = Queue; - MIsProfilingEnabled = Queue->MIsProfilingEnabled; +void event_impl::setQueue(queue_impl &Queue) { + MQueue = Queue.shared_from_this(); + MIsProfilingEnabled = Queue.MIsProfilingEnabled; // TODO After setting the queue, the event is no longer default // constructed. Consider a design change which would allow diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 0297ebfd3e4d3..3a3226738c193 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -74,7 +74,7 @@ class event_impl { /// as it was constructed with the queue based constructor. /// /// \param Queue is a queue to be associated with the event - void setQueue(const QueueImplPtr &Queue); + void setQueue(queue_impl &Queue); /// Waits for the event. /// @@ -234,14 +234,14 @@ class event_impl { /// Sets worker queue for command. /// /// @return - void setWorkerQueue(const QueueImplPtr &WorkerQueue) { + void setWorkerQueue(std::weak_ptr WorkerQueue) { MWorkerQueue = WorkerQueue; }; /// Sets original queue used for submission. /// /// @return - void setSubmittedQueue(const QueueImplPtr &SubmittedQueue) { + void setSubmittedQueue(std::weak_ptr SubmittedQueue) { MSubmittedQueue = SubmittedQueue; }; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 1df6d05bb4760..caf50642c3a2c 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -505,7 +505,7 @@ graph_impl::add(std::function CGF, (void)Args; #ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::handler_impl HandlerImpl{*this}; - sycl::handler Handler{&HandlerImpl, std::shared_ptr{}}; + sycl::handler Handler{HandlerImpl}; #else sycl::handler Handler{shared_from_this()}; #endif @@ -2310,7 +2310,7 @@ void dynamic_command_group_impl::finalizeCGFList( // as a single command-group with multiple commands inside. #ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::handler_impl HandlerImpl{*MGraph}; - sycl::handler Handler{&HandlerImpl, std::shared_ptr{}}; + sycl::handler Handler{HandlerImpl}; #else sycl::handler Handler{MGraph}; #endif diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 26ee3eaaee3df..f7d8cd6e33644 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -311,15 +311,11 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo) { #ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::handler_impl HandlerImplVal(*this, SecondaryQueue, CallerNeedsEvent); - detail::handler_impl *HandlerImpl = &HandlerImplVal; - // Inlining `Self` results in a crash when SYCL RT is built using MSVC with - // optimizations enabled. No crash if built using OneAPI. - auto Self = shared_from_this(); - handler Handler(HandlerImpl, Self); + handler Handler(HandlerImplVal); #else handler Handler(shared_from_this(), SecondaryQueue, CallerNeedsEvent); - auto &HandlerImpl = detail::getSyclObjImpl(Handler); #endif + auto &HandlerImpl = detail::getSyclObjImpl(Handler); #ifdef XPTI_ENABLE_INSTRUMENTATION if (xptiTraceEnabled()) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 99490ba2851c4..1d4f2e11693ea 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -687,10 +687,12 @@ class queue_impl : public std::enable_shared_from_this { protected: template EventImplPtr insertHelperBarrier(const HandlerType &Handler) { - auto ResEvent = std::make_shared(Handler.MQueue); + auto &Queue = Handler.impl->get_queue(); + auto ResEvent = + std::make_shared(Queue.shared_from_this()); ur_event_handle_t UREvent = nullptr; getAdapter()->call( - Handler.MQueue->getHandleRef(), 0, nullptr, &UREvent); + Queue.getHandleRef(), 0, nullptr, &UREvent); ResEvent->setHandle(UREvent); return ResEvent; } diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 5f3dd7a9ea983..03c09a6476f40 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -104,9 +104,8 @@ static size_t deviceToID(const device &Device) { return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); } -static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { - xpti::addMetadata(TraceEvent, "sycl_device_type", - queueDeviceToString(Queue.get())); +static void addDeviceMetadata(xpti_td *TraceEvent, queue_impl *Queue) { + xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue)); if (Queue) { xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device())); @@ -115,10 +114,17 @@ static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { getSyclObjImpl(Queue->get_device())->get_info()); } } +static void addDeviceMetadata(xpti_td *TraceEvent, + const std::shared_ptr &Queue) { + addDeviceMetadata(TraceEvent, Queue.get()); +} -static unsigned long long getQueueID(const QueueImplPtr &Queue) { +static unsigned long long getQueueID(queue_impl *Queue) { return Queue ? Queue->getQueueID() : 0; } +static unsigned long long getQueueID(const std::shared_ptr &Queue) { + return getQueueID(Queue.get()); +} #endif static ContextImplPtr getContext(const QueueImplPtr &Queue) { @@ -224,7 +230,7 @@ static std::string commandToName(Command::CommandType Type) { std::vector Command::getUrEvents(const std::vector &EventImpls, - const QueueImplPtr &CommandQueue, bool IsHostTaskCommand) { + queue_impl *CommandQueue, bool IsHostTaskCommand) { std::vector RetUrEvents; for (auto &EventImpl : EventImpls) { auto Handle = EventImpl->getHandle(); @@ -235,7 +241,7 @@ Command::getUrEvents(const std::vector &EventImpls, // At this stage dependency is definitely ur task and need to check if // current one is a host task. In this case we should not skip ur event due // to different sync mechanisms for different task types on in-order queue. - if (CommandQueue && EventImpl->getWorkerQueue() == CommandQueue && + if (CommandQueue && EventImpl->getWorkerQueue().get() == CommandQueue && CommandQueue->isInOrder() && !IsHostTaskCommand) continue; @@ -247,7 +253,7 @@ Command::getUrEvents(const std::vector &EventImpls, std::vector Command::getUrEvents(const std::vector &EventImpls) const { - return getUrEvents(EventImpls, MWorkerQueue, isHostTask()); + return getUrEvents(EventImpls, MWorkerQueue.get(), isHostTask()); } // This function is implemented (duplicating getUrEvents a lot) as short term @@ -1985,8 +1991,7 @@ void instrumentationAddExtraKernelMetadata( const std::shared_ptr &KernelBundleImplPtr, KernelNameStrRefT KernelName, KernelNameBasedCacheT *KernelNameBasedCachePtr, - const std::shared_ptr &SyclKernel, - const QueueImplPtr &Queue, + const std::shared_ptr &SyclKernel, queue_impl *Queue, std::vector &CGArgs) // CGArgs are not const since they could be // sorted in this function { @@ -2037,7 +2042,7 @@ void instrumentationFillCommonData(const std::string &KernelName, const std::string &FuncName, const std::string &FileName, uint64_t Line, uint64_t Column, const void *const Address, - const QueueImplPtr &Queue, + queue_impl *Queue, std::optional &FromSource, uint64_t &OutInstanceID, xpti_td *&OutTraceEvent) { @@ -2099,7 +2104,7 @@ std::pair emitKernelInstrumentationData( int32_t StreamID, const std::shared_ptr &SyclKernel, const detail::code_location &CodeLoc, bool IsTopCodeLoc, const std::string_view SyclKernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr, const QueueImplPtr &Queue, + KernelNameBasedCacheT *KernelNameBasedCachePtr, queue_impl *Queue, const NDRDescT &NDRDesc, const std::shared_ptr &KernelBundleImplPtr, std::vector &CGArgs) { @@ -2134,7 +2139,7 @@ std::pair emitKernelInstrumentationData( // Stash the queue_id mutable metadata in TLS // NOTE: Queue can be null when kernel is directly enqueued to a command // buffer by graph API, when a modifiable graph is finalized. - if (Queue.get()) + if (Queue) xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue)); instrumentationAddExtraKernelMetadata( @@ -2183,7 +2188,7 @@ void ExecCGCommand::emitInstrumentationData() { xpti_td *CmdTraceEvent = nullptr; instrumentationFillCommonData(KernelName, FuncName, MCommandGroup->MFileName, MCommandGroup->MLine, MCommandGroup->MColumn, - MAddress, MQueue, FromSource, MInstanceID, + MAddress, MQueue.get(), FromSource, MInstanceID, CmdTraceEvent); if (CmdTraceEvent) { @@ -2196,7 +2201,7 @@ void ExecCGCommand::emitInstrumentationData() { instrumentationAddExtraKernelMetadata( CmdTraceEvent, KernelCG->MNDRDesc, KernelCG->getKernelBundle(), KernelCG->MKernelName, KernelCG->MKernelNameBasedCachePtr, - KernelCG->MSyclKernel, MQueue, KernelCG->MArgs); + KernelCG->MSyclKernel, MQueue.get(), KernelCG->MArgs); } xptiNotifySubscribers( @@ -2660,7 +2665,7 @@ ur_result_t enqueueImpCommandBufferKernel( } void enqueueImpKernel( - const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector &Args, + queue_impl &Queue, NDRDescT &NDRDesc, std::vector &Args, const std::shared_ptr &KernelBundleImplPtr, const detail::kernel_impl *MSyclKernel, KernelNameStrRefT KernelName, KernelNameBasedCacheT *KernelNameBasedCachePtr, @@ -2671,10 +2676,9 @@ void enqueueImpKernel( const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), bool KernelHasSpecialCaptures) { - assert(Queue && "Kernel submissions should have an associated queue"); // Run OpenCL kernel - auto &ContextImpl = Queue->getContextImplPtr(); - device_impl &DeviceImpl = Queue->getDeviceImpl(); + auto &ContextImpl = Queue.getContextImplPtr(); + device_impl &DeviceImpl = Queue.getDeviceImpl(); ur_kernel_handle_t Kernel = nullptr; std::mutex *KernelMutex = nullptr; ur_program_handle_t Program = nullptr; @@ -2686,7 +2690,7 @@ void enqueueImpKernel( if (nullptr != MSyclKernel) { assert(MSyclKernel->get_info() == - Queue->get_context()); + Queue.get_context()); Kernel = MSyclKernel->getHandleRef(); Program = MSyclKernel->getProgramRef(); @@ -2748,14 +2752,14 @@ void enqueueImpKernel( // provided. if (KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_SLM || KernelCacheConfig == UR_KERNEL_CACHE_CONFIG_LARGE_DATA) { - const AdapterPtr &Adapter = Queue->getAdapter(); + const AdapterPtr &Adapter = Queue.getAdapter(); Adapter->call( Kernel, UR_KERNEL_EXEC_INFO_CACHE_CONFIG, sizeof(ur_kernel_cache_config_t), nullptr, &KernelCacheConfig); } Error = SetKernelParamsAndLaunch( - *Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, + Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, OutEventImpl, EliminatedArgMask, getMemAllocationFunc, KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, BinImage, KernelName, KernelNameBasedCachePtr, KernelFuncPtr, @@ -3260,7 +3264,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { assert(BinImage && "Failed to obtain a binary image."); } enqueueImpKernel( - MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel.get(), + *MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel.get(), KernelName, ExecKernel->MKernelNameBasedCachePtr, RawEvents, EventImpl, getMemAllocationFunc, ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative, ExecKernel->MKernelUsesClusterLaunch, diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 7a82278cbc169..b9bc793f7a2a5 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -240,7 +240,7 @@ class Command { static std::vector getUrEvents(const std::vector &EventImpls, - const QueueImplPtr &CommandQueue, bool IsHostTaskCommand); + queue_impl *CommandQueue, bool IsHostTaskCommand); /// Collect UR events from EventImpls and filter out some of them in case of /// in order queue. Does blocking enqueue if event is expected to produce ur @@ -623,7 +623,7 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, bool read); void enqueueImpKernel( - const QueueImplPtr &Queue, NDRDescT &NDRDesc, std::vector &Args, + queue_impl &Queue, NDRDescT &NDRDesc, std::vector &Args, const std::shared_ptr &KernelBundleImplPtr, const detail::kernel_impl *MSyclKernel, KernelNameStrRefT KernelName, KernelNameBasedCacheT *KernelNameBasedCachePtr, @@ -692,7 +692,7 @@ std::pair emitKernelInstrumentationData( int32_t StreamID, const std::shared_ptr &SyclKernel, const detail::code_location &CodeLoc, bool IsTopCodeLoc, std::string_view SyclKernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr, const QueueImplPtr &Queue, + KernelNameBasedCacheT *KernelNameBasedCachePtr, queue_impl *Queue, const NDRDescT &NDRDesc, const std::shared_ptr &KernelBundleImplPtr, std::vector &CGArgs); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 4abe77807f81d..fe871a0d85566 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -315,42 +315,27 @@ fill_copy_args(detail::handler_impl *impl, } // namespace detail #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - -handler::handler(const std::shared_ptr &Queue, - bool CallerNeedsEvent) - : MImplOwner(std::make_shared(*Queue, nullptr, - CallerNeedsEvent)), - impl(MImplOwner.get()), MQueue(Queue) {} - -handler::handler(detail::handler_impl *HandlerImpl, - const std::shared_ptr &Queue) - : impl(HandlerImpl), MQueue(Queue) {} - +handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {} #else - handler::handler(std::shared_ptr Queue, bool CallerNeedsEvent) : impl(std::make_shared(*Queue, nullptr, CallerNeedsEvent)), - MQueue(std::move(Queue)) {} + MQueueDoNotUse(std::move(Queue)) {} -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES -// TODO: This function is not used anymore, remove it in the next -// ABI-breaking window. handler::handler( std::shared_ptr Queue, [[maybe_unused]] std::shared_ptr PrimaryQueue, std::shared_ptr SecondaryQueue, bool CallerNeedsEvent) : impl(std::make_shared(*Queue, SecondaryQueue.get(), CallerNeedsEvent)), - MQueue(Queue) {} -#endif + MQueueDoNotUse(Queue) {} handler::handler(std::shared_ptr Queue, detail::queue_impl *SecondaryQueue, bool CallerNeedsEvent) : impl(std::make_shared(*Queue, SecondaryQueue, CallerNeedsEvent)), - MQueue(std::move(Queue)) {} + MQueueDoNotUse(std::move(Queue)) {} handler::handler( std::shared_ptr Graph) @@ -539,8 +524,8 @@ event handler::finalize() { // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - std::vector RawEvents = - detail::Command::getUrEvents(impl->CGData.MEvents, MQueue, false); + std::vector RawEvents = detail::Command::getUrEvents( + impl->CGData.MEvents, impl->get_queue_or_null(), false); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::EventImplPtr &LastEventImpl = MLastEvent; @@ -578,8 +563,9 @@ event handler::finalize() { StreamID = xptiRegisterStream(detail::SYCL_STREAM_NAME); std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, - MKernelName.data(), impl->MKernelNameBasedCachePtr, MQueue, - impl->MNDRDesc, KernelBundleImpPtr, impl->MArgs); + MKernelName.data(), impl->MKernelNameBasedCachePtr, + impl->get_queue_or_null(), impl->MNDRDesc, KernelBundleImpPtr, + impl->MArgs); detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); @@ -588,11 +574,11 @@ event handler::finalize() { const detail::RTDeviceBinaryImage *BinImage = nullptr; if (detail::SYCLConfig::get()) { std::tie(BinImage, std::ignore) = detail::retrieveKernelBinary( - *MQueue, toKernelNameStrT(MKernelName)); + impl->get_queue(), toKernelNameStrT(MKernelName)); assert(BinImage && "Failed to obtain a binary image."); } enqueueImpKernel( - MQueue, impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr, + impl->get_queue(), impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr, MKernel.get(), toKernelNameStrT(MKernelName), impl->MKernelNameBasedCachePtr, RawEvents, DiscardEvent ? nullptr : LastEventImpl.get(), nullptr, @@ -621,16 +607,17 @@ event handler::finalize() { LastEventImpl->setStateDiscarded(); #endif } else { - LastEventImpl->setQueue(MQueue); - LastEventImpl->setWorkerQueue(MQueue); - LastEventImpl->setContextImpl(MQueue->getContextImplPtr()); + detail::queue_impl &Queue = impl->get_queue(); + LastEventImpl->setQueue(Queue); + LastEventImpl->setWorkerQueue(Queue.weak_from_this()); + LastEventImpl->setContextImpl(impl->get_context().shared_from_this()); LastEventImpl->setStateIncomplete(); LastEventImpl->setSubmissionTime(); EnqueueKernel(); LastEventImpl->setEnqueued(); // connect returned event with dependent events - if (!MQueue->isInOrder()) { + if (!Queue.isInOrder()) { // MEvents is not used anymore, so can move. LastEventImpl->getPreparedDepsEvents() = std::move(impl->CGData.MEvents); @@ -845,11 +832,14 @@ event handler::finalize() { #endif } + // Because graph case is handled right above. + assert(Queue); + // If the queue has an associated graph then we need to take the CG and pass // it to the graph to create a node, rather than submit it to the scheduler. - if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) { + if (auto GraphImpl = Queue->getCommandGraph(); GraphImpl) { auto EventImpl = std::make_shared(); - EventImpl->setSubmittedQueue(MQueue); + EventImpl->setSubmittedQueue(Queue->weak_from_this()); std::shared_ptr NodeImpl = nullptr; @@ -864,14 +854,13 @@ event handler::finalize() { : ext::oneapi::experimental::detail::getNodeTypeFromCG(getType()); // Create a new node in the graph representing this command-group - if (MQueue->isInOrder()) { + if (Queue->isInOrder()) { // In-order queues create implicit linear dependencies between nodes. // Find the last node added to the graph from this queue, so our new // node can set it as a predecessor. std::vector> Deps; - if (auto DependentNode = - GraphImpl->getLastInorderNode(impl->get_queue_or_null())) { + if (auto DependentNode = GraphImpl->getLastInorderNode(Queue)) { Deps.push_back(std::move(DependentNode)); } NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); @@ -879,9 +868,10 @@ event handler::finalize() { // If we are recording an in-order queue remember the new node, so it // can be used as a dependency for any more nodes recorded from this // queue. - GraphImpl->setLastInorderNode(*MQueue, NodeImpl); + GraphImpl->setLastInorderNode(*Queue, NodeImpl); } else { - auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue); + auto LastBarrierRecordedFromQueue = + GraphImpl->getBarrierDep(Queue->weak_from_this()); std::vector> Deps; @@ -891,7 +881,7 @@ event handler::finalize() { NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); if (NodeImpl->MCGType == sycl::detail::CGType::Barrier) { - GraphImpl->setBarrierDep(MQueue, NodeImpl); + GraphImpl->setBarrierDep(Queue->weak_from_this(), NodeImpl); } } @@ -906,7 +896,7 @@ event handler::finalize() { } detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), std::move(MQueue), impl->MEventNeeded); + std::move(CommandGroup), Queue->shared_from_this(), impl->MEventNeeded); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES MLastEvent = Event; @@ -1961,8 +1951,9 @@ void handler::depends_on(const detail::EventImplPtr &EventImpl) { } auto EventGraph = EventImpl->getCommandGraph(); - if (MQueue && EventGraph) { - auto QueueGraph = MQueue->getCommandGraph(); + queue_impl *Queue = impl->get_queue_or_null(); + if (Queue && EventGraph) { + auto QueueGraph = Queue->getCommandGraph(); if (EventGraph->getContextImplPtr().get() != &impl->get_context()) { throw sycl::exception( @@ -1989,7 +1980,7 @@ void handler::depends_on(const detail::EventImplPtr &EventImpl) { // we need to set it to recording (implements the transitive queue recording // feature). if (!QueueGraph) { - EventGraph->beginRecording(impl->get_queue()); + EventGraph->beginRecording(*Queue); } } @@ -2014,12 +2005,11 @@ void handler::depends_on(const std::vector &Events) { } } -static bool -checkContextSupports(const std::shared_ptr &ContextImpl, - ur_context_info_t InfoQuery) { - auto &Adapter = ContextImpl->getAdapter(); +static bool checkContextSupports(detail::context_impl &ContextImpl, + ur_context_info_t InfoQuery) { + auto &Adapter = ContextImpl.getAdapter(); ur_bool_t SupportsOp = false; - Adapter->call(ContextImpl->getHandleRef(), + Adapter->call(ContextImpl.getHandleRef(), InfoQuery, sizeof(ur_bool_t), &SupportsOp, nullptr); return SupportsOp; @@ -2032,8 +2022,7 @@ void handler::verifyDeviceHasProgressGuarantee( using execution_scope = sycl::ext::oneapi::experimental::execution_scope; using forward_progress = sycl::ext::oneapi::experimental::forward_progress_guarantee; - device_impl &deviceImpl = MQueue->getDeviceImpl(); - const bool supported = deviceImpl.supportsForwardProgress( + const bool supported = impl->get_device().supportsForwardProgress( guarantee, threadScope, coordinationScope); if (threadScope == execution_scope::work_group) { if (!supported) { @@ -2073,20 +2062,18 @@ void handler::verifyDeviceHasProgressGuarantee( } bool handler::supportsUSMMemcpy2D() { - // Return true when handler_impl is constructed with a graph. - if (!MQueue) + if (impl->get_graph_or_null()) return true; - return checkContextSupports(MQueue->getContextImplPtr(), + return checkContextSupports(impl->get_context(), UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT); } bool handler::supportsUSMFill2D() { - // Return true when handler_impl is constructed with a graph. - if (!MQueue) + if (impl->get_graph_or_null()) return true; - return checkContextSupports(MQueue->getContextImplPtr(), + return checkContextSupports(impl->get_context(), UR_CONTEXT_INFO_USM_FILL2D_SUPPORT); } @@ -2156,16 +2143,14 @@ void handler::memcpyToHostOnlyDeviceGlobal(const void *DeviceGlobalPtr, size_t DeviceGlobalTSize, bool IsDeviceImageScoped, size_t NumBytes, size_t Offset) { - std::weak_ptr WeakContextImpl = - MQueue->getContextImplPtr(); - detail::device_impl &Dev = MQueue->getDeviceImpl(); - host_task([=, &Dev] { + host_task([=, &Dev = impl->get_device(), + WeakContextImpl = impl->get_context().weak_from_this()] { // Capture context as weak to avoid keeping it alive for too long. If it is // dead by the time this executes, the operation would not have been visible // anyway. Devices are alive till library shutdown so capturing a reference // to one is fine. - std::shared_ptr ContextImpl = WeakContextImpl.lock(); - if (ContextImpl) + if (std::shared_ptr ContextImpl = + WeakContextImpl.lock()) ContextImpl->memcpyToHostOnlyDeviceGlobal( Dev, DeviceGlobalPtr, Src, DeviceGlobalTSize, IsDeviceImageScoped, NumBytes, Offset); @@ -2176,18 +2161,15 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScoped, size_t NumBytes, size_t Offset) { - const std::shared_ptr &ContextImpl = - MQueue->getContextImplPtr(); - detail::device_impl &DeviceImpl = MQueue->getDeviceImpl(); - host_task([=, &DeviceImpl] { + host_task([=, Context = impl->get_context().shared_from_this(), + &Dev = impl->get_device()] { // Unlike memcpy to device_global, we need to keep the context alive in the // capture of this operation as we must be able to correctly copy the value // to the user-specified pointer. Device is guaranteed to live until SYCL RT // library shutdown (but even if it wasn't, alive conext has to guarantee // alive device). - ContextImpl->memcpyFromHostOnlyDeviceGlobal( - DeviceImpl, Dest, DeviceGlobalPtr, IsDeviceImageScoped, NumBytes, - Offset); + Context->memcpyFromHostOnlyDeviceGlobal( + Dev, Dest, DeviceGlobalPtr, IsDeviceImageScoped, NumBytes, Offset); }); } @@ -2418,6 +2400,8 @@ void handler::copyCodeLoc(const handler &other) { impl->MIsTopCodeLoc = other.impl->MIsTopCodeLoc; } -queue handler::getQueue() { return createSyclObjFromImpl(MQueue); } +queue handler::getQueue() { + return createSyclObjFromImpl(impl->get_queue()); +} } // namespace _V1 } // namespace sycl diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index c36f3006406b2..e2372256e5431 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -35,7 +35,7 @@ class LimitedHandler { public: LimitedHandler(sycl::detail::CGType CGType, std::shared_ptr Queue) - : MCGType(CGType), MQueue(Queue) {} + : MCGType(CGType), impl(std::make_shared(Queue)) {} virtual ~LimitedHandler() {} virtual void depends_on(const sycl::detail::EventImplPtr &) {} @@ -57,8 +57,12 @@ class LimitedHandler { sycl::detail::CGType getType() { return MCGType; } sycl::detail::CGType MCGType; - std::shared_ptr MQueue; - std::shared_ptr impl; + struct handler_impl { + handler_impl(std::shared_ptr Queue) : MQueue(Queue) {} + std::shared_ptr MQueue; + MockQueueImpl &get_queue() { return *MQueue; } + }; + std::shared_ptr impl; std::shared_ptr MKernel; detail::string MKernelName; }; diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index a4af3fe73ae86..771d8581965c4 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -258,7 +258,9 @@ class MockHandler : public sycl::handler { std::shared_ptr &getHostTask() { return impl->MHostTask; } - const std::shared_ptr &getQueue() { return MQueue; } + const std::shared_ptr getQueue() { + return impl->get_queue().shared_from_this(); + } void setType(sycl::detail::CGType Type) { impl->MCGType = Type; }