Skip to content

[NFCI][SYCL] Complete handler::MQueue->handler_impl move #18767

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 17 additions & 29 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,28 +424,23 @@ template <int Dims> bool range_size_fits_in_size_t(const range<Dims> &r) {
/// \ingroup sycl_api
class __SYCL_EXPORT handler {
private:
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
/// Constructs SYCL handler from the pre-constructed stack-allocated
/// `handler_impl` (not enforced, but meaningless to do a heap allocation
/// outside handler instance).
///
/// \param HandlerImpl is a pre-constructed handler_impl.
//
// Can't provide this overload outside preview because `handler` lacks
// required data members.
handler(detail::handler_impl &HandlerImpl);
#else
/// Constructs SYCL handler from queue.
///
/// \param Queue is a SYCL queue.
/// \param CallerNeedsEvent indicates if the event resulting from this handler
/// is needed by the caller.
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
handler(const std::shared_ptr<detail::queue_impl> &Queue,
bool CallerNeedsEvent);
#else
handler(std::shared_ptr<detail::queue_impl> Queue, bool CallerNeedsEvent);
#endif

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
/// Constructs SYCL handler from the pre-constructed handler_impl and the
/// associated queue. Inside of Graph implementation, the Queue value is not
/// used, for those cases it can be initialized with an empty shared_ptr.
///
/// \param HandlerImpl is a pre-constructed handler_impl.
/// \param Queue is a SYCL queue.
handler(detail::handler_impl *HandlerImpl,
const std::shared_ptr<detail::queue_impl> &Queue);
#else
/// Constructs SYCL handler from the associated queue and the submission's
/// primary and secondary queue.
///
Expand All @@ -456,20 +451,14 @@ class __SYCL_EXPORT handler {
/// is null if no secondary queue is associated with the submission.
/// \param CallerNeedsEvent indicates if the event resulting from this handler
/// is needed by the caller.
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
// TODO: This function is not used anymore, remove it in the next
// ABI-breaking window.
handler(std::shared_ptr<detail::queue_impl> Queue,
std::shared_ptr<detail::queue_impl> PrimaryQueue,
std::shared_ptr<detail::queue_impl> SecondaryQueue,
bool CallerNeedsEvent);
#endif
__SYCL_DLL_LOCAL handler(std::shared_ptr<detail::queue_impl> Queue,
detail::queue_impl *SecondaryQueue,
bool CallerNeedsEvent);
#endif

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
/// Constructs SYCL handler from Graph.
///
/// The handler will add the command-group as a node to the graph rather than
Expand Down Expand Up @@ -3368,16 +3357,15 @@ class __SYCL_EXPORT handler {

private:
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
// In some cases we need to construct handler_impl in heap. Sole propose
// of MImplOwner is to destroy handler_impl in destructor of handler.
// Can't use unique_ptr because declaration of handler_impl is not available
// in this header.
std::shared_ptr<detail::handler_impl> MImplOwner;
// TODO: Maybe make it a reference when non-preview branch is removed.
// On the other hand, see `HandlerAccess:postProcess` to how `swap_impl` might
// be useful in future, pointer here would make that possible/easier.
detail::handler_impl *impl;
const std::shared_ptr<detail::queue_impl> &MQueue;
#else
std::shared_ptr<detail::handler_impl> impl;
std::shared_ptr<detail::queue_impl> MQueue;

// Use impl->get_queue*() instead:
std::shared_ptr<detail::queue_impl> MQueueDoNotUse;
#endif
std::vector<detail::LocalAccessorImplPtr> MLocalAccStorage;
std::vector<std::shared_ptr<detail::stream_impl>> MStreamStorage;
Expand Down
14 changes: 8 additions & 6 deletions sycl/source/detail/async_alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ getUrEvents(const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
}

std::vector<std::shared_ptr<detail::node_impl>> getDepGraphNodes(
sycl::handler &Handler, const std::shared_ptr<detail::queue_impl> &Queue,
sycl::handler &Handler, detail::queue_impl *Queue,
const std::shared_ptr<detail::graph_impl> &Graph,
const std::vector<std::shared_ptr<detail::event_impl>> &DepEvents) {
auto HandlerImpl = detail::getSyclObjImpl(Handler);
Expand All @@ -46,7 +46,7 @@ std::vector<std::shared_ptr<detail::node_impl>> getDepGraphNodes(
// If this is being recorded from an in-order queue we need to get the last
// in-order node if any, since this will later become a dependency of the
// node being processed here.
if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue.get());
if (const auto &LastInOrderNode = Graph->getLastInorderNode(Queue);
LastInOrderNode) {
DepNodes.push_back(LastInOrderNode);
}
Expand Down Expand Up @@ -78,10 +78,11 @@ void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size) {
ur_event_handle_t Event = nullptr;
// If a graph is present do the allocation from the graph memory pool instead.
if (auto Graph = h.getCommandGraph(); Graph) {
auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents);
auto DepNodes =
getDepGraphNodes(h, h.impl->get_queue_or_null(), Graph, DepEvents);
alloc = Graph->getMemPool().malloc(size, kind, DepNodes);
} else {
auto &Q = h.MQueue->getHandleRef();
ur_queue_handle_t Q = h.impl->get_queue().getHandleRef();
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, (ur_usm_pool_handle_t)0, size, nullptr, UREvents.size(),
Expand Down Expand Up @@ -128,13 +129,14 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size,
ur_event_handle_t Event = nullptr;
// If a graph is present do the allocation from the graph memory pool instead.
if (auto Graph = h.getCommandGraph(); Graph) {
auto DepNodes = getDepGraphNodes(h, h.MQueue, Graph, DepEvents);
auto DepNodes =
getDepGraphNodes(h, h.impl->get_queue_or_null(), Graph, DepEvents);

// Memory pool is passed as the graph may use some properties of it.
alloc = Graph->getMemPool().malloc(size, pool.get_alloc_kind(), DepNodes,
sycl::detail::getSyclObjImpl(pool));
} else {
auto &Q = h.MQueue->getHandleRef();
ur_queue_handle_t Q = h.impl->get_queue().getHandleRef();
Adapter->call<sycl::errc::runtime,
sycl::detail::UrApiKind::urEnqueueUSMDeviceAllocExp>(
Q, memPoolImpl.get()->get_handle(), size, nullptr, UREvents.size(),
Expand Down
12 changes: 6 additions & 6 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,14 +338,14 @@ void context_impl::addDeviceGlobalInitializer(
}
}

std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
ur_program_handle_t NativePrg,
const std::shared_ptr<queue_impl> &QueueImpl) {
std::vector<ur_event_handle_t>
context_impl::initializeDeviceGlobals(ur_program_handle_t NativePrg,
queue_impl &QueueImpl) {
if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire))
return {};

const AdapterPtr &Adapter = getAdapter();
device_impl &DeviceImpl = QueueImpl->getDeviceImpl();
device_impl &DeviceImpl = QueueImpl.getDeviceImpl();
std::lock_guard<std::mutex> NativeProgramLock(MDeviceGlobalInitializersMutex);
auto ImgIt = MDeviceGlobalInitializers.find(
std::make_pair(NativePrg, DeviceImpl.getHandleRef()));
Expand Down Expand Up @@ -417,7 +417,7 @@ std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
for (DeviceGlobalMapEntry *DeviceGlobalEntry : DeviceGlobalEntries) {
// Get or allocate the USM memory associated with the device global.
DeviceGlobalUSMMem &DeviceGlobalUSM =
DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(*QueueImpl);
DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(QueueImpl);

// If the device global still has a initialization event it should be
// added to the initialization events list. Since initialization events
Expand All @@ -432,7 +432,7 @@ std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
ur_event_handle_t InitEvent;
void *const &USMPtr = DeviceGlobalUSM.getPtr();
Adapter->call<UrApiKind::urEnqueueDeviceGlobalVariableWrite>(
QueueImpl->getHandleRef(), NativePrg,
QueueImpl.getHandleRef(), NativePrg,
DeviceGlobalEntry->MUniqueId.c_str(), false, sizeof(void *), 0,
&USMPtr, 0, nullptr, &InitEvent);

Expand Down
3 changes: 1 addition & 2 deletions sycl/source/detail/context_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,7 @@ class context_impl : public std::enable_shared_from_this<context_impl> {

/// Initializes device globals for a program on the associated queue.
std::vector<ur_event_handle_t>
initializeDeviceGlobals(ur_program_handle_t NativePrg,
const std::shared_ptr<queue_impl> &QueueImpl);
initializeDeviceGlobals(ur_program_handle_t NativePrg, queue_impl &QueueImpl);

void memcpyToHostOnlyDeviceGlobal(device_impl &DeviceImpl,
const void *DeviceGlobalPtr,
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ event_impl::event_impl(const QueueImplPtr &Queue)
MState.store(HES_Complete);
}

void event_impl::setQueue(const QueueImplPtr &Queue) {
MQueue = Queue;
MIsProfilingEnabled = Queue->MIsProfilingEnabled;
void event_impl::setQueue(queue_impl &Queue) {
MQueue = Queue.shared_from_this();
MIsProfilingEnabled = Queue.MIsProfilingEnabled;

// TODO After setting the queue, the event is no longer default
// constructed. Consider a design change which would allow
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class event_impl {
/// as it was constructed with the queue based constructor.
///
/// \param Queue is a queue to be associated with the event
void setQueue(const QueueImplPtr &Queue);
void setQueue(queue_impl &Queue);

/// Waits for the event.
///
Expand Down Expand Up @@ -234,14 +234,14 @@ class event_impl {
/// Sets worker queue for command.
///
/// @return
void setWorkerQueue(const QueueImplPtr &WorkerQueue) {
void setWorkerQueue(std::weak_ptr<queue_impl> WorkerQueue) {
MWorkerQueue = WorkerQueue;
};

/// Sets original queue used for submission.
///
/// @return
void setSubmittedQueue(const QueueImplPtr &SubmittedQueue) {
void setSubmittedQueue(std::weak_ptr<queue_impl> SubmittedQueue) {
MSubmittedQueue = SubmittedQueue;
};

Expand Down
4 changes: 2 additions & 2 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ graph_impl::add(std::function<void(handler &)> CGF,
(void)Args;
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImpl{*this};
sycl::handler Handler{&HandlerImpl, std::shared_ptr<detail::queue_impl>{}};
sycl::handler Handler{HandlerImpl};
#else
sycl::handler Handler{shared_from_this()};
#endif
Expand Down Expand Up @@ -2310,7 +2310,7 @@ void dynamic_command_group_impl::finalizeCGFList(
// as a single command-group with multiple commands inside.
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImpl{*MGraph};
sycl::handler Handler{&HandlerImpl, std::shared_ptr<detail::queue_impl>{}};
sycl::handler Handler{HandlerImpl};
#else
sycl::handler Handler{MGraph};
#endif
Expand Down
8 changes: 2 additions & 6 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,15 +311,11 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
const v1::SubmissionInfo &SubmitInfo) {
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::handler_impl HandlerImplVal(*this, SecondaryQueue, CallerNeedsEvent);
detail::handler_impl *HandlerImpl = &HandlerImplVal;
// Inlining `Self` results in a crash when SYCL RT is built using MSVC with
// optimizations enabled. No crash if built using OneAPI.
auto Self = shared_from_this();
handler Handler(HandlerImpl, Self);
handler Handler(HandlerImplVal);
#else
handler Handler(shared_from_this(), SecondaryQueue, CallerNeedsEvent);
auto &HandlerImpl = detail::getSyclObjImpl(Handler);
#endif
auto &HandlerImpl = detail::getSyclObjImpl(Handler);

#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiTraceEnabled()) {
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,10 +687,12 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
protected:
template <typename HandlerType = handler>
EventImplPtr insertHelperBarrier(const HandlerType &Handler) {
auto ResEvent = std::make_shared<detail::event_impl>(Handler.MQueue);
auto &Queue = Handler.impl->get_queue();
auto ResEvent =
std::make_shared<detail::event_impl>(Queue.shared_from_this());
ur_event_handle_t UREvent = nullptr;
getAdapter()->call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
Handler.MQueue->getHandleRef(), 0, nullptr, &UREvent);
Queue.getHandleRef(), 0, nullptr, &UREvent);
ResEvent->setHandle(UREvent);
return ResEvent;
}
Expand Down
Loading