diff --git a/compiler/lib/utils/OPTIONS.def b/compiler/lib/utils/OPTIONS.def index 9b71f52e..a8a90636 100644 --- a/compiler/lib/utils/OPTIONS.def +++ b/compiler/lib/utils/OPTIONS.def @@ -1272,8 +1272,8 @@ OPTION(OT_UINT32, \ OA_RUNTIME|OVA_OPTIONAL|OA_SEPARATOR_EQUAL, \ "code-object-version", NULL, \ LCCodeObjectVersion, \ - 4, 4, 5, NULL, \ - "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 4. (COMGR only)") + 5, 4, 5, NULL, \ + "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 5. (COMGR only)") /* Do not remove the following line. Any option should be diff --git a/device/device.cpp b/device/device.cpp index 2709febd..8c86b86e 100644 --- a/device/device.cpp +++ b/device/device.cpp @@ -751,7 +751,7 @@ bool Device::disableP2P(amd::Device* ptrDev) { } bool Device::UpdateStackSize(uint64_t stackSize) { - if (stackSize > 16 * Ki) { + if (stackSize > ((128 * Ki) - 16)) { return false; } stack_size_ = stackSize; diff --git a/device/device.hpp b/device/device.hpp index 64d67465..ee9ac0ee 100644 --- a/device/device.hpp +++ b/device/device.hpp @@ -1796,9 +1796,14 @@ class Device : public RuntimeObject { // Returns the status of HW event, associated with amd::Event virtual bool IsHwEventReady( - const amd::Event& event, //!< AMD event for HW status validation - bool wait = false //!< If true then forces the event completion - ) const { + const amd::Event& event, //!< AMD event for HW status validation + bool wait = false) const { //!< If true then forces the event completion + return false; + }; + + // Returns the status of HW event, associated with amd::Event + virtual bool IsHwEventReadyForcedWait( + const amd::Event& event) const { //!< AMD event for HW status validation return false; }; diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp index f6b447ca..913b26b6 100644 --- a/device/rocm/rocblit.cpp +++ b/device/rocm/rocblit.cpp @@ -2682,31 +2682,13 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, Memory* schedulerMem = dev().getRocMemory(schedulerParam); sp->kernarg_address = reinterpret_cast(schedulerMem->getDeviceMemory()); - - sp->hidden_global_offset_x = 0; - sp->hidden_global_offset_y = 0; - sp->hidden_global_offset_z = 0; sp->thread_counter = 0; sp->child_queue = reinterpret_cast(schedulerQueue); sp->complete_signal = schedulerSignal; hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne); - sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | - (1 << HSA_PACKET_HEADER_BARRIER) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | - (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - sp->scheduler_aql.setup = 1; - sp->scheduler_aql.workgroup_size_x = 1; - sp->scheduler_aql.workgroup_size_y = 1; - sp->scheduler_aql.workgroup_size_z = 1; - sp->scheduler_aql.grid_size_x = threads; - sp->scheduler_aql.grid_size_y = 1; - sp->scheduler_aql.grid_size_z = 1; - sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle(); - sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address; - sp->scheduler_aql.private_segment_size = 0; - sp->scheduler_aql.group_segment_size = 0; + sp->vqueue_header = vqVM; sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam); @@ -2721,7 +2703,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam, address parameters = captureArguments(kernels_[Scheduler]); if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler], - parameters, nullptr)) { + parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) { return false; } releaseArguments(parameters); diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp index 961227bb..69d5ff05 100644 --- a/device/rocm/rocdevice.cpp +++ b/device/rocm/rocdevice.cpp @@ -2331,7 +2331,6 @@ bool Device::IpcDetach (void* dev_ptr) const { // ================================================================================================ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { - constexpr bool kForceAllocation = true; amd::Memory* mem = nullptr; if (nullptr == svmPtr) { @@ -2343,7 +2342,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_ return nullptr; } - if (!mem->create(nullptr, false, false, kForceAllocation)) { + if (!mem->create(nullptr)) { LogError("failed to create a svm hidden buffer!"); mem->release(); return nullptr; @@ -2730,10 +2729,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI return result; } +// ================================================================================================ +bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const { + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); + if (hw_event == nullptr) { + ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); + return false; + } + static constexpr bool Timeout = true; + return WaitForSignal(reinterpret_cast(hw_event)->signal_, false, true); +} + // ================================================================================================ bool Device::IsHwEventReady(const amd::Event& event, bool wait) const { - void* hw_event = (event.NotifyEvent() != nullptr) ? - event.NotifyEvent()->HwEvent() : event.HwEvent(); + void* hw_event = + (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent(); if (hw_event == nullptr) { ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event"); return false; @@ -3210,7 +3221,9 @@ device::Signal* Device::createSignal() const { amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) { // Only create arena_mem_object if CPU memory is accessible from HMM // or if runtime received an interop from another ROCr's client - if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) { + hsa_amd_pointer_info_t ptr_info = {}; + ptr_info.size = sizeof(hsa_amd_pointer_info_t); + if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) { return nullptr; } @@ -3227,8 +3240,9 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size } // Calculate the offset of the pointer. - const void* dev_ptr = reinterpret_cast(arena_mem_obj_->getDeviceMemory( - *arena_mem_obj_->getContext().devices()[0])->virtualAddress()); + const void* dev_ptr = reinterpret_cast( + arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0]) + ->virtualAddress()); offset = reinterpret_cast(ptr) - reinterpret_cast(dev_ptr); return arena_mem_obj_; @@ -3242,20 +3256,25 @@ void Device::ReleaseGlobalSignal(void* signal) const { } // ================================================================================================ -bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const { - hsa_amd_pointer_info_t ptr_info = {}; - ptr_info.size = sizeof(hsa_amd_pointer_info_t); +bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) { // Query ptr type to see if it's a HMM allocation - hsa_status_t status = hsa_amd_pointer_info( - const_cast(dev_ptr), &ptr_info, nullptr, nullptr, nullptr); + hsa_status_t status = + hsa_amd_pointer_info(const_cast(dev_ptr), ptr_info, nullptr, nullptr, nullptr); // The call should never fail in ROCR, but just check for an error and continue if (status != HSA_STATUS_SUCCESS) { LogError("hsa_amd_pointer_info() failed"); } - // Check if it's a legacy non-HMM allocation in ROCr - if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) { - if ((size != 0) && ((reinterpret_cast(dev_ptr) - - reinterpret_cast(ptr_info.agentBaseAddress)) > size)) { + + // Return false for pinned memory. A true return may result in a race because + // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment + if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) { + return false; + } + + if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) { + if ((size != 0) && + ((reinterpret_cast(dev_ptr) - + reinterpret_cast(ptr_info->agentBaseAddress)) > size)) { return false; } return true; diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp index b3da3783..43898c8e 100644 --- a/device/rocm/rocdevice.hpp +++ b/device/rocm/rocdevice.hpp @@ -258,6 +258,7 @@ class NullDevice : public amd::Device { cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; } virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; } + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; } virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {}; virtual void ReleaseGlobalSignal(void* signal) const {} @@ -443,6 +444,7 @@ class Device : public NullDevice { cl_set_device_clock_mode_output_amd* pSetClockModeOutput); virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const; + virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const; virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const; virtual void ReleaseGlobalSignal(void* signal) const; @@ -549,7 +551,7 @@ class Device : public NullDevice { const bool isFineGrainSupported() const; //! Returns True if memory pointer is known to ROCr (excludes HMM allocations) - bool IsValidAllocation(const void* dev_ptr, size_t size) const; + bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info); //! Allocates hidden heap for device memory allocations void HiddenHeapAlloc(); diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 56b6022f..451fdcd5 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -1041,7 +1041,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD hsa_signal_t signal, hsa_signal_value_t value, hsa_signal_value_t mask, hsa_signal_condition32_t cond, bool skipTs, hsa_signal_t completionSignal) { - hsa_amd_barrier_value_packet_t barrier_value_packet_ = {0}; uint16_t rest = HSA_AMD_PACKET_TYPE_BARRIER_VALUE; const uint32_t queueSize = gpu_queue_->size; const uint32_t queueMask = queueSize - 1; @@ -1274,6 +1273,7 @@ bool VirtualGPU::create() { // Initialize barrier and barrier value packets memset(&barrier_packet_, 0, sizeof(barrier_packet_)); barrier_packet_.header = kInvalidAql; + barrier_value_packet_.header.header = kInvalidAql; // Create a object of PrintfDbg printfdbg_ = new PrintfDbg(roc_device_); @@ -2781,7 +2781,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) // ================================================================================================ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, void* eventHandle, - uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) { + uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd, + hsa_kernel_dispatch_packet_t* aql_packet) { device::Kernel* devKernel = const_cast(kernel.getDeviceKernel(dev())); Kernel& gpuKernel = static_cast(*devKernel); size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize(); @@ -3108,6 +3109,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_; } + // Copy scheduler's AQL packet for possible relaunch from the scheduler itself + if (aql_packet != nullptr) { + *aql_packet = dispatchPacket; + aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | + (1 << HSA_PACKET_HEADER_BARRIER) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + } + // Dispatch the packet if (!dispatchAqlPacket( &dispatchPacket, aqlHeaderWithOrder, diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index 11a3670e..d113a289 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits::max(); // Active wait time out incase same sdma engine is used again, // then just wait instead of adding dependency wait signal. -constexpr static uint64_t kSDMAEngineTimeout = 10; +constexpr static uint64_t kForcedTimeout = 10; template -inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) { +inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) { if (hsa_signal_load_relaxed(signal) > 0) { uint64_t timeout = kTimeout100us; if (active_wait) { @@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd } if (active_wait_timeout) { // If diff engine, wait to 10 ms. Otherwise no wait - timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; + timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K; if (timeout == 0) { return false; } @@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice { const_address parameters, //!< Parameters for the kernel void* event_handle, //!< Handle to OCL event for debugging uint32_t sharedMemBytes = 0, //!< Shared memory size - amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command + amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command + hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch ); void submitNativeFn(amd::NativeFnCommand& cmd); void submitMarker(amd::Marker& cmd); @@ -502,6 +503,7 @@ class VirtualGPU : public device::VirtualDevice { hsa_agent_t gpu_device_; //!< Physical device hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu hsa_barrier_and_packet_t barrier_packet_; + hsa_amd_barrier_value_packet_t barrier_value_packet_; uint32_t dispatch_id_; //!< This variable must be updated atomically. Device& roc_device_; //!< roc device object