diff --git a/compiler/lib/utils/OPTIONS.def b/compiler/lib/utils/OPTIONS.def
index 9b71f52e..a8a90636 100644
--- a/compiler/lib/utils/OPTIONS.def
+++ b/compiler/lib/utils/OPTIONS.def
@@ -1272,8 +1272,8 @@ OPTION(OT_UINT32, \
        OA_RUNTIME|OVA_OPTIONAL|OA_SEPARATOR_EQUAL, \
        "code-object-version", NULL, \
        LCCodeObjectVersion, \
-       4, 4, 5, NULL, \
-       "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 4. (COMGR only)")
+       5, 4, 5, NULL, \
+       "Specify code object ABI version. Allowed values are 4, and 5. Defaults to 5. (COMGR only)")
 
 /*
    Do not remove the following line. Any option should be
diff --git a/device/device.cpp b/device/device.cpp
index 2709febd..8c86b86e 100644
--- a/device/device.cpp
+++ b/device/device.cpp
@@ -751,7 +751,7 @@ bool Device::disableP2P(amd::Device* ptrDev) {
 }
 
 bool Device::UpdateStackSize(uint64_t stackSize) {
-  if (stackSize > 16 * Ki) {
+  if (stackSize > ((128 * Ki) - 16)) {
     return false;
   }
   stack_size_ = stackSize;
diff --git a/device/device.hpp b/device/device.hpp
index 64d67465..ee9ac0ee 100644
--- a/device/device.hpp
+++ b/device/device.hpp
@@ -1796,9 +1796,14 @@ class Device : public RuntimeObject {
 
   // Returns the status of HW event, associated with amd::Event
   virtual bool IsHwEventReady(
-      const amd::Event& event,  //!< AMD event for HW status validation
-      bool wait = false         //!< If true then forces the event completion
-      ) const {
+      const amd::Event& event,    //!< AMD event for HW status validation
+      bool wait = false) const {  //!< If true then forces the event completion
+    return false;
+  };
+
+  // Returns the status of HW event, associated with amd::Event
+  virtual bool IsHwEventReadyForcedWait(
+      const amd::Event& event) const {  //!< AMD event for HW status validation
     return false;
   };
 
diff --git a/device/rocm/rocblit.cpp b/device/rocm/rocblit.cpp
index f6b447ca..913b26b6 100644
--- a/device/rocm/rocblit.cpp
+++ b/device/rocm/rocblit.cpp
@@ -2682,31 +2682,13 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
 
   Memory* schedulerMem = dev().getRocMemory(schedulerParam);
   sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());
-
-  sp->hidden_global_offset_x = 0;
-  sp->hidden_global_offset_y = 0;
-  sp->hidden_global_offset_z = 0;
   sp->thread_counter = 0;
   sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
   sp->complete_signal = schedulerSignal;
 
   hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
 
-  sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-                             (1 << HSA_PACKET_HEADER_BARRIER) |
-                             (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-                             (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-  sp->scheduler_aql.setup = 1;
-  sp->scheduler_aql.workgroup_size_x = 1;
-  sp->scheduler_aql.workgroup_size_y = 1;
-  sp->scheduler_aql.workgroup_size_z = 1;
-  sp->scheduler_aql.grid_size_x = threads;
-  sp->scheduler_aql.grid_size_y = 1;
-  sp->scheduler_aql.grid_size_z = 1;
-  sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle();
-  sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address;
-  sp->scheduler_aql.private_segment_size = 0;
-  sp->scheduler_aql.group_segment_size = 0;
+
   sp->vqueue_header = vqVM;
 
   sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
@@ -2721,7 +2703,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
   address parameters = captureArguments(kernels_[Scheduler]);
 
   if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler],
-                                  parameters, nullptr)) {
+                                  parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) {
     return false;
   }
   releaseArguments(parameters);
diff --git a/device/rocm/rocdevice.cpp b/device/rocm/rocdevice.cpp
index 961227bb..69d5ff05 100644
--- a/device/rocm/rocdevice.cpp
+++ b/device/rocm/rocdevice.cpp
@@ -2331,7 +2331,6 @@ bool Device::IpcDetach (void* dev_ptr) const {
 // ================================================================================================
 void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
                        void* svmPtr) const {
-  constexpr bool kForceAllocation = true;
   amd::Memory* mem = nullptr;
 
   if (nullptr == svmPtr) {
@@ -2343,7 +2342,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
       return nullptr;
     }
 
-    if (!mem->create(nullptr, false, false, kForceAllocation)) {
+    if (!mem->create(nullptr)) {
       LogError("failed to create a svm hidden buffer!");
       mem->release();
       return nullptr;
@@ -2730,10 +2729,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
   return result;
 }
 
+// ================================================================================================
+bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const {
+  void* hw_event =
+      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
+  if (hw_event == nullptr) {
+    ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
+    return false;
+  }
+  static constexpr bool Timeout = true;
+  return WaitForSignal<Timeout>(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, false, true);
+}
+
 // ================================================================================================
 bool Device::IsHwEventReady(const amd::Event& event, bool wait) const {
-  void* hw_event = (event.NotifyEvent() != nullptr) ?
-    event.NotifyEvent()->HwEvent() : event.HwEvent();
+  void* hw_event =
+      (event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
   if (hw_event == nullptr) {
     ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
     return false;
@@ -3210,7 +3221,9 @@ device::Signal* Device::createSignal() const {
 amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) {
   // Only create arena_mem_object if CPU memory is accessible from HMM
   // or if runtime received an interop from another ROCr's client
-  if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) {
+  hsa_amd_pointer_info_t ptr_info = {};
+  ptr_info.size = sizeof(hsa_amd_pointer_info_t);
+  if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) {
     return nullptr;
   }
 
@@ -3227,8 +3240,9 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size
   }
 
   // Calculate the offset of the pointer.
-  const void* dev_ptr = reinterpret_cast<void*>(arena_mem_obj_->getDeviceMemory(
-                          *arena_mem_obj_->getContext().devices()[0])->virtualAddress());
+  const void* dev_ptr = reinterpret_cast<void*>(
+      arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0])
+          ->virtualAddress());
   offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);
 
   return arena_mem_obj_;
@@ -3242,20 +3256,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
 }
 
 // ================================================================================================
-bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
-  hsa_amd_pointer_info_t ptr_info = {};
-  ptr_info.size = sizeof(hsa_amd_pointer_info_t);
+bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
   // Query ptr type to see if it's a HMM allocation
-  hsa_status_t status = hsa_amd_pointer_info(
-    const_cast<void*>(dev_ptr), &ptr_info, nullptr, nullptr, nullptr);
+  hsa_status_t status =
+      hsa_amd_pointer_info(const_cast<void*>(dev_ptr), ptr_info, nullptr, nullptr, nullptr);
   // The call should never fail in ROCR, but just check for an error and continue
   if (status != HSA_STATUS_SUCCESS) {
     LogError("hsa_amd_pointer_info() failed");
   }
-  // Check if it's a legacy non-HMM allocation in ROCr
-  if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
-    if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
-                         reinterpret_cast<const_address>(ptr_info.agentBaseAddress)) > size)) {
+
+  // Return false for pinned memory. A true return may result in a race because
+  // ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment
+  if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) {
+    return false;
+  }
+
+  if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
+    if ((size != 0) &&
+        ((reinterpret_cast<const_address>(dev_ptr) -
+          reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
       return false;
     }
     return true;
diff --git a/device/rocm/rocdevice.hpp b/device/rocm/rocdevice.hpp
index b3da3783..43898c8e 100644
--- a/device/rocm/rocdevice.hpp
+++ b/device/rocm/rocdevice.hpp
@@ -258,6 +258,7 @@ class NullDevice : public amd::Device {
       cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
+  virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; }
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
   virtual void ReleaseGlobalSignal(void* signal) const {}
 
@@ -443,6 +444,7 @@ class Device : public NullDevice {
                             cl_set_device_clock_mode_output_amd* pSetClockModeOutput);
 
   virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
+  virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const;
   virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
   virtual void ReleaseGlobalSignal(void* signal) const;
 
@@ -549,7 +551,7 @@ class Device : public NullDevice {
   const bool isFineGrainSupported() const;
 
   //! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
-  bool IsValidAllocation(const void* dev_ptr, size_t size) const;
+  bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info);
 
   //! Allocates hidden heap for device memory allocations
   void HiddenHeapAlloc();
diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp
index 56b6022f..451fdcd5 100644
--- a/device/rocm/rocvirtual.cpp
+++ b/device/rocm/rocvirtual.cpp
@@ -1041,7 +1041,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
                                             hsa_signal_t signal, hsa_signal_value_t value,
                                             hsa_signal_value_t mask, hsa_signal_condition32_t cond,
                                             bool skipTs, hsa_signal_t completionSignal) {
-  hsa_amd_barrier_value_packet_t barrier_value_packet_ = {0};
   uint16_t rest = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
@@ -1274,6 +1273,7 @@ bool VirtualGPU::create() {
   // Initialize barrier and barrier value packets
   memset(&barrier_packet_, 0, sizeof(barrier_packet_));
   barrier_packet_.header = kInvalidAql;
+  barrier_value_packet_.header.header = kInvalidAql;
 
   // Create a object of PrintfDbg
   printfdbg_ = new PrintfDbg(roc_device_);
@@ -2781,7 +2781,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
 // ================================================================================================
 bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
     const amd::Kernel& kernel, const_address parameters, void* eventHandle,
-    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
+    uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
+    hsa_kernel_dispatch_packet_t* aql_packet) {
   device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
   Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
   size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
@@ -3108,6 +3109,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
       dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
     }
 
+    // Copy scheduler's AQL packet for possible relaunch from the scheduler itself
+    if (aql_packet != nullptr) {
+      *aql_packet = dispatchPacket;
+      aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+                           (1 << HSA_PACKET_HEADER_BARRIER) |
+                           (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+                           (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+      aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    }
+
     // Dispatch the packet
     if (!dispatchAqlPacket(
             &dispatchPacket, aqlHeaderWithOrder,
diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp
index 11a3670e..d113a289 100644
--- a/device/rocm/rocvirtual.hpp
+++ b/device/rocm/rocvirtual.hpp
@@ -46,10 +46,10 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();
 
 // Active wait time out incase same sdma engine is used again,
 // then just wait instead of adding dependency wait signal.
-constexpr static uint64_t kSDMAEngineTimeout = 10;
+constexpr static uint64_t kForcedTimeout = 10;
 
 template <bool active_wait_timeout = false>
-inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) {
+inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) {
   if (hsa_signal_load_relaxed(signal) > 0) {
     uint64_t timeout = kTimeout100us;
     if (active_wait) {
@@ -57,7 +57,7 @@ inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sd
     }
     if (active_wait_timeout) {
       // If diff engine, wait to 10 ms. Otherwise no wait
-      timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
+      timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
       if (timeout == 0) {
         return false;
       }
@@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice {
                             const_address parameters,            //!< Parameters for the kernel
                             void* event_handle,  //!< Handle to OCL event for debugging
                             uint32_t sharedMemBytes = 0, //!< Shared memory size
-                            amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
+                            amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
+                            hsa_kernel_dispatch_packet_t* aql_packet = nullptr  //!< Scheduler launch
                             );
   void submitNativeFn(amd::NativeFnCommand& cmd);
   void submitMarker(amd::Marker& cmd);
@@ -502,6 +503,7 @@ class VirtualGPU : public device::VirtualDevice {
   hsa_agent_t gpu_device_;  //!< Physical device
   hsa_queue_t* gpu_queue_;  //!< Queue associated with a gpu
   hsa_barrier_and_packet_t barrier_packet_;
+  hsa_amd_barrier_value_packet_t barrier_value_packet_;
 
   uint32_t dispatch_id_;  //!< This variable must be updated atomically.
   Device& roc_device_;    //!< roc device object