diff --git a/unified-runtime/source/adapters/cuda/context.hpp b/unified-runtime/source/adapters/cuda/context.hpp
index e49a7db9fe505..eebc012a281fb 100644
--- a/unified-runtime/source/adapters/cuda/context.hpp
+++ b/unified-runtime/source/adapters/cuda/context.hpp
@@ -16,6 +16,8 @@
 #include <atomic>
 #include <mutex>
 #include <set>
+#include <shared_mutex>
+#include <unordered_map>
 #include <vector>
 
 #include "adapter.hpp"
@@ -96,6 +98,10 @@ struct ur_context_handle_t_ : ur::cuda::handle_base {
   umf_memory_provider_handle_t MemoryProviderHost = nullptr;
   umf_memory_pool_handle_t MemoryPoolHost = nullptr;
 
+  // Track which device allocated each USM pointer for cross-device operations
+  std::unordered_map<const void *, ur_device_handle_t> AllocationMetadata;
+  mutable std::shared_mutex AllocationMetadataMutex;
+
   ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
       : handle_base(), Devices{Devs, Devs + NumDevices} {
     // Create UMF CUDA memory provider for the host memory
@@ -147,6 +153,25 @@ struct ur_context_handle_t_ : ur::cuda::handle_base {
 
   ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool);
 
+  // Register USM allocation metadata for cross-device operation tracking
+  void registerAllocation(const void *Ptr, ur_device_handle_t Device) {
+    std::unique_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
+    AllocationMetadata[Ptr] = Device;
+  }
+
+  // Unregister USM allocation metadata
+  void unregisterAllocation(const void *Ptr) {
+    std::unique_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
+    AllocationMetadata.erase(Ptr);
+  }
+
+  // Query which device allocated a USM pointer
+  ur_device_handle_t getAllocationDevice(const void *Ptr) const {
+    std::shared_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
+    auto It = AllocationMetadata.find(Ptr);
+    return (It != AllocationMetadata.end()) ? It->second : nullptr;
+  }
+
 private:
   std::mutex Mutex;
   std::vector<deleter_data> ExtendedDeleters;
@@ -165,19 +190,30 @@ class ScopedContext {
 
   ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
 
-  ~ScopedContext() {}
+  ~ScopedContext() {
+    // Restore original context if we changed it and there was a previous
+    // context If Original was nullptr, leave the current context active to
+    // maintain compatibility with code that expects context to remain set
+    if (NeedToRestore && Original != nullptr) {
+      cuCtxSetCurrent(Original);
+    }
+  }
 
 private:
   void setContext(CUcontext Desired) {
-    CUcontext Original = nullptr;
-
     UR_CHECK_ERROR(cuCtxGetCurrent(&Original));
 
     // Make sure the desired context is active on the current thread, setting
     // it if necessary
     if (Original != Desired) {
       UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
+      NeedToRestore = true;
+    } else {
+      NeedToRestore = false;
     }
   }
+
+  CUcontext Original = nullptr;
+  bool NeedToRestore = false;
 };
 } // namespace
diff --git a/unified-runtime/source/adapters/cuda/device.cpp b/unified-runtime/source/adapters/cuda/device.cpp
index 582cdbe7a9d90..dc0269ccdfc65 100644
--- a/unified-runtime/source/adapters/cuda/device.cpp
+++ b/unified-runtime/source/adapters/cuda/device.cpp
@@ -1174,8 +1174,49 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(true);
   case UR_DEVICE_INFO_USE_NATIVE_ASSERT:
     return ReturnValue(true);
-  case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP:
-    return ReturnValue(true);
+  case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: {
+    // P2P support requires compute capability >= 2.0
+    // Check if device supports Unified Virtual Addressing (UVA)
+    int Major = 0;
+    UR_CHECK_ERROR(cuDeviceGetAttribute(
+        &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get()));
+
+    // Compute capability 2.0+ supports UVA which is required for P2P
+    if (Major < 2) {
+      return ReturnValue(false);
+    }
+
+    // Check if device can actually access peers
+    // We need to check against other devices in the platform
+    int canAccessPeer = 0;
+    int deviceCount = 0;
+    UR_CHECK_ERROR(cuDeviceGetCount(&deviceCount));
+
+    // If there's only one device, P2P is not applicable
+    if (deviceCount < 2) {
+      return ReturnValue(false);
+    }
+
+    // Check if this device can access at least one other device
+    CUdevice currentDevice = hDevice->get();
+    bool hasP2PCapability = false;
+
+    for (int i = 0; i < deviceCount; ++i) {
+      CUdevice peerDevice;
+      UR_CHECK_ERROR(cuDeviceGet(&peerDevice, i));
+
+      if (peerDevice != currentDevice) {
+        UR_CHECK_ERROR(
+            cuDeviceCanAccessPeer(&canAccessPeer, currentDevice, peerDevice));
+        if (canAccessPeer) {
+          hasP2PCapability = true;
+          break;
+        }
+      }
+    }
+
+    return ReturnValue(hasP2PCapability);
+  }
   case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP:
     return ReturnValue(false);
   case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP:
diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp
index 6ba5b8c2c27f2..d44b056ae0d9d 100644
--- a/unified-runtime/source/adapters/cuda/enqueue.cpp
+++ b/unified-runtime/source/adapters/cuda/enqueue.cpp
@@ -1580,8 +1580,72 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
                                                       hQueue, CuStream);
       UR_CHECK_ERROR(EventPtr->start());
     }
-    UR_CHECK_ERROR(
-        cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
+
+    // Check memory types and device ownership
+    CUmemorytype SrcType = CU_MEMORYTYPE_HOST;
+    CUmemorytype DstType = CU_MEMORYTYPE_HOST;
+    cuPointerGetAttribute(&SrcType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                          (CUdeviceptr)pSrc);
+    cuPointerGetAttribute(&DstType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                          (CUdeviceptr)pDst);
+
+    // Detect cross-device copy for Managed Memory
+    bool isManagedMemory =
+        (SrcType == CU_MEMORYTYPE_UNIFIED || DstType == CU_MEMORYTYPE_UNIFIED);
+
+    if (isManagedMemory) {
+      // For Managed Memory cross-device copies without P2P:
+      // CUDA driver automatically stages through CPU memory.
+      // We just need to ensure the queue's device can access both pointers.
+      // Prefetch both to CPU to enable staging, then let CUDA handle migration.
+
+      // Prefetch SRC to CPU (system memory) if it's Managed
+      if (SrcType == CU_MEMORYTYPE_UNIFIED) {
+        UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pSrc, size,
+                                          CU_DEVICE_CPU, CuStream));
+      }
+
+      // Prefetch DST to CPU if it's Managed
+      if (DstType == CU_MEMORYTYPE_UNIFIED) {
+        UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pDst, size,
+                                          CU_DEVICE_CPU, CuStream));
+      }
+
+      // Wait for prefetches to complete
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+
+      // Now copy - CUDA will handle cross-device migration via CPU
+      UR_CHECK_ERROR(
+          cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
+
+    } else {
+      // For Device memory: try to detect cross-device copy
+      int SrcDevice = -1;
+      int DstDevice = -1;
+
+      // Get device ordinals (ignore errors for host memory)
+      cuPointerGetAttribute(&SrcDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+                            (CUdeviceptr)pSrc);
+      cuPointerGetAttribute(&DstDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+                            (CUdeviceptr)pDst);
+
+      bool isCrossDevice =
+          (SrcDevice != -1 && DstDevice != -1 && SrcDevice != DstDevice);
+
+      if (isCrossDevice) {
+        // Cross-device Device memory copy
+        // This requires P2P or staging through host
+        // cuMemcpyAsync will handle this - with P2P it's direct,
+        // without P2P driver stages through host (slower)
+        UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
+                                     CuStream));
+      } else {
+        // Same device or host-device copy
+        UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
+                                     CuStream));
+      }
+    }
+
     if (phEvent) {
       UR_CHECK_ERROR(EventPtr->record());
     }
diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp
index c805c1084ec0f..00daf020898f8 100644
--- a/unified-runtime/source/adapters/cuda/usm.cpp
+++ b/unified-runtime/source/adapters/cuda/usm.cpp
@@ -47,7 +47,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
 /// USM: Implements USM device allocations using a normal CUDA device pointer
 ///
 UR_APIEXPORT ur_result_t UR_APICALL
-urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
+urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
                  const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool,
                  size_t size, void **ppMem) {
   auto alignment = pUSMDesc ? pUSMDesc->align : 0u;
@@ -65,15 +65,23 @@ urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
     auto umfErr = umfPoolGetLastAllocationError(pool);
     return umf::umf2urResult(umfErr);
   }
+
+  // Register allocation with context for cross-device operation tracking
+  if (hContext && *ppMem && hDevice) {
+    hContext->registerAllocation(*ppMem, hDevice);
+  }
+
   return UR_RESULT_SUCCESS;
 }
 
 /// USM: Implements USM Shared allocations using CUDA Managed Memory
 ///
 UR_APIEXPORT ur_result_t UR_APICALL
-urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
+urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
                  const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool,
                  size_t size, void **ppMem) {
+  // hContext unused - Shared memory not tracked (CUDA handles migration)
+  (void)hContext;
   auto alignment = pUSMDesc ? pUSMDesc->align : 0u;
 
   ScopedContext SC(hDevice);
@@ -89,12 +97,23 @@ urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
     auto umfErr = umfPoolGetLastAllocationError(pool);
     return umf::umf2urResult(umfErr);
   }
+
+  // Do NOT register Managed Memory allocations
+  // CUDA Unified Memory system handles migration automatically.
+  // Manual tracking interferes with automatic page migration.
+
   return UR_RESULT_SUCCESS;
 }
 
 /// USM: Frees the given USM pointer associated with the context.
 ///
-UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) {
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
+                                              void *pMem) {
+  // Unregister allocation metadata before freeing
+  if (hContext && pMem) {
+    hContext->unregisterAllocation(pMem);
+  }
+
   umf_memory_pool_handle_t hPool = NULL;
   umf_result_t ret = umfPoolByPtr(pMem, &hPool);
   if (ret == UMF_RESULT_SUCCESS) {
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp
index d4a9f0188aa2f..88136db77355a 100644
--- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp
+++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp
@@ -10,6 +10,7 @@
 #include <uur/known_failure.h>
 #include <uur/raii.h>
 
+#include <iostream>
 #include <thread>
 #include <utility>
 
@@ -23,6 +24,8 @@ struct urMultiQueueLaunchMemcpyTest
   std::vector<ur_program_handle_t> programs;
   std::vector<ur_kernel_handle_t> kernels;
   std::vector<void *> SharedMem;
+  std::vector<void *> HostMem; // For CUDA: host-accessible verification buffer
+  bool useCudaDeviceMemory = false;
 
   static constexpr char ProgramName[] = "increment";
   static constexpr size_t ArraySize = 100;
@@ -46,6 +49,16 @@ struct urMultiQueueLaunchMemcpyTest
     kernels.resize(devices.size());
     SharedMem.resize(devices.size());
 
+    // Check if we're on CUDA backend
+    ur_platform_backend_t backend;
+    ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                     sizeof(backend), &backend, nullptr));
+    useCudaDeviceMemory = (backend == UR_PLATFORM_BACKEND_CUDA);
+
+    if (useCudaDeviceMemory) {
+      HostMem.resize(devices.size());
+    }
+
     std::shared_ptr<std::vector<char>> il_binary;
     std::vector<ur_program_metadata_t> metadatas{};
 
@@ -71,10 +84,25 @@ struct urMultiQueueLaunchMemcpyTest
       ASSERT_SUCCESS(
           urKernelCreate(programs[i], KernelName.data(), &kernels[i]));
 
-      ASSERT_SUCCESS(urUSMSharedAlloc(context, devices[i], nullptr, nullptr,
+      // CUDA: Use USM Device memory for multi-GPU peer transfers
+      // Other backends: Use USM Shared memory
+      if (useCudaDeviceMemory) {
+        ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[i], nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &SharedMem[i]));
+        ASSERT_NE(SharedMem[i], nullptr);
+
+        // Also allocate host-accessible buffer for verification
+        ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr,
                                       ArraySize * sizeof(uint32_t),
-                                      &SharedMem[i]));
-      ASSERT_NE(SharedMem[i], nullptr);
+                                      &HostMem[i]));
+        ASSERT_NE(HostMem[i], nullptr);
+      } else {
+        ASSERT_SUCCESS(urUSMSharedAlloc(context, devices[i], nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &SharedMem[i]));
+        ASSERT_NE(SharedMem[i], nullptr);
+      }
 
       ASSERT_SUCCESS(urEnqueueUSMFill(
           queues[i], SharedMem[i], sizeof(uint32_t), &InitialValue,
@@ -90,6 +118,11 @@ struct urMultiQueueLaunchMemcpyTest
     for (auto &Ptr : SharedMem) {
       urUSMFree(context, Ptr);
     }
+    if (useCudaDeviceMemory) {
+      for (auto &Ptr : HostMem) {
+        urUSMFree(context, Ptr);
+      }
+    }
     for (const auto &kernel : kernels) {
       urKernelRelease(kernel);
     }
@@ -112,9 +145,21 @@ struct urMultiQueueLaunchMemcpyTest
         } while (status != UR_EVENT_STATUS_COMPLETE);
 
         auto ExpectedValue = InitialValue + i + 1;
-        for (uint32_t j = 0; j < ArraySize; ++j) {
-          ASSERT_EQ(reinterpret_cast<uint32_t *>(SharedMem[i])[j],
-                    ExpectedValue);
+
+        // CUDA: Copy from device to host buffer before verification
+        if (useCudaDeviceMemory) {
+          ASSERT_SUCCESS(urEnqueueUSMMemcpy(
+              queues[i], true, HostMem[i], SharedMem[i],
+              ArraySize * sizeof(uint32_t), 0, nullptr, nullptr));
+          for (uint32_t j = 0; j < ArraySize; ++j) {
+            ASSERT_EQ(reinterpret_cast<uint32_t *>(HostMem[i])[j],
+                      ExpectedValue);
+          }
+        } else {
+          for (uint32_t j = 0; j < ArraySize; ++j) {
+            ASSERT_EQ(reinterpret_cast<uint32_t *>(SharedMem[i])[j],
+                      ExpectedValue);
+          }
         }
       });
     }
@@ -268,8 +313,6 @@ UUR_PLATFORM_TEST_SUITE_WITH_PARAM(
 // ... ops
 TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) {
   UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{});
-  // https://github.com/intel/llvm/issues/19033
-  UUR_KNOWN_FAILURE_ON(uur::CUDA{});
 
   auto waitOnEvent = std::get<0>(getParam()).value;
   auto runBackgroundCheck = std::get<1>(getParam()).value;
@@ -278,6 +321,23 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) {
   ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP,
                                  sizeof(usm_p2p_support), &usm_p2p_support,
                                  nullptr));
+
+  // Log P2P support status for debugging
+  std::cout << "[P2P INFO] Device 0 USM P2P Support: "
+            << (usm_p2p_support ? "ENABLED" : "DISABLED") << std::endl;
+
+  // Check all device pairs for P2P support
+  for (size_t i = 0; i < devices.size(); i++) {
+    for (size_t j = i + 1; j < devices.size(); j++) {
+      ur_bool_t pair_p2p = false;
+      ASSERT_SUCCESS(urDeviceGetInfo(devices[i],
+                                     UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP,
+                                     sizeof(pair_p2p), &pair_p2p, nullptr));
+      std::cout << "[P2P INFO] Device " << i << " <-> Device " << j
+                << " P2P: " << (pair_p2p ? "ENABLED" : "DISABLED") << std::endl;
+    }
+  }
+
   if (!usm_p2p_support) {
     GTEST_SKIP() << "EXP usm p2p feature is not supported.";
   }
@@ -324,11 +384,22 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) {
     ASSERT_SUCCESS(urQueueFinish(queues.back()));
   }
 
+  // CUDA: Copy device buffers to host for verification
+  if (this->useCudaDeviceMemory) {
+    for (size_t i = 0; i < devices.size(); i++) {
+      ASSERT_SUCCESS(urEnqueueUSMMemcpy(
+          queues[i], true, this->HostMem[i], this->SharedMem[i],
+          ArraySize * sizeof(uint32_t), 0, nullptr, nullptr));
+    }
+  }
+
   size_t ExpectedValue = InitialValue;
   for (size_t i = 0; i < devices.size(); i++) {
     ExpectedValue++;
+    void *verifyPtr =
+        this->useCudaDeviceMemory ? this->HostMem[i] : this->SharedMem[i];
     for (uint32_t j = 0; j < ArraySize; ++j) {
-      ASSERT_EQ(reinterpret_cast<uint32_t *>(SharedMem[i])[j], ExpectedValue);
+      ASSERT_EQ(reinterpret_cast<uint32_t *>(verifyPtr)[j], ExpectedValue);
     }
   }
 }