intel · kekaczma · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
@@ -16,6 +16,8 @@
 #include <atomic>
 #include <mutex>
 #include <set>
+#include <shared_mutex>
+#include <unordered_map>
 #include <vector>
 
 #include "adapter.hpp"
@@ -96,6 +98,10 @@ struct ur_context_handle_t_ : ur::cuda::handle_base {
   umf_memory_provider_handle_t MemoryProviderHost = nullptr;
   umf_memory_pool_handle_t MemoryPoolHost = nullptr;
 
+  // Track which device allocated each USM pointer for cross-device operations
+  std::unordered_map<const void *, ur_device_handle_t> AllocationMetadata;
+  mutable std::shared_mutex AllocationMetadataMutex;
+
   ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
       : handle_base(), Devices{Devs, Devs + NumDevices} {
     // Create UMF CUDA memory provider for the host memory
@@ -147,6 +153,25 @@ struct ur_context_handle_t_ : ur::cuda::handle_base {
 
   ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool);
 
+  // Register USM allocation metadata for cross-device operation tracking
+  void registerAllocation(const void *Ptr, ur_device_handle_t Device) {
+    std::unique_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
+    AllocationMetadata[Ptr] = Device;
+  }
+
+  // Unregister USM allocation metadata
+  void unregisterAllocation(const void *Ptr) {
+    std::unique_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
+    AllocationMetadata.erase(Ptr);
+  }
+
+  // Query which device allocated a USM pointer
+  ur_device_handle_t getAllocationDevice(const void *Ptr) const {
+    std::shared_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
+    auto It = AllocationMetadata.find(Ptr);
+    return (It != AllocationMetadata.end()) ? It->second : nullptr;
+  }
+
 private:
   std::mutex Mutex;
   std::vector<deleter_data> ExtendedDeleters;
@@ -165,19 +190,30 @@ class ScopedContext {
 
   ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
 
-  ~ScopedContext() {}
+  ~ScopedContext() {
+    // Restore original context if we changed it and there was a previous
+    // context If Original was nullptr, leave the current context active to
+    // maintain compatibility with code that expects context to remain set
+    if (NeedToRestore && Original != nullptr) {
+      cuCtxSetCurrent(Original);
+    }
+  }
 
 private:
   void setContext(CUcontext Desired) {
-    CUcontext Original = nullptr;
-
     UR_CHECK_ERROR(cuCtxGetCurrent(&Original));
 
     // Make sure the desired context is active on the current thread, setting
     // it if necessary
     if (Original != Desired) {
       UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
+      NeedToRestore = true;
+    } else {
+      NeedToRestore = false;
     }
   }
+
+  CUcontext Original = nullptr;
+  bool NeedToRestore = false;
 };
 } // namespace
@@ -1174,8 +1174,49 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(true);
   case UR_DEVICE_INFO_USE_NATIVE_ASSERT:
     return ReturnValue(true);
-  case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP:
-    return ReturnValue(true);
+  case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: {
+    // P2P support requires compute capability >= 2.0
+    // Check if device supports Unified Virtual Addressing (UVA)
+    int Major = 0;
+    UR_CHECK_ERROR(cuDeviceGetAttribute(
+        &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get()));
+
+    // Compute capability 2.0+ supports UVA which is required for P2P
+    if (Major < 2) {
+      return ReturnValue(false);
+    }
+
+    // Check if device can actually access peers
+    // We need to check against other devices in the platform
+    int canAccessPeer = 0;
+    int deviceCount = 0;
+    UR_CHECK_ERROR(cuDeviceGetCount(&deviceCount));
+
+    // If there's only one device, P2P is not applicable
+    if (deviceCount < 2) {
+      return ReturnValue(false);
+    }
+
+    // Check if this device can access at least one other device
+    CUdevice currentDevice = hDevice->get();
+    bool hasP2PCapability = false;
+
+    for (int i = 0; i < deviceCount; ++i) {
+      CUdevice peerDevice;
+      UR_CHECK_ERROR(cuDeviceGet(&peerDevice, i));
+
+      if (peerDevice != currentDevice) {
+        UR_CHECK_ERROR(
+            cuDeviceCanAccessPeer(&canAccessPeer, currentDevice, peerDevice));
+        if (canAccessPeer) {
+          hasP2PCapability = true;
+          break;
+        }
+      }
+    }
+
+    return ReturnValue(hasP2PCapability);
+  }
   case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP:
     return ReturnValue(false);
   case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP:

@@ -1580,8 +1580,72 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
                                                       hQueue, CuStream);
       UR_CHECK_ERROR(EventPtr->start());
     }
-    UR_CHECK_ERROR(
-        cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
+
+    // Check memory types and device ownership
+    CUmemorytype SrcType = CU_MEMORYTYPE_HOST;
+    CUmemorytype DstType = CU_MEMORYTYPE_HOST;
+    cuPointerGetAttribute(&SrcType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                          (CUdeviceptr)pSrc);
+    cuPointerGetAttribute(&DstType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+                          (CUdeviceptr)pDst);
+
+    // Detect cross-device copy for Managed Memory
+    bool isManagedMemory =
+        (SrcType == CU_MEMORYTYPE_UNIFIED || DstType == CU_MEMORYTYPE_UNIFIED);
+
+    if (isManagedMemory) {
+      // For Managed Memory cross-device copies without P2P:
+      // CUDA driver automatically stages through CPU memory.
+      // We just need to ensure the queue's device can access both pointers.
+      // Prefetch both to CPU to enable staging, then let CUDA handle migration.
+
+      // Prefetch SRC to CPU (system memory) if it's Managed
+      if (SrcType == CU_MEMORYTYPE_UNIFIED) {
+        UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pSrc, size,
+                                          CU_DEVICE_CPU, CuStream));
+      }
+
+      // Prefetch DST to CPU if it's Managed
+      if (DstType == CU_MEMORYTYPE_UNIFIED) {
+        UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pDst, size,
+                                          CU_DEVICE_CPU, CuStream));
+      }
+
+      // Wait for prefetches to complete
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+
+      // Now copy - CUDA will handle cross-device migration via CPU
+      UR_CHECK_ERROR(
+          cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
+
+    } else {
+      // For Device memory: try to detect cross-device copy
+      int SrcDevice = -1;
+      int DstDevice = -1;
+
+      // Get device ordinals (ignore errors for host memory)
+      cuPointerGetAttribute(&SrcDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+                            (CUdeviceptr)pSrc);
+      cuPointerGetAttribute(&DstDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+                            (CUdeviceptr)pDst);
+
+      bool isCrossDevice =
+          (SrcDevice != -1 && DstDevice != -1 && SrcDevice != DstDevice);
+
+      if (isCrossDevice) {
+        // Cross-device Device memory copy
+        // This requires P2P or staging through host
+        // cuMemcpyAsync will handle this - with P2P it's direct,
+        // without P2P driver stages through host (slower)
+        UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
+                                     CuStream));
+      } else {
+        // Same device or host-device copy
+        UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
+                                     CuStream));
+      }
+    }
+
     if (phEvent) {
       UR_CHECK_ERROR(EventPtr->record());
     }

@@ -47,7 +47,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
 /// USM: Implements USM device allocations using a normal CUDA device pointer
 ///
 UR_APIEXPORT ur_result_t UR_APICALL
-urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
+urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
                  const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool,
                  size_t size, void **ppMem) {
   auto alignment = pUSMDesc ? pUSMDesc->align : 0u;
@@ -65,15 +65,23 @@ urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
     auto umfErr = umfPoolGetLastAllocationError(pool);
     return umf::umf2urResult(umfErr);
   }
+
+  // Register allocation with context for cross-device operation tracking
+  if (hContext && *ppMem && hDevice) {
+    hContext->registerAllocation(*ppMem, hDevice);
+  }
+
   return UR_RESULT_SUCCESS;
 }
 
 /// USM: Implements USM Shared allocations using CUDA Managed Memory
 ///
 UR_APIEXPORT ur_result_t UR_APICALL
-urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
+urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
                  const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool,
                  size_t size, void **ppMem) {
+  // hContext unused - Shared memory not tracked (CUDA handles migration)
+  (void)hContext;
   auto alignment = pUSMDesc ? pUSMDesc->align : 0u;
 
   ScopedContext SC(hDevice);
@@ -89,12 +97,23 @@ urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
     auto umfErr = umfPoolGetLastAllocationError(pool);
     return umf::umf2urResult(umfErr);
   }
+
+  // Do NOT register Managed Memory allocations
+  // CUDA Unified Memory system handles migration automatically.
+  // Manual tracking interferes with automatic page migration.
+
   return UR_RESULT_SUCCESS;
 }
 
 /// USM: Frees the given USM pointer associated with the context.
 ///
-UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) {
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
+                                              void *pMem) {
+  // Unregister allocation metadata before freeing
+  if (hContext && pMem) {
+    hContext->unregisterAllocation(pMem);
+  }
+
   umf_memory_pool_handle_t hPool = NULL;
   umf_result_t ret = umfPoolByPtr(pMem, &hPool);
   if (ret == UMF_RESULT_SUCCESS) {