Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
66c6a4c
cuda: Track USM allocation metadata for cross-device operations
kekaczma Feb 20, 2026
1f45f89
[CUDA][UR] Track USM allocation metadata for cross-device operations
kekaczma Feb 20, 2026
b5aa0d8
[CUDA][UR] Add fallback for cross-device USM memcpy
kekaczma Feb 20, 2026
570d6c5
[CUDA][UR] Use synchronous memcpy for cross-device transfers
kekaczma Feb 20, 2026
3447c1f
[CUDA][UR] Use cuMemcpyPeer for cross-device USM copies
kekaczma Feb 20, 2026
5f7f9fa
[CUDA][UR] Retry cuMemcpyPeerAsync without P2P enabling
kekaczma Feb 20, 2026
885e9c8
Add stream synchronization before cross-device peer copy
kekaczma Feb 20, 2026
204074c
Activate source context before cuMemcpyPeerAsync
kekaczma Feb 20, 2026
9fd41dd
Fix ScopedContext to restore original context
kekaczma Feb 20, 2026
4da8175
Remove nested ScopedContext from cross-device copy
kekaczma Feb 20, 2026
abf0748
Fix ScopedContext to not restore nullptr context
kekaczma Feb 20, 2026
dcad3b2
Do not register USM Shared allocations for cross-device tracking
kekaczma Feb 20, 2026
1d81a9c
[CUDA] Suppress unused parameter warning in urUSMSharedAlloc
kekaczma Feb 23, 2026
bd224eb
[CUDA] Use different strategies for Managed vs Device memory cross-de…
kekaczma Feb 23, 2026
8e5fa1a
[CUDA] Revert to not tracking USM Shared allocations
kekaczma Feb 23, 2026
0fd95d2
[CUDA] Simplify urEnqueueUSMMemcpy - remove all cross-device detection
kekaczma Feb 23, 2026
3aab64b
[CUDA] Fix unused parameter warning in urUSMSharedAlloc
kekaczma Feb 23, 2026
15ef1e0
[CUDA] Use cuMemPrefetchAsync for Managed Memory in urEnqueueUSMMemcpy
kekaczma Feb 23, 2026
21ff7a5
[CUDA] Use USM Device memory for multi-GPU tests instead of USM Shared
kekaczma Feb 23, 2026
c5a1646
[CUDA] Fix Managed Memory cross-device copy to work without P2P
kekaczma Feb 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions unified-runtime/source/adapters/cuda/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <atomic>
#include <mutex>
#include <set>
#include <shared_mutex>
#include <unordered_map>
#include <vector>

#include "adapter.hpp"
Expand Down Expand Up @@ -96,6 +98,10 @@ struct ur_context_handle_t_ : ur::cuda::handle_base {
umf_memory_provider_handle_t MemoryProviderHost = nullptr;
umf_memory_pool_handle_t MemoryPoolHost = nullptr;

// Track which device allocated each USM pointer for cross-device operations
std::unordered_map<const void *, ur_device_handle_t> AllocationMetadata;
mutable std::shared_mutex AllocationMetadataMutex;

ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
: handle_base(), Devices{Devs, Devs + NumDevices} {
// Create UMF CUDA memory provider for the host memory
Expand Down Expand Up @@ -147,6 +153,25 @@ struct ur_context_handle_t_ : ur::cuda::handle_base {

ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool);

// Register USM allocation metadata for cross-device operation tracking
void registerAllocation(const void *Ptr, ur_device_handle_t Device) {
std::unique_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
AllocationMetadata[Ptr] = Device;
}

// Unregister USM allocation metadata
void unregisterAllocation(const void *Ptr) {
std::unique_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
AllocationMetadata.erase(Ptr);
}

// Query which device allocated a USM pointer
ur_device_handle_t getAllocationDevice(const void *Ptr) const {
std::shared_lock<std::shared_mutex> Lock(AllocationMetadataMutex);
auto It = AllocationMetadata.find(Ptr);
return (It != AllocationMetadata.end()) ? It->second : nullptr;
}

private:
std::mutex Mutex;
std::vector<deleter_data> ExtendedDeleters;
Expand All @@ -165,19 +190,30 @@ class ScopedContext {

ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }

~ScopedContext() {}
~ScopedContext() {
// Restore original context if we changed it and there was a previous
// context If Original was nullptr, leave the current context active to
// maintain compatibility with code that expects context to remain set
if (NeedToRestore && Original != nullptr) {
cuCtxSetCurrent(Original);
}
}

private:
void setContext(CUcontext Desired) {
CUcontext Original = nullptr;

UR_CHECK_ERROR(cuCtxGetCurrent(&Original));

// Make sure the desired context is active on the current thread, setting
// it if necessary
if (Original != Desired) {
UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
NeedToRestore = true;
} else {
NeedToRestore = false;
}
}

CUcontext Original = nullptr;
bool NeedToRestore = false;
};
} // namespace
45 changes: 43 additions & 2 deletions unified-runtime/source/adapters/cuda/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1174,8 +1174,49 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(true);
case UR_DEVICE_INFO_USE_NATIVE_ASSERT:
return ReturnValue(true);
case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP:
return ReturnValue(true);
case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: {
// P2P support requires compute capability >= 2.0
// Check if device supports Unified Virtual Addressing (UVA)
int Major = 0;
UR_CHECK_ERROR(cuDeviceGetAttribute(
&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get()));

// Compute capability 2.0+ supports UVA which is required for P2P
if (Major < 2) {
return ReturnValue(false);
}

// Check if device can actually access peers
// We need to check against other devices in the platform
int canAccessPeer = 0;
int deviceCount = 0;
UR_CHECK_ERROR(cuDeviceGetCount(&deviceCount));

// If there's only one device, P2P is not applicable
if (deviceCount < 2) {
return ReturnValue(false);
}

// Check if this device can access at least one other device
CUdevice currentDevice = hDevice->get();
bool hasP2PCapability = false;

for (int i = 0; i < deviceCount; ++i) {
CUdevice peerDevice;
UR_CHECK_ERROR(cuDeviceGet(&peerDevice, i));

if (peerDevice != currentDevice) {
UR_CHECK_ERROR(
cuDeviceCanAccessPeer(&canAccessPeer, currentDevice, peerDevice));
if (canAccessPeer) {
hasP2PCapability = true;
break;
}
}
}

return ReturnValue(hasP2PCapability);
}
case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP:
return ReturnValue(false);
case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP:
Expand Down
68 changes: 66 additions & 2 deletions unified-runtime/source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1580,8 +1580,72 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
hQueue, CuStream);
UR_CHECK_ERROR(EventPtr->start());
}
UR_CHECK_ERROR(
cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));

// Check memory types and device ownership
CUmemorytype SrcType = CU_MEMORYTYPE_HOST;
CUmemorytype DstType = CU_MEMORYTYPE_HOST;
cuPointerGetAttribute(&SrcType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
(CUdeviceptr)pSrc);
cuPointerGetAttribute(&DstType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
(CUdeviceptr)pDst);

// Detect cross-device copy for Managed Memory
bool isManagedMemory =
(SrcType == CU_MEMORYTYPE_UNIFIED || DstType == CU_MEMORYTYPE_UNIFIED);

if (isManagedMemory) {
// For Managed Memory cross-device copies without P2P:
// CUDA driver automatically stages through CPU memory.
// We just need to ensure the queue's device can access both pointers.
// Prefetch both to CPU to enable staging, then let CUDA handle migration.

// Prefetch SRC to CPU (system memory) if it's Managed
if (SrcType == CU_MEMORYTYPE_UNIFIED) {
UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pSrc, size,
CU_DEVICE_CPU, CuStream));
}

// Prefetch DST to CPU if it's Managed
if (DstType == CU_MEMORYTYPE_UNIFIED) {
UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pDst, size,
CU_DEVICE_CPU, CuStream));
}

// Wait for prefetches to complete
UR_CHECK_ERROR(cuStreamSynchronize(CuStream));

// Now copy - CUDA will handle cross-device migration via CPU
UR_CHECK_ERROR(
cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));

} else {
// For Device memory: try to detect cross-device copy
int SrcDevice = -1;
int DstDevice = -1;

// Get device ordinals (ignore errors for host memory)
cuPointerGetAttribute(&SrcDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
(CUdeviceptr)pSrc);
cuPointerGetAttribute(&DstDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
(CUdeviceptr)pDst);

bool isCrossDevice =
(SrcDevice != -1 && DstDevice != -1 && SrcDevice != DstDevice);

if (isCrossDevice) {
// Cross-device Device memory copy
// This requires P2P or staging through host
// cuMemcpyAsync will handle this - with P2P it's direct,
// without P2P driver stages through host (slower)
UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
CuStream));
} else {
// Same device or host-device copy
UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size,
CuStream));
}
}

if (phEvent) {
UR_CHECK_ERROR(EventPtr->record());
}
Expand Down
25 changes: 22 additions & 3 deletions unified-runtime/source/adapters/cuda/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
/// USM: Implements USM device allocations using a normal CUDA device pointer
///
UR_APIEXPORT ur_result_t UR_APICALL
urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool,
size_t size, void **ppMem) {
auto alignment = pUSMDesc ? pUSMDesc->align : 0u;
Expand All @@ -65,15 +65,23 @@ urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
auto umfErr = umfPoolGetLastAllocationError(pool);
return umf::umf2urResult(umfErr);
}

// Register allocation with context for cross-device operation tracking
if (hContext && *ppMem && hDevice) {
hContext->registerAllocation(*ppMem, hDevice);
}

return UR_RESULT_SUCCESS;
}

/// USM: Implements USM Shared allocations using CUDA Managed Memory
///
UR_APIEXPORT ur_result_t UR_APICALL
urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool,
size_t size, void **ppMem) {
// hContext unused - Shared memory not tracked (CUDA handles migration)
(void)hContext;
auto alignment = pUSMDesc ? pUSMDesc->align : 0u;

ScopedContext SC(hDevice);
Expand All @@ -89,12 +97,23 @@ urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice,
auto umfErr = umfPoolGetLastAllocationError(pool);
return umf::umf2urResult(umfErr);
}

// Do NOT register Managed Memory allocations
// CUDA Unified Memory system handles migration automatically.
// Manual tracking interferes with automatic page migration.

return UR_RESULT_SUCCESS;
}

/// USM: Frees the given USM pointer associated with the context.
///
UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) {
UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
void *pMem) {
// Unregister allocation metadata before freeing
if (hContext && pMem) {
hContext->unregisterAllocation(pMem);
}

umf_memory_pool_handle_t hPool = NULL;
umf_result_t ret = umfPoolByPtr(pMem, &hPool);
if (ret == UMF_RESULT_SUCCESS) {
Expand Down
Loading
Loading