diff --git a/unified-runtime/source/adapters/cuda/context.hpp b/unified-runtime/source/adapters/cuda/context.hpp index e49a7db9fe505..eebc012a281fb 100644 --- a/unified-runtime/source/adapters/cuda/context.hpp +++ b/unified-runtime/source/adapters/cuda/context.hpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include "adapter.hpp" @@ -96,6 +98,10 @@ struct ur_context_handle_t_ : ur::cuda::handle_base { umf_memory_provider_handle_t MemoryProviderHost = nullptr; umf_memory_pool_handle_t MemoryPoolHost = nullptr; + // Track which device allocated each USM pointer for cross-device operations + std::unordered_map AllocationMetadata; + mutable std::shared_mutex AllocationMetadataMutex; + ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices) : handle_base(), Devices{Devs, Devs + NumDevices} { // Create UMF CUDA memory provider for the host memory @@ -147,6 +153,25 @@ struct ur_context_handle_t_ : ur::cuda::handle_base { ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool); + // Register USM allocation metadata for cross-device operation tracking + void registerAllocation(const void *Ptr, ur_device_handle_t Device) { + std::unique_lock Lock(AllocationMetadataMutex); + AllocationMetadata[Ptr] = Device; + } + + // Unregister USM allocation metadata + void unregisterAllocation(const void *Ptr) { + std::unique_lock Lock(AllocationMetadataMutex); + AllocationMetadata.erase(Ptr); + } + + // Query which device allocated a USM pointer + ur_device_handle_t getAllocationDevice(const void *Ptr) const { + std::shared_lock Lock(AllocationMetadataMutex); + auto It = AllocationMetadata.find(Ptr); + return (It != AllocationMetadata.end()) ? It->second : nullptr; + } + private: std::mutex Mutex; std::vector ExtendedDeleters; @@ -165,19 +190,30 @@ class ScopedContext { ScopedContext(CUcontext NativeContext) { setContext(NativeContext); } - ~ScopedContext() {} + ~ScopedContext() { + // Restore original context if we changed it and there was a previous + // context If Original was nullptr, leave the current context active to + // maintain compatibility with code that expects context to remain set + if (NeedToRestore && Original != nullptr) { + cuCtxSetCurrent(Original); + } + } private: void setContext(CUcontext Desired) { - CUcontext Original = nullptr; - UR_CHECK_ERROR(cuCtxGetCurrent(&Original)); // Make sure the desired context is active on the current thread, setting // it if necessary if (Original != Desired) { UR_CHECK_ERROR(cuCtxSetCurrent(Desired)); + NeedToRestore = true; + } else { + NeedToRestore = false; } } + + CUcontext Original = nullptr; + bool NeedToRestore = false; }; } // namespace diff --git a/unified-runtime/source/adapters/cuda/device.cpp b/unified-runtime/source/adapters/cuda/device.cpp index 582cdbe7a9d90..dc0269ccdfc65 100644 --- a/unified-runtime/source/adapters/cuda/device.cpp +++ b/unified-runtime/source/adapters/cuda/device.cpp @@ -1174,8 +1174,49 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(true); case UR_DEVICE_INFO_USE_NATIVE_ASSERT: return ReturnValue(true); - case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: - return ReturnValue(true); + case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: { + // P2P support requires compute capability >= 2.0 + // Check if device supports Unified Virtual Addressing (UVA) + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + + // Compute capability 2.0+ supports UVA which is required for P2P + if (Major < 2) { + return ReturnValue(false); + } + + // Check if device can actually access peers + // We need to check against other devices in the platform + int canAccessPeer = 0; + int deviceCount = 0; + UR_CHECK_ERROR(cuDeviceGetCount(&deviceCount)); + + // If there's only one device, P2P is not applicable + if (deviceCount < 2) { + return ReturnValue(false); + } + + // Check if this device can access at least one other device + CUdevice currentDevice = hDevice->get(); + bool hasP2PCapability = false; + + for (int i = 0; i < deviceCount; ++i) { + CUdevice peerDevice; + UR_CHECK_ERROR(cuDeviceGet(&peerDevice, i)); + + if (peerDevice != currentDevice) { + UR_CHECK_ERROR( + cuDeviceCanAccessPeer(&canAccessPeer, currentDevice, peerDevice)); + if (canAccessPeer) { + hasP2PCapability = true; + break; + } + } + } + + return ReturnValue(hasP2PCapability); + } case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP: diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index 6ba5b8c2c27f2..d44b056ae0d9d 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -1580,8 +1580,72 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( hQueue, CuStream); UR_CHECK_ERROR(EventPtr->start()); } - UR_CHECK_ERROR( - cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream)); + + // Check memory types and device ownership + CUmemorytype SrcType = CU_MEMORYTYPE_HOST; + CUmemorytype DstType = CU_MEMORYTYPE_HOST; + cuPointerGetAttribute(&SrcType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)pSrc); + cuPointerGetAttribute(&DstType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)pDst); + + // Detect cross-device copy for Managed Memory + bool isManagedMemory = + (SrcType == CU_MEMORYTYPE_UNIFIED || DstType == CU_MEMORYTYPE_UNIFIED); + + if (isManagedMemory) { + // For Managed Memory cross-device copies without P2P: + // CUDA driver automatically stages through CPU memory. + // We just need to ensure the queue's device can access both pointers. + // Prefetch both to CPU to enable staging, then let CUDA handle migration. + + // Prefetch SRC to CPU (system memory) if it's Managed + if (SrcType == CU_MEMORYTYPE_UNIFIED) { + UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pSrc, size, + CU_DEVICE_CPU, CuStream)); + } + + // Prefetch DST to CPU if it's Managed + if (DstType == CU_MEMORYTYPE_UNIFIED) { + UR_CHECK_ERROR(cuMemPrefetchAsync((CUdeviceptr)pDst, size, + CU_DEVICE_CPU, CuStream)); + } + + // Wait for prefetches to complete + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + + // Now copy - CUDA will handle cross-device migration via CPU + UR_CHECK_ERROR( + cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream)); + + } else { + // For Device memory: try to detect cross-device copy + int SrcDevice = -1; + int DstDevice = -1; + + // Get device ordinals (ignore errors for host memory) + cuPointerGetAttribute(&SrcDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + (CUdeviceptr)pSrc); + cuPointerGetAttribute(&DstDevice, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + (CUdeviceptr)pDst); + + bool isCrossDevice = + (SrcDevice != -1 && DstDevice != -1 && SrcDevice != DstDevice); + + if (isCrossDevice) { + // Cross-device Device memory copy + // This requires P2P or staging through host + // cuMemcpyAsync will handle this - with P2P it's direct, + // without P2P driver stages through host (slower) + UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, + CuStream)); + } else { + // Same device or host-device copy + UR_CHECK_ERROR(cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, + CuStream)); + } + } + if (phEvent) { UR_CHECK_ERROR(EventPtr->record()); } diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index c805c1084ec0f..00daf020898f8 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -47,7 +47,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, /// USM: Implements USM device allocations using a normal CUDA device pointer /// UR_APIEXPORT ur_result_t UR_APICALL -urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice, +urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool, size_t size, void **ppMem) { auto alignment = pUSMDesc ? pUSMDesc->align : 0u; @@ -65,15 +65,23 @@ urUSMDeviceAlloc(ur_context_handle_t, ur_device_handle_t hDevice, auto umfErr = umfPoolGetLastAllocationError(pool); return umf::umf2urResult(umfErr); } + + // Register allocation with context for cross-device operation tracking + if (hContext && *ppMem && hDevice) { + hContext->registerAllocation(*ppMem, hDevice); + } + return UR_RESULT_SUCCESS; } /// USM: Implements USM Shared allocations using CUDA Managed Memory /// UR_APIEXPORT ur_result_t UR_APICALL -urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice, +urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool, size_t size, void **ppMem) { + // hContext unused - Shared memory not tracked (CUDA handles migration) + (void)hContext; auto alignment = pUSMDesc ? pUSMDesc->align : 0u; ScopedContext SC(hDevice); @@ -89,12 +97,23 @@ urUSMSharedAlloc(ur_context_handle_t, ur_device_handle_t hDevice, auto umfErr = umfPoolGetLastAllocationError(pool); return umf::umf2urResult(umfErr); } + + // Do NOT register Managed Memory allocations + // CUDA Unified Memory system handles migration automatically. + // Manual tracking interferes with automatic page migration. + return UR_RESULT_SUCCESS; } /// USM: Frees the given USM pointer associated with the context. /// -UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, + void *pMem) { + // Unregister allocation metadata before freeing + if (hContext && pMem) { + hContext->unregisterAllocation(pMem); + } + umf_memory_pool_handle_t hPool = NULL; umf_result_t ret = umfPoolByPtr(pMem, &hPool); if (ret == UMF_RESULT_SUCCESS) { diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp index d4a9f0188aa2f..88136db77355a 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -23,6 +24,8 @@ struct urMultiQueueLaunchMemcpyTest std::vector programs; std::vector kernels; std::vector SharedMem; + std::vector HostMem; // For CUDA: host-accessible verification buffer + bool useCudaDeviceMemory = false; static constexpr char ProgramName[] = "increment"; static constexpr size_t ArraySize = 100; @@ -46,6 +49,16 @@ struct urMultiQueueLaunchMemcpyTest kernels.resize(devices.size()); SharedMem.resize(devices.size()); + // Check if we're on CUDA backend + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + useCudaDeviceMemory = (backend == UR_PLATFORM_BACKEND_CUDA); + + if (useCudaDeviceMemory) { + HostMem.resize(devices.size()); + } + std::shared_ptr> il_binary; std::vector metadatas{}; @@ -71,10 +84,25 @@ struct urMultiQueueLaunchMemcpyTest ASSERT_SUCCESS( urKernelCreate(programs[i], KernelName.data(), &kernels[i])); - ASSERT_SUCCESS(urUSMSharedAlloc(context, devices[i], nullptr, nullptr, + // CUDA: Use USM Device memory for multi-GPU peer transfers + // Other backends: Use USM Shared memory + if (useCudaDeviceMemory) { + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[i], nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &SharedMem[i])); + ASSERT_NE(SharedMem[i], nullptr); + + // Also allocate host-accessible buffer for verification + ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, ArraySize * sizeof(uint32_t), - &SharedMem[i])); - ASSERT_NE(SharedMem[i], nullptr); + &HostMem[i])); + ASSERT_NE(HostMem[i], nullptr); + } else { + ASSERT_SUCCESS(urUSMSharedAlloc(context, devices[i], nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &SharedMem[i])); + ASSERT_NE(SharedMem[i], nullptr); + } ASSERT_SUCCESS(urEnqueueUSMFill( queues[i], SharedMem[i], sizeof(uint32_t), &InitialValue, @@ -90,6 +118,11 @@ struct urMultiQueueLaunchMemcpyTest for (auto &Ptr : SharedMem) { urUSMFree(context, Ptr); } + if (useCudaDeviceMemory) { + for (auto &Ptr : HostMem) { + urUSMFree(context, Ptr); + } + } for (const auto &kernel : kernels) { urKernelRelease(kernel); } @@ -112,9 +145,21 @@ struct urMultiQueueLaunchMemcpyTest } while (status != UR_EVENT_STATUS_COMPLETE); auto ExpectedValue = InitialValue + i + 1; - for (uint32_t j = 0; j < ArraySize; ++j) { - ASSERT_EQ(reinterpret_cast(SharedMem[i])[j], - ExpectedValue); + + // CUDA: Copy from device to host buffer before verification + if (useCudaDeviceMemory) { + ASSERT_SUCCESS(urEnqueueUSMMemcpy( + queues[i], true, HostMem[i], SharedMem[i], + ArraySize * sizeof(uint32_t), 0, nullptr, nullptr)); + for (uint32_t j = 0; j < ArraySize; ++j) { + ASSERT_EQ(reinterpret_cast(HostMem[i])[j], + ExpectedValue); + } + } else { + for (uint32_t j = 0; j < ArraySize; ++j) { + ASSERT_EQ(reinterpret_cast(SharedMem[i])[j], + ExpectedValue); + } } }); } @@ -268,8 +313,6 @@ UUR_PLATFORM_TEST_SUITE_WITH_PARAM( // ... ops TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) { UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); - // https://github.com/intel/llvm/issues/19033 - UUR_KNOWN_FAILURE_ON(uur::CUDA{}); auto waitOnEvent = std::get<0>(getParam()).value; auto runBackgroundCheck = std::get<1>(getParam()).value; @@ -278,6 +321,23 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) { ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP, sizeof(usm_p2p_support), &usm_p2p_support, nullptr)); + + // Log P2P support status for debugging + std::cout << "[P2P INFO] Device 0 USM P2P Support: " + << (usm_p2p_support ? "ENABLED" : "DISABLED") << std::endl; + + // Check all device pairs for P2P support + for (size_t i = 0; i < devices.size(); i++) { + for (size_t j = i + 1; j < devices.size(); j++) { + ur_bool_t pair_p2p = false; + ASSERT_SUCCESS(urDeviceGetInfo(devices[i], + UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP, + sizeof(pair_p2p), &pair_p2p, nullptr)); + std::cout << "[P2P INFO] Device " << i << " <-> Device " << j + << " P2P: " << (pair_p2p ? "ENABLED" : "DISABLED") << std::endl; + } + } + if (!usm_p2p_support) { GTEST_SKIP() << "EXP usm p2p feature is not supported."; } @@ -324,11 +384,22 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) { ASSERT_SUCCESS(urQueueFinish(queues.back())); } + // CUDA: Copy device buffers to host for verification + if (this->useCudaDeviceMemory) { + for (size_t i = 0; i < devices.size(); i++) { + ASSERT_SUCCESS(urEnqueueUSMMemcpy( + queues[i], true, this->HostMem[i], this->SharedMem[i], + ArraySize * sizeof(uint32_t), 0, nullptr, nullptr)); + } + } + size_t ExpectedValue = InitialValue; for (size_t i = 0; i < devices.size(); i++) { ExpectedValue++; + void *verifyPtr = + this->useCudaDeviceMemory ? this->HostMem[i] : this->SharedMem[i]; for (uint32_t j = 0; j < ArraySize; ++j) { - ASSERT_EQ(reinterpret_cast(SharedMem[i])[j], ExpectedValue); + ASSERT_EQ(reinterpret_cast(verifyPtr)[j], ExpectedValue); } } }