Skip to content

Commit eb4568d

Browse files
committed
Enable/disable P2P per test instance to avoid race conditions
Changes: - Track enabled P2P pairs in member variable enabledP2PPairs - SetUp: Only record pairs WE successfully enabled (both SUCCESS) - TearDown: Disable P2P bidirectionally for our pairs, ignore errors - Removes global P2P state dependency between test instances Works for both: - 2 physical GPUs duplicated 4× (8 logical devices) - 8 distinct physical GPUs Fixes #19033
1 parent 9f6512e commit eb4568d

1 file changed

Lines changed: 57 additions & 26 deletions

File tree

unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ struct urMultiQueueLaunchMemcpyTest
2323
std::vector<ur_program_handle_t> programs;
2424
std::vector<ur_kernel_handle_t> kernels;
2525
std::vector<void *> SharedMem;
26+
27+
// Track P2P pairs that we successfully enabled in SetUp, so we can
28+
// disable them in TearDown without interfering with other tests
29+
std::set<std::pair<ur_device_handle_t, ur_device_handle_t>> enabledP2PPairs;
2630

2731
static constexpr char ProgramName[] = "increment";
2832
static constexpr size_t ArraySize = 100;
@@ -49,29 +53,49 @@ struct urMultiQueueLaunchMemcpyTest
4953
urDeviceGetInfo(devices[0], UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP,
5054
sizeof(usm_p2p_support), &usm_p2p_support, nullptr));
5155
if (usm_p2p_support) {
56+
// Track unique physical device pairs to avoid enabling P2P multiple times.
57+
// Important: When devices are duplicated (same physical GPU appears multiple
58+
// times in the devices vector), we must only enable P2P once per unique
59+
// physical GPU pair to avoid CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED.
60+
// We store successfully enabled pairs in member variable so we can disable
61+
// them in TearDown, ensuring clean state for subsequent tests.
62+
5263
for (size_t i = 0; i < devices.size(); i++) {
53-
for (size_t j = 0; j < devices.size(); j++) {
54-
// Skip if same device (either by index or handle)
55-
if (i != j && devices[i] != devices[j]) {
56-
// First check if P2P is actually supported between this pair
57-
int32_t p2p_access_supported = 0;
58-
auto query_result = urUsmP2PPeerAccessGetInfoExp(
59-
devices[i], devices[j], UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED,
60-
sizeof(p2p_access_supported), &p2p_access_supported, nullptr);
61-
62-
if (query_result != UR_RESULT_SUCCESS || !p2p_access_supported) {
63-
// P2P not supported between this pair, skip the test
64-
GTEST_SKIP() << "P2P access not supported between device pair ("
65-
<< i << ", " << j << ")";
66-
}
67-
68-
// Enable peer access from device[i] to device[j]
69-
// Ignore errors - P2P may already be enabled by another test
70-
auto result = urUsmP2PEnablePeerAccessExp(devices[i], devices[j]);
71-
if (result != UR_RESULT_SUCCESS &&
72-
result != UR_RESULT_ERROR_INVALID_OPERATION) {
73-
ASSERT_SUCCESS(result);
74-
}
64+
for (size_t j = i + 1; j < devices.size(); j++) {
65+
// Skip if same physical device (duplicate handles)
66+
if (devices[i] == devices[j]) {
67+
continue;
68+
}
69+
70+
// Create canonical pair (always smaller handle first) to track unique pairs
71+
auto pair = devices[i] < devices[j]
72+
? std::make_pair(devices[i], devices[j])
73+
: std::make_pair(devices[j], devices[i]);
74+
75+
// Skip if we already processed this physical device pair
76+
if (enabledP2PPairs.count(pair)) {
77+
continue;
78+
}
79+
80+
// First check if P2P is actually supported between this physical pair
81+
int32_t p2p_access_supported = 0;
82+
auto query_result = urUsmP2PPeerAccessGetInfoExp(
83+
devices[i], devices[j], UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED,
84+
sizeof(p2p_access_supported), &p2p_access_supported, nullptr);
85+
86+
if (query_result != UR_RESULT_SUCCESS || !p2p_access_supported) {
87+
GTEST_SKIP() << "P2P access not supported between device pair ("
88+
<< i << ", " << j << ")";
89+
}
90+
91+
// Enable bidirectional peer access (both directions needed for memcpy)
92+
auto result_ij = urUsmP2PEnablePeerAccessExp(devices[i], devices[j]);
93+
auto result_ji = urUsmP2PEnablePeerAccessExp(devices[j], devices[i]);
94+
95+
// Only track pairs that WE successfully enabled (not already-enabled ones)
96+
// This way TearDown only disables what we enabled, avoiding conflicts
97+
if (result_ij == UR_RESULT_SUCCESS && result_ji == UR_RESULT_SUCCESS) {
98+
enabledP2PPairs.insert(pair);
7599
}
76100
}
77101
}
@@ -133,10 +157,17 @@ struct urMultiQueueLaunchMemcpyTest
133157
urProgramRelease(program);
134158
}
135159

136-
// NOTE: We intentionally do NOT disable P2P access here.
137-
// P2P state is per-context and shared across all test instances.
138-
// Disabling P2P in one test's TearDown would break other concurrent tests.
139-
// P2P can safely remain enabled for the lifetime of the process.
160+
// Disable P2P access for pairs that WE enabled in SetUp.
161+
// This ensures clean state for subsequent tests and prevents race conditions
162+
// where multiple test instances try to enable the same P2P pairs.
163+
// We only disable pairs we successfully enabled (tracked in enabledP2PPairs).
164+
for (const auto &pair : enabledP2PPairs) {
165+
// Disable bidirectionally (both directions)
166+
// Ignore errors - pair might already be disabled by another test
167+
urUsmP2PDisablePeerAccessExp(pair.first, pair.second);
168+
urUsmP2PDisablePeerAccessExp(pair.second, pair.first);
169+
}
170+
enabledP2PPairs.clear();
140171

141172
UUR_RETURN_ON_FATAL_FAILURE(
142173
uur::urMultiQueueMultiDeviceTestWithParam<minDevices, T>::TearDown());

0 commit comments

Comments
 (0)