Skip to content

Commit

Permalink
PR #16921: [PJRT:GPU] Treat GPU collective memory space as device mem…
Browse files Browse the repository at this point in the history
…ory space

Imported from GitHub PR #16921

This is a regression fix when using --xla_gpu_enable_nccl_user_buffers=true.
Return device memory space when collective memory space is used as an output on GPU.
Copybara import of the project:

--
8113e6f by Jane Liu <[email protected]>:

Treat collective memory space as device memory space when using as an output

--
b5e43d6 by Jane Liu <[email protected]>:

fix the test

Merging this change closes #16921

FUTURE_COPYBARA_INTEGRATE_REVIEW=#16921 from zhenying-liu:nccl-buffer-output b5e43d6
PiperOrigin-RevId: 672618973
  • Loading branch information
zhenying-liu authored and Google-ML-Automation committed Sep 12, 2024
1 parent dfe1fa1 commit 46ea4e8
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 5 deletions.
1 change: 0 additions & 1 deletion xla/pjrt/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ cc_library(
]) + if_cuda([
"@local_config_cuda//cuda:cuda_headers",
"//xla/stream_executor/cuda:cuda_platform_id",
"//xla/stream_executor/cuda:cuda_activation_header",
"//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
"//xla/service/gpu:nvptx_compiler_impl",
]) + if_rocm([
Expand Down
62 changes: 62 additions & 0 deletions xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1131,6 +1131,24 @@ constexpr char const* kD2HProgramTupleOutput = R"(
}
)";

constexpr char const* kCollectiveMemorySpaceOutput = R"(
HloModule jit__psum, entry_computation_layout={(s32[1,4]{1,0})->s32[4]{0}}
region_0.3 {
Arg_0.0 = s32[] parameter(0)
Arg_1.0 = s32[] parameter(1)
ROOT add.0 = s32[] add(Arg_0.0, Arg_1.0)
}
ENTRY main.10_spmd {
param = s32[1,4]{1,0} parameter(0)
reshape = s32[4]{0} reshape(param)
ROOT all-reduce = s32[4]{0} all-reduce(reshape), channel_id=1, to_apply=region_0.3
}
)";

} // namespace

TEST(StreamExecutorGpuClientTest, ExecutePinnedHostOutputTest) {
Expand Down Expand Up @@ -1197,6 +1215,50 @@ TEST(StreamExecutorGpuClientTest, ExecutablePinnedHostOutputMemoryKindTest) {
EXPECT_EQ(memory_kinds[0][0], "pinned_host");
}

// Verify the output device memory kind with collective memory space shape when
// NCCL user buffer is enabled.
TEST(StreamExecutorGpuClientTest,
ExecutableCollectiveMemoryOutputMemoryKindTest) {
TF_ASSERT_OK_AND_ASSIGN(auto client,
GetStreamExecutorGpuClient(GpuClientOptions()));
xla::CompileOptions options;
options.executable_build_options.mutable_debug_options()
->set_xla_gpu_enable_nccl_user_buffers(true);

TF_ASSERT_OK_AND_ASSIGN(
auto executable,
CompileExecutable(kCollectiveMemorySpaceOutput, *client, options));
std::vector<int32_t> data{1, 2, 3, 4};
// Build the input shape with the correct memory space set.
Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, {1, 4},
/*major_to_minor=*/{1, 0});
shape.mutable_layout()->set_memory_space(Layout::kDefaultMemorySpace);

auto device = client->addressable_devices()[0];
TF_EXPECT_OK(device->default_memory_space());
TF_ASSERT_OK_AND_ASSIGN(
auto input, client->BufferFromHostBuffer(
data.data(), shape.element_type(), shape.dimensions(),
/*byte_strides=*/std::nullopt,
PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall,
/*on_done_with_host_buffer=*/nullptr, device));
EXPECT_EQ(input->memory_space()->kind(), "device");

TF_ASSERT_OK_AND_ASSIGN(auto memory_kinds,
executable->GetOutputMemoryKinds());
EXPECT_EQ(memory_kinds.size(), 1);
EXPECT_EQ(memory_kinds[0].size(), 1);
EXPECT_EQ(memory_kinds[0][0], "device");

TF_ASSERT_OK_AND_ASSIGN(
auto result, executable->Execute({{input.get()}}, ExecuteOptions()));
std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
EXPECT_EQ(result_buffers[0]->memory_space()->kind(), "device");
Shape result_shape = result_buffers[0]->on_device_shape();
auto memory_space = result_shape.layout().memory_space();
EXPECT_EQ(memory_space, 1);
}

TEST(StreamExecutorGpuClientTest,
ExecutablePinnedHostTupleOutputMemoryKindTest) {
TF_ASSERT_OK_AND_ASSIGN(auto client,
Expand Down
2 changes: 2 additions & 0 deletions xla/pjrt/pjrt_stream_executor_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2286,6 +2286,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> OutputBufferHelper(
device->default_memory_space().value_or(nullptr);
if (shape.has_layout()) {
switch (shape.layout().memory_space()) {
case Layout::kGenericFastMemorySpace:
case Layout::kDefaultMemorySpace:
// Nothing to do, we have already set the default memory space.
break;
Expand Down Expand Up @@ -3322,6 +3323,7 @@ absl::StatusOr<absl::string_view> MemoryKindFromSimpleShape(
switch (shape.layout().memory_space()) {
case Layout::kHostMemorySpace:
return PinnedHostMemorySpace::kKind;
case Layout::kGenericFastMemorySpace:
case Layout::kDefaultMemorySpace:
return default_memory_kind;
default:
Expand Down
4 changes: 0 additions & 4 deletions xla/stream_executor/cuda/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ cuda_only_cc_library(
visibility = ["//visibility:public"],
deps =
[
":cuda_activation",
":cuda_collectives",
":cuda_driver",
":cuda_executor",
Expand Down Expand Up @@ -325,7 +324,6 @@ cuda_only_cc_library(
],
visibility = ["//visibility:public"],
deps = [
":cuda_activation",
":cuda_blas_utils",
":cuda_executor",
":cuda_helpers",
Expand Down Expand Up @@ -425,7 +423,6 @@ cc_library(
name = "cuda_dnn_headers",
textual_hdrs = ["cuda_dnn.h"],
deps = if_cuda_is_configured([
":cuda_activation_header",
"//xla/stream_executor:dnn",
"//xla/stream_executor:plugin_registry",
]) + [
Expand Down Expand Up @@ -466,7 +463,6 @@ cuda_only_cc_library(
copts = tf_additional_cudnn_plugin_copts(),
visibility = ["//visibility:public"],
deps = [
":cuda_activation",
":cuda_diagnostics",
":cuda_driver",
":cuda_executor",
Expand Down

0 comments on commit 46ea4e8

Please sign in to comment.