-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Offload] Implement olShutDown
#144055
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[Offload] Implement olShutDown
#144055
Conversation
@llvm/pr-subscribers-offload @llvm/pr-subscribers-backend-amdgpu Author: Ross Brunton (RossBrunton) Changes
The spec has been updated to remove references to This flagged up some memory issues, which have been fixed in #143873 . Please ignore the first three commits of this MR. Full diff: https://github.com/llvm/llvm-project/pull/144055.diff 7 Files Affected:
diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 7674da0438c29..e611dd9105251 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -152,8 +152,7 @@ def : Function {
let name = "olInit";
let desc = "Perform initialization of the Offload library and plugins";
let details = [
- "This must be the first API call made by a user of the Offload library",
- "Each call will increment an internal reference count that is decremented by `olShutDown`"
+ "This must be the first API call made by a user of the Offload library"
];
let params = [];
let returns = [];
@@ -163,8 +162,8 @@ def : Function {
let name = "olShutDown";
let desc = "Release the resources in use by Offload";
let details = [
- "This decrements an internal reference count. When this reaches 0, all resources will be released",
- "Subsequent API calls made after this are not valid"
+ "All resources owned by the Offload library and plugins will be released",
+ "Subsequent API calls made after calling `olShutDown` are undefined behavior"
];
let params = [];
let returns = [];
diff --git a/offload/liboffload/API/Program.td b/offload/liboffload/API/Program.td
index 8c88fe6e21e6a..0476fa1f7c27a 100644
--- a/offload/liboffload/API/Program.td
+++ b/offload/liboffload/API/Program.td
@@ -13,7 +13,9 @@
def : Function {
let name = "olCreateProgram";
let desc = "Create a program for the device from the binary image pointed to by `ProgData`.";
- let details = [];
+ let details = [
+ "The provided `ProgData` will be copied and need not outlive the returned handle",
+ ];
let params = [
Param<"ol_device_handle_t", "Device", "handle of the device", PARAM_IN>,
Param<"const void*", "ProgData", "pointer to the program binary data", PARAM_IN>,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d2b331905ab77..ae2975e3ec3c5 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -168,15 +168,27 @@ void initPlugins() {
!std::getenv("OFFLOAD_DISABLE_VALIDATION");
}
-// TODO: We can properly reference count here and manage the resources in a more
-// clever way
Error olInit_impl() {
static std::once_flag InitFlag;
std::call_once(InitFlag, initPlugins);
return Error::success();
}
-Error olShutDown_impl() { return Error::success(); }
+
+Error olShutDown_impl() {
+ llvm::Error Result = Error::success();
+
+ for (auto &P : Platforms()) {
+ // Host plugin is nullptr and has no deinit
+ if (!P.Plugin)
+ continue;
+
+ if (auto Res = P.Plugin->deinit())
+ Result = llvm::joinErrors(std::move(Result), std::move(Res));
+ }
+
+ return Result;
+}
Error olGetPlatformInfoImplDetail(ol_platform_handle_t Platform,
ol_platform_info_t PropName, size_t PropSize,
@@ -465,6 +477,9 @@ Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
}
Error olDestroyProgram_impl(ol_program_handle_t Program) {
+ if (auto Err = Program->Image->getDevice().unloadBinary(Program->Image))
+ return Err;
+
return olDestroy(Program);
}
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..5f80a97d02cb6 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2023,6 +2023,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
+ Error unloadBinaryImpl(DeviceImageTy *Image) override {
+ AMDGPUDeviceImageTy &AMDImage = static_cast<AMDGPUDeviceImageTy &>(*Image);
+
+ // Unload the executable of the image.
+ return AMDImage.unloadExecutable();
+ }
+
/// Deinitialize the device and release its resources.
Error deinitImpl() override {
// Deinitialize the stream and event pools.
@@ -2035,19 +2042,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = AMDGPUSignalManager.deinit())
return Err;
- // Close modules if necessary.
- if (!LoadedImages.empty()) {
- // Each image has its own module.
- for (DeviceImageTy *Image : LoadedImages) {
- AMDGPUDeviceImageTy &AMDImage =
- static_cast<AMDGPUDeviceImageTy &>(*Image);
-
- // Unload the executable of the image.
- if (auto Err = AMDImage.unloadExecutable())
- return Err;
- }
- }
-
// Invalidate agent reference.
Agent = {0};
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..7af61074bb322 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -712,6 +712,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
virtual Expected<DeviceImageTy *>
loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
+ /// Unload a previously loaded Image from the device
+ Error unloadBinary(DeviceImageTy *Image);
+ virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
+
/// Setup the device environment if needed. Notice this setup may not be run
/// on some plugins. By default, it will be executed, but plugins can change
/// this behavior by overriding the shouldSetupDeviceEnvironment function.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9a6b3c1f4324..bb503c1ff7a54 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -821,26 +821,52 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
return Plugin::success();
}
-Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
- for (DeviceImageTy *Image : LoadedImages)
- if (auto Err = callGlobalDestructors(Plugin, *Image))
- return Err;
+Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
+ if (auto Err = callGlobalDestructors(Plugin, *Image))
+ return Err;
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
- for (auto *Image : LoadedImages) {
- DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
- GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
- sizeof(DeviceMemoryPoolTrackingTy),
- &ImageDeviceMemoryPoolTracking);
- if (auto Err =
- GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
- consumeError(std::move(Err));
- continue;
- }
- DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
+ DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
+ GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
+ sizeof(DeviceMemoryPoolTrackingTy),
+ &ImageDeviceMemoryPoolTracking);
+ if (auto Err =
+ GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
+ consumeError(std::move(Err));
}
+ DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
+ }
+
+ GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
+ auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
+ if (!ProfOrErr)
+ return ProfOrErr.takeError();
+
+ if (!ProfOrErr->empty()) {
+ // Dump out profdata
+ if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
+ uint32_t(DeviceDebugKind::PGODump))
+ ProfOrErr->dump();
+
+ // Write data to profiling file
+ if (auto Err = ProfOrErr->write())
+ return Err;
+ }
+
+ LoadedImages.erase(
+ std::find(LoadedImages.begin(), LoadedImages.end(), Image));
+ return unloadBinaryImpl(Image);
+}
+
+Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
+ while (!LoadedImages.empty()) {
+ if (auto Err = unloadBinary(LoadedImages.back()))
+ return Err;
+ }
+
+ if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
// TODO: Write this by default into a file.
printf("\n\n|-----------------------\n"
"| Device memory tracker:\n"
@@ -856,25 +882,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
DeviceMemoryPoolTracking.AllocationMax);
}
- for (auto *Image : LoadedImages) {
- GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
- auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
- if (!ProfOrErr)
- return ProfOrErr.takeError();
-
- if (ProfOrErr->empty())
- continue;
-
- // Dump out profdata
- if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
- uint32_t(DeviceDebugKind::PGODump))
- ProfOrErr->dump();
-
- // Write data to profiling file
- if (auto Err = ProfOrErr->write())
- return Err;
- }
-
// Delete the memory manager before deinitializing the device. Otherwise,
// we may delete device allocations after the device is deinitialized.
if (MemoryManager)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..eb313d0a4f093 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -358,6 +358,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
+ Error unloadBinaryImpl(DeviceImageTy *Image) override {
+ assert(Context && "Invalid CUDA context");
+
+ // Each image has its own module.
+ CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
+
+ // Unload the module of the image.
+ if (auto Err = CUDAImage.unloadModule())
+ return Err;
+
+ return Plugin::success();
+ }
+
/// Deinitialize the device and release its resources.
Error deinitImpl() override {
if (Context) {
@@ -372,20 +385,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (auto Err = CUDAEventManager.deinit())
return Err;
- // Close modules if necessary.
- if (!LoadedImages.empty()) {
- assert(Context && "Invalid CUDA context");
-
- // Each image has its own module.
- for (DeviceImageTy *Image : LoadedImages) {
- CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
-
- // Unload the module of the image.
- if (auto Err = CUDAImage.unloadModule())
- return Err;
- }
- }
-
if (Context) {
CUresult Res = cuDevicePrimaryCtxRelease(Device);
if (auto Err =
|
Initialization like this should always be reference counted. Otherwise the first thread to call |
@jhuber6 Ah, okay I'll add that then. What about calling |
Yes, see https://github.com/llvm/llvm-project/blob/main/offload/libomptarget/OffloadRTL.cpp for how OpenMP does it. It's tough to do this without mutexes because you need to prevent other threads from making progress until you're done initializing stuff. |
olShutDown
and remove global reference countingolShutDown
✅ With the latest revision this PR passed the C/C++ code formatter. |
// Host plugin is nullptr and has no deinit | ||
if (!P.Plugin) | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need to handle this more cleanly in the future.
if (auto Res = P.Plugin->deinit()) | ||
Result = llvm::joinErrors(std::move(Result), std::move(Res)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we have multiple plugins active this will potentially drop a previous error and hit an assertion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should collect all the errors from each plugin into a list. But I don't think that is actually handled correctly when returning it from the C api.
f555d49
to
ebf32b7
Compare
`olShutDown` was not properly calling deinit on the platforms, resulting in random segfaults on AMD devices.
if (!isOffloadInitialized()) | ||
InitResult = initPlugins(); | ||
|
||
OffloadContext::get().RefCount++; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't we be calling new and delete on this if it's null?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The context is new
'd in initPlugins()
, but perhaps it makes sense doing it in olInit_impl
, so I'll move it here.
@@ -197,8 +195,10 @@ Error olInit_impl() { | |||
std::lock_guard<std::mutex> Lock{OffloadContextValMutex}; | |||
|
|||
std::optional<Error> InitResult; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A std::optional<Error>
is weird here. The result of Error::success()
is supposed to be the optional result.
olShutDown
was not properly calling deinit on the platforms, resulting in random segfaults on AMD devices.As part of this,
olInit
andolShutDown
now alloc and free the platform list and alloc info list rather than it being static. This allowsolShutDown
to be called within a destructor of a static object (like the tests do) without having to worry about destructor ordering.This flagged up some memory issues, which have been fixed in #143873 . Please ignore the first three commits of this MR.Dependant MR has been merged.