Merge branch 'lwawrzyniak/deferred-unload' into 'main'

nvlukasz · nvlukasz · commit edf67c0cd358 · 2024-12-19T08:11:47.000-08:00
Fix graph capture errors caused by module unloading Closes GH-401 See merge request omniverse/warp!932
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@
 - Fix for occasional failure to update .meta files into Warp kernel cache on Windows
 - Mark kernel arrays as written to when passed to `wp.atomic_add()` or `wp.atomic_sub()`
 - Fix the OpenGL renderer not being able to run without CUDA ([GH-344](https://github.com/NVIDIA/warp/issues/344)).
+- Fix errors during graph capture caused by module unloading ([GH-401](https://github.com/NVIDIA/warp/issues/401)).
 
 ## [1.5.0] - 2024-12-02
 
diff --git a/warp/native/warp.cu b/warp/native/warp.cu
@@ -187,6 +187,13 @@ struct FreeInfo
     bool is_async = false;
 };
 
+// Information used when deferring module unloading.
+struct ModuleInfo
+{
+    void* context = NULL;
+    void* module = NULL;
+};
+
 static std::unordered_map<CUfunction, std::string> g_kernel_names;
 
 // cached info for all devices, indexed by ordinal
@@ -214,6 +221,9 @@ static std::unordered_map<void*, GraphAllocInfo> g_graph_allocs;
 // Call free_deferred_allocs() to release.
 static std::vector<FreeInfo> g_deferred_free_list;
 
+// Modules that cannot be unloaded immediately get queued here.
+// Call unload_deferred_modules() to release.
+static std::vector<ModuleInfo> g_deferred_module_list;
 
 void cuda_set_context_restore_policy(bool always_restore)
 {
@@ -410,6 +420,31 @@ static int free_deferred_allocs(void* context = NULL)
     return num_freed_allocs;
 }
 
+static int unload_deferred_modules(void* context = NULL)
+{
+    if (g_deferred_module_list.empty() || !g_captures.empty())
+        return 0;
+
+    int num_unloaded_modules = 0;
+    for (auto it = g_deferred_module_list.begin(); it != g_deferred_module_list.end(); /*noop*/)
+    {
+        // free the module if it matches the given context or if the context is unspecified
+        const ModuleInfo& module_info = *it;
+        if (module_info.context == context || !context)
+        {
+            cuda_unload_module(module_info.context, module_info.module);
+            ++num_unloaded_modules;
+            it = g_deferred_module_list.erase(it);
+        }
+        else
+        {
+            ++it;
+        }
+    }
+
+    return num_unloaded_modules;
+}
+
 static void CUDART_CB on_graph_destroy(void* user_data)
 {
     if (!user_data)
@@ -1920,6 +1955,8 @@ void cuda_context_synchronize(void* context)
         check_cu(cuCtxSynchronize_f());
     }
 
+    unload_deferred_modules(context);
+
     // check_cuda(cudaDeviceGraphMemTrim(cuda_context_get_device_ordinal(context)));
 }
 
@@ -2542,7 +2579,10 @@ bool cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
 
     // process deferred free list if no more captures are ongoing
     if (g_captures.empty())
+    {
         free_deferred_allocs();
+        unload_deferred_modules();
+    }
 
     if (graph_ret)
         *graph_ret = graph_exec;
@@ -3104,9 +3144,20 @@ void* cuda_load_module(void* context, const char* path)
 
 void cuda_unload_module(void* context, void* module)
 {
-    ContextGuard guard(context);
-
-    check_cu(cuModuleUnload_f((CUmodule)module));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        check_cu(cuModuleUnload_f((CUmodule)module));
+    }
+    else
+    {
+        // defer until graph capture completes
+        ModuleInfo module_info;
+        module_info.context = context ? context : get_current_context();
+        module_info.module = module;
+        g_deferred_module_list.push_back(module_info);
+    }
 }
 
 
diff --git a/warp/tests/aux_test_module_unload.py b/warp/tests/aux_test_module_unload.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+"""Dummy module used in test_reload.py"""
+
+import warp as wp
+
+
+@wp.kernel
+def k():
+    pass
diff --git a/warp/tests/test_reload.py b/warp/tests/test_reload.py
@@ -241,6 +241,32 @@ def foo(a: wp.array(dtype=int)):
         test.assertEqual(a.numpy()[0], 42)
 
 
+def test_module_unload_during_graph_capture(test, device):
+    @wp.kernel
+    def foo(a: wp.array(dtype=int)):
+        a[0] = 42
+
+    # preload module before graph capture
+    wp.load_module(device=device)
+
+    # load another module to test unloading during graph capture
+    other_module = wp.get_module("warp.tests.aux_test_module_unload")
+    other_module.load(device)
+
+    with wp.ScopedDevice(device):
+        a = wp.zeros(1, dtype=int)
+
+        with wp.ScopedCapture(force_module_load=False) as capture:
+            wp.launch(foo, dim=1, inputs=[a])
+
+            # unloading a module during graph capture should be fine (deferred until capture completes)
+            other_module.unload()
+
+        wp.capture_launch(capture.graph)
+
+        test.assertEqual(a.numpy()[0], 42)
+
+
 devices = get_test_devices()
 cuda_devices = get_cuda_test_devices()
 
@@ -258,6 +284,9 @@ class TestReload(unittest.TestCase):
 add_function_test(
     TestReload, "test_graph_launch_after_module_reload", test_graph_launch_after_module_reload, devices=cuda_devices
 )
+add_function_test(
+    TestReload, "test_module_unload_during_graph_capture", test_module_unload_during_graph_capture, devices=cuda_devices
+)
 
 
 if __name__ == "__main__":