NVIDIA · isVoid · Dec 2, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -362,11 +362,14 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
 
         stream_handle = stream and stream.handle or zero_stream
 
-        rtsys.ensure_allocated(stream_handle)
-        rtsys.set_memsys_to_module(cufunc.module, stream_handle)
-        rtsys.ensure_initialized(stream_handle)
-        if config.CUDA_NRT_STATS:
-            rtsys.memsys_enable_stats(stream_handle)
+        if hasattr(self, "target_context") and self.target_context.enable_nrt:
+            # If NRT is enabled, we also initialize the memsys. The statistics
+            # are controlled by a different config setting `NRT_STATS`.
+            rtsys.ensure_allocated(stream_handle)
+            rtsys.set_memsys_to_module(cufunc.module, stream_handle)
+            rtsys.ensure_initialized(stream_handle)
+            if config.CUDA_NRT_STATS:
+                rtsys.memsys_enable_stats(stream_handle)
 
         # Invoke kernel
         driver.launch_kernel(cufunc.handle,

diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu
@@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size)
 {
   void* ptr = NULL;
   ptr       = malloc(size);
-  if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
+  if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
   return ptr;
 }
 
@@ -48,7 +48,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
   mi->dtor_info = dtor_info;
   mi->data      = data;
   mi->size      = size;
- if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
+ if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
 }
 
 extern "C"
@@ -63,7 +63,7 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
 extern "C" __device__ void NRT_Free(void* ptr)
 {
   free(ptr);
-  if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
+  if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.free++; }
 }
 
 extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
@@ -74,7 +74,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
 extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
 {
   NRT_dealloc(mi);
-  if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
+  if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
 }
 
 extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
@@ -151,7 +151,7 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
     ptr = malloc(size);
     //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));
 
-    if (TheMSys->stats.enabled)
+    if (TheMSys && TheMSys->stats.enabled)
     {
        TheMSys->stats.alloc++;
     }

diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
@@ -3,17 +3,20 @@
 
 import numpy as np
 import unittest
-from unittest.mock import patch
 from numba.cuda.testing import CUDATestCase
 
 from numba.cuda.tests.nrt.mock_numpy import cuda_empty, cuda_ones, cuda_arange
-from numba.tests.support import run_in_subprocess
+from numba.tests.support import run_in_subprocess, override_config
 
 from numba import cuda
 from numba.cuda.runtime.nrt import rtsys
 
 
 class TestNrtBasic(CUDATestCase):
+    def run(self, result=None):
+        with override_config("CUDA_ENABLE_NRT", True):
+            super(TestNrtBasic, self).run(result)
+
     def test_nrt_launches(self):
         @cuda.jit
         def f(x):
@@ -24,8 +27,7 @@ def g():
             x = cuda_empty(10, np.int64)
             f(x)
 
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1,1]()
+        g[1,1]()
         cuda.synchronize()
 
     def test_nrt_ptx_contains_refcount(self):
@@ -38,8 +40,7 @@ def g():
             x = cuda_empty(10, np.int64)
             f(x)
 
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1,1]()
+        g[1,1]()
 
         ptx = next(iter(g.inspect_asm().values()))
 
@@ -72,8 +73,7 @@ def g(out_ary):
 
         out_ary = np.zeros(1, dtype=np.int64)
 
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1,1](out_ary)
+        g[1,1](out_ary)
 
         self.assertEqual(out_ary[0], 1)
 
@@ -168,36 +168,35 @@ def foo():
             arr = cuda_arange(5 * tmp[0]) # noqa: F841
             return None
 
-        # Switch on stats
-        rtsys.memsys_enable_stats()
-        # check the stats are on
-        self.assertTrue(rtsys.memsys_stats_enabled())
-
-        for i in range(2):
-            # capture the stats state
-            stats_1 = rtsys.get_allocation_stats()
-            # Switch off stats
-            rtsys.memsys_disable_stats()
-            # check the stats are off
-            self.assertFalse(rtsys.memsys_stats_enabled())
-            # run something that would move the counters were they enabled
-            with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-                foo[1, 1]()
+        with override_config('CUDA_ENABLE_NRT', True):
             # Switch on stats
             rtsys.memsys_enable_stats()
             # check the stats are on
             self.assertTrue(rtsys.memsys_stats_enabled())
-            # capture the stats state (should not have changed)
-            stats_2 = rtsys.get_allocation_stats()
-            # run something that will move the counters
-            with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
+
+            for i in range(2):
+                # capture the stats state
+                stats_1 = rtsys.get_allocation_stats()
+                # Switch off stats
+                rtsys.memsys_disable_stats()
+                # check the stats are off
+                self.assertFalse(rtsys.memsys_stats_enabled())
+                # run something that would move the counters were they enabled
+                foo[1, 1]()
+                # Switch on stats
+                rtsys.memsys_enable_stats()
+                # check the stats are on
+                self.assertTrue(rtsys.memsys_stats_enabled())
+                # capture the stats state (should not have changed)
+                stats_2 = rtsys.get_allocation_stats()
+                # run something that will move the counters
                 foo[1, 1]()
-            # capture the stats state (should have changed)
-            stats_3 = rtsys.get_allocation_stats()
-            # check stats_1 == stats_2
-            self.assertEqual(stats_1, stats_2)
-            # check stats_2 < stats_3
-            self.assertLess(stats_2, stats_3)
+                # capture the stats state (should have changed)
+                stats_3 = rtsys.get_allocation_stats()
+                # check stats_1 == stats_2
+                self.assertEqual(stats_1, stats_2)
+                # check stats_2 < stats_3
+                self.assertLess(stats_2, stats_3)
 
     def test_rtsys_stats_query_raises_exception_when_disabled(self):
         # Checks that the standard rtsys.get_allocation_stats() query raises

diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py
@@ -2,7 +2,7 @@
 import gc
 import numpy as np
 import unittest
-from unittest.mock import patch
+from numba.tests.support import override_config
 from numba.cuda.runtime import rtsys
 from numba.cuda.tests.support import EnableNRTStatsMixin
 from numba.cuda.testing import CUDATestCase
@@ -18,10 +18,18 @@ def setUp(self):
         gc.collect()
         super(TestNrtRefCt, self).setUp()
 
+    def tearDown(self):
+        super(TestNrtRefCt, self).tearDown()
+
+    def run(self, result=None):
+        with override_config("CUDA_ENABLE_NRT", True):
+            super(TestNrtRefCt, self).run(result)
+
     def test_no_return(self):
         """
         Test issue #1291
         """
+
         n = 10
 
         @cuda.jit
@@ -31,8 +39,7 @@ def kernel():
             return None
 
         init_stats = rtsys.get_allocation_stats()
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            kernel[1, 1]()
+        kernel[1, 1]()
         cur_stats = rtsys.get_allocation_stats()
         self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
         self.assertEqual(cur_stats.free - init_stats.free, n)
@@ -56,8 +63,7 @@ def g(n):
             return None
 
         init_stats = rtsys.get_allocation_stats()
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            g[1, 1](10)
+        g[1, 1](10)
         cur_stats = rtsys.get_allocation_stats()
         self.assertEqual(cur_stats.alloc - init_stats.alloc, 1)
         self.assertEqual(cur_stats.free - init_stats.free, 1)
@@ -79,8 +85,7 @@ def if_with_allocation_and_initialization(arr1, test1):
         arr = np.random.random((5, 5))  # the values are not consumed
 
         init_stats = rtsys.get_allocation_stats()
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            if_with_allocation_and_initialization[1, 1](arr, False)
+        if_with_allocation_and_initialization[1, 1](arr, False)
         cur_stats = rtsys.get_allocation_stats()
         self.assertEqual(cur_stats.alloc - init_stats.alloc,
                          cur_stats.free - init_stats.free)
@@ -103,8 +108,7 @@ def f(arr):
         arr = np.ones((2, 2))
 
         init_stats = rtsys.get_allocation_stats()
-        with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
-            f[1, 1](arr)
+        f[1, 1](arr)
         cur_stats = rtsys.get_allocation_stats()
         self.assertEqual(cur_stats.alloc - init_stats.alloc,
                          cur_stats.free - init_stats.free)