diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py index 3d91570..79cc5e4 100644 --- a/numba_cuda/numba/cuda/dispatcher.py +++ b/numba_cuda/numba/cuda/dispatcher.py @@ -362,11 +362,14 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0): stream_handle = stream and stream.handle or zero_stream - rtsys.ensure_allocated(stream_handle) - rtsys.set_memsys_to_module(cufunc.module, stream_handle) - rtsys.ensure_initialized(stream_handle) - if config.CUDA_NRT_STATS: - rtsys.memsys_enable_stats(stream_handle) + if hasattr(self, "target_context") and self.target_context.enable_nrt: + # If NRT is enabled, we also initialize the memsys. The statistics + # are controlled by a different config setting `NRT_STATS`. + rtsys.ensure_allocated(stream_handle) + rtsys.set_memsys_to_module(cufunc.module, stream_handle) + rtsys.ensure_initialized(stream_handle) + if config.CUDA_NRT_STATS: + rtsys.memsys_enable_stats(stream_handle) # Invoke kernel driver.launch_kernel(cufunc.handle, diff --git a/numba_cuda/numba/cuda/runtime/nrt.cu b/numba_cuda/numba/cuda/runtime/nrt.cu index 19bbcf2..337fe35 100644 --- a/numba_cuda/numba/cuda/runtime/nrt.cu +++ b/numba_cuda/numba/cuda/runtime/nrt.cu @@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size) { void* ptr = NULL; ptr = malloc(size); - if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; } + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.alloc++; } return ptr; } @@ -48,7 +48,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi, mi->dtor_info = dtor_info; mi->data = data; mi->size = size; - if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; } + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; } } extern "C" @@ -63,7 +63,7 @@ __device__ NRT_MemInfo* NRT_MemInfo_new( extern "C" __device__ void NRT_Free(void* ptr) { free(ptr); - if (TheMSys->stats.enabled) { TheMSys->stats.free++; } + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.free++; } } extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi) @@ -74,7 +74,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi) extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi) { NRT_dealloc(mi); - if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; } + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_free++; } } extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) @@ -151,7 +151,7 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) { ptr = malloc(size); //NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr)); - if (TheMSys->stats.enabled) + if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.alloc++; } diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py index 161afe8..3e927d8 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py @@ -3,17 +3,20 @@ import numpy as np import unittest -from unittest.mock import patch from numba.cuda.testing import CUDATestCase from numba.cuda.tests.nrt.mock_numpy import cuda_empty, cuda_ones, cuda_arange -from numba.tests.support import run_in_subprocess +from numba.tests.support import run_in_subprocess, override_config from numba import cuda from numba.cuda.runtime.nrt import rtsys class TestNrtBasic(CUDATestCase): + def run(self, result=None): + with override_config("CUDA_ENABLE_NRT", True): + super(TestNrtBasic, self).run(result) + def test_nrt_launches(self): @cuda.jit def f(x): @@ -24,8 +27,7 @@ def g(): x = cuda_empty(10, np.int64) f(x) - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - g[1,1]() + g[1,1]() cuda.synchronize() def test_nrt_ptx_contains_refcount(self): @@ -38,8 +40,7 @@ def g(): x = cuda_empty(10, np.int64) f(x) - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - g[1,1]() + g[1,1]() ptx = next(iter(g.inspect_asm().values())) @@ -72,8 +73,7 @@ def g(out_ary): out_ary = np.zeros(1, dtype=np.int64) - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - g[1,1](out_ary) + g[1,1](out_ary) self.assertEqual(out_ary[0], 1) @@ -168,36 +168,35 @@ def foo(): arr = cuda_arange(5 * tmp[0]) # noqa: F841 return None - # Switch on stats - rtsys.memsys_enable_stats() - # check the stats are on - self.assertTrue(rtsys.memsys_stats_enabled()) - - for i in range(2): - # capture the stats state - stats_1 = rtsys.get_allocation_stats() - # Switch off stats - rtsys.memsys_disable_stats() - # check the stats are off - self.assertFalse(rtsys.memsys_stats_enabled()) - # run something that would move the counters were they enabled - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - foo[1, 1]() + with override_config('CUDA_ENABLE_NRT', True): # Switch on stats rtsys.memsys_enable_stats() # check the stats are on self.assertTrue(rtsys.memsys_stats_enabled()) - # capture the stats state (should not have changed) - stats_2 = rtsys.get_allocation_stats() - # run something that will move the counters - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): + + for i in range(2): + # capture the stats state + stats_1 = rtsys.get_allocation_stats() + # Switch off stats + rtsys.memsys_disable_stats() + # check the stats are off + self.assertFalse(rtsys.memsys_stats_enabled()) + # run something that would move the counters were they enabled + foo[1, 1]() + # Switch on stats + rtsys.memsys_enable_stats() + # check the stats are on + self.assertTrue(rtsys.memsys_stats_enabled()) + # capture the stats state (should not have changed) + stats_2 = rtsys.get_allocation_stats() + # run something that will move the counters foo[1, 1]() - # capture the stats state (should have changed) - stats_3 = rtsys.get_allocation_stats() - # check stats_1 == stats_2 - self.assertEqual(stats_1, stats_2) - # check stats_2 < stats_3 - self.assertLess(stats_2, stats_3) + # capture the stats state (should have changed) + stats_3 = rtsys.get_allocation_stats() + # check stats_1 == stats_2 + self.assertEqual(stats_1, stats_2) + # check stats_2 < stats_3 + self.assertLess(stats_2, stats_3) def test_rtsys_stats_query_raises_exception_when_disabled(self): # Checks that the standard rtsys.get_allocation_stats() query raises diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py index d237fb1..fb2ceda 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py @@ -2,7 +2,7 @@ import gc import numpy as np import unittest -from unittest.mock import patch +from numba.tests.support import override_config from numba.cuda.runtime import rtsys from numba.cuda.tests.support import EnableNRTStatsMixin from numba.cuda.testing import CUDATestCase @@ -18,10 +18,18 @@ def setUp(self): gc.collect() super(TestNrtRefCt, self).setUp() + def tearDown(self): + super(TestNrtRefCt, self).tearDown() + + def run(self, result=None): + with override_config("CUDA_ENABLE_NRT", True): + super(TestNrtRefCt, self).run(result) + def test_no_return(self): """ Test issue #1291 """ + n = 10 @cuda.jit @@ -31,8 +39,7 @@ def kernel(): return None init_stats = rtsys.get_allocation_stats() - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - kernel[1, 1]() + kernel[1, 1]() cur_stats = rtsys.get_allocation_stats() self.assertEqual(cur_stats.alloc - init_stats.alloc, n) self.assertEqual(cur_stats.free - init_stats.free, n) @@ -56,8 +63,7 @@ def g(n): return None init_stats = rtsys.get_allocation_stats() - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - g[1, 1](10) + g[1, 1](10) cur_stats = rtsys.get_allocation_stats() self.assertEqual(cur_stats.alloc - init_stats.alloc, 1) self.assertEqual(cur_stats.free - init_stats.free, 1) @@ -79,8 +85,7 @@ def if_with_allocation_and_initialization(arr1, test1): arr = np.random.random((5, 5)) # the values are not consumed init_stats = rtsys.get_allocation_stats() - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - if_with_allocation_and_initialization[1, 1](arr, False) + if_with_allocation_and_initialization[1, 1](arr, False) cur_stats = rtsys.get_allocation_stats() self.assertEqual(cur_stats.alloc - init_stats.alloc, cur_stats.free - init_stats.free) @@ -103,8 +108,7 @@ def f(arr): arr = np.ones((2, 2)) init_stats = rtsys.get_allocation_stats() - with patch('numba.config.CUDA_ENABLE_NRT', True, create=True): - f[1, 1](arr) + f[1, 1](arr) cur_stats = rtsys.get_allocation_stats() self.assertEqual(cur_stats.alloc - init_stats.alloc, cur_stats.free - init_stats.free)