Skip to content

Commit

Permalink
remove all patches, use override_config; Only allocate memsys when NR…
Browse files Browse the repository at this point in the history
…T is enabled
  • Loading branch information
isVoid committed Dec 18, 2024
1 parent 237dae4 commit 7710372
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 52 deletions.
13 changes: 8 additions & 5 deletions numba_cuda/numba/cuda/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,11 +362,14 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):

stream_handle = stream and stream.handle or zero_stream

rtsys.ensure_allocated(stream_handle)
rtsys.set_memsys_to_module(cufunc.module, stream_handle)
rtsys.ensure_initialized(stream_handle)
if config.CUDA_NRT_STATS:
rtsys.memsys_enable_stats(stream_handle)
if hasattr(self, "target_context") and self.target_context.enable_nrt:
# If NRT is enabled, we also initialize the memsys. The statistics
# are controlled by a different config setting `NRT_STATS`.
rtsys.ensure_allocated(stream_handle)
rtsys.set_memsys_to_module(cufunc.module, stream_handle)
rtsys.ensure_initialized(stream_handle)
if config.CUDA_NRT_STATS:
rtsys.memsys_enable_stats(stream_handle)

# Invoke kernel
driver.launch_kernel(cufunc.handle,
Expand Down
10 changes: 5 additions & 5 deletions numba_cuda/numba/cuda/runtime/nrt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size)
{
void* ptr = NULL;
ptr = malloc(size);
if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
return ptr;
}

Expand All @@ -48,7 +48,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
mi->dtor_info = dtor_info;
mi->data = data;
mi->size = size;
if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
}

extern "C"
Expand All @@ -63,7 +63,7 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
extern "C" __device__ void NRT_Free(void* ptr)
{
free(ptr);
if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.free++; }
}

extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
Expand All @@ -74,7 +74,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
{
NRT_dealloc(mi);
if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
}

extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
Expand Down Expand Up @@ -151,7 +151,7 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
ptr = malloc(size);
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));

if (TheMSys->stats.enabled)
if (TheMSys && TheMSys->stats.enabled)
{
TheMSys->stats.alloc++;
}
Expand Down
65 changes: 32 additions & 33 deletions numba_cuda/numba/cuda/tests/nrt/test_nrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@

import numpy as np
import unittest
from unittest.mock import patch
from numba.cuda.testing import CUDATestCase

from numba.cuda.tests.nrt.mock_numpy import cuda_empty, cuda_ones, cuda_arange
from numba.tests.support import run_in_subprocess
from numba.tests.support import run_in_subprocess, override_config

from numba import cuda
from numba.cuda.runtime.nrt import rtsys


class TestNrtBasic(CUDATestCase):
def run(self, result=None):
with override_config("CUDA_ENABLE_NRT", True):
super(TestNrtBasic, self).run(result)

def test_nrt_launches(self):
@cuda.jit
def f(x):
Expand All @@ -24,8 +27,7 @@ def g():
x = cuda_empty(10, np.int64)
f(x)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1,1]()
g[1,1]()
cuda.synchronize()

def test_nrt_ptx_contains_refcount(self):
Expand All @@ -38,8 +40,7 @@ def g():
x = cuda_empty(10, np.int64)
f(x)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1,1]()
g[1,1]()

ptx = next(iter(g.inspect_asm().values()))

Expand Down Expand Up @@ -72,8 +73,7 @@ def g(out_ary):

out_ary = np.zeros(1, dtype=np.int64)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1,1](out_ary)
g[1,1](out_ary)

self.assertEqual(out_ary[0], 1)

Expand Down Expand Up @@ -168,36 +168,35 @@ def foo():
arr = cuda_arange(5 * tmp[0]) # noqa: F841
return None

# Switch on stats
rtsys.memsys_enable_stats()
# check the stats are on
self.assertTrue(rtsys.memsys_stats_enabled())

for i in range(2):
# capture the stats state
stats_1 = rtsys.get_allocation_stats()
# Switch off stats
rtsys.memsys_disable_stats()
# check the stats are off
self.assertFalse(rtsys.memsys_stats_enabled())
# run something that would move the counters were they enabled
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
foo[1, 1]()
with override_config('CUDA_ENABLE_NRT', True):
# Switch on stats
rtsys.memsys_enable_stats()
# check the stats are on
self.assertTrue(rtsys.memsys_stats_enabled())
# capture the stats state (should not have changed)
stats_2 = rtsys.get_allocation_stats()
# run something that will move the counters
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):

for i in range(2):
# capture the stats state
stats_1 = rtsys.get_allocation_stats()
# Switch off stats
rtsys.memsys_disable_stats()
# check the stats are off
self.assertFalse(rtsys.memsys_stats_enabled())
# run something that would move the counters were they enabled
foo[1, 1]()
# Switch on stats
rtsys.memsys_enable_stats()
# check the stats are on
self.assertTrue(rtsys.memsys_stats_enabled())
# capture the stats state (should not have changed)
stats_2 = rtsys.get_allocation_stats()
# run something that will move the counters
foo[1, 1]()
# capture the stats state (should have changed)
stats_3 = rtsys.get_allocation_stats()
# check stats_1 == stats_2
self.assertEqual(stats_1, stats_2)
# check stats_2 < stats_3
self.assertLess(stats_2, stats_3)
# capture the stats state (should have changed)
stats_3 = rtsys.get_allocation_stats()
# check stats_1 == stats_2
self.assertEqual(stats_1, stats_2)
# check stats_2 < stats_3
self.assertLess(stats_2, stats_3)

def test_rtsys_stats_query_raises_exception_when_disabled(self):
# Checks that the standard rtsys.get_allocation_stats() query raises
Expand Down
22 changes: 13 additions & 9 deletions numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import gc
import numpy as np
import unittest
from unittest.mock import patch
from numba.tests.support import override_config
from numba.cuda.runtime import rtsys
from numba.cuda.tests.support import EnableNRTStatsMixin
from numba.cuda.testing import CUDATestCase
Expand All @@ -18,10 +18,18 @@ def setUp(self):
gc.collect()
super(TestNrtRefCt, self).setUp()

def tearDown(self):
super(TestNrtRefCt, self).tearDown()

def run(self, result=None):
with override_config("CUDA_ENABLE_NRT", True):
super(TestNrtRefCt, self).run(result)

def test_no_return(self):
"""
Test issue #1291
"""

n = 10

@cuda.jit
Expand All @@ -31,8 +39,7 @@ def kernel():
return None

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
kernel[1, 1]()
kernel[1, 1]()
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
self.assertEqual(cur_stats.free - init_stats.free, n)
Expand All @@ -56,8 +63,7 @@ def g(n):
return None

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1, 1](10)
g[1, 1](10)
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc, 1)
self.assertEqual(cur_stats.free - init_stats.free, 1)
Expand All @@ -79,8 +85,7 @@ def if_with_allocation_and_initialization(arr1, test1):
arr = np.random.random((5, 5)) # the values are not consumed

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
if_with_allocation_and_initialization[1, 1](arr, False)
if_with_allocation_and_initialization[1, 1](arr, False)
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc,
cur_stats.free - init_stats.free)
Expand All @@ -103,8 +108,7 @@ def f(arr):
arr = np.ones((2, 2))

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
f[1, 1](arr)
f[1, 1](arr)
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc,
cur_stats.free - init_stats.free)
Expand Down

0 comments on commit 7710372

Please sign in to comment.