Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NRT Memsys #78

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
0490a55
initial
isVoid Dec 2, 2024
cbb12dd
fix erro code bug
isVoid Dec 3, 2024
7f6b099
add stream handle
isVoid Dec 3, 2024
81118de
comment out no allocation case
isVoid Dec 3, 2024
8805e99
Passing no return test
isVoid Dec 3, 2024
a7d2887
do not force reinit memsys on every kernel launch, add another test
isVoid Dec 4, 2024
f1f8377
add another test from CPU target
isVoid Dec 4, 2024
f7ac1c6
porting the correct np_empty_like implementation
isVoid Dec 4, 2024
050f21c
Merge branch 'main' of https://github.com/NVIDIA/numba-cuda into nrt-…
isVoid Dec 4, 2024
8f454ac
pull in another test from numba cpu target
isVoid Dec 4, 2024
8843765
move memsys tests to test_nrt_refct
isVoid Dec 4, 2024
06f5e53
rename into memsys stats enabled
isVoid Dec 5, 2024
f4d1a80
Explicitly control the use of stream in tests with NRT libraries
isVoid Dec 5, 2024
ec2736b
move readenv to utils
isVoid Dec 15, 2024
1f2b6d1
rename a few API, default stream argument to None, read environment v…
isVoid Dec 15, 2024
1382a5d
add cuda_ones mock api
isVoid Dec 15, 2024
8494547
add memsys get enabled status API, cleanups
isVoid Dec 15, 2024
5433b54
add 3 tests in TestNRTStatistics
isVoid Dec 15, 2024
5879098
add 3 more tests, augment API with single getters
isVoid Dec 16, 2024
5273e4a
make cuda nrt test mixin
isVoid Dec 16, 2024
bb1cf0f
comments and clean up
isVoid Dec 16, 2024
cc51a8a
Merge branch 'main' of https://github.com/NVIDIA/numba-cuda into nrt-…
isVoid Dec 16, 2024
237dae4
add cuh file to package data
isVoid Dec 17, 2024
7710372
remove all patches, use override_config; Only allocate memsys when NR…
isVoid Dec 18, 2024
7ff3a04
relaxed memory order
isVoid Dec 18, 2024
685eb64
rename as managed_result
isVoid Dec 19, 2024
bd9f9ad
use .load to avoid compile warnings
isVoid Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions numba_cuda/numba/cuda/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,11 +362,14 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):

stream_handle = stream and stream.handle or zero_stream

rtsys.ensure_allocated(stream_handle)
rtsys.set_memsys_to_module(cufunc.module, stream_handle)
rtsys.ensure_initialized(stream_handle)
if config.CUDA_NRT_STATS:
rtsys.memsys_enable_stats(stream_handle)
if hasattr(self, "target_context") and self.target_context.enable_nrt:
Copy link
Collaborator Author

@isVoid isVoid Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check for target_context indicates that for cached kernels that doesn't have a target_context attribute, NRT is always considered off. (Is this correct?)

# If NRT is enabled, we also initialize the memsys. The statistics
# are controlled by a different config setting `NRT_STATS`.
rtsys.ensure_allocated(stream_handle)
rtsys.set_memsys_to_module(cufunc.module, stream_handle)
rtsys.ensure_initialized(stream_handle)
if config.CUDA_NRT_STATS:
rtsys.memsys_enable_stats(stream_handle)

# Invoke kernel
driver.launch_kernel(cufunc.handle,
Expand Down
10 changes: 5 additions & 5 deletions numba_cuda/numba/cuda/runtime/nrt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ extern "C" __device__ void* NRT_Allocate(size_t size)
{
void* ptr = NULL;
ptr = malloc(size);
if (TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.alloc++; }
isVoid marked this conversation as resolved.
Show resolved Hide resolved
return ptr;
}

Expand All @@ -48,7 +48,7 @@ extern "C" __device__ void NRT_MemInfo_init(NRT_MemInfo* mi,
mi->dtor_info = dtor_info;
mi->data = data;
mi->size = size;
if (TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_alloc++; }
}

extern "C"
Expand All @@ -63,7 +63,7 @@ __device__ NRT_MemInfo* NRT_MemInfo_new(
extern "C" __device__ void NRT_Free(void* ptr)
{
free(ptr);
if (TheMSys->stats.enabled) { TheMSys->stats.free++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.free++; }
}

extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
Expand All @@ -74,7 +74,7 @@ extern "C" __device__ void NRT_dealloc(NRT_MemInfo* mi)
extern "C" __device__ void NRT_MemInfo_destroy(NRT_MemInfo* mi)
{
NRT_dealloc(mi);
if (TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
if (TheMSys && TheMSys->stats.enabled) { TheMSys->stats.mi_free++; }
}

extern "C" __device__ void NRT_MemInfo_call_dtor(NRT_MemInfo* mi)
Expand Down Expand Up @@ -151,7 +151,7 @@ extern "C" __device__ void* NRT_Allocate_External(size_t size) {
ptr = malloc(size);
//NRT_Debug(nrt_debug_print("NRT_Allocate_External bytes=%zu ptr=%p\n", size, ptr));

if (TheMSys->stats.enabled)
if (TheMSys && TheMSys->stats.enabled)
{
TheMSys->stats.alloc++;
}
Expand Down
65 changes: 32 additions & 33 deletions numba_cuda/numba/cuda/tests/nrt/test_nrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@

import numpy as np
import unittest
from unittest.mock import patch
from numba.cuda.testing import CUDATestCase

from numba.cuda.tests.nrt.mock_numpy import cuda_empty, cuda_ones, cuda_arange
from numba.tests.support import run_in_subprocess
from numba.tests.support import run_in_subprocess, override_config

from numba import cuda
from numba.cuda.runtime.nrt import rtsys


class TestNrtBasic(CUDATestCase):
def run(self, result=None):
with override_config("CUDA_ENABLE_NRT", True):
super(TestNrtBasic, self).run(result)

def test_nrt_launches(self):
@cuda.jit
def f(x):
Expand All @@ -24,8 +27,7 @@ def g():
x = cuda_empty(10, np.int64)
f(x)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1,1]()
g[1,1]()
cuda.synchronize()

def test_nrt_ptx_contains_refcount(self):
Expand All @@ -38,8 +40,7 @@ def g():
x = cuda_empty(10, np.int64)
f(x)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1,1]()
g[1,1]()

ptx = next(iter(g.inspect_asm().values()))

Expand Down Expand Up @@ -72,8 +73,7 @@ def g(out_ary):

out_ary = np.zeros(1, dtype=np.int64)

with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1,1](out_ary)
g[1,1](out_ary)

self.assertEqual(out_ary[0], 1)

Expand Down Expand Up @@ -168,36 +168,35 @@ def foo():
arr = cuda_arange(5 * tmp[0]) # noqa: F841
return None

# Switch on stats
rtsys.memsys_enable_stats()
# check the stats are on
self.assertTrue(rtsys.memsys_stats_enabled())

for i in range(2):
# capture the stats state
stats_1 = rtsys.get_allocation_stats()
# Switch off stats
rtsys.memsys_disable_stats()
# check the stats are off
self.assertFalse(rtsys.memsys_stats_enabled())
# run something that would move the counters were they enabled
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
foo[1, 1]()
with override_config('CUDA_ENABLE_NRT', True):
# Switch on stats
rtsys.memsys_enable_stats()
# check the stats are on
self.assertTrue(rtsys.memsys_stats_enabled())
# capture the stats state (should not have changed)
stats_2 = rtsys.get_allocation_stats()
# run something that will move the counters
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):

for i in range(2):
# capture the stats state
stats_1 = rtsys.get_allocation_stats()
# Switch off stats
rtsys.memsys_disable_stats()
# check the stats are off
self.assertFalse(rtsys.memsys_stats_enabled())
# run something that would move the counters were they enabled
foo[1, 1]()
# Switch on stats
rtsys.memsys_enable_stats()
# check the stats are on
self.assertTrue(rtsys.memsys_stats_enabled())
# capture the stats state (should not have changed)
stats_2 = rtsys.get_allocation_stats()
# run something that will move the counters
foo[1, 1]()
# capture the stats state (should have changed)
stats_3 = rtsys.get_allocation_stats()
# check stats_1 == stats_2
self.assertEqual(stats_1, stats_2)
# check stats_2 < stats_3
self.assertLess(stats_2, stats_3)
# capture the stats state (should have changed)
stats_3 = rtsys.get_allocation_stats()
# check stats_1 == stats_2
self.assertEqual(stats_1, stats_2)
# check stats_2 < stats_3
self.assertLess(stats_2, stats_3)

def test_rtsys_stats_query_raises_exception_when_disabled(self):
# Checks that the standard rtsys.get_allocation_stats() query raises
Expand Down
22 changes: 13 additions & 9 deletions numba_cuda/numba/cuda/tests/nrt/test_nrt_refct.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import gc
import numpy as np
import unittest
from unittest.mock import patch
from numba.tests.support import override_config
from numba.cuda.runtime import rtsys
from numba.cuda.tests.support import EnableNRTStatsMixin
from numba.cuda.testing import CUDATestCase
Expand All @@ -18,10 +18,18 @@ def setUp(self):
gc.collect()
super(TestNrtRefCt, self).setUp()

def tearDown(self):
super(TestNrtRefCt, self).tearDown()

def run(self, result=None):
with override_config("CUDA_ENABLE_NRT", True):
super(TestNrtRefCt, self).run(result)

def test_no_return(self):
"""
Test issue #1291
"""

n = 10

@cuda.jit
Expand All @@ -31,8 +39,7 @@ def kernel():
return None

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
kernel[1, 1]()
kernel[1, 1]()
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc, n)
self.assertEqual(cur_stats.free - init_stats.free, n)
Expand All @@ -56,8 +63,7 @@ def g(n):
return None

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
g[1, 1](10)
g[1, 1](10)
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc, 1)
self.assertEqual(cur_stats.free - init_stats.free, 1)
Expand All @@ -79,8 +85,7 @@ def if_with_allocation_and_initialization(arr1, test1):
arr = np.random.random((5, 5)) # the values are not consumed

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
if_with_allocation_and_initialization[1, 1](arr, False)
if_with_allocation_and_initialization[1, 1](arr, False)
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc,
cur_stats.free - init_stats.free)
Expand All @@ -103,8 +108,7 @@ def f(arr):
arr = np.ones((2, 2))

init_stats = rtsys.get_allocation_stats()
with patch('numba.config.CUDA_ENABLE_NRT', True, create=True):
f[1, 1](arr)
f[1, 1](arr)
cur_stats = rtsys.get_allocation_stats()
self.assertEqual(cur_stats.alloc - init_stats.alloc,
cur_stats.free - init_stats.free)
Expand Down
Loading