From 816a23a8022be303e6c0788b873c0c0d043b15c7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 19 May 2024 23:52:51 -0500 Subject: [PATCH 1/2] Add CMake bit to force build type --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b251c204c..b83a96589 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,11 @@ find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) # }}} +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + # {{{ Detect nanobind and import it execute_process( From 558e1d7a7ea2a28d335ff18b57d0b626faf09b7b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 18 May 2024 23:40:42 -0500 Subject: [PATCH 2/2] Use intrusive reference counting --- src/mempool.hpp | 17 ++++++++-- src/wrap_cl.cpp | 11 +++++++ src/wrap_cl.hpp | 23 +++++++------- src/wrap_cl_part_1.cpp | 14 ++++++-- src/wrap_cl_part_2.cpp | 2 +- src/wrap_helpers.hpp | 3 +- src/wrap_mempool.cpp | 72 +++++++++++++++++++++++++++--------------- 7 files changed, 97 insertions(+), 45 deletions(-) diff --git a/src/mempool.hpp b/src/mempool.hpp index a0eca827e..66af4852f 100644 --- a/src/mempool.hpp +++ b/src/mempool.hpp @@ -36,6 +36,14 @@ #include #include "bitlog.hpp" +#ifndef PYGPU_PYCUDA +#include +#include + + +namespace nb = nanobind; +#endif + namespace PYGPU_PACKAGE { @@ -53,7 +61,7 @@ namespace PYGPU_PACKAGE #ifdef PYGPU_PYCUDA #define PYGPU_SHARED_PTR boost::shared_ptr #else -#define PYGPU_SHARED_PTR std::shared_ptr +#define PYGPU_SHARED_PTR nb::ref #endif template @@ -89,6 +97,9 @@ namespace PYGPU_PACKAGE template class memory_pool : mp_noncopyable + #ifndef PYGPU_PYCUDA + , public nb::intrusive_base + #endif { public: typedef typename Allocator::pointer_type pointer_type; @@ -102,7 +113,7 @@ namespace PYGPU_PACKAGE container_t m_container; typedef typename container_t::value_type bin_pair_t; - std::shared_ptr m_allocator; + PYGPU_SHARED_PTR m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. @@ -125,7 +136,7 @@ namespace PYGPU_PACKAGE unsigned m_leading_bits_in_bin_id; public: - memory_pool(std::shared_ptr alloc, unsigned leading_bits_in_bin_id=4) + memory_pool(PYGPU_SHARED_PTR alloc, unsigned leading_bits_in_bin_id=4) : m_allocator(alloc), m_held_blocks(0), m_active_blocks(0), m_managed_bytes(0), m_active_bytes(0), diff --git a/src/wrap_cl.cpp b/src/wrap_cl.cpp index 8c1710476..cbf855b06 100644 --- a/src/wrap_cl.cpp +++ b/src/wrap_cl.cpp @@ -27,6 +27,7 @@ #define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API #include "wrap_cl.hpp" +#include @@ -49,6 +50,16 @@ static bool import_numpy_helper() NB_MODULE(_cl, m) { + py::intrusive_init( + [](PyObject *o) noexcept { + py::gil_scoped_acquire guard; + Py_INCREF(o); + }, + [](PyObject *o) noexcept { + py::gil_scoped_acquire guard; + Py_DECREF(o); + }); + if (!import_numpy_helper()) throw py::python_error(); diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 1896c618e..f127ef218 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -1143,7 +1143,7 @@ namespace pyopencl // {{{ context - class context : public noncopyable + class context : public noncopyable, public py::intrusive_base { private: cl_context m_context; @@ -1415,7 +1415,7 @@ namespace pyopencl // {{{ command_queue - class command_queue + class command_queue: public py::intrusive_base { private: cl_command_queue m_queue; @@ -1625,13 +1625,12 @@ namespace pyopencl } } - std::unique_ptr get_context() const + py::ref get_context() const { cl_context param_value; PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo, (data(), CL_QUEUE_CONTEXT, sizeof(param_value), ¶m_value, 0)); - return std::unique_ptr( - new context(param_value, /*retain*/ true)); + return py::ref(new context(param_value, /*retain*/ true)); } #if PYOPENCL_CL_VERSION < 0x1010 @@ -3437,12 +3436,12 @@ namespace pyopencl { private: bool m_valid; - std::shared_ptr m_queue; + py::ref m_queue; memory_object m_mem; void *m_ptr; public: - memory_map(std::shared_ptr cq, memory_object const &mem, void *ptr) + memory_map(py::ref cq, memory_object const &mem, void *ptr) : m_valid(true), m_queue(cq), m_mem(mem), m_ptr(ptr) { } @@ -3479,7 +3478,7 @@ namespace pyopencl #ifndef PYPY_VERSION inline py::object enqueue_map_buffer( - std::shared_ptr cq, + py::ref cq, memory_object_holder &buf, cl_map_flags flags, size_t offset, @@ -3563,7 +3562,7 @@ namespace pyopencl #ifndef PYPY_VERSION inline py::object enqueue_map_image( - std::shared_ptr cq, + py::ref cq, memory_object_holder &img, cl_map_flags flags, py::object py_origin, @@ -3697,7 +3696,7 @@ namespace pyopencl class svm_allocation : public svm_pointer { private: - std::shared_ptr m_context; + py::ref m_context; void *m_allocation; size_t m_size; command_queue_ref m_queue; @@ -3705,7 +3704,7 @@ namespace pyopencl // wait for users to finish in the case of out-of-order queues. public: - svm_allocation(std::shared_ptr const &ctx, size_t size, cl_uint alignment, + svm_allocation(py::ref const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags, const command_queue *queue = nullptr) : m_context(ctx), m_size(size) { @@ -3738,7 +3737,7 @@ namespace pyopencl } } - svm_allocation(std::shared_ptr const &ctx, void *allocation, size_t size, + svm_allocation(py::ref const &ctx, void *allocation, size_t size, const cl_command_queue queue) : m_context(ctx), m_allocation(allocation), m_size(size) { diff --git a/src/wrap_cl_part_1.cpp b/src/wrap_cl_part_1.cpp index 512ede748..41f7b7e44 100644 --- a/src/wrap_cl_part_1.cpp +++ b/src/wrap_cl_part_1.cpp @@ -79,7 +79,13 @@ void pyopencl_expose_part_1(py::module_ &m) { typedef context cls; - py::class_(m, "Context", py::dynamic_attr(), py::is_weak_referenceable()) + py::class_( + m, "Context", + py::dynamic_attr(), + py::is_weak_referenceable(), + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) + ) .def( "__init__", [](cls *self, py::object py_devices, py::object py_properties, @@ -112,7 +118,11 @@ void pyopencl_expose_part_1(py::module_ &m) // {{{ command queue { typedef command_queue cls; - py::class_(m, "CommandQueue", py::dynamic_attr()) + py::class_( + m, "CommandQueue", + py::dynamic_attr(), + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) ) .def( py::init(), py::arg("context"), diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 807b45af6..76cfed278 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -359,7 +359,7 @@ void pyopencl_expose_part_2(py::module_ &m) { typedef svm_allocation cls; py::class_(m, "SVMAllocation", py::dynamic_attr()) - .def(py::init, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(), + .def(py::init, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(), py::arg("context"), py::arg("size"), py::arg("alignment"), diff --git a/src/wrap_helpers.hpp b/src/wrap_helpers.hpp index c878c36c3..367749535 100644 --- a/src/wrap_helpers.hpp +++ b/src/wrap_helpers.hpp @@ -30,7 +30,8 @@ #include #include -#include +#include +#include #include diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index e375a1944..d04aa5dad 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -43,7 +43,7 @@ namespace pyopencl { // {{{ test_allocator - class test_allocator + class test_allocator : public py::intrusive_base { public: typedef void *pointer_type; @@ -79,14 +79,14 @@ namespace pyopencl { // {{{ buffer allocators - class buffer_allocator_base + class buffer_allocator_base : public py::intrusive_base { protected: - std::shared_ptr m_context; + py::ref m_context; cl_mem_flags m_flags; public: - buffer_allocator_base(std::shared_ptr const &ctx, + buffer_allocator_base(py::ref const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : m_context(ctx), m_flags(flags) { @@ -131,7 +131,7 @@ namespace pyopencl { typedef buffer_allocator_base super; public: - deferred_buffer_allocator(std::shared_ptr const &ctx, + deferred_buffer_allocator(py::ref const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(ctx, flags) { } @@ -158,7 +158,7 @@ namespace pyopencl { public: immediate_buffer_allocator(pyopencl::command_queue &queue, cl_mem_flags flags=CL_MEM_READ_WRITE) - : super(std::shared_ptr(queue.get_context()), flags), + : super(queue.get_context(), flags), m_queue(queue.data(), /*retain*/ true) { } @@ -229,7 +229,7 @@ namespace pyopencl { public: pooled_buffer( - std::shared_ptr p, super::size_type s) + py::ref p, super::size_type s) : super(p, s) { } @@ -306,7 +306,7 @@ namespace pyopencl { // {{{ allocate_from_buffer_pool pooled_buffer *allocate_from_buffer_pool( - std::shared_ptr > pool, + py::ref > pool, memory_pool::size_type sz) { return new pooled_buffer(pool, sz); @@ -326,20 +326,20 @@ namespace pyopencl { // {{{ svm allocator - class svm_allocator + class svm_allocator : public py::intrusive_base { public: typedef svm_held_pointer pointer_type; typedef size_t size_type; protected: - std::shared_ptr m_context; + py::ref m_context; cl_uint m_alignment; cl_svm_mem_flags m_flags; pyopencl::command_queue_ref m_queue; public: - svm_allocator(std::shared_ptr const &ctx, + svm_allocator(py::ref const &ctx, cl_uint alignment=0, cl_svm_mem_flags flags=CL_MEM_READ_WRITE, pyopencl::command_queue *queue=nullptr) : m_context(ctx), m_alignment(alignment), m_flags(flags) @@ -367,7 +367,7 @@ namespace pyopencl { return false; } - std::shared_ptr context() const + py::ref context() const { return m_context; } @@ -453,7 +453,7 @@ namespace pyopencl { public: pooled_svm( - std::shared_ptr p, super::size_type s) + py::ref p, super::size_type s) : super(p, s) { } @@ -552,7 +552,7 @@ namespace pyopencl { // {{{ allocate_from_svm_pool pooled_svm *allocate_from_svm_pool( - std::shared_ptr > pool, + py::ref > pool, pyopencl::memory_pool::size_type sz) { return new pooled_svm(pool, sz); @@ -594,7 +594,11 @@ void pyopencl_expose_mempool(py::module_ &m) { typedef pyopencl::buffer_allocator_base cls; - py::class_ wrapper(m, "AllocatorBase"); + py::class_ wrapper( + m, "AllocatorBase", + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) + ); wrapper .def("__call__", pyopencl::allocate_from_buffer_allocator, py::arg("size")) ; @@ -604,19 +608,23 @@ void pyopencl_expose_mempool(py::module_ &m) { typedef pyopencl::memory_pool cls; - py::class_ wrapper(m, "_TestMemoryPool"); + py::class_ wrapper( + m, "_TestMemoryPool", + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) + ); wrapper .def("__init__", [](cls *self, unsigned leading_bits_in_bin_id) { new (self) cls( - std::shared_ptr( + py::ref( new pyopencl::test_allocator()), leading_bits_in_bin_id); }, py::arg("leading_bits_in_bin_id")=4 ) - .def("allocate", [](std::shared_ptr pool, cls::size_type sz) + .def("allocate", [](py::ref pool, cls::size_type sz) { pool->allocate(sz); return py::none(); @@ -631,9 +639,9 @@ void pyopencl_expose_mempool(py::module_ &m) py::class_ wrapper( m, "DeferredAllocator"); wrapper - .def(py::init const &>()) + .def(py::init const &>()) .def(py::init< - std::shared_ptr const &, + py::ref const &, cl_mem_flags>(), py::arg("queue"), py::arg("mem_flags")) ; @@ -663,9 +671,13 @@ void pyopencl_expose_mempool(py::module_ &m) { typedef pyopencl::memory_pool cls; - py::class_ wrapper( m, "MemoryPool"); + py::class_ wrapper( + m, "MemoryPool", + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) + ); wrapper - .def(py::init, unsigned>(), + .def(py::init, unsigned>(), py::arg("allocator"), py::arg("leading_bits_in_bin_id")=4 ) @@ -679,9 +691,13 @@ void pyopencl_expose_mempool(py::module_ &m) #if PYOPENCL_CL_VERSION >= 0x2000 { typedef pyopencl::svm_allocator cls; - py::class_ wrapper(m, "SVMAllocator"); + py::class_ wrapper( + m, "SVMAllocator", + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) + ); wrapper - .def(py::init const &, cl_uint, cl_uint, pyopencl::command_queue *>(), + .def(py::init const &, cl_uint, cl_uint, pyopencl::command_queue *>(), py::arg("context"), /* py::kw_only(), */ py::arg("alignment")=0, @@ -719,9 +735,13 @@ void pyopencl_expose_mempool(py::module_ &m) { typedef pyopencl::memory_pool cls; - py::class_ wrapper( m, "SVMPool"); + py::class_ wrapper( + m, "SVMPool", + py::intrusive_ptr( + [](cls *o, PyObject *po) noexcept { o->set_self_py(po); }) + ); wrapper - .def(py::init, unsigned>(), + .def(py::init, unsigned>(), py::arg("allocator"), /* py::kw_only(), */ py::arg("leading_bits_in_bin_id")=4