From d350e1e59af4b7197d5452674ff690001b9988e8 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Sun, 20 Oct 2024 21:43:49 +0300
Subject: [PATCH] Fixes #689: All remaining `async` subnamespace operations -
 `typed_set()`, `set()` and zero()` - now out of the namespace, and unified
 with their non-async variants

---
 src/cuda/api/memory.hpp                     | 144 +++++++++-----------
 src/cuda/api/multi_wrapper_impls/memory.hpp |  31 +++--
 src/cuda/api/stream.hpp                     |   4 +-
 3 files changed, 85 insertions(+), 94 deletions(-)
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index ec906af9..8f60e655 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -373,9 +373,10 @@ struct deleter {
  * @param start The first location to set to @p value ; must be properly aligned.
  * @param value A (properly aligned) value to set T-elements to.
  * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
 template <typename T>
-void typed_set(T* start, const T& value, size_t num_elements);
+void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream = {});
 
 /**
  * Sets all bytes in a region of memory to a fixed value
@@ -386,10 +387,26 @@ void typed_set(T* start, const T& value, size_t num_elements);
  * @param start starting address of the memory region to set, in a CUDA
  * device's global memory
  * @param num_bytes size of the memory region in bytes
+ * @param stream an stream on which to schedule the operation; may be omitted
+ *
+**
+ * Asynchronously sets all bytes in a stretch of memory to a single value
+ *
+ * @note asynchronous version of @ref memory::set(void*, int, size_t)
+ *
+ * @param start starting address of the memory region to set,
+ * in a CUDA device's global memory
+ * @param byte_value value to set the memory region to
+ * @param num_bytes size of the memory region in bytes
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void set(void* start, int byte_value, size_t num_bytes)
+inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
-	return typed_set<unsigned char>(static_cast<unsigned char*>(start), static_cast<unsigned char>(byte_value), num_bytes);
+	return typed_set<unsigned char>(
+		static_cast<unsigned char*>(start),
+		static_cast<unsigned char>(byte_value),
+		num_bytes,
+		stream);
 }
 
 /**
@@ -399,10 +416,11 @@ inline void set(void* start, int byte_value, size_t num_bytes)
  *
  * @param byte_value value to set the memory region to
  * @param region a region to zero-out, in a CUDA device's global memory
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void set(region_t region, int byte_value)
+inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
 {
-	set(region.start(), byte_value, region.size());
+	set(region.start(), byte_value, region.size(), stream);
 }
 
 /**
@@ -411,10 +429,11 @@ inline void set(region_t region, int byte_value)
  * @param start the beginning of a region of memory to zero-out, accessible
  *     within a CUDA device's global memory
  * @param num_bytes the size in bytes of the region of memory to zero-out
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(void* start, size_t num_bytes)
+inline void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
-	set(start, 0, num_bytes);
+	set(start, 0, num_bytes, stream);
 }
 
 /**
@@ -422,23 +441,24 @@ inline void zero(void* start, size_t num_bytes)
  *
  * @param region the memory region to zero-out, accessible as a part of a
  * CUDA device's global memory
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(region_t region)
+inline void zero(region_t region, optional_ref<const stream_t> stream = {})
 {
-	zero(region.start(), region.size());
+	zero(region.start(), region.size(), stream);
 }
 
-
 /**
  * Sets all bytes of a single pointed-to value to 0
  *
  * @param ptr pointer to a value of a certain type, accessible within
  *     in a CUDA device's global memory
+ * @param stream an existing stream on which to schedule this action; may be omitted
  */
 template <typename T>
-inline void zero(T* ptr)
+inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
 {
-	zero(ptr, sizeof(T));
+	zero(ptr, sizeof(T), stream);
 }
 
 } // namespace device
@@ -446,15 +466,6 @@ inline void zero(T* ptr)
 /// Asynchronous memory operations
 namespace detail_ {
 
-/**
- * Asynchronous versions of @ref memory::copy functions.
- *
- *
- * @note Since we assume Compute Capability >= 2.0, all devices support the
- * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
- * where the data is located, and one does not have to specify this.
- */
-
 ///@{
 
 /**
@@ -712,8 +723,9 @@ inline void copy(T(&destination)[N], T* source, optional_ref<const stream_t> str
  *     memory, global CUDA-device-side memory or CUDA-managed memory.
  * @param byte_value value to set the memory region to
  * @param num_bytes The amount of memory to set to @p byte_value
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-void set(void* ptr, int byte_value, size_t num_bytes);
+void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream);
 
 /**
  * Sets all bytes in a region of memory to a fixed value
@@ -724,10 +736,11 @@ void set(void* ptr, int byte_value, size_t num_bytes);
  * @param region the memory region to set; may be in host-side memory,
  * global CUDA-device-side memory or CUDA-managed memory.
  * @param byte_value value to set the memory region to
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void set(region_t region, int byte_value)
+inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream)
 {
-	return set(region.start(), byte_value, region.size());
+	return set(region.start(), byte_value, region.size(), stream);
 }
 
 /**
@@ -735,10 +748,11 @@ inline void set(region_t region, int byte_value)
  *
  * @param region the memory region to zero-out; may be in host-side memory,
  * global CUDA-device-side memory or CUDA-managed memory.
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(region_t region)
+inline void zero(region_t region, optional_ref<const stream_t> stream)
 {
-	return set(region, 0);
+	return set(region, 0, stream);
 }
 
 /**
@@ -747,10 +761,11 @@ inline void zero(region_t region)
  * @param ptr the beginning of a region of memory to zero-out; may be in host-side
  *     memory, global CUDA-device-side memory or CUDA-managed memory.
  * @param num_bytes the size in bytes of the region of memory to zero-out
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(void* ptr, size_t num_bytes)
+inline void zero(void* ptr, size_t num_bytes, optional_ref<const stream_t> stream)
 {
-	return set(ptr, 0, num_bytes);
+	return set(ptr, 0, num_bytes, stream);
 }
 
 /**
@@ -758,7 +773,8 @@ inline void zero(void* ptr, size_t num_bytes)
  *
  * @param ptr pointer to a single element of a certain type, which may
  * be in host-side memory, global CUDA-device-side memory or CUDA-managed
- * memory
+ * memory.
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
 template <typename T>
 inline void zero(T* ptr)
@@ -1289,11 +1305,8 @@ inline void copy(void* destination, const_region_t source, optional_ref<const st
 	copy(destination, source, source.size(), stream);
 }
 
-
 namespace device {
 
-namespace async {
-
 namespace detail_ {
 
 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
@@ -1343,67 +1356,27 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
 /**
  * Sets consecutive elements of a region of memory to a fixed value of some width
  *
- * @note A generalization of `async::set()`, for different-size units.
+ * @note A generalization of `set()`, for different-size units.
  *
  * @tparam T An unsigned integer type of size 1, 2, 4 or 8
  * @param start The first location to set to @p value ; must be properly aligned.
  * @param value A (properly aligned) value to set T-elements to.
  * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
- * @param stream The stream on which to enqueue the operation.
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
 template <typename T>
 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);
 
-/**
- * Asynchronously sets all bytes in a stretch of memory to a single value
- *
- * @note asynchronous version of @ref memory::set(void*, int, size_t)
- *
- * @param start starting address of the memory region to set,
- * in a CUDA device's global memory
- * @param byte_value value to set the memory region to
- * @param num_bytes size of the memory region in bytes
- * @param stream stream on which to schedule this action
- */
-inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
-{
-	return typed_set<unsigned char>(
-		static_cast<unsigned char*>(start),
-		static_cast<unsigned char>(byte_value),
-		num_bytes,
-		stream);
-}
-
 /**
  * Asynchronously sets all bytes in a stretch of memory to 0.
  *
- * @note asynchronous version of @ref memory::zero(void*, size_t)
- *
- * @param start starting address of the memory region to set,
- * in a CUDA device's global memory
- * @param num_bytes size of the memory region in bytes
- * @param stream stream on which to schedule this action
+ * @param start      starting address of the memory region to set, in a CUDA device's global memory
+ * @param num_bytes  size of the memory region in bytes
+ * @param stream     stream on which to schedule this action
+ * @param stream     A stream on which to enqueue the operation; may be omitted.
  */
 void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);
 
-/**
- * Asynchronously sets all bytes of a single pointed-to value
- * to 0 (zero).
- *
- * @note asynchronous version of @ref memory::zero(T*)
- *
- * @param ptr a pointer to the value to be to zero; must be valid in the
- * CUDA context of @p stream
- * @param stream stream on which to schedule this action
- */
-template <typename T>
-inline void zero(T* ptr, optional_ref<const stream_t> stream)
-{
-	zero(ptr, sizeof(T), stream);
-}
-
-} // namespace async
-
 } // namespace device
 
 namespace inter_context {
@@ -1846,10 +1819,13 @@ inline void deregister(const_region_t region)
  * Sets all bytes in a stretch of host-side memory to a single value
  *
  * @note a wrapper for @ref ::std::memset
- *
+ * @param byte_value The value to set each byte in the memory region to.
+ */
+///@{
+
+/**
  * @param start starting address of the memory region to set,
  * in host memory; can be either CUDA-allocated or otherwise.
- * @param byte_value value to set the memory region to
  * @param num_bytes size of the memory region in bytes
  */
 inline void set(void* start, int byte_value, size_t num_bytes)
@@ -1858,6 +1834,14 @@ inline void set(void* start, int byte_value, size_t num_bytes)
 	// TODO: Error handling?
 }
 
+/**
+ * @param region The region of memory to set to the fixed value
+ */
+inline void set(region_t region, int byte_value)
+{
+	set(region.start(), byte_value, region.size());
+}
+
 /**
  * Zero-out a region of host memory
  *
diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
index 57d0926b..800fb0ba 100644
--- a/src/cuda/api/multi_wrapper_impls/memory.hpp
+++ b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -140,11 +140,6 @@ inline void free(void* ptr)
 
 namespace async {
 
-template <typename T>
-inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream)
-{
-	detail_::set(start, value, num_elements, stream.handle());
-}
 
 inline void zero(void* start, size_t num_bytes, const stream_t& stream)
 {
@@ -409,8 +404,11 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib
 namespace device {
 
 template <typename T>
-inline void typed_set(T* start, const T& value, size_t num_elements)
+inline void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream)
 {
+	if (stream) {
+		detail_::set(start, value, num_elements, stream->handle());
+	}
 	context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
 	static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
 	static_assert(sizeof(T) == 1 or sizeof(T) == 2 or sizeof(T) == 4,
@@ -418,25 +416,34 @@ inline void typed_set(T* start, const T& value, size_t num_elements)
 	// TODO: Consider checking for alignment when compiling without NDEBUG
 	status_t result {CUDA_SUCCESS};
 	switch(sizeof(T)) {
-	case 1: result = cuMemsetD8 (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
-	case 2: result = cuMemsetD16(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
-	case 4: result = cuMemsetD32(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
+	case 1: result = stream ?
+		cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream->handle()) :
+		cuMemsetD8      (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
+	case 2: result = stream ?
+		cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream->handle()) :
+		cuMemsetD16     (address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
+	case 4: result = stream ?
+		cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream->handle()) :
+		cuMemsetD32     (address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
 	}
 	throw_if_error_lazy(result, "Setting global device memory bytes");
 }
 
 } // namespace device
 
-inline void set(void* ptr, int byte_value, size_t num_bytes)
+inline void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
 {
 	switch ( type_of(ptr) ) {
 	case device_:
 //		case managed_:
 	case unified_:
-		memory::device::set(ptr, byte_value, num_bytes); break;
+		memory::device::set(ptr, byte_value, num_bytes, stream); break;
 //		case unregistered_:
 	case host_:
-		::std::memset(ptr, byte_value, num_bytes); break;
+		if (stream) {
+			throw ::std::invalid_argument("Asynchronous host-memory set's not currently supported");
+		} else { ::std::memset(ptr, byte_value, num_bytes); }
+		break;
 	default:
 		throw runtime_error(
 			cuda::status::invalid_value,
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index a7e3f0cb..1912e8ac 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -481,7 +481,7 @@ class stream_t {
 		{
 			// Is it necessary to set the device? I wonder.
 			CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
-			memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
+			memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
 		}
 
 		/// @copydoc memset(void *, int, size_t) const
@@ -504,7 +504,7 @@ class stream_t {
 		void memzero(void *start, size_t num_bytes) const
 		{
 			CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
-			memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_);
+			memory::device::detail_::zero(start, num_bytes, associated_stream.handle_);
 		}
 
 		/**