diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index 3ee85249..f41efadd 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -180,11 +180,30 @@ namespace detail_ {
  *
  * @param num_bytes amount of memory to allocate in bytes
  */
+#if CUDA_VERSION >= 11020
+inline cuda::memory::region_t allocate_in_current_context(
+	size_t num_bytes, optional<stream::handle_t> stream_handle = {})
+#else
 inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
+#endif
 {
+#if CUDA_VERSION >= 11020
+	if (stream_handle) {
+		device::address_t allocated = 0;
+		// Note: the typed cudaMalloc also takes its size in bytes, apparently,
+		// not in number of elements
+		auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle);
+		if (is_success(status) && allocated == 0) {
+			// Can this even happen? hopefully not
+			status = static_cast<decltype(status)>(status::unknown);
+		}
+		throw_if_error_lazy(status,
+			"Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
+			" bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) );
+		return {as_pointer(allocated), num_bytes};
+	}
+#endif
 	device::address_t allocated = 0;
-	// Note: the typed cudaMalloc also takes its size in bytes, apparently,
-	// not in number of elements
 	auto status = cuMemAlloc(&allocated, num_bytes);
 	if (is_success(status) && allocated == 0) {
 		// Can this even happen? hopefully not
@@ -195,110 +214,90 @@ inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
 	return {as_pointer(allocated), num_bytes};
 }
 
-inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes)
+#if CUDA_VERSION >= 11020
+inline region_t allocate(
+	context::handle_t           context_handle,
+	size_t                      size_in_bytes,
+	optional<stream::handle_t>  stream_handle = {})
+{
+	CAW_SET_SCOPE_CONTEXT(context_handle);
+	return allocate_in_current_context(size_in_bytes, stream_handle);
+}
+#else
+inline region_t allocate(
+	context::handle_t           context_handle,
+	size_t                      size_in_bytes)
 {
 	CAW_SET_SCOPE_CONTEXT(context_handle);
 	return allocate_in_current_context(size_in_bytes);
 }
-
-} // namespace detail_
+#endif
 
 #if CUDA_VERSION >= 11020
-namespace async {
-
-namespace detail_ {
-
-/// Allocate memory asynchronously on a specified stream.
-inline region_t allocate(
-	context::handle_t  context_handle,
-	stream::handle_t   stream_handle,
-	size_t             num_bytes)
+inline void free_on_stream(
+	void*              allocated_region_start,
+	stream::handle_t   stream_handle)
 {
-	device::address_t allocated = 0;
-	// Note: the typed cudaMalloc also takes its size in bytes, apparently,
-	// not in number of elements
-	auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle);
-	if (is_success(status) && allocated == 0) {
-		// Can this even happen? hopefully not
-		status = static_cast<decltype(status)>(status::unknown);
-	}
+	auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
 	throw_if_error_lazy(status,
-		"Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
-		" bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) );
-	return {as_pointer(allocated), num_bytes};
+		"Failed scheduling an asynchronous freeing of the global memory region starting at "
+		+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
+		+ stream::detail_::identify(stream_handle));
 }
+#endif // CUDA_VERSION >= 11020
 
-} // namespace detail_
-
-/**
- * Schedule an allocation of device-side memory on a CUDA stream.
- *
- * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable"
- * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes.
- *
- * @throws cuda::runtime_error if scheduling fails for any reason
- *
- * @param stream the stream on which to register the allocation
- * @param size_in_bytes the amount of memory to allocate
- * @return a pointer to the region of memory which will become allocated once the stream
- * completes all previous tasks and proceeds to also complete the allocation.
- */
-region_t allocate(const stream_t& stream, size_t size_in_bytes);
-
-} // namespace async
-#endif
-
-/// Free a region of device-side memory (regardless of how it was allocated)
-inline void free(void* ptr)
+inline void free_in_current_context(
+	context::handle_t          current_context_handle,
+	void*                      allocated_region_start)
 {
-	auto result = cuMemFree(address(ptr));
-#ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
+	auto result = cuMemFree(address(allocated_region_start));
 	if (result == status::success) { return; }
-#else
-	if (result == status::success or result == status::context_is_destroyed) { return; }
+#ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
+	if (result == status::context_is_destroyed) { return; }
 #endif
-	throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr));
+	throw runtime_error(result, "Freeing device memory at "
+		+ cuda::detail_::ptr_as_hex(allocated_region_start)
+		+ " in " + context::detail_::identify(current_context_handle));
 }
 
-/// @copydoc free(void*)
-inline void free(region_t region) { free(region.start()); }
+} // namespace detail_
 
+/// Free a region of device-side memory (regardless of how it was allocated)
 #if CUDA_VERSION >= 11020
-namespace async {
-
-namespace detail_ {
+inline void free(void* region_start, optional_ref<const stream_t> stream = {});
+#else
+inline void free(void* ptr);
+#endif
 
-inline void free(
-	context::handle_t  context_handle,
-	stream::handle_t   stream_handle,
-	void*              allocated_region_start)
+#if CUDA_VERSION >= 11020
+/// @copydoc free(void*, optional_ref<const stream_t>)
+inline void free(region_t region, optional_ref<const stream_t> stream = {})
 {
-	auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
-	throw_if_error_lazy(status,
-		"Failed scheduling an asynchronous freeing of the global memory region starting at "
-		+ cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
-		+ stream::detail_::identify(stream_handle, context_handle) );
+	free(region.start(), stream);
 }
+#else
+/// @copydoc free(void*)
+inline void free(region_t region)
+{
+	free(region.start());
+}
+#endif
 
-} // namespace detail_
-
+#if CUDA_VERSION >= 11020
 /**
- * Schedule a de-allocation of device-side memory on a CUDA stream.
+ * Schedule an allocation of device-side memory on a CUDA stream.
  *
- * @throws cuda::runtime_error if freeing fails
+ * @note The CUDA memory allocator guarantees alignment "suitabl[e] for any kind of variable"
+ * (CUDA 9.0 Runtime API documentation), so probably at least 128 bytes.
+ *
+ * @throws cuda::runtime_error if scheduling fails for any reason
  *
  * @param stream the stream on which to register the allocation
+ * @param size_in_bytes the amount of memory to allocate
+ * @return a pointer to the region of memory which will become allocated once the stream
+ * completes all previous tasks and proceeds to also complete the allocation.
  */
- ///@{
-void free(const stream_t& stream, void* region_start);
-
-inline void free(const stream_t& stream, region_t region)
-{
-	free(stream, region.data());
-}
-///@}
-
-} // namespace async
+region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stream);
 #endif
 
 /**
@@ -335,7 +334,9 @@ namespace detail_ {
 
 // Note: Allocates _in the current context_! No current context => failure!
 struct allocator {
-	void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); }
+	void* operator()(size_t num_bytes) const {
+		return detail_::allocate_in_current_context(num_bytes).start();
+	}
 };
 
 struct deleter {
@@ -354,9 +355,10 @@ struct deleter {
  * @param start The first location to set to @p value ; must be properly aligned.
  * @param value A (properly aligned) value to set T-elements to.
  * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
 template <typename T>
-void typed_set(T* start, const T& value, size_t num_elements);
+void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream = {});
 
 /**
  * Sets all bytes in a region of memory to a fixed value
@@ -367,10 +369,26 @@ void typed_set(T* start, const T& value, size_t num_elements);
  * @param start starting address of the memory region to set, in a CUDA
  * device's global memory
  * @param num_bytes size of the memory region in bytes
+ * @param stream an stream on which to schedule the operation; may be omitted
+ *
+**
+ * Asynchronously sets all bytes in a stretch of memory to a single value
+ *
+ * @note asynchronous version of @ref memory::set(void*, int, size_t)
+ *
+ * @param start starting address of the memory region to set,
+ * in a CUDA device's global memory
+ * @param byte_value value to set the memory region to
+ * @param num_bytes size of the memory region in bytes
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void set(void* start, int byte_value, size_t num_bytes)
+inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
-	return typed_set<unsigned char>(static_cast<unsigned char*>(start), static_cast<unsigned char>(byte_value), num_bytes);
+	return typed_set<unsigned char>(
+		static_cast<unsigned char*>(start),
+		static_cast<unsigned char>(byte_value),
+		num_bytes,
+		stream);
 }
 
 /**
@@ -380,10 +398,11 @@ inline void set(void* start, int byte_value, size_t num_bytes)
  *
  * @param byte_value value to set the memory region to
  * @param region a region to zero-out, in a CUDA device's global memory
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void set(region_t region, int byte_value)
+inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
 {
-	set(region.start(), byte_value, region.size());
+	set(region.start(), byte_value, region.size(), stream);
 }
 
 /**
@@ -392,10 +411,11 @@ inline void set(region_t region, int byte_value)
  * @param start the beginning of a region of memory to zero-out, accessible
  *     within a CUDA device's global memory
  * @param num_bytes the size in bytes of the region of memory to zero-out
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(void* start, size_t num_bytes)
+inline void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
-	set(start, 0, num_bytes);
+	set(start, 0, num_bytes, stream);
 }
 
 /**
@@ -403,23 +423,24 @@ inline void zero(void* start, size_t num_bytes)
  *
  * @param region the memory region to zero-out, accessible as a part of a
  * CUDA device's global memory
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(region_t region)
+inline void zero(region_t region, optional_ref<const stream_t> stream = {})
 {
-	zero(region.start(), region.size());
+	zero(region.start(), region.size(), stream);
 }
 
-
 /**
  * Sets all bytes of a single pointed-to value to 0
  *
  * @param ptr pointer to a value of a certain type, accessible within
  *     in a CUDA device's global memory
+ * @param stream an existing stream on which to schedule this action; may be omitted
  */
 template <typename T>
-inline void zero(T* ptr)
+inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
 {
-	zero(ptr, sizeof(T));
+	zero(ptr, sizeof(T), stream);
 }
 
 } // namespace device
@@ -427,15 +448,6 @@ inline void zero(T* ptr)
 /// Asynchronous memory operations
 namespace detail_ {
 
-/**
- * Asynchronous versions of @ref memory::copy functions.
- *
- *
- * @note Since we assume Compute Capability >= 2.0, all devices support the
- * Unified Virtual Address Space, so the CUDA driver can determine, for each pointer,
- * where the data is located, and one does not have to specify this.
- */
-
 ///@{
 
 /**
@@ -692,8 +704,9 @@ inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream
  *     memory, global CUDA-device-side memory or CUDA-managed memory.
  * @param byte_value value to set the memory region to
  * @param num_bytes The amount of memory to set to @p byte_value
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-void set(void* ptr, int byte_value, size_t num_bytes);
+void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {});
 
 /**
  * Sets all bytes in a region of memory to a fixed value
@@ -704,10 +717,11 @@ void set(void* ptr, int byte_value, size_t num_bytes);
  * @param region the memory region to set; may be in host-side memory,
  * global CUDA-device-side memory or CUDA-managed memory.
  * @param byte_value value to set the memory region to
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void set(region_t region, int byte_value)
+inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
 {
-	return set(region.start(), byte_value, region.size());
+	return set(region.start(), byte_value, region.size(), stream);
 }
 
 /**
@@ -715,10 +729,11 @@ inline void set(region_t region, int byte_value)
  *
  * @param region the memory region to zero-out; may be in host-side memory,
  * global CUDA-device-side memory or CUDA-managed memory.
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(region_t region)
+inline void zero(region_t region, optional_ref<const stream_t> stream = {})
 {
-	return set(region, 0);
+	return set(region, 0, stream);
 }
 
 /**
@@ -727,10 +742,11 @@ inline void zero(region_t region)
  * @param ptr the beginning of a region of memory to zero-out; may be in host-side
  *     memory, global CUDA-device-side memory or CUDA-managed memory.
  * @param num_bytes the size in bytes of the region of memory to zero-out
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
-inline void zero(void* ptr, size_t num_bytes)
+inline void zero(void* ptr, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
-	return set(ptr, 0, num_bytes);
+	return set(ptr, 0, num_bytes, stream);
 }
 
 /**
@@ -738,7 +754,8 @@ inline void zero(void* ptr, size_t num_bytes)
  *
  * @param ptr pointer to a single element of a certain type, which may
  * be in host-side memory, global CUDA-device-side memory or CUDA-managed
- * memory
+ * memory.
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
 template <typename T>
 inline void zero(T* ptr)
@@ -1269,11 +1286,8 @@ inline void copy(void* destination, const_region_t source, optional_ref<const st
 	copy(destination, source, source.size(), stream);
 }
 
-
 namespace device {
 
-namespace async {
-
 namespace detail_ {
 
 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
@@ -1283,6 +1297,7 @@ inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t
 	throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
 }
 
+
 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
 {
 	set(region.start(), byte_value, region.size(), stream_handle);
@@ -1323,67 +1338,27 @@ inline void typed_set(T* start, const T& value, size_t num_elements, stream::han
 /**
  * Sets consecutive elements of a region of memory to a fixed value of some width
  *
- * @note A generalization of `async::set()`, for different-size units.
+ * @note A generalization of `set()`, for different-size units.
  *
  * @tparam T An unsigned integer type of size 1, 2, 4 or 8
  * @param start The first location to set to @p value ; must be properly aligned.
  * @param value A (properly aligned) value to set T-elements to.
  * @param num_elements The number of type-T elements (i.e. _not_ necessarily the number of bytes).
- * @param stream The stream on which to enqueue the operation.
+ * @param stream A stream on which to schedule this action; may be omitted.
  */
 template <typename T>
 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);
 
-/**
- * Asynchronously sets all bytes in a stretch of memory to a single value
- *
- * @note asynchronous version of @ref memory::set(void*, int, size_t)
- *
- * @param start starting address of the memory region to set,
- * in a CUDA device's global memory
- * @param byte_value value to set the memory region to
- * @param num_bytes size of the memory region in bytes
- * @param stream stream on which to schedule this action
- */
-inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
-{
-	return typed_set<unsigned char>(
-		static_cast<unsigned char*>(start),
-		static_cast<unsigned char>(byte_value),
-		num_bytes,
-		stream);
-}
-
 /**
  * Asynchronously sets all bytes in a stretch of memory to 0.
  *
- * @note asynchronous version of @ref memory::zero(void*, size_t)
- *
- * @param start starting address of the memory region to set,
- * in a CUDA device's global memory
- * @param num_bytes size of the memory region in bytes
- * @param stream stream on which to schedule this action
+ * @param start      starting address of the memory region to set, in a CUDA device's global memory
+ * @param num_bytes  size of the memory region in bytes
+ * @param stream     stream on which to schedule this action
+ * @param stream     A stream on which to enqueue the operation; may be omitted.
  */
 void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);
 
-/**
- * Asynchronously sets all bytes of a single pointed-to value
- * to 0 (zero).
- *
- * @note asynchronous version of @ref memory::zero(T*)
- *
- * @param ptr a pointer to the value to be to zero; must be valid in the
- * CUDA context of @p stream
- * @param stream stream on which to schedule this action
- */
-template <typename T>
-inline void zero(T* ptr, optional_ref<const stream_t> stream)
-{
-	zero(ptr, sizeof(T), stream);
-}
-
-} // namespace async
-
 } // namespace device
 
 namespace inter_context {
@@ -1826,10 +1801,13 @@ inline void deregister(const_region_t region)
  * Sets all bytes in a stretch of host-side memory to a single value
  *
  * @note a wrapper for @ref ::std::memset
- *
+ * @param byte_value The value to set each byte in the memory region to.
+ */
+///@{
+
+/**
  * @param start starting address of the memory region to set,
  * in host memory; can be either CUDA-allocated or otherwise.
- * @param byte_value value to set the memory region to
  * @param num_bytes size of the memory region in bytes
  */
 inline void set(void* start, int byte_value, size_t num_bytes)
@@ -1838,6 +1816,14 @@ inline void set(void* start, int byte_value, size_t num_bytes)
 	// TODO: Error handling?
 }
 
+/**
+ * @param region The region of memory to set to the fixed value
+ */
+inline void set(region_t region, int byte_value)
+{
+	memory::set(region.start(), byte_value, region.size(), nullopt);
+}
+
 /**
  * Zero-out a region of host memory
  *
@@ -1856,7 +1842,7 @@ inline void zero(void* start, size_t num_bytes)
  */
 inline void zero(region_t region)
 {
-	set(region, 0);
+	host::set(region, 0);
 }
 
 /**
@@ -2128,8 +2114,6 @@ inline void free(region_t region)
 	free(region.start());
 }
 
-namespace async {
-
 namespace detail_ {
 
 inline void prefetch(
@@ -2164,8 +2148,6 @@ void prefetch_to_host(
 	const_region_t   region,
 	const stream_t&  stream);
 
-} // namespace async
-
 } // namespace managed
 
 namespace mapped {
@@ -2374,8 +2356,9 @@ namespace detail_ {
 template <typename T>
 unique_span<T> make_unique_span(const context::handle_t context_handle, size_t size)
 {
+	auto allocate_in_current_context_ = [](size_t size) { return allocate_in_current_context(size); };
 	CAW_SET_SCOPE_CONTEXT(context_handle);
-	return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context);
+	return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context_);
 }
 
 } // namespace detail_
diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
index d8ab0514..3a896405 100644
--- a/src/cuda/api/multi_wrapper_impls/memory.hpp
+++ b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -114,32 +114,32 @@ inline region_t allocate(const device_t& device, size_t size_in_bytes)
 	return allocate(pc, size_in_bytes);
 }
 
-namespace async {
 #if CUDA_VERSION >= 11020
-inline region_t allocate(const stream_t& stream, size_t size_in_bytes)
+inline region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stream = {})
 {
-	return detail_::allocate(stream.context().handle(), stream.handle(), size_in_bytes);
+	return stream ?
+		detail_::allocate(stream->context().handle(), size_in_bytes, stream->handle()) :
+		detail_::allocate_in_current_context(size_in_bytes);
 }
 
-inline void free(const stream_t& stream, void* region_start)
-{
-	return detail_::free(stream.context().handle(), stream.handle(), region_start);
-}
 #endif // CUDA_VERSION >= 11020
 
-template <typename T>
-inline void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream)
-{
-	detail_::set(start, value, num_elements, stream.handle());
-}
-
-inline void zero(void* start, size_t num_bytes, const stream_t& stream)
+#if CUDA_VERSION >= 11020
+inline void free(void* region_start, optional_ref<const stream_t> stream)
+#else
+inline void free(void* region_start)
+#endif // CUDA_VERSION >= 11020
 {
-	detail_::zero(start, num_bytes, stream.handle());
+#if CUDA_VERSION >= 11020
+	if (stream) {
+		detail_::free_on_stream(region_start, stream->handle());
+		return;
+	}
+#endif
+	context::current::detail_::scoped_existence_ensurer_t ensurer;
+	detail_::free_in_current_context(ensurer.context_handle,region_start);
 }
 
-} // namespace async
-
 } // namespace device
 
 namespace inter_context {
@@ -237,8 +237,6 @@ ::std::vector<device_t, Allocator> expected_accessors(const_region_t region, con
 	return devices;
 }
 
-namespace async {
-
 inline void prefetch(
 	const_region_t         region,
 	const cuda::device_t&  destination,
@@ -252,8 +250,6 @@ inline void prefetch_to_host(const_region_t region, const stream_t& stream)
 	detail_::prefetch(region, CU_DEVICE_CPU, stream.handle());
 }
 
-} // namespace async
-
 inline region_t allocate(
 	const context_t&      context,
 	size_t                num_bytes,
@@ -396,8 +392,11 @@ inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attrib
 namespace device {
 
 template <typename T>
-inline void typed_set(T* start, const T& value, size_t num_elements)
+inline void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream)
 {
+	if (stream) {
+		detail_::set(start, value, num_elements, stream->handle());
+	}
 	context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
 	static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
 	static_assert(sizeof(T) == 1 or sizeof(T) == 2 or sizeof(T) == 4,
@@ -405,25 +404,34 @@ inline void typed_set(T* start, const T& value, size_t num_elements)
 	// TODO: Consider checking for alignment when compiling without NDEBUG
 	status_t result {CUDA_SUCCESS};
 	switch(sizeof(T)) {
-	case 1: result = cuMemsetD8 (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
-	case 2: result = cuMemsetD16(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
-	case 4: result = cuMemsetD32(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
+	case 1: result = stream ?
+		cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream->handle()) :
+		cuMemsetD8      (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
+	case 2: result = stream ?
+		cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream->handle()) :
+		cuMemsetD16     (address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
+	case 4: result = stream ?
+		cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream->handle()) :
+		cuMemsetD32     (address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
 	}
 	throw_if_error_lazy(result, "Setting global device memory bytes");
 }
 
 } // namespace device
 
-inline void set(void* ptr, int byte_value, size_t num_bytes)
+inline void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
 {
 	switch ( type_of(ptr) ) {
 	case device_:
 //		case managed_:
 	case unified_:
-		memory::device::set(ptr, byte_value, num_bytes); break;
+		memory::device::set(ptr, byte_value, num_bytes, stream); break;
 //		case unregistered_:
 	case host_:
-		::std::memset(ptr, byte_value, num_bytes); break;
+		if (stream) {
+			throw ::std::invalid_argument("Asynchronous host-memory set's not currently supported");
+		} else { ::std::memset(ptr, byte_value, num_bytes); }
+		break;
 	default:
 		throw runtime_error(
 			cuda::status::invalid_value,
diff --git a/src/cuda/api/stream.hpp b/src/cuda/api/stream.hpp
index cf6547e2..1912e8ac 100644
--- a/src/cuda/api/stream.hpp
+++ b/src/cuda/api/stream.hpp
@@ -481,7 +481,7 @@ class stream_t {
 		{
 			// Is it necessary to set the device? I wonder.
 			CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
-			memory::device::async::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
+			memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
 		}
 
 		/// @copydoc memset(void *, int, size_t) const
@@ -504,7 +504,7 @@ class stream_t {
 		void memzero(void *start, size_t num_bytes) const
 		{
 			CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
-			memory::device::async::detail_::zero(start, num_bytes, associated_stream.handle_);
+			memory::device::detail_::zero(start, num_bytes, associated_stream.handle_);
 		}
 
 		/**
@@ -590,7 +590,7 @@ class stream_t {
 		 */
 		memory::region_t allocate(size_t num_bytes) const
 		{
-			return memory::device::async::allocate(associated_stream, num_bytes);
+			return memory::device::allocate(num_bytes, associated_stream);
 		}
 
 		memory::region_t allocate(const memory::pool_t& pool, size_t num_bytes);
@@ -601,14 +601,14 @@ class stream_t {
 		///@{
 		void free(void* region_start) const
 		{
-			memory::device::async::free(associated_stream, region_start);
+			memory::device::free(region_start, associated_stream);
 		}
 
 		void free(memory::region_t region) const
 		{
-			memory::device::async::free(associated_stream, region);
+			memory::device::free(region, associated_stream);
 		}
-#endif
+#endif // CUDA_VERSION >= 11020
 
 		/**
 		 * Sets the attachment of a region of managed memory (i.e. in the address space visible