From dab09dbde6d9a93cb0f9c4dd9e73510537550220 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Mon, 23 Sep 2024 00:27:46 +0300
Subject: [PATCH] Warning fixes:

* Corrected a CUDA version include guard in `graph/instance.hpp` which was enabling an unused parameter in CUDA 11.0 - 11.3.x
* Explicitly cast a `size_t` to `unsigned int` in `module.hpp`, to avoid narrowing warnings
* In the asyncAPI example, now Using iostreams printing rather than a `printf()` with an inexact format specifier (`size_t` vs `unsigned long`)
* Now using float literals, rather than double literals, to set float variables or fill float buffers, in the bandwidthTest and jacobiCudaGraphs examples
* streamOrderedAllocation example: Explicit cast from `size_t` to `int` to avoid a warning about narrowing
---
 examples/modified_cuda_samples/asyncAPI/asyncAPI.cu           | 2 +-
 .../modified_cuda_samples/bandwidthtest/bandwidthtest.cpp     | 4 ++--
 examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp      | 2 +-
 .../streamOrderedAllocation/streamOrderedAllocation.cu        | 2 +-
 src/cuda/api/graph/instance.hpp                               | 2 +-
 src/cuda/api/module.hpp                                       | 2 +-
 src/cuda/api/multi_wrapper_impls/graph.hpp                    | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
index f578cfdc..386ddb02 100644
--- a/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
+++ b/examples/modified_cuda_samples/asyncAPI/asyncAPI.cu
@@ -34,7 +34,7 @@ bool correct_output(cuda::span<const int> data, const int x)
 	for (size_t i = 0; i < data.size(); i++)
 		if (data[i] != x)
 		{
-			printf("Error! data[%lu] = %d, ref = %d\n", i, data[i], x);
+			std::cout << "Error! data" << i << " = " << data[i] << " ref = " << x << '\n';
 			return false;
 		}
 	return true;
diff --git a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
index d21c91c5..19857887 100644
--- a/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
+++ b/examples/modified_cuda_samples/bandwidthtest/bandwidthtest.cpp
@@ -105,8 +105,8 @@ int main()
 	cuda::memory::copy(h_aPinned, h_aPageable, bytes);
 	// Note: the following two instructions can be replaced with CUDA API wrappers
 	// calls - cuda::memory::host::zero(), but that won't improve anything
-	std::fill_n(h_bPageable, nElements, 0.0);
-	std::fill_n(h_bPinned, nElements, 0.0);
+	std::fill_n(h_bPageable, nElements, 0.0f);
+	std::fill_n(h_bPinned, nElements, 0.0f);
 
 	std::cout << "\nDevice: " << cuda::device::current::get().name() << "\n";
 	std::cout << "\nTransfer size (MB): " << (bytes / Mi) << "\n";
diff --git a/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp b/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp
index eb11c8a2..ea556f42 100644
--- a/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp
+++ b/examples/modified_cuda_samples/jacobiCudaGraphs/main.cpp
@@ -149,7 +149,7 @@ int main(int argc, char **argv)
 
 	createLinearSystem(A, b);
 
-	float convergence_threshold = 1.0e-2;
+	float convergence_threshold = 1.0e-2f;
 	int max_num_iterations = 4 * N_ROWS * N_ROWS;
 
 	// create timer
diff --git a/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu b/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu
index 1a6be6e2..fd3ed314 100644
--- a/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu
+++ b/examples/modified_cuda_samples/streamOrderedAllocation/streamOrderedAllocation.cu
@@ -115,7 +115,7 @@ int basicStreamOrderedAllocation(
 	auto d_c = span<float>(stream.enqueue.allocate(c.size() * sizeof(float)));
 	stream.enqueue.copy(d_a, a);
 	stream.enqueue.copy(d_b, b);
-	stream.enqueue.kernel_launch(vectorAddGPU, launch_config, d_a.data(), d_b.data(), d_c.data(), c.size());
+	stream.enqueue.kernel_launch(vectorAddGPU, launch_config, d_a.data(), d_b.data(), d_c.data(), (int) c.size());
 	stream.enqueue.free(d_a);
 	stream.enqueue.free(d_b);
 	stream.enqueue.copy(c, d_c);
diff --git a/src/cuda/api/graph/instance.hpp b/src/cuda/api/graph/instance.hpp
index 027a7cf4..49612b08 100644
--- a/src/cuda/api/graph/instance.hpp
+++ b/src/cuda/api/graph/instance.hpp
@@ -446,7 +446,7 @@ void set_node_parameters(
 
 inline instance_t instantiate(
 	const template_t& template_
-#if CUDA_VERSION >= 11000
+#if CUDA_VERSION >= 11040
 	, bool free_previous_allocations_before_relaunch = false
 #endif
 #if CUDA_VERSION >= 12000
diff --git a/src/cuda/api/module.hpp b/src/cuda/api/module.hpp
index f3a1fbd4..f6a7d89d 100644
--- a/src/cuda/api/module.hpp
+++ b/src/cuda/api/module.hpp
@@ -67,7 +67,7 @@ inline void destroy(handle_t handle, context::handle_t context_handle, device::i
 inline unique_span<kernel::handle_t> get_kernel_handles(handle_t module_handle, size_t num_kernels)
 {
 	auto result = make_unique_span<kernel::handle_t>(num_kernels);
-	auto status = cuModuleEnumerateFunctions(result.data(), num_kernels, module_handle);
+	auto status = cuModuleEnumerateFunctions(result.data(), (unsigned int) num_kernels, module_handle);
 	throw_if_error_lazy(status, "Failed enumerating the kernels in " + module::detail_::identify(module_handle));
 	return result;
 }
diff --git a/src/cuda/api/multi_wrapper_impls/graph.hpp b/src/cuda/api/multi_wrapper_impls/graph.hpp
index 89d9c939..5f710b8f 100644
--- a/src/cuda/api/multi_wrapper_impls/graph.hpp
+++ b/src/cuda/api/multi_wrapper_impls/graph.hpp
@@ -112,7 +112,7 @@ inline instance_t template_t::instantiate(
 {
 	return graph::instantiate(
 		*this
-#if CUDA_VERSION >= 11000
+#if CUDA_VERSION >= 11040
 		, free_previous_allocations_before_relaunch
 #endif
 #if CUDA_VERSION >= 11700