Misc Cleanup (#814)

* Cleanup #define's in filter.cuh * Cleanup #define's in other files * Fix dereferencing type-punned pointer bug in Release mode * Fix Werror=uninitialized compile error when MATX_EN_OPENBLAS=ON in Release mode * Fix uninitialized variable bug in svd plan * Update PrintTests for default tensor name
NVIDIA · Dec 20, 2024 · 1bfe456 · 1bfe456
1 parent b1a02f1
commit 1bfe456
Show file tree

Hide file tree

Showing 51 changed files with 650 additions and 631 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -185,10 +185,10 @@ if (MATX_NVTX_FLAGS)
     target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
 endif()
 if (MATX_BUILD_32_BIT)
-    set(INT_TYPE "lp64")
-    target_compile_definitions(matx INTERFACE INDEX_32_BIT)
+    set(MATX_NVPL_INT_TYPE "lp64")
+    target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)
 else()
-    set(INT_TYPE "ilp64")
+    set(MATX_NVPL_INT_TYPE "ilp64")
 endif()
 
 # Host support
@@ -211,13 +211,13 @@ if (MATX_EN_NVPL OR MATX_EN_X86_FFTW OR MATX_EN_BLIS OR MATX_EN_OPENBLAS)
     endif()
 
     if (MATX_EN_NVPL)
-        message(STATUS "Enabling NVPL library support for ARM CPUs with ${INT_TYPE} interface")
+        message(STATUS "Enabling NVPL library support for ARM CPUs with ${MATX_NVPL_INT_TYPE} interface")
         find_package(nvpl REQUIRED COMPONENTS fft blas lapack HINTS ${blas_DIR})
         if (NOT MATX_BUILD_32_BIT)
             target_compile_definitions(matx INTERFACE NVPL_ILP64)
         endif()
         target_compile_definitions(matx INTERFACE NVPL_LAPACK_COMPLEX_CUSTOM)
-        target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${INT_TYPE}_omp nvpl::lapack_${INT_TYPE}_omp)
+        target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${MATX_NVPL_INT_TYPE}_omp nvpl::lapack_${MATX_NVPL_INT_TYPE}_omp)
         target_compile_definitions(matx INTERFACE MATX_EN_NVPL)
     else()
         # FFTW

diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu
@@ -46,8 +46,8 @@ using namespace matx;
  * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
  * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
  * boilerplate code around the original expression. This custom operator can then be used either alone or inside
- * other arithmetic expressions, and only a single load is issues for each tensor. 
- * 
+ * other arithmetic expressions, and only a single load is issues for each tensor.
+ *
  * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
  * shows the performance difference.
  */
@@ -76,7 +76,7 @@ public:
     auto d2 = d1 - VsqrtT;
     auto cdf_d1 = normcdf(d1);
     auto cdf_d2 = normcdf(d2);
-    auto expRT = exp(-1 * r * T); 
+    auto expRT = exp(-1 * r * T);
 
     out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
   }
@@ -87,20 +87,20 @@ public:
 
 /* Arithmetic expression */
 template<typename T1>
-void compute_black_scholes_matx(tensor_t<T1,1>& K, 
-                                tensor_t<T1,1>& S, 
-                                tensor_t<T1,1>& V, 
-                                tensor_t<T1,1>& r, 
-                                tensor_t<T1,1>& T, 
-                                tensor_t<T1,1>& output, 
+void compute_black_scholes_matx(tensor_t<T1,1>& K,
+                                tensor_t<T1,1>& S,
+                                tensor_t<T1,1>& V,
+                                tensor_t<T1,1>& r,
+                                tensor_t<T1,1>& T,
+                                tensor_t<T1,1>& output,
                                 cudaExecutor& exec)
 {
     auto VsqrtT = V * sqrt(T);
     auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
     auto d2 = d1 - VsqrtT;
     auto cdf_d1 = normcdf(d1);
     auto cdf_d2 = normcdf(d2);
-    auto expRT = exp(-1 * r * T); 
+    auto expRT = exp(-1 * r * T);
 
     (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
 }
@@ -120,13 +120,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   tensor_t<dtype, 1> V_tensor{{input_size}};
   tensor_t<dtype, 1> r_tensor{{input_size}};
   tensor_t<dtype, 1> T_tensor{{input_size}};
-  tensor_t<dtype, 1> output_tensor{{input_size}};  
+  tensor_t<dtype, 1> output_tensor{{input_size}};
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   cudaExecutor exec{stream};
 
-  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);  
+  compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -154,11 +154,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventElapsedTime(&time_ms, start, stop);
 
   printf("Time with custom operator = %.2fms per iteration\n",
-         time_ms / num_iterations);         
+         time_ms / num_iterations);
 
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
@@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu
@@ -101,7 +101,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
       }
       cudaEventRecord(stop, stream);
       exec.sync();
-      CUDA_CHECK_LAST_ERROR();
+      MATX_CUDA_CHECK_LAST_ERROR();
       cudaEventElapsedTime(&elapsed_ms, start, stop);
 
       const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
@@ -112,7 +112,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
     printf("\n");
   }
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
 
   cudaEventDestroy(start);
   cudaEventDestroy(stop);

diff --git a/examples/conv2d.cu b/examples/conv2d.cu
@@ -39,23 +39,23 @@ using namespace matx;
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-  
+
   index_t iN = 4;
   index_t iM = 6;
- 
+
   index_t fN = 4;
   index_t fM = 2;
-  
+
   auto in = make_tensor<int>({iN,iM});
   auto filter = make_tensor<int>({fN,fM});
-  
+
   in.SetVals({ {1,2,3,4,5,6},
                {5,4,3,2,1,0},
                {3,4,5,6,7,8},
                {1,2,3,4,5,6},
                });
 
-  filter.SetVals({ {1,2}, 
+  filter.SetVals({ {1,2},
                    {3,4},
                    {5,6},
                    {7,8}});
@@ -73,9 +73,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t oM = iM - fM + 1;
   auto mode = MATX_C_MODE_VALID;
 #endif
-  
+
   auto out = make_tensor<int>({oN,oM});
-  
+
   (out = conv2d(in, filter, mode)).run();
 
   printf("in:\n");
@@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   print(out);
 
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }
diff --git a/examples/convolution.cu b/examples/convolution.cu
@@ -93,7 +93,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (uint32_t i = 0; i < iterations; i++) {
     (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
   }
-  
+
 
   cudaEventRecord(stop, stream);
   exec.sync();
@@ -149,6 +149,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   matxPrintMemoryStatistics();
 
-  CUDA_CHECK_LAST_ERROR();
+  MATX_CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
 }