Skip to content

Commit

Permalink
Misc Cleanup (#814)
Browse files Browse the repository at this point in the history
* Cleanup #define's in filter.cuh

* Cleanup #define's in other files

* Fix dereferencing type-punned pointer bug in Release mode

* Fix Werror=uninitialized compile error when MATX_EN_OPENBLAS=ON in Release mode

* Fix uninitialized variable bug in svd plan

* Update PrintTests for default tensor name
  • Loading branch information
tmartin-gh authored Dec 20, 2024
1 parent b1a02f1 commit 1bfe456
Show file tree
Hide file tree
Showing 51 changed files with 650 additions and 631 deletions.
10 changes: 5 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,10 @@ if (MATX_NVTX_FLAGS)
target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
endif()
if (MATX_BUILD_32_BIT)
set(INT_TYPE "lp64")
target_compile_definitions(matx INTERFACE INDEX_32_BIT)
set(MATX_NVPL_INT_TYPE "lp64")
target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)
else()
set(INT_TYPE "ilp64")
set(MATX_NVPL_INT_TYPE "ilp64")
endif()

# Host support
Expand All @@ -211,13 +211,13 @@ if (MATX_EN_NVPL OR MATX_EN_X86_FFTW OR MATX_EN_BLIS OR MATX_EN_OPENBLAS)
endif()

if (MATX_EN_NVPL)
message(STATUS "Enabling NVPL library support for ARM CPUs with ${INT_TYPE} interface")
message(STATUS "Enabling NVPL library support for ARM CPUs with ${MATX_NVPL_INT_TYPE} interface")
find_package(nvpl REQUIRED COMPONENTS fft blas lapack HINTS ${blas_DIR})
if (NOT MATX_BUILD_32_BIT)
target_compile_definitions(matx INTERFACE NVPL_ILP64)
endif()
target_compile_definitions(matx INTERFACE NVPL_LAPACK_COMPLEX_CUSTOM)
target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${INT_TYPE}_omp nvpl::lapack_${INT_TYPE}_omp)
target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${MATX_NVPL_INT_TYPE}_omp nvpl::lapack_${MATX_NVPL_INT_TYPE}_omp)
target_compile_definitions(matx INTERFACE MATX_EN_NVPL)
else()
# FFTW
Expand Down
28 changes: 14 additions & 14 deletions examples/black_scholes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ using namespace matx;
* instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
* kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
* boilerplate code around the original expression. This custom operator can then be used either alone or inside
* other arithmetic expressions, and only a single load is issues for each tensor.
*
* other arithmetic expressions, and only a single load is issues for each tensor.
*
* This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
* shows the performance difference.
*/
Expand Down Expand Up @@ -76,7 +76,7 @@ public:
auto d2 = d1 - VsqrtT;
auto cdf_d1 = normcdf(d1);
auto cdf_d2 = normcdf(d2);
auto expRT = exp(-1 * r * T);
auto expRT = exp(-1 * r * T);

out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
}
Expand All @@ -87,20 +87,20 @@ public:

/* Arithmetic expression */
template<typename T1>
void compute_black_scholes_matx(tensor_t<T1,1>& K,
tensor_t<T1,1>& S,
tensor_t<T1,1>& V,
tensor_t<T1,1>& r,
tensor_t<T1,1>& T,
tensor_t<T1,1>& output,
void compute_black_scholes_matx(tensor_t<T1,1>& K,
tensor_t<T1,1>& S,
tensor_t<T1,1>& V,
tensor_t<T1,1>& r,
tensor_t<T1,1>& T,
tensor_t<T1,1>& output,
cudaExecutor& exec)
{
auto VsqrtT = V * sqrt(T);
auto d1 = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;
auto d2 = d1 - VsqrtT;
auto cdf_d1 = normcdf(d1);
auto cdf_d2 = normcdf(d2);
auto expRT = exp(-1 * r * T);
auto expRT = exp(-1 * r * T);

(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
}
Expand All @@ -120,13 +120,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
tensor_t<dtype, 1> V_tensor{{input_size}};
tensor_t<dtype, 1> r_tensor{{input_size}};
tensor_t<dtype, 1> T_tensor{{input_size}};
tensor_t<dtype, 1> output_tensor{{input_size}};
tensor_t<dtype, 1> output_tensor{{input_size}};

cudaStream_t stream;
cudaStreamCreate(&stream);
cudaExecutor exec{stream};

compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);

cudaEvent_t start, stop;
cudaEventCreate(&start);
Expand Down Expand Up @@ -154,11 +154,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
cudaEventElapsedTime(&time_ms, start, stop);

printf("Time with custom operator = %.2fms per iteration\n",
time_ms / num_iterations);
time_ms / num_iterations);

cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);
CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
2 changes: 1 addition & 1 deletion examples/cgsolve.cu
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
// example-end sync-test-1
printf ("max l2 norm: %f\n", (float)sqrt(maxn()));

CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
4 changes: 2 additions & 2 deletions examples/channelize_poly_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
}
cudaEventRecord(stop, stream);
exec.sync();
CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
cudaEventElapsedTime(&elapsed_ms, start, stop);

const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
Expand All @@ -112,7 +112,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
printf("\n");
}

CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();

cudaEventDestroy(start);
cudaEventDestroy(stop);
Expand Down
16 changes: 8 additions & 8 deletions examples/conv2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,23 @@ using namespace matx;
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();

index_t iN = 4;
index_t iM = 6;

index_t fN = 4;
index_t fM = 2;

auto in = make_tensor<int>({iN,iM});
auto filter = make_tensor<int>({fN,fM});

in.SetVals({ {1,2,3,4,5,6},
{5,4,3,2,1,0},
{3,4,5,6,7,8},
{1,2,3,4,5,6},
});

filter.SetVals({ {1,2},
filter.SetVals({ {1,2},
{3,4},
{5,6},
{7,8}});
Expand All @@ -73,9 +73,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
index_t oM = iM - fM + 1;
auto mode = MATX_C_MODE_VALID;
#endif

auto out = make_tensor<int>({oN,oM});

(out = conv2d(in, filter, mode)).run();

printf("in:\n");
Expand All @@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
print(out);


CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
4 changes: 2 additions & 2 deletions examples/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
for (uint32_t i = 0; i < iterations; i++) {
(outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
}


cudaEventRecord(stop, stream);
exec.sync();
Expand Down Expand Up @@ -149,6 +149,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

matxPrintMemoryStatistics();

CUDA_CHECK_LAST_ERROR();
MATX_CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
Loading

0 comments on commit 1bfe456

Please sign in to comment.