Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 225 additions & 0 deletions ci/compute-sanitizer-suppressions.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
<?xml version="1.0" encoding="utf-8"?>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this xml?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is necessary to run compute-sanitizer --initchecks without false positives. It will be useful if we include initcheck runs as part of CI in the future (which I think we should consider)
CCCL has a similar file on their repo: https://github.com/NVIDIA/cccl/blob/main/ci/compute-sanitizer-suppressions.xml

<ComputeSanitizerOutput>
<record>
<kind>Initcheck</kind>
<what>
<text>Uninitialized __global__ memory read of size 4 bytes</text>
<size>4</size>
</what>
<where>
<func>.*</func>
</where>
<hostStack>
<frame>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<func>cusparseCsr2cscEx2</func>
<module>.*libcusparse.so.*</module>
</frame>
</hostStack>
</record>
<record>
<kind>Initcheck</kind>
<what>
<text>Uninitialized __global__ memory read of size 4 bytes</text>
<size>4</size>
</what>
<where>
<func>ThreadLoad</func>
</where>
<hostStack>
<frame>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<module>libcudart.*</module>
</frame>
<frame>
<func>cudaLaunchKernel</func>
</frame>
<frame>
<func>.*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.*</func>
</frame>
</hostStack>
</record>
<record>
<kind>Initcheck</kind>
<what>
<text>Uninitialized __global__ memory read of size 2 bytes</text>
<size>2</size>
</what>
<where>
<func>ThreadLoad</func>
</where>
<hostStack>
<frame>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<module>libcudart.*</module>
</frame>
<frame>
<func>cudaLaunchKernel</func>
</frame>
<frame>
<func>.*cub::.*::Device(Segmented)?(Reduce|Scan)(SingleTile)?Kernel.*</func>
</frame>
</hostStack>
</record>
<record>
<kind>Initcheck</kind>
<what>
<text>Uninitialized __global__ memory read of size 8 bytes</text>
<size>8</size>
</what>
<where>
<func>DeviceSegmentedReduceKernel</func>
</where>
</record>
<record>
<kind>Initcheck</kind>
<what>
<text>Uninitialized __global__ memory read of size 4 bytes</text>
<size>4</size>
</what>
<where>
<func>ThreadLoad</func>
</where>
<hostStack>
<frame>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<module>libcudart.*</module>
</frame>
<frame>
<module>libcudart.*</module>
</frame>
<frame>
<module>.*libcuopt.*</module>
</frame>
<frame>
<func>.*Device(Reduce|Scan)Kernel.*</func>
</frame>
</hostStack>
</record>
<!-- Rule matching cccl's pattern of copying tuples back to host after reduce_by_keys, which contain uninitialized padding -->
<!-- Because of aggressive inlining, thrust calls are elided out of the host stack, which prevents a more finely grained rule. In practice this is good enough -->
<record>
<kind>InitcheckApiError</kind>
<level>Error</level>
<what>
<text>Host API uninitialized memory access</text>
<accessSize>16</accessSize>
</what>
<hostStack>
<frame>
<func>cuMemcpyDtoHAsync.*</func>
<module>.*libcuda.so.*</module>
</frame>
</hostStack>
</record>
<!-- Suppress uninit copies on rmm::device_vector copy constructor - often vector members are allocated but not filled -->
<record>
<kind>InitcheckApiError</kind>
<level>Error</level>
<what>
<text>Host API uninitialized memory access</text>
</what>
<hostStack>
<frame>
<func>cuMemcpyAsync</func>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*librmm.so.*</module>
</frame>
<frame>
<func>rmm::device_buffer::device_buffer</func>
<module>.*librmm.so.*</module>
</frame>
</hostStack>
</record>
<record>
<kind>InitcheckApiError</kind>
<level>Error</level>
<what>
<text>Host API uninitialized memory access</text>
</what>
<hostStack>
<frame>
<func>cuMemcpyAsync</func>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*librmm.so.*</module>
</frame>
<frame>
<module>.*librmm.so.*</module>
</frame>
<frame>
<func>rmm::device_uvector.*::device_uvector</func>
<module>.*libcuopt.so.*</module>
</frame>
</hostStack>
</record>
<!-- Uninitialized device-to-device copies are usually harmless - if actualy bogus, errors may be caught later on -->
<record>
<kind>InitcheckApiError</kind>
<level>Error</level>
<what>
<text>Host API uninitialized memory access</text>
</what>
<hostStack>
<frame>
<func>cuMemcpyDtoDAsync.*</func>
<module>.*libcuda.so.*</module>
</frame>
</hostStack>
</record>
<record>
<kind>InitcheckApiError</kind>
<level>Error</level>
<what>
<text>Host API uninitialized memory access</text>
</what>
<hostStack>
<frame>
<func>cuMemcpyAsync</func>
<module>.*libcuda.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<module>.*libcudart.so.*</module>
</frame>
<frame>
<func>cudaMemcpyAsync</func>
</frame>
<frame>
<func>rmm::device_buffer::resize</func>
<module>.*librmm.so.*</module>
</frame>
</hostStack>
</record>
</ComputeSanitizerOutput>
2 changes: 1 addition & 1 deletion cpp/libmps_parser/src/mps_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1139,7 +1139,7 @@ void mps_parser_t<i_t, f_t>::parse_bounds(std::string_view line)
c_values.emplace_back(f_t(0));
variable_lower_bounds.emplace_back(0);
variable_upper_bounds.emplace_back(+std::numeric_limits<f_t>::infinity());
var_types.resize(var_types.size() + 1);
var_types.emplace_back('C');
itr = var_names_map.find(std::string(var_name));
}
i_t var_id = itr->second;
Expand Down
43 changes: 43 additions & 0 deletions cpp/src/linear_programming/cusparse_view.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <linear_programming/cusparse_view.hpp>
#include <linear_programming/utils.cuh>
#include <mip/mip_constants.hpp>
#include <utilities/cuda_helpers.cuh>

#include <raft/sparse/detail/cusparse_macros.h>
#include <raft/sparse/detail/cusparse_wrappers.h>
Expand Down Expand Up @@ -541,12 +542,54 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
A_indices_(dummy_int)
{
}
template <typename T>
cusparseStatus_t cusparsespmv_wrapper(cusparseHandle_t handle,
cusparseOperation_t opA,
const T* alpha,
const cusparseSpMatDescr_t matA,
const cusparseDnVecDescr_t vecX,
const T* beta,
const cusparseDnVecDescr_t vecY,
cusparseSpMVAlg_t alg,
T* externalBuffer,
cudaStream_t stream)
{
void* dest_ptr;
int64_t dest_size;
cudaDataType valtype;
RAFT_CUSPARSE_TRY(cusparseDnVecGet(vecY, &dest_size, &dest_ptr, &valtype));
// cusparse flags a false positive here on the destination tmp buffer, silence it
cuopt::mark_memory_as_initialized(dest_ptr, dest_size, stream);

return raft::sparse::detail::cusparsespmv(
handle, opA, alpha, matA, vecX, beta, vecY, alg, externalBuffer, stream);
}

#if MIP_INSTANTIATE_FLOAT
template class cusparse_view_t<int, float>;
template cusparseStatus_t cusparsespmv_wrapper<float>(cusparseHandle_t handle,
cusparseOperation_t opA,
const float* alpha,
const cusparseSpMatDescr_t matA,
const cusparseDnVecDescr_t vecX,
const float* beta,
const cusparseDnVecDescr_t vecY,
cusparseSpMVAlg_t alg,
float* externalBuffer,
cudaStream_t stream);
#endif
#if MIP_INSTANTIATE_DOUBLE
template class cusparse_view_t<int, double>;
template cusparseStatus_t cusparsespmv_wrapper<double>(cusparseHandle_t handle,
cusparseOperation_t opA,
const double* alpha,
const cusparseSpMatDescr_t matA,
const cusparseDnVecDescr_t vecX,
const double* beta,
const cusparseDnVecDescr_t vecY,
cusparseSpMVAlg_t alg,
double* externalBuffer,
cudaStream_t stream);
#endif

} // namespace cuopt::linear_programming::detail
13 changes: 13 additions & 0 deletions cpp/src/linear_programming/cusparse_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,17 @@ class cusparse_view_t {
const rmm::device_uvector<i_t>& A_offsets_;
const rmm::device_uvector<i_t>& A_indices_;
};

template <typename T>
cusparseStatus_t cusparsespmv_wrapper(cusparseHandle_t handle,
cusparseOperation_t opA,
const T* alpha,
const cusparseSpMatDescr_t matA,
const cusparseDnVecDescr_t vecX,
const T* beta,
const cusparseDnVecDescr_t vecY,
cusparseSpMVAlg_t alg,
T* externalBuffer,
cudaStream_t stream);

} // namespace cuopt::linear_programming::detail
43 changes: 22 additions & 21 deletions cpp/src/linear_programming/pdhg.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <linear_programming/utilities/ping_pong_graph.cuh>
#include <linear_programming/utils.cuh>
#include <mip/mip_constants.hpp>
#include <utilities/copy_helpers.hpp>
#include <utilities/cuda_helpers.cuh>

#include <raft/sparse/detail/cusparse_macros.h>
#include <raft/sparse/detail/cusparse_wrappers.h>
Expand Down Expand Up @@ -85,17 +87,16 @@ void pdhg_solver_t<i_t, f_t>::compute_next_dual_solution(rmm::device_scalar<f_t>
// Done in previous function

// K(x'+delta_x)
RAFT_CUSPARSE_TRY(
raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
CUSPARSE_OPERATION_NON_TRANSPOSE,
reusable_device_scalar_value_1_.data(), // 1
cusparse_view_.A,
cusparse_view_.tmp_primal,
reusable_device_scalar_value_0_.data(), // 1
cusparse_view_.dual_gradient,
CUSPARSE_SPMV_CSR_ALG2,
(f_t*)cusparse_view_.buffer_non_transpose.data(),
stream_view_));
RAFT_CUSPARSE_TRY(cusparsespmv_wrapper(handle_ptr_->get_cusparse_handle(),
CUSPARSE_OPERATION_NON_TRANSPOSE,
reusable_device_scalar_value_1_.data(), // 1
cusparse_view_.A,
cusparse_view_.tmp_primal,
reusable_device_scalar_value_0_.data(), // 1
cusparse_view_.dual_gradient,
CUSPARSE_SPMV_CSR_ALG2,
(f_t*)cusparse_view_.buffer_non_transpose.data(),
stream_view_));

// y - (sigma*dual_gradient)
// max(min(0, sigma*constraint_upper+primal_product), sigma*constraint_lower+primal_product)
Expand All @@ -122,16 +123,16 @@ void pdhg_solver_t<i_t, f_t>::compute_At_y()
{
// A_t @ y

RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
CUSPARSE_OPERATION_NON_TRANSPOSE,
reusable_device_scalar_value_1_.data(),
cusparse_view_.A_T,
cusparse_view_.dual_solution,
reusable_device_scalar_value_0_.data(),
cusparse_view_.current_AtY,
CUSPARSE_SPMV_CSR_ALG2,
(f_t*)cusparse_view_.buffer_transpose.data(),
stream_view_));
RAFT_CUSPARSE_TRY(cusparsespmv_wrapper(handle_ptr_->get_cusparse_handle(),
CUSPARSE_OPERATION_NON_TRANSPOSE,
reusable_device_scalar_value_1_.data(),
cusparse_view_.A_T,
cusparse_view_.dual_solution,
reusable_device_scalar_value_0_.data(),
cusparse_view_.current_AtY,
CUSPARSE_SPMV_CSR_ALG2,
(f_t*)cusparse_view_.buffer_transpose.data(),
stream_view_));
}

template <typename i_t, typename f_t>
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/linear_programming/pdlp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <linear_programming/pdlp.cuh>
#include <linear_programming/utils.cuh>
#include <mip/mip_constants.hpp>
#include <utilities/copy_helpers.hpp>
#include <utilities/cuda_helpers.cuh>
#include "cuopt/linear_programming/pdlp/solver_solution.hpp"

#include <raft/common/nvtx.hpp>
Expand Down Expand Up @@ -1048,6 +1050,9 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(
primal_size_h_,
clamp<f_t>(),
stream_view_);
// Triggers a false positive in compute-sanitizer otherwise (lack of initialization doesn't
// matter here)
cuopt::mark_span_as_initialized(make_span(unscaled_primal_avg_solution_), stream_view_);
raft::linalg::ternaryOp(unscaled_primal_avg_solution_.data(),
unscaled_primal_avg_solution_.data(),
op_problem_scaled_.variable_lower_bounds.data(),
Expand Down
Loading