Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions conda/recipes/libcuopt/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ cache:
- libcurand-dev
- libcusparse-dev
- cuda-cudart-dev
- cuda-driver-dev
- boost
- tbb-devel
- zlib
Expand All @@ -89,8 +88,6 @@ outputs:
cmake --install cpp/libmps_parser/build
dynamic_linking:
overlinking_behavior: "error"
missing_dso_allowlist:
- libcuda.so.1
prefix_detection:
ignore:
# See https://github.com/rapidsai/build-planning/issues/160
Expand Down Expand Up @@ -138,8 +135,6 @@ outputs:
cmake --install cpp/build
dynamic_linking:
overlinking_behavior: "error"
missing_dso_allowlist:
- libcuda.so.1
prefix_detection:
ignore:
# See https://github.com/rapidsai/build-planning/issues/160
Expand All @@ -156,7 +151,6 @@ outputs:
- rapids-logger =0.1
- librmm =${{ dep_minor_version }}
- cuda-cudart-dev
- cuda-driver-dev
- libcublas
- libcudss-dev >=0.7
- libcusparse-dev
Expand Down Expand Up @@ -198,8 +192,6 @@ outputs:
cmake --install cpp/build --component testing
dynamic_linking:
overlinking_behavior: "error"
missing_dso_allowlist:
- libcuda.so.1
string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
requirements:
build:
Expand Down
1 change: 0 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ set(CUOPT_PRIVATE_CUDA_LIBS
CUDA::curand
CUDA::cusolver
TBB::tbb
CUDA::cuda_driver
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this does a dlopen now, rather than dynamically linking, which is why this is removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't checked the trace but I think so. I wanted to avoid linking to the driver with these changes.

OpenMP::OpenMP_CXX)

list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt)
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dual_simplex/device_sparse_matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ class device_csc_matrix_t {

// Inclusive cumulative sum to have the corresponding column for each entry
rmm::device_buffer d_temp_storage;
size_t temp_storage_bytes;
size_t temp_storage_bytes{0};
cub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes, col_index.data(), col_index.data(), col_index.size(), stream);
d_temp_storage.resize(temp_storage_bytes, stream);
Expand Down
104 changes: 62 additions & 42 deletions cpp/src/dual_simplex/sparse_cholesky.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
#include "dual_simplex/tic_toc.hpp"

#include <cuda_runtime.h>
#include <utilities/driver_helpers.cuh>

#include <raft/common/nvtx.hpp>

#include "cuda.h"
#include "cudss.h"

namespace cuopt::linear_programming::dual_simplex {
Expand Down Expand Up @@ -157,37 +157,46 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
cudssGetProperty(PATCH_LEVEL, &patch);
settings.log.printf("cuDSS Version : %d.%d.%d\n", major, minor, patch);

CU_CHECK(cuDriverGetVersion(&driver_version));
settings_.log.printf("CUDA Driver Version : %d\n", driver_version);

cuda_error = cudaSuccess;
status = CUDSS_STATUS_SUCCESS;

if (settings_.concurrent_halt != nullptr && driver_version >= 13000) {
#if defined(SPLIT_SM_FOR_BARRIER) && CUDART_VERSION >= 13000
if (CUDART_VERSION >= 13000 && settings_.concurrent_halt != nullptr) {
cuGetErrorString_func = cuopt::detail::get_driver_entry_point("cuGetErrorString");
// 1. Set up the GPU resources
CUdevResource initial_device_GPU_resources = {};
CU_CHECK(cuDeviceGetDevResource(
handle_ptr_->get_device(), &initial_device_GPU_resources, CU_DEV_RESOURCE_TYPE_SM));
auto cuDeviceGetDevResource_func =
cuopt::detail::get_driver_entry_point("cuDeviceGetDevResource");
CU_CHECK(reinterpret_cast<decltype(::cuDeviceGetDevResource)*>(cuDeviceGetDevResource_func)(
handle_ptr_->get_device(), &initial_device_GPU_resources, CU_DEV_RESOURCE_TYPE_SM),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));

#ifdef DEBUG
std::cout << "Initial GPU resources retrieved via cuDeviceGetDevResource() have type "
<< initial_device_GPU_resources.type << " and SM count "
<< initial_device_GPU_resources.sm.smCount << std::endl;
settings.log.printf(
" Initial GPU resources retrieved via "
"cuDeviceGetDevResource() have type "
"%d and SM count %d\n",
initial_device_GPU_resources.type,
initial_device_GPU_resources.sm.smCount);
#endif

// 2. Partition the GPU resources
auto total_SMs = initial_device_GPU_resources.sm.smCount;
auto barrier_sms = raft::alignTo(static_cast<i_t>(total_SMs * 0.75f), 8);
CUdevResource input;
CUdevResource resource;
auto cuDevSmResourceSplitByCount_func =
cuopt::detail::get_driver_entry_point("cuDevSmResourceSplitByCount");
auto n_groups = 1u;
auto use_flags = CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING; // or 0
CU_CHECK(cuDevSmResourceSplitByCount(
&resource, &n_groups, &initial_device_GPU_resources, nullptr, use_flags, barrier_sms));
CU_CHECK(
reinterpret_cast<decltype(::cuDevSmResourceSplitByCount)*>(
cuDevSmResourceSplitByCount_func)(
&resource, &n_groups, &initial_device_GPU_resources, nullptr, use_flags, barrier_sms),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
#ifdef DEBUG
printf(
" Resources were split into %d resource groups (had requested %d) with %d SMs each (had "
"requested %d)\n",
settings.log.printf(
" Resources were split into %d resource groups (had "
"requested %d) with %d SMs each (had "
"requested % d)\n",
n_groups,
n_groups,
resource.sm.smCount,
Expand All @@ -196,34 +205,42 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
// 3. Create the resource descriptor
auto constexpr const n_resource_desc = 1;
CUdevResourceDesc resource_desc;
CU_CHECK(cuDevResourceGenerateDesc(&resource_desc, &resource, n_resource_desc));
auto cuDevResourceGenerateDesc_func =
cuopt::detail::get_driver_entry_point("cuDevResourceGenerateDesc");
CU_CHECK(reinterpret_cast<decltype(::cuDevResourceGenerateDesc)*>(
cuDevResourceGenerateDesc_func)(&resource_desc, &resource, n_resource_desc),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
#ifdef DEBUG
printf(
" For the resource descriptor of barrier green context we will combine %d resources of "
"%d "
"SMs each\n",
settings.log.printf(
" For the resource descriptor of barrier green context "
"we will combine %d resources of "
"%d SMs each\n",
n_resource_desc,
resource.sm.smCount);
#endif

// Only perform this if CUDA version is more than 13
// (all resource splitting and descriptor creation already above)
// No additional code needed here as the logic is already guarded above.
// 4. Create the green context and stream for that green context
// CUstream barrier_green_ctx_stream;
// (all resource splitting and descriptor creation already
// above) No additional code needed here as the logic is
// already guarded above.
// 4. Create the green context and stream for that green
// context CUstream barrier_green_ctx_stream;
i_t stream_priority;
cudaStream_t cuda_stream = handle_ptr_->get_stream();
cudaError_t priority_result = cudaStreamGetPriority(cuda_stream, &stream_priority);
RAFT_CUDA_TRY(priority_result);
CU_CHECK(cuGreenCtxCreate(
&barrier_green_ctx, resource_desc, handle_ptr_->get_device(), CU_GREEN_CTX_DEFAULT_STREAM));
CU_CHECK(cuGreenCtxStreamCreate(
&stream, barrier_green_ctx, CU_STREAM_NON_BLOCKING, stream_priority));
#endif
} else {
// Convert runtime API stream to driver API stream for consistency
cudaStream_t cuda_stream = handle_ptr_->get_stream();
stream = reinterpret_cast<CUstream>(cuda_stream);
auto cuGreenCtxCreate_func = cuopt::detail::get_driver_entry_point("cuGreenCtxCreate");
CU_CHECK(reinterpret_cast<decltype(::cuGreenCtxCreate)*>(cuGreenCtxCreate_func)(
&barrier_green_ctx,
resource_desc,
handle_ptr_->get_device(),
CU_GREEN_CTX_DEFAULT_STREAM),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
auto cuGreenCtxStreamCreate_func =
cuopt::detail::get_driver_entry_point("cuGreenCtxStreamCreate");
CU_CHECK(reinterpret_cast<decltype(::cuGreenCtxStreamCreate)*>(cuGreenCtxStreamCreate_func)(
&stream, barrier_green_ctx, CU_STREAM_NON_BLOCKING, stream_priority),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
}

CUDSS_CALL_AND_CHECK_EXIT(cudssCreate(&handle), status, "cudssCreate");
Expand Down Expand Up @@ -336,12 +353,15 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy");
CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy");
CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
#ifdef SPLIT_SM_FOR_BARRIER
if (settings_.concurrent_halt != nullptr && driver_version >= 13000) {
CU_CHECK(cuStreamDestroy(stream));
#if CUDART_VERSION >= 13000
CU_CHECK(cuGreenCtxDestroy(barrier_green_ctx));
#endif
if (settings_.concurrent_halt != nullptr) {
auto cuStreamDestroy_func = cuopt::detail::get_driver_entry_point("cuStreamDestroy");
CU_CHECK(reinterpret_cast<decltype(::cuStreamDestroy)*>(cuStreamDestroy_func)(stream),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
auto cuGreenCtxDestroy_func = cuopt::detail::get_driver_entry_point("cuGreenCtxDestroy");
CU_CHECK(
reinterpret_cast<decltype(::cuGreenCtxDestroy)*>(cuGreenCtxDestroy_func)(barrier_green_ctx),
reinterpret_cast<decltype(::cuGetErrorString)*>(cuGetErrorString_func));
handle_ptr_->get_stream().synchronize();
}
#endif
Expand Down Expand Up @@ -473,7 +493,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {

auto d_nnz = Arow.row_start.element(Arow.m, Arow.row_start.stream());
if (nnz != d_nnz) {
printf("Error: nnz %d != A_in.col_start[A_in.n] %d\n", nnz, d_nnz);
settings_.log.printf("Error: nnz %d != A_in.col_start[A_in.n] %d\n", nnz, d_nnz);
exit(1);
}

Expand Down Expand Up @@ -796,11 +816,11 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
f_t* csr_values_d;
f_t* x_values_d;
f_t* b_values_d;
i_t driver_version;

const simplex_solver_settings_t<i_t, f_t>& settings_;
CUgreenCtx barrier_green_ctx;
CUstream stream;
void* cuGetErrorString_func;
};

} // namespace cuopt::linear_programming::dual_simplex
38 changes: 38 additions & 0 deletions cpp/src/utilities/driver_helpers.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights
* reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "cuda.h"

namespace cuopt {

namespace detail {

inline auto get_driver_entry_point(const char* name)
{
void* func;
cudaDriverEntryPointQueryResult driver_status;
cudaGetDriverEntryPointByVersion(name, &func, CUDART_VERSION, cudaEnableDefault, &driver_status);
if (driver_status != cudaDriverEntryPointSuccess) {
fprintf(stderr, "Failed to fetch symbol for %s\n", name);
}
return func;
}

} // namespace detail
} // namespace cuopt
4 changes: 2 additions & 2 deletions cpp/src/utilities/macros.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@
#endif

// For CUDA Driver API
#define CU_CHECK(expr_to_check) \
#define CU_CHECK(expr_to_check, err_func) \
do { \
CUresult result = expr_to_check; \
if (result != CUDA_SUCCESS) { \
const char* pErrStr; \
cuGetErrorString(result, &pErrStr); \
err_func(result, &pErrStr); \
fprintf(stderr, "CUDA Error: %s:%i:%s\n", __FILE__, __LINE__, pErrStr); \
} \
} while (0)