diff --git a/conda/recipes/libcuopt/recipe.yaml b/conda/recipes/libcuopt/recipe.yaml index ae9a2420f..e18cea2a6 100644 --- a/conda/recipes/libcuopt/recipe.yaml +++ b/conda/recipes/libcuopt/recipe.yaml @@ -73,7 +73,6 @@ cache: - libcurand-dev - libcusparse-dev - cuda-cudart-dev - - cuda-driver-dev - boost - tbb-devel - zlib @@ -89,8 +88,6 @@ outputs: cmake --install cpp/libmps_parser/build dynamic_linking: overlinking_behavior: "error" - missing_dso_allowlist: - - libcuda.so.1 prefix_detection: ignore: # See https://github.com/rapidsai/build-planning/issues/160 @@ -138,8 +135,6 @@ outputs: cmake --install cpp/build dynamic_linking: overlinking_behavior: "error" - missing_dso_allowlist: - - libcuda.so.1 prefix_detection: ignore: # See https://github.com/rapidsai/build-planning/issues/160 @@ -156,7 +151,6 @@ outputs: - rapids-logger =0.1 - librmm =${{ dep_minor_version }} - cuda-cudart-dev - - cuda-driver-dev - libcublas - libcudss-dev >=0.7 - libcusparse-dev @@ -198,8 +192,6 @@ outputs: cmake --install cpp/build --component testing dynamic_linking: overlinking_behavior: "error" - missing_dso_allowlist: - - libcuda.so.1 string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }} requirements: build: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e03d2581f..65db6f406 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -283,7 +283,6 @@ set(CUOPT_PRIVATE_CUDA_LIBS CUDA::curand CUDA::cusolver TBB::tbb - CUDA::cuda_driver OpenMP::OpenMP_CXX) list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt) diff --git a/cpp/src/dual_simplex/device_sparse_matrix.cuh b/cpp/src/dual_simplex/device_sparse_matrix.cuh index f347f956b..00c198d3f 100644 --- a/cpp/src/dual_simplex/device_sparse_matrix.cuh +++ b/cpp/src/dual_simplex/device_sparse_matrix.cuh @@ -184,7 +184,7 @@ class device_csc_matrix_t { // Inclusive cumulative sum to have the corresponding column for each entry rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes; + size_t temp_storage_bytes{0}; cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, col_index.data(), col_index.data(), col_index.size(), stream); d_temp_storage.resize(temp_storage_bytes, stream); diff --git a/cpp/src/dual_simplex/sparse_cholesky.cuh b/cpp/src/dual_simplex/sparse_cholesky.cuh index 489ee98d0..51145a36b 100644 --- a/cpp/src/dual_simplex/sparse_cholesky.cuh +++ b/cpp/src/dual_simplex/sparse_cholesky.cuh @@ -23,10 +23,10 @@ #include "dual_simplex/tic_toc.hpp" #include +#include #include -#include "cuda.h" #include "cudss.h" namespace cuopt::linear_programming::dual_simplex { @@ -157,37 +157,46 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { cudssGetProperty(PATCH_LEVEL, &patch); settings.log.printf("cuDSS Version : %d.%d.%d\n", major, minor, patch); - CU_CHECK(cuDriverGetVersion(&driver_version)); - settings_.log.printf("CUDA Driver Version : %d\n", driver_version); - cuda_error = cudaSuccess; status = CUDSS_STATUS_SUCCESS; - if (settings_.concurrent_halt != nullptr && driver_version >= 13000) { -#if defined(SPLIT_SM_FOR_BARRIER) && CUDART_VERSION >= 13000 + if (CUDART_VERSION >= 13000 && settings_.concurrent_halt != nullptr) { + cuGetErrorString_func = cuopt::detail::get_driver_entry_point("cuGetErrorString"); // 1. Set up the GPU resources CUdevResource initial_device_GPU_resources = {}; - CU_CHECK(cuDeviceGetDevResource( - handle_ptr_->get_device(), &initial_device_GPU_resources, CU_DEV_RESOURCE_TYPE_SM)); + auto cuDeviceGetDevResource_func = + cuopt::detail::get_driver_entry_point("cuDeviceGetDevResource"); + CU_CHECK(reinterpret_cast(cuDeviceGetDevResource_func)( + handle_ptr_->get_device(), &initial_device_GPU_resources, CU_DEV_RESOURCE_TYPE_SM), + reinterpret_cast(cuGetErrorString_func)); + #ifdef DEBUG - std::cout << "Initial GPU resources retrieved via cuDeviceGetDevResource() have type " - << initial_device_GPU_resources.type << " and SM count " - << initial_device_GPU_resources.sm.smCount << std::endl; + settings.log.printf( + " Initial GPU resources retrieved via " + "cuDeviceGetDevResource() have type " + "%d and SM count %d\n", + initial_device_GPU_resources.type, + initial_device_GPU_resources.sm.smCount); #endif // 2. Partition the GPU resources auto total_SMs = initial_device_GPU_resources.sm.smCount; auto barrier_sms = raft::alignTo(static_cast(total_SMs * 0.75f), 8); - CUdevResource input; CUdevResource resource; + auto cuDevSmResourceSplitByCount_func = + cuopt::detail::get_driver_entry_point("cuDevSmResourceSplitByCount"); auto n_groups = 1u; auto use_flags = CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING; // or 0 - CU_CHECK(cuDevSmResourceSplitByCount( - &resource, &n_groups, &initial_device_GPU_resources, nullptr, use_flags, barrier_sms)); + CU_CHECK( + reinterpret_cast( + cuDevSmResourceSplitByCount_func)( + &resource, &n_groups, &initial_device_GPU_resources, nullptr, use_flags, barrier_sms), + reinterpret_cast(cuGetErrorString_func)); #ifdef DEBUG - printf( - " Resources were split into %d resource groups (had requested %d) with %d SMs each (had " - "requested %d)\n", + settings.log.printf( + " Resources were split into %d resource groups (had " + "requested %d) with %d SMs each (had " + "requested % d)\n", n_groups, n_groups, resource.sm.smCount, @@ -196,34 +205,42 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { // 3. Create the resource descriptor auto constexpr const n_resource_desc = 1; CUdevResourceDesc resource_desc; - CU_CHECK(cuDevResourceGenerateDesc(&resource_desc, &resource, n_resource_desc)); + auto cuDevResourceGenerateDesc_func = + cuopt::detail::get_driver_entry_point("cuDevResourceGenerateDesc"); + CU_CHECK(reinterpret_cast( + cuDevResourceGenerateDesc_func)(&resource_desc, &resource, n_resource_desc), + reinterpret_cast(cuGetErrorString_func)); #ifdef DEBUG - printf( - " For the resource descriptor of barrier green context we will combine %d resources of " - "%d " - "SMs each\n", + settings.log.printf( + " For the resource descriptor of barrier green context " + "we will combine %d resources of " + "%d SMs each\n", n_resource_desc, resource.sm.smCount); #endif // Only perform this if CUDA version is more than 13 - // (all resource splitting and descriptor creation already above) - // No additional code needed here as the logic is already guarded above. - // 4. Create the green context and stream for that green context - // CUstream barrier_green_ctx_stream; + // (all resource splitting and descriptor creation already + // above) No additional code needed here as the logic is + // already guarded above. + // 4. Create the green context and stream for that green + // context CUstream barrier_green_ctx_stream; i_t stream_priority; cudaStream_t cuda_stream = handle_ptr_->get_stream(); cudaError_t priority_result = cudaStreamGetPriority(cuda_stream, &stream_priority); RAFT_CUDA_TRY(priority_result); - CU_CHECK(cuGreenCtxCreate( - &barrier_green_ctx, resource_desc, handle_ptr_->get_device(), CU_GREEN_CTX_DEFAULT_STREAM)); - CU_CHECK(cuGreenCtxStreamCreate( - &stream, barrier_green_ctx, CU_STREAM_NON_BLOCKING, stream_priority)); -#endif - } else { - // Convert runtime API stream to driver API stream for consistency - cudaStream_t cuda_stream = handle_ptr_->get_stream(); - stream = reinterpret_cast(cuda_stream); + auto cuGreenCtxCreate_func = cuopt::detail::get_driver_entry_point("cuGreenCtxCreate"); + CU_CHECK(reinterpret_cast(cuGreenCtxCreate_func)( + &barrier_green_ctx, + resource_desc, + handle_ptr_->get_device(), + CU_GREEN_CTX_DEFAULT_STREAM), + reinterpret_cast(cuGetErrorString_func)); + auto cuGreenCtxStreamCreate_func = + cuopt::detail::get_driver_entry_point("cuGreenCtxStreamCreate"); + CU_CHECK(reinterpret_cast(cuGreenCtxStreamCreate_func)( + &stream, barrier_green_ctx, CU_STREAM_NON_BLOCKING, stream_priority), + reinterpret_cast(cuGetErrorString_func)); } CUDSS_CALL_AND_CHECK_EXIT(cudssCreate(&handle), status, "cudssCreate"); @@ -336,12 +353,15 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { CUDSS_CALL_AND_CHECK_EXIT(cudssConfigDestroy(solverConfig), status, "cudssConfigDestroy"); CUDSS_CALL_AND_CHECK_EXIT(cudssDestroy(handle), status, "cudssDestroy"); CUDA_CALL_AND_CHECK_EXIT(cudaStreamSynchronize(stream), "cudaStreamSynchronize"); -#ifdef SPLIT_SM_FOR_BARRIER - if (settings_.concurrent_halt != nullptr && driver_version >= 13000) { - CU_CHECK(cuStreamDestroy(stream)); #if CUDART_VERSION >= 13000 - CU_CHECK(cuGreenCtxDestroy(barrier_green_ctx)); -#endif + if (settings_.concurrent_halt != nullptr) { + auto cuStreamDestroy_func = cuopt::detail::get_driver_entry_point("cuStreamDestroy"); + CU_CHECK(reinterpret_cast(cuStreamDestroy_func)(stream), + reinterpret_cast(cuGetErrorString_func)); + auto cuGreenCtxDestroy_func = cuopt::detail::get_driver_entry_point("cuGreenCtxDestroy"); + CU_CHECK( + reinterpret_cast(cuGreenCtxDestroy_func)(barrier_green_ctx), + reinterpret_cast(cuGetErrorString_func)); handle_ptr_->get_stream().synchronize(); } #endif @@ -473,7 +493,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { auto d_nnz = Arow.row_start.element(Arow.m, Arow.row_start.stream()); if (nnz != d_nnz) { - printf("Error: nnz %d != A_in.col_start[A_in.n] %d\n", nnz, d_nnz); + settings_.log.printf("Error: nnz %d != A_in.col_start[A_in.n] %d\n", nnz, d_nnz); exit(1); } @@ -796,11 +816,11 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { f_t* csr_values_d; f_t* x_values_d; f_t* b_values_d; - i_t driver_version; const simplex_solver_settings_t& settings_; CUgreenCtx barrier_green_ctx; CUstream stream; + void* cuGetErrorString_func; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/utilities/driver_helpers.cuh b/cpp/src/utilities/driver_helpers.cuh new file mode 100644 index 000000000..71065b47d --- /dev/null +++ b/cpp/src/utilities/driver_helpers.cuh @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda.h" + +namespace cuopt { + +namespace detail { + +inline auto get_driver_entry_point(const char* name) +{ + void* func; + cudaDriverEntryPointQueryResult driver_status; + cudaGetDriverEntryPointByVersion(name, &func, CUDART_VERSION, cudaEnableDefault, &driver_status); + if (driver_status != cudaDriverEntryPointSuccess) { + fprintf(stderr, "Failed to fetch symbol for %s\n", name); + } + return func; +} + +} // namespace detail +} // namespace cuopt diff --git a/cpp/src/utilities/macros.cuh b/cpp/src/utilities/macros.cuh index c641103bc..f5401d52a 100644 --- a/cpp/src/utilities/macros.cuh +++ b/cpp/src/utilities/macros.cuh @@ -38,12 +38,12 @@ #endif // For CUDA Driver API -#define CU_CHECK(expr_to_check) \ +#define CU_CHECK(expr_to_check, err_func) \ do { \ CUresult result = expr_to_check; \ if (result != CUDA_SUCCESS) { \ const char* pErrStr; \ - cuGetErrorString(result, &pErrStr); \ + err_func(result, &pErrStr); \ fprintf(stderr, "CUDA Error: %s:%i:%s\n", __FILE__, __LINE__, pErrStr); \ } \ } while (0)