From 2b3650c4d46d754e730d83f315dfadcb98954f56 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sun, 22 Sep 2024 01:17:58 +0300 Subject: [PATCH] Regards #681: WIP: Added a fatbin builder based on NVIDIA's libnvfatbin; but... * not tested it yet * need to add the nvFatbin documentation example program, rewritten to use the APIs --- .github/action-scripts/install-cuda-ubuntu.sh | 1 + .../action-scripts/install-cuda-windows.ps1 | 1 + CMakeLists.txt | 27 +- README.md | 1 + src/cuda/api.hpp | 3 + src/cuda/api/fatbin_builder.hpp | 260 ++++++++++++++++++ src/cuda/api/fatbin_options.hpp | 162 +++++++++++ src/cuda/api/versions.hpp | 24 ++ 8 files changed, 475 insertions(+), 4 deletions(-) create mode 100644 src/cuda/api/fatbin_builder.hpp create mode 100644 src/cuda/api/fatbin_options.hpp diff --git a/.github/action-scripts/install-cuda-ubuntu.sh b/.github/action-scripts/install-cuda-ubuntu.sh index 728f4fba..d36caacb 100755 --- a/.github/action-scripts/install-cuda-ubuntu.sh +++ b/.github/action-scripts/install-cuda-ubuntu.sh @@ -30,6 +30,7 @@ CUDA_PACKAGES_IN=( "cudart-dev" "nvcc" "profiler-api" + "libnvfatbin" ) ## ------------------- diff --git a/.github/action-scripts/install-cuda-windows.ps1 b/.github/action-scripts/install-cuda-windows.ps1 index 02fed875..b6597e49 100755 --- a/.github/action-scripts/install-cuda-windows.ps1 +++ b/.github/action-scripts/install-cuda-windows.ps1 @@ -70,6 +70,7 @@ $CUDA_PACKAGES_IN = @( "nvrtc_dev"; "nsight_nvtx"; "nvtx"; + "nvfatbin"; "cudart"; "visual_studio_integration"; "cuda_profiler_api"; diff --git a/CMakeLists.txt b/CMakeLists.txt index 02c548e6..17e0ffbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,8 @@ if(libm_exists) set(c_math_lib m) endif() -if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1) - foreach(tgt in nvptxcompiler nvptxcompiler_static) +if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4) + foreach(tgt in nvfatbin nvfatbin_static) if (NOT TARGET ${tgt}) _CUDAToolkit_find_and_add_import_lib(${tgt}) endif() @@ -79,11 +79,30 @@ add_library("${caw_namespace}::driver-and-runtime" ALIAS caw_runtime-and-driver) target_link_libraries(caw_rtc INTERFACE cuda-api-wrappers::runtime-and-driver CUDA::nvrtc) if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1) if (TARGET CUDA::nvptxcompiler) + target_link_libraries(caw_rtc INTERFACE CUDA::nvptxcompiler) set(ptx_compiler_target nvptxcompiler) + elseif (TARGET CUDA::nvptxcompiler) + target_link_libraries(caw_rtc INTERFACE CUDA::nvptxcompiler_static) + elseif(EXISTS "${CUDA_nvptxcompiler_LIBRARY}") + target_link_libraries(caw_rtc INTERFACE "${CUDA_nvptxcompiler_LIBRARY}") + elseif(EXISTS "${CUDA_nvptxcompiler_static_LIBRARY}") + target_link_libraries(caw_rtc INTERFACE "${CUDA_nvptxcompiler_static_LIBRARY}" pthread) + else() + message(WARNING "Could not locate a valid NVIDIA PTX Compiler target or library file") + endif() +endif() +if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4) + if (TARGET CUDA::nvfatbin) + target_link_libraries(caw_runtime-and-driver INTERFACE CUDA::nvfatbin) + elseif (TARGET CUDA::nvfatbin) + target_link_libraries(caw_runtime-and-driver INTERFACE CUDA::nvfatbin_static) + elseif(EXISTS "${CUDA_nvfatbin_LIBRARY}") + target_link_libraries(caw_runtime-and-driver INTERFACE "${CUDA_nvfatbin_LIBRARY}") + elseif(EXISTS "${CUDA_nvfatbin_static_LIBRARY}") + target_link_libraries(caw_runtime-and-driver INTERFACE "${CUDA_nvfatbin_static_LIBRARY}") else() - set(ptx_compiler_target nvptxcompiler_static) + message(WARNING "Could not locate a valid NVIDIA fatbin creator target or library file") endif() - target_link_libraries(caw_rtc INTERFACE CUDA::${ptx_compiler_target}) endif() target_link_libraries(caw_nvtx INTERFACE cuda-api-wrappers::runtime-and-driver) diff --git a/README.md b/README.md index 27d29a0c..c4cc21ef 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ This is a header-only library of integrated wrappers around the core parts of NV * The slightly higher-level CUDA [Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) * NVIDIA's dynamic CUDA code compilation library, [NVRTC](http://docs.nvidia.com/cuda/nvrtc/index.html) * NVIDIA's out-of-driver, full-featured [PTX compiler library](https://docs.nvidia.com/cuda/ptx-compiler-api/index.html) (available since CUDA 11.1) +* NVIDIA's fat binary creation library, [nvFatbin](https://docs.nvidia.com/cuda/nvfatbin/index.html) (available since CUDA 12.4) * The NVIDIA profiler in-program API, also known as [NVTX](https://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx) (the NVIDIA Toolkit Extensions library). It is intended for those who would otherwise use these APIs directly, to make working with them be more intuitive and consistent, making use of modern C++ language capabilities, programming idioms and best practices. In a nutshell - making CUDA API work more fun :-) diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp index 51af11f0..d1731ea6 100644 --- a/src/cuda/api.hpp +++ b/src/cuda/api.hpp @@ -41,6 +41,9 @@ #include "api/kernels/in_library.hpp" #endif #include "api/link.hpp" +#if CUDA_VERSION >= 12040 +#include "api/fatbin_builder.hpp" +#endif #include "api/current_device.hpp" diff --git a/src/cuda/api/fatbin_builder.hpp b/src/cuda/api/fatbin_builder.hpp new file mode 100644 index 00000000..616d0ea7 --- /dev/null +++ b/src/cuda/api/fatbin_builder.hpp @@ -0,0 +1,260 @@ +/** + * @file + * + * @brief Contains the @ref fatbin_builder_t class and related code. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_FATBIN_HPP_ +#define CUDA_API_WRAPPERS_FATBIN_HPP_ + +#include "detail/region.hpp" +#include "fatbin_options.hpp" +#include "types.hpp" + +#include + +namespace cuda { + +///@cond +class fatbin_builder_t; +///@endcond + +namespace fatbin_builder { + +using handle_t = nvFatbinHandle; + +inline fatbin_builder_t wrap(handle_t handle, bool take_ownership = false) noexcept; + +inline fatbin_builder_t create(const options_t & options); + +namespace detail_ { + +inline ::std::string identify(handle_t handle) +{ + return "Fatbin builder with handle " + cuda::detail_::ptr_as_hex(handle); +} + +inline ::std::string identify(const fatbin_builder_t&); + +} // namespace detail_ + +} // namespace fatbin_builder + + +class fatbin_builder_t { +public: // type definitions + using size_type = ::size_t; + + struct deleter_type { + void operator()(void * data) { operator delete(data); } + }; + +public: // getters + + fatbin_builder::handle_t handle() const + { return handle_; } + + /// True if this wrapper is responsible for telling CUDA to destroy + /// the fatbin handle upon the wrapper's own destruction + bool is_owning() const noexcept + { return owning; } + +protected: // unsafe actions + + void build_without_size_check_in(memory::region_t target_region) const + { + auto status = nvFatbinGet(handle_, target_region.data()); + throw_if_error_lazy(status, "Failed completing the generation of a fatbin at " + + cuda::detail_::ptr_as_hex(target_region.data())); + } + +public: + size_type size() const + { + size_type result; + auto status = nvFatbinSize(handle_, &result); + throw_if_error_lazy(status, "Failed determining prospective fatbin size for " + fatbin_builder::detail_::identify(*this)); + return result; + } + + void build_in(memory::region_t target_region) const + { + auto required_size = size(); + if (target_region.size() < required_size) { + throw ::std::invalid_argument("Provided region for fatbin creation is of size " + + ::std::to_string(target_region.size()) + " bytes, while the fatbin requires " + ::std::to_string(required_size)); + } + return build_without_size_check_in(target_region); + } + + memory::unique_region build() const + { + auto size_ = size(); + auto ptr = operator new(size_); + memory::region_t target_region{ptr, size_}; + build_in(target_region); + return memory::unique_region(target_region); + } + + void add_ptx_source( + const char* identifier, + span nul_terminated_ptx_source, + device::compute_capability_t target_compute_capability) const // no support for options, for now + { +#ifndef NDEBUG + if (nul_terminated_ptx_source.empty()) { + throw ::std::invalid_argument("Empty PTX source code passed for addition into fatbin"); + } + if (nul_terminated_ptx_source[nul_terminated_ptx_source.size() - 1] != '\0') { + throw ::std::invalid_argument("PTX source code passed for addition into fatbin was not nul-character-terminated"); + } +#endif + auto compute_capability_str = ::std::to_string(target_compute_capability.as_combined_number()); + auto empty_cmdline = ""; + auto status = nvFatbinAddPTX(handle_, + nul_terminated_ptx_source.data(), + nul_terminated_ptx_source.size(), + compute_capability_str.c_str(), + identifier, + empty_cmdline); + throw_if_error_lazy(status, "Failed adding PTX source fragment " + + ::std::string(identifier) + " at " + detail_::ptr_as_hex(nul_terminated_ptx_source.data()) + + " to a fat binary for target compute capability " + compute_capability_str); + } + + void add_lto_ir( + const char* identifier, + memory::region_t lto_ir, + device::compute_capability_t target_compute_capability) const + { + auto compute_capability_str = ::std::to_string(target_compute_capability.as_combined_number()); + auto empty_cmdline = ""; + auto status = nvFatbinAddLTOIR( + handle_, lto_ir.data(), lto_ir.size(), compute_capability_str.c_str(), identifier, empty_cmdline); + throw_if_error_lazy(status, "Failed adding LTO IR fragment " + + ::std::string(identifier) + " at " + detail_::ptr_as_hex(lto_ir.data()) + + " to a fat binary for target compute capability " + compute_capability_str); + } + + void add_cubin( + const char* identifier, + memory::region_t cubin, + device::compute_capability_t target_compute_capability) const + { + auto compute_capability_str = ::std::to_string(target_compute_capability.as_combined_number()); + auto status = nvFatbinAddCubin( + handle_, cubin.data(), cubin.size(), compute_capability_str.c_str(), identifier); + throw_if_error_lazy(status, "Failed adding cubin fragment " + + ::std::string(identifier) + " at " + detail_::ptr_as_hex(cubin.data()) + + " to a fat binary for target compute capability " + compute_capability_str); + } + +#if CUDA_VERSION >= 12050 + /** + * Adds relocatable PTX entries from a host object to the fat binary being built + * + * @param ptx_code PTX "host object". TODO: Is this PTX code in text mode? Something else? + * + * @note The builder's options (specified on creation) are ignored for these operations. + */ + void add_relocatable_ptx(memory::region_t ptx_code) const + { + auto status = nvFatbinAddReloc(handle_, ptx_code.data(), ptx_code.size()); + throw_if_error_lazy(status, "Failed adding relocatable PTX code at " + detail_::ptr_as_hex(ptx_code.data()) + + "to fatbin builder " + fatbin_builder::detail_::identify(*this) ); + } + + // TODO: WTF is an index? + void add_index(const char* identifier, memory::region_t index) const + { + auto status = nvFatbinAddIndex(handle_, index.data(), index.size(), identifier); + throw_if_error_lazy(status, "Failed adding index " + ::std::string(identifier) + " at " + + detail_::ptr_as_hex(index.data()) + " to a fat binary"); + } +#endif // CUDA_VERSION >= 12050 + +protected: // constructors + + fatbin_builder_t( + fatbin_builder::handle_t handle, + // no support for options, for now + bool take_ownership) noexcept + : handle_(handle), owning(take_ownership) + {} + +public: // friendship + + friend fatbin_builder_t fatbin_builder::wrap(fatbin_builder::handle_t, bool) noexcept; + +public: // constructors and destructor + + fatbin_builder_t(const fatbin_builder_t &) = delete; + + fatbin_builder_t(fatbin_builder_t &&other) noexcept: + fatbin_builder_t(other.handle_, other.owning) + { + other.owning = false; + }; + + ~fatbin_builder_t() noexcept(false) + { + if (owning) { + auto status = nvFatbinDestroy(&handle_); // this nullifies the handle :-O + throw_if_error_lazy(status, + ::std::string("Failed destroying fatbin builder ") + detail_::ptr_as_hex(handle_) + + " in " + fatbin_builder::detail_::identify(handle_)); + } + } + +public: // operators + + fatbin_builder_t &operator=(const fatbin_builder_t &) = delete; + + fatbin_builder_t &operator=(fatbin_builder_t &&other) noexcept + { + ::std::swap(handle_, other.handle_); + ::std::swap(owning, owning); + return *this; + } + +protected: // data members + fatbin_builder::handle_t handle_; + bool owning; + // this field is mutable only for enabling move construction; other + // than in that case it must not be altered +}; + +namespace fatbin_builder { + +/// Create a new link-process (before adding any compiled images or or image-files) +inline fatbin_builder_t create(const options_t & options) +{ + handle_t new_handle; + auto marshalled_options = marshalling::marshal(options); + auto option_ptrs = marshalled_options.option_ptrs(); + auto status = nvFatbinCreate(&new_handle, option_ptrs.data(), option_ptrs.size()); + throw_if_error_lazy(status, "Failed creating a new fatbin builder"); + auto do_take_ownership = true; + return wrap(new_handle, do_take_ownership); +} + +inline fatbin_builder_t wrap(handle_t handle, bool take_ownership) noexcept +{ + return fatbin_builder_t{handle, take_ownership}; +} + +namespace detail_ { + +inline ::std::string identify(const fatbin_builder_t& builder) +{ + return identify(builder.handle()); +} + +} // namespace detail_ + +} // namespace fatbin_builder + + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_FATBIN_HPP_ diff --git a/src/cuda/api/fatbin_options.hpp b/src/cuda/api/fatbin_options.hpp new file mode 100644 index 00000000..7d0ee4a1 --- /dev/null +++ b/src/cuda/api/fatbin_options.hpp @@ -0,0 +1,162 @@ +/** + * @file + * +* @brief Contains @ref fatbin_builder::options_t class and related definitions + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_FATBIN_OPTIONS_HPP_ +#define CUDA_API_WRAPPERS_FATBIN_OPTIONS_HPP_ + +#include "device_properties.hpp" +#include "detail/option_marshalling.hpp" +#include "types.hpp" + +#include +#include + +namespace cuda { + +///@cond +class module_t; +///@endcond + +namespace fatbin_builder { + + +/* + +Fatbin options (not including deprecated ones): + + -compress= Enable (true) / disable (false) compression (default: true). + + -compress-all Compress everything in the fatbin, even if it’s small. + + -cuda Specify CUDA (rather than OpenCL). + -opencl Specify OpenCL (rather than CUDA). + -host= Specify host operating system. Valid options are “linux”, “windows” (“mac” is deprecated) + + -g Generate debug information. + +*/ + +struct options_t final { + + enum : bool { + width_32_bits = false, width_64_bits = true + }; + optional use_64_bit_entry_width{width_64_bits}; + + enum : bool { + dont_compress = false, do_compress = true + }; + optional compress{do_compress}; + + enum : bool { + only_compress_large_objects = false, compression_for_everything = true + }; + optional apply_compression_to_small_objects{only_compress_large_objects}; + + enum ecosystem_t { + cuda, opencl + }; + optional ecosystem; + + enum host_os_t { + windows, linux + }; + optional targeted_host_os; +}; + +namespace detail_ { + +struct marshalled_options_t { + ::std::size_t num_options; + ::std::string option_str; +}; + +} // namespace detail + +} // namespace fatbin_builder + +namespace marshalling { + +namespace detail_ { + +template +struct gadget { + static void process( + const fatbin_builder::options_t &opts, + MarshalTarget &marshalled, Delimiter delimiter, + bool need_delimiter_after_last_option) + { + using fatbin_builder::options_t; + opt_start_t opt_start { delimiter }; + if (opts.use_64_bit_entry_width) { + marshalled << opt_start << '-' << (opts.use_64_bit_entry_width.value() ? "64" : "32"); + } + if (opts.compress) { + marshalled << opt_start << "-compress=" << (opts.compress.value() ? "true" : "false"); + } + if (opts.apply_compression_to_small_objects.value_or(false)) { + marshalled << opt_start << "-compress-all"; + } + if (opts.ecosystem) { + marshalled << opt_start << '-' << ((opts.ecosystem.value() == options_t::opencl) ? "opencl" : "cuda"); + } + if (opts.targeted_host_os) { + marshalled << opt_start << "-host=" << ((opts.targeted_host_os.value() == options_t::windows) ? "windows" : "linux"); + } + if (need_delimiter_after_last_option) { + marshalled << opt_start; + } + } +}; + +} // namespace detail_ + +/* +inline marshalled_options_t marshal(const options_t& options) +{ + marshalled_options_t result; + result.num_options = 0; + auto advance_option= [&]() { + if (result.num_options > 0) { oss << ' '; } + result.num_options++; + }; + + thread_local ::std::ostringstream oss; + oss.str(""); + + // TODO: With C++14/17, we can lambda-ish this better + + if (options.use_64_bit_entry_width.has_value()) { + advance_option(); + oss << '-' << (options.use_64_bit_entry_width.value() ? "64" : "32"); + } + if (options.compress.has_value()) { + advance_option(); + oss << "-compress=" << (options.compress.value() ? "true" : "false"); + } + if (options.apply_compression_to_small_objects.value_or(false)) { + advance_option(); + oss << "-compress-all"; + } + if (options.ecosystem.has_value()) { + advance_option(); + oss << '-' << ((options.ecosystem.value() == options_t::opencl) ? "opencl" : "cuda"); + } + if (options.targeted_host_os.has_value()) { + advance_option(); + oss << "-host=" << ((options.targeted_host_os.value() == options_t::windows) ? "windows" : "linux"); + } + + result.option_str = oss.str(); + return result; +} +*/ + +} // namespace marshalling + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_FATBIN_OPTIONS_HPP_ diff --git a/src/cuda/api/versions.hpp b/src/cuda/api/versions.hpp index d0cd4c44..763e1f28 100644 --- a/src/cuda/api/versions.hpp +++ b/src/cuda/api/versions.hpp @@ -12,8 +12,14 @@ #include "error.hpp" +#if CUDA_VERSION >= 12040 +#include +#endif + #include #include +#include + namespace cuda { @@ -161,6 +167,24 @@ inline version_t runtime() { return version_t::from_single_number(version); } +#if CUDA_VERSION >= 12040 + +inline version_t fatbin() { + unsigned int major { 0 }, minor { 0 }; + + auto status = nvFatbinVersion(&major, &minor); + throw_if_error_lazy(status, "Failed obtaining the nvfatbin library version"); +#ifndef NDEBUG + if ((major == 0) or (major > ::std::numeric_limits::max()) + or (minor == 0) or (minor > ::std::numeric_limits::max())) { + throw ::std::logic_error("Invalid version encountered: (" + + ::std::to_string(major) + ", " + ::std::to_string(minor) + ')' ); + } +#endif + return version_t{ static_cast(major), static_cast(minor) }; +} +#endif // CUDA_VERSION >= 12040 + } // namespace version_numbers } // namespace cuda