boostorg · mborland · Apr 2, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/include/boost/decimal/decimal64_t.hpp b/include/boost/decimal/decimal64_t.hpp
diff --git a/include/boost/decimal/detail/attributes.hpp b/include/boost/decimal/detail/attributes.hpp
@@ -25,83 +25,83 @@ namespace detail {
 namespace impl {
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto storage_width_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto storage_width_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 32 :
            decimal_val_v<DecimalType> < 128 ? 64 : 128;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto precision_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto precision_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 7 :
            decimal_val_v<DecimalType> < 128 ? 16 : 34;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto bias_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto bias_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 101 :
            decimal_val_v<DecimalType> < 128 ? 398 : 6176;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto max_biased_exp_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto max_biased_exp_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 191 :
            decimal_val_v<DecimalType> < 128 ? 767 : 12287;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto emax_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto emax_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 96 :
            decimal_val_v<DecimalType> < 128 ? 384 : 6144;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto emin_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto emin_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? -95 :
            decimal_val_v<DecimalType> < 128 ? -383 : -6143;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto combination_field_width_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto combination_field_width_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 11 :
            decimal_val_v<DecimalType> < 128 ? 13 : 17;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto trailing_significand_field_width_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto trailing_significand_field_width_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 20 :
            decimal_val_v<DecimalType> < 128 ? 50 : 110;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType, std::enable_if_t<decimal_val_v<DecimalType> < 128, bool> = true>
-constexpr auto max_significand_v() noexcept
+BOOST_DECIMAL_CUDA_CONSTEXPR auto max_significand_v() noexcept
 {
     return decimal_val_v<DecimalType> < 64 ? 9'999'999 : 9'999'999'999'999'999;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType, std::enable_if_t<decimal_val_v<DecimalType> >= 128, bool> = true>
-constexpr auto max_significand_v() noexcept
+BOOST_DECIMAL_CUDA_CONSTEXPR auto max_significand_v() noexcept
 {
     // 34x 9s
     return BOOST_DECIMAL_DETAIL_INT128_UINT128_C(9999999999999999999999999999999999);
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto max_string_length_v() noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto max_string_length_v() noexcept -> int
 {
     return decimal_val_v<DecimalType> < 64 ? 15 :
            decimal_val_v<DecimalType> < 128 ? 25 : 41;
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-constexpr auto is_fast_type_v() noexcept -> bool
+BOOST_DECIMAL_CUDA_CONSTEXPR auto is_fast_type_v() noexcept -> bool
 {
     // The fast types all assign 1 additional bit over the regular types
     return decimal_val_v<DecimalType> % 2 == 1;

diff --git a/include/boost/decimal/detail/config.hpp b/include/boost/decimal/detail/config.hpp
@@ -13,8 +13,8 @@
 #  define BOOST_DECIMAL_DEC_EVAL_METHOD 0
 #endif
 
-// Fundamental NVCC options
-#if defined(__NVCC__) && defined(BOOST_DECIMAL_ENABLE_CUDA)
+// Fundamental CUDACC options
+#if defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA)
 #  define BOOST_DECIMAL_HOST_DEVICE __host__ __device__
 #  define BOOST_DECIMAL_CUDA_CONSTEXPR __host__ __device__ constexpr
 #  define BOOST_DECIMAL_DETAIL_INT128_ENABLE_CUDA
@@ -97,7 +97,7 @@
 #  endif
 #endif
 
-#if defined(__NVCC__) && defined(BOOST_DECIMAL_ENABLE_CUDA)
+#if defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA)
 
 // Include intrinsics if available
 #if defined(_MSC_VER)
@@ -147,7 +147,7 @@
 #endif
 
 // Use 128-bit integers and suppress warnings for using extensions
-#if (defined(BOOST_HAS_INT128) || (defined(__SIZEOF_INT128__) && !defined(_MSC_VER))) && !(defined(__NVCC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
+#if (defined(BOOST_HAS_INT128) || (defined(__SIZEOF_INT128__) && !defined(_MSC_VER))) && !(defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
 
 namespace boost { namespace decimal { namespace detail {
 
@@ -170,7 +170,7 @@ typedef unsigned __int128 builtin_uint128_t;
 #endif
 
 // 128-bit floats
-#if (defined(BOOST_HAS_FLOAT128) || defined(__SIZEOF_FLOAT128__)) && !(defined(__NVCC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
+#if (defined(BOOST_HAS_FLOAT128) || defined(__SIZEOF_FLOAT128__)) && !(defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
 #  define BOOST_DECIMAL_HAS_FLOAT128
 #endif
 
@@ -283,9 +283,9 @@ typedef unsigned __int128 builtin_uint128_t;
 #  define BOOST_DECIMAL_HAS_BUILTIN_IS_CONSTANT_EVALUATED
 #endif
 
-#if defined(BOOST_DECIMAL_HAS_IS_CONSTANT_EVALUATED) && !(defined(__NVCC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
+#if defined(BOOST_DECIMAL_HAS_IS_CONSTANT_EVALUATED) && !(defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
 #  define BOOST_DECIMAL_IS_CONSTANT_EVALUATED(x) std::is_constant_evaluated()
-#elif defined(BOOST_DECIMAL_HAS_BUILTIN_IS_CONSTANT_EVALUATED) && !(defined(__NVCC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
+#elif defined(BOOST_DECIMAL_HAS_BUILTIN_IS_CONSTANT_EVALUATED) && !(defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
 #  define BOOST_DECIMAL_IS_CONSTANT_EVALUATED(x) __builtin_is_constant_evaluated()
 #else
 #  define BOOST_DECIMAL_IS_CONSTANT_EVALUATED(x) false

diff --git a/include/boost/decimal/detail/div_impl.hpp b/include/boost/decimal/detail/div_impl.hpp
@@ -49,7 +49,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto generic_div_impl(co
 }
 
 template <typename DecimalType, typename T>
-BOOST_DECIMAL_FORCE_INLINE constexpr auto d64_generic_div_impl(const T& lhs, const T& rhs, const bool sign) noexcept -> DecimalType
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_generic_div_impl(const T& lhs, const T& rhs, const bool sign) noexcept -> DecimalType
 {
     using unsigned_int128_type = boost::int128::uint128_t;
 

diff --git a/include/boost/decimal/detail/fenv_rounding.hpp b/include/boost/decimal/detail/fenv_rounding.hpp
@@ -61,7 +61,7 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod(T dividend, T divisor) noexcept -> divm
     return {q, r};
 }
 
-#ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_INT128
+#if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) && !(defined(__CUDACC__) && defined(BOOST_DECIMAL_ENABLE_CUDA))
 
 constexpr auto divmod(const int128::uint128_t dividend, const int128::uint128_t divisor) -> divmod_result<int128::uint128_t>
 {
@@ -101,7 +101,7 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto divmod10(const int128::uint128_t lhs) noexcept
 }
 
 template <typename TargetType, typename T>
-BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round_impl(T& val, const bool is_neg, const bool sticky, const rounding_mode round = _boost_decimal_global_rounding_mode) noexcept -> int
+BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round_impl(T& val, const bool is_neg, const bool sticky, const rounding_mode round) noexcept -> int
 {
     using significand_type = std::conditional_t<decimal_val_v<TargetType> >= 128, int128::uint128_t, std::int64_t>;
 
@@ -166,7 +166,8 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round_impl(T& val, const bool is_neg, con
 template <typename TargetType, typename T, std::enable_if_t<is_integral_v<T>, bool> = true>
 BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round(T& val, bool is_neg = false, bool sticky = false) noexcept -> int
 {
-    return impl::fenv_round_impl<TargetType>(val, is_neg, sticky);
+    constexpr auto round {_boost_decimal_global_rounding_mode};
+    return impl::fenv_round_impl<TargetType>(val, is_neg, sticky, round);
 }
 
 #else
@@ -176,7 +177,8 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto fenv_round(T& val, bool is_neg = false, bool s
 {
     if (BOOST_DECIMAL_IS_CONSTANT_EVALUATED(coeff))
     {
-        return impl::fenv_round_impl<TargetType>(val, is_neg, sticky);
+        constexpr auto round {_boost_decimal_global_rounding_mode};
+        return impl::fenv_round_impl<TargetType>(val, is_neg, sticky, round);
     }
     else
     {

diff --git a/include/boost/decimal/detail/int128.hpp b/include/boost/decimal/detail/int128.hpp
@@ -5,10 +5,11 @@
 #ifndef BOOST_DECIMAL_DETAIL_INT128_HPP
 #define BOOST_DECIMAL_DETAIL_INT128_HPP
 
-#include "int128/int128.hpp"
-#include "int128/bit.hpp"
-#include "int128/iostream.hpp"
-#include "int128/literals.hpp"
-#include "int128/numeric.hpp"
+#include <boost/decimal/detail/config.hpp>
+#include <boost/decimal/detail/int128/int128.hpp>
+#include <boost/decimal/detail/int128/bit.hpp>
+#include <boost/decimal/detail/int128/iostream.hpp>
+#include <boost/decimal/detail/int128/literals.hpp>
+#include <boost/decimal/detail/int128/numeric.hpp>
 
 #endif // BOOST_DECIMAL_DETAIL_INT128_HPP
diff --git a/include/boost/decimal/detail/mul_impl.hpp b/include/boost/decimal/detail/mul_impl.hpp
@@ -64,7 +64,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(T lhs_sig,
 
 // In the fast case we are better served doing our 128-bit division here since we are at a know starting point
 template <typename ReturnType, typename T>
-constexpr auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
+BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
 {
     using unsigned_int128_type = boost::int128::uint128_t;
 
@@ -85,7 +85,7 @@ constexpr auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
 }
 
 template <typename ReturnType, BOOST_DECIMAL_INTEGRAL T, BOOST_DECIMAL_INTEGRAL U>
-BOOST_DECIMAL_FORCE_INLINE constexpr auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
                                                        T rhs_sig, U rhs_exp, bool rhs_sign) noexcept
                                                        -> std::enable_if_t<detail::is_decimal_floating_point_v<ReturnType>, ReturnType>
 {
@@ -108,7 +108,7 @@ BOOST_DECIMAL_FORCE_INLINE constexpr auto d64_mul_impl(T lhs_sig, U lhs_exp, boo
 }
 
 template <typename ReturnType, BOOST_DECIMAL_INTEGRAL T, BOOST_DECIMAL_INTEGRAL U>
-BOOST_DECIMAL_FORCE_INLINE constexpr auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
                                                        T rhs_sig, U rhs_exp, bool rhs_sign) noexcept
                                                        -> std::enable_if_t<!detail::is_decimal_floating_point_v<ReturnType>, ReturnType>
 {

diff --git a/include/boost/decimal/detail/utilities.hpp b/include/boost/decimal/detail/utilities.hpp
@@ -16,7 +16,7 @@ namespace decimal {
 namespace detail {
 
 template <typename T>
-constexpr auto swap(T& x, T& y) noexcept -> void
+BOOST_DECIMAL_CUDA_CONSTEXPR auto swap(T& x, T& y) noexcept -> void
 {
     const T temp {x};
     x = y;

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -9,3 +9,9 @@ run test_cuda_decimal32_t_add.cu ;
 run test_cuda_decimal32_t_sub.cu ;
 run test_cuda_decimal32_t_mul.cu ;
 run test_cuda_decimal32_t_div.cu ;
+
+run test_cuda_decimal64_t_construct.cu ;
+run test_cuda_decimal64_t_add.cu ;
+run test_cuda_decimal64_t_sub.cu ;
+run test_cuda_decimal64_t_mul.cu ;
+run test_cuda_decimal64_t_div.cu ;
diff --git a/test/test_cuda_decimal64_t_add.cu b/test/test_cuda_decimal64_t_add.cu
@@ -0,0 +1,77 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <boost/decimal/decimal64_t.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::decimal::decimal64_t;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<int> dist{1, 4999};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type(dist(rng));
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cudaDeviceSetLimit(cudaLimitStackSize, 2048);
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] + input_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}