Skip to content
This repository was archived by the owner on Feb 17, 2025. It is now read-only.

Commit

Permalink
Implementation of no-carry montgomery multiplication.
Browse files Browse the repository at this point in the history
  • Loading branch information
martun committed Jul 22, 2024
1 parent 93edae7 commit 6b3c8e8
Show file tree
Hide file tree
Showing 4 changed files with 267 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,18 +99,15 @@ namespace boost {

template<unsigned Bits>
BOOST_MP_FORCEINLINE BOOST_MP_CXX14_CONSTEXPR typename std::enable_if<
boost::multiprecision::is_unsigned_number<cpp_int_modular_backend<Bits>>::value &&
!boost::multiprecision::backends::is_trivial_cpp_int_modular<cpp_int_modular_backend<Bits>>::value &&
!boost::multiprecision::backends::is_trivial_cpp_int_modular<cpp_int_modular_backend<Bits>>::value>::
type
!boost::multiprecision::backends::is_trivial_cpp_int_modular<cpp_int_modular_backend<Bits>>::value>::type
eval_complement(cpp_int_modular_backend<Bits>& result, const cpp_int_modular_backend<Bits>& o) noexcept {

unsigned os = o.size();
for (unsigned i = 0; i < os; ++i)
result.limbs()[i] = ~o.limbs()[i];
result.normalize();
}
#ifndef TVM

// Left shift will throw away upper bits.
// This function must be called only when s % 8 == 0, i.e. we shift bytes.
template<unsigned Bits>
Expand All @@ -129,7 +126,6 @@ namespace boost {
std::memset(pc, 0, bytes);
}
}
#endif

// Left shift will throw away upper bits.
// This function must be called only when s % limb_bits == 0, i.e. we shift limbs, which are normally 64 bit.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@ namespace boost {
initialize_modulus(m);
initialize_barrett_params();
initialize_montgomery_params();

m_no_carry_montgomery_mul_allowed = is_applicable_for_no_carry_montgomery_mul();
}

public:
Expand Down Expand Up @@ -417,9 +419,27 @@ namespace boost {
barrett_reduce(result, tmp);
}

template<typename Backend1>
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul(Backend1 &result, const Backend1 &y) const {
return montgomery_mul_impl(result, y, std::integral_constant<bool, is_trivial_cpp_int_modular<Backend1>::value>());
// Delegates Montgomery multiplication to one of corresponding algorithms.
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul(
Backend &result, const Backend &y,
std::integral_constant<bool, false> const&) const {

if ( m_no_carry_montgomery_mul_allowed )
montgomery_mul_no_carry_impl(
result,
y);
else
montgomery_mul_CIOS_impl(
result,
y,
std::integral_constant<bool, false>() );
}

void montgomery_mul(Backend &result, const Backend &y, std::integral_constant<bool, true> const&) const {
montgomery_mul_CIOS_impl(
result,
y,
std::integral_constant<bool, true>() );
}

//
Expand All @@ -428,7 +448,9 @@ namespace boost {
//
// A specialization for trivial cpp_int_modular types only.
template<typename Backend1>
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_impl(Backend1 &result, const Backend1 &y, std::integral_constant<bool, true> const&) const {
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_impl(
Backend1 &result, const Backend1 &y,
std::integral_constant<bool, true> const&) const {
BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod));

Backend_padded_limbs A(internal_limb_type(0u));
Expand Down Expand Up @@ -487,9 +509,160 @@ namespace boost {
result = A;
}

// Given a value represented in 'double_limb_type', decomposes it into
// two 'limb_type' variables, based on high order bits and low order bits.
// There 'a' receives high order bits of 'X', and 'b' receives the low order bits.
static BOOST_MP_CXX14_CONSTEXPR void dbl_limb_to_limbs(
const internal_double_limb_type& X,
internal_limb_type& a,
internal_limb_type& b ) {
b = X;
a = X >> limb_bits;
}

// Tests if the faster implementation of Montgomery multiplication is possible.
// We don't need the template argument Backend1, it's just here to enable specialization.
template<class Backend1 = Backend>
BOOST_MP_CXX14_CONSTEXPR typename boost::enable_if_c<!is_trivial_cpp_int_modular<Backend1>::value, bool>::type
is_applicable_for_no_carry_montgomery_mul() const {

// Check that
// 1. The most significant bit of modulus is non-zero, meaning we have at least 1 additional
// bit in the number. I.E. if modulus is 255 bits, then we have 1 additional "unused" bit in the number.
// 2. Some other bit in modulus is 0.
// 3. The number has < 12 limbs.
return m_mod.internal_limb_count < 12 && (Bits % sizeof(internal_limb_type) != 0) &&
!eval_eq(m_mod_compliment, Backend(internal_limb_type(1u)));
}

template<class Backend1 = Backend>
BOOST_MP_CXX14_CONSTEXPR typename boost::enable_if_c<is_trivial_cpp_int_modular<Backend1>::value, bool>::type
is_applicable_for_no_carry_montgomery_mul() const {
return false;
}

// Non-carry implementation of Montgomery multiplication.
// Implemented from pseudo-code at
// "https://hackmd.io/@gnark/modular_multiplication".
template< typename Backend1 >
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_no_carry_impl(
Backend1& c,
const Backend1& b) const {

BOOST_ASSERT( eval_lt(c, m_mod) && eval_lt(b, m_mod) );
BOOST_ASSERT( is_applicable_for_no_carry_montgomery_mul() );

// Obtain number of limbs
constexpr int N = Backend1::internal_limb_count;

const Backend1 a( c ); // Copy the first argument, as the implemented
// algorithm doesn't work in-place.

// Prepare temporary variables
c = internal_limb_type(0u);
internal_limb_type A( 0u ), C( 0u );
internal_double_limb_type tmp( 0u );
internal_limb_type dummy ( 0u );

auto* a_limbs = a.limbs();
auto* b_limbs = b.limbs();
auto* c_limbs = c.limbs();
auto* m_mod_limbs = m_mod.limbs();

for ( int i = 0; i < N; ++i ) {
// "(A,t[0]) := t[0] + a[0]*b[i]"
tmp = a_limbs[0];
tmp *= b_limbs[i];
tmp += c_limbs[0];
modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[0] );

// "m := t[0]*q'[0] mod W"
tmp = c_limbs[0];
//tmp *= q.limbs()[0];
tmp *= m_montgomery_p_dash;
// tmp = -tmp;
// Note that m is a shorter integer, and we are taking the last bits of tmp.
internal_limb_type m = tmp;

// "(C,_) := t[0] + m*q[0]"
tmp = m;
tmp *= m_mod_limbs[0];
tmp += c_limbs[0];
modular_functions_fixed::dbl_limb_to_limbs( tmp, C, dummy );

// The lower loop is unrolled. We want to do this for every 3, because normally mod_size == 4.
std::size_t j = 1;
for (; j + 3 <= N; j += 3) {
// For j
// "(A,t[j]) := t[j] + a[j]*b[i] + A"
tmp = a_limbs[j];
tmp *= b_limbs[i];
tmp += c_limbs[j];
tmp += A;
modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j] );

// "(C,t[j-1]) := t[j] + m*q[j] + C"
tmp = m;
tmp *= m_mod_limbs[j];
tmp += c_limbs[j];
tmp += C;
modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j-1] );

// For j + 1
// "(A,t[j]) := t[j] + a[j]*b[i] + A"
tmp = a_limbs[j + 1];
tmp *= b_limbs[i];
tmp += c_limbs[j + 1];
tmp += A;
modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j + 1] );

// "(C,t[j-1]) := t[j] + m*q[j] + C"
tmp = m;
tmp *= m_mod_limbs[j + 1];
tmp += c_limbs[j + 1];
tmp += C;
modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j] );

// For j + 2
// "(A,t[j]) := t[j] + a[j]*b[i] + A"
tmp = a_limbs[j + 2];
tmp *= b_limbs[i];
tmp += c_limbs[j + 2];
tmp += A;
modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j + 2] );

// "(C,t[j-1]) := t[j] + m*q[j] + C"
tmp = m;
tmp *= m_mod_limbs[j + 2];
tmp += c_limbs[j + 2];
tmp += C;
modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j + 1] );
}

for ( ; j < N; ++j ) {
// "(A,t[j]) := t[j] + a[j]*b[i] + A"
tmp = a_limbs[j];
tmp *= b_limbs[i];
tmp += c_limbs[j];
tmp += A;
modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j] );

// "(C,t[j-1]) := t[j] + m*q[j] + C"
tmp = m;
tmp *= m_mod_limbs[j];
tmp += c_limbs[j];
tmp += C;
modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j-1] );
}

// "t[N-1] = C + A"
c_limbs[N-1] = C + A;
}
}

// A specialization for non-trivial cpp_int_modular types only.
template<typename Backend1>
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_impl(Backend1 &result, const Backend1 &y,
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_CIOS_impl(Backend1 &result, const Backend1 &y,
std::integral_constant<bool, false> const&) const {
BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod));

Expand All @@ -514,8 +687,7 @@ namespace boost {
internal_double_limb_type z = 0;
internal_double_limb_type z2 = 0;

std::size_t i = 0;
while (i < mod_size) {
for (std::size_t i = 0; i < mod_size; ++i) {
x_i = x_limbs[i];
A_0 = A_limbs[0];
u_i = (A_0 + x_i * y_last_limb) * m_montgomery_p_dash;
Expand All @@ -534,7 +706,7 @@ namespace boost {

std::size_t j = 1;

// We want to do this for every 3, because normally mod_size == 4.
// The lower loop is unrolled. We want to do this for every 3, because normally mod_size == 4.
internal_double_limb_type t = 0, t2 = 0;
for (; j + 3 <= mod_size; j += 3) {
// For j
Expand Down Expand Up @@ -589,7 +761,6 @@ namespace boost {
A_limbs[mod_size - 1] = static_cast<internal_limb_type>(tmp);
carry = static_cast<internal_limb_type>(
tmp >> std::numeric_limits<internal_limb_type>::digits);
++i;
}

if (carry) {
Expand All @@ -602,6 +773,75 @@ namespace boost {
result = A;
}

//
// WARNING: could be errors here due to trivial backend -- more tests needed
// TODO(martun): optimize this function, it obviously does not need to be this long.
//
// A specialization for trivial cpp_int_modular types only.
template< typename Backend1 >
BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_CIOS_impl(
Backend1& result,
const Backend1& y,
std::integral_constant<bool, true> const& ) const {

BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod));

Backend_padded_limbs A(internal_limb_type(0u));
const size_t mod_size = m_mod.size();
auto mod_last_limb = static_cast<internal_double_limb_type>(get_limb_value(m_mod, 0));
auto y_last_limb = get_limb_value(y, 0);

for (size_t i = 0; i < mod_size; i++) {
auto x_i = get_limb_value(result, i);
auto A_0 = A.limbs()[0];
internal_limb_type u_i = (A_0 + x_i * y_last_limb) * m_montgomery_p_dash;

// A += x[i] * y + u_i * m followed by a 1 limb-shift to the right
internal_limb_type k = 0;
internal_limb_type k2 = 0;

internal_double_limb_type z = static_cast<internal_double_limb_type>(y_last_limb) *
static_cast<internal_double_limb_type>(x_i) +
A_0 + k;
internal_double_limb_type z2 = mod_last_limb * static_cast<internal_double_limb_type>(u_i) +
static_cast<internal_limb_type>(z) + k2;
k = static_cast<internal_limb_type>(z >> std::numeric_limits<internal_limb_type>::digits);
k2 = static_cast<internal_limb_type>(z2 >> std::numeric_limits<internal_limb_type>::digits);

for (size_t j = 1; j < mod_size; ++j) {
internal_double_limb_type t =
static_cast<internal_double_limb_type>(get_limb_value(y, j)) *
static_cast<internal_double_limb_type>(x_i) +
A.limbs()[j] + k;
internal_double_limb_type t2 =
static_cast<internal_double_limb_type>(get_limb_value(m_mod, j)) *
static_cast<internal_double_limb_type>(u_i) +
static_cast<internal_limb_type>(t) + k2;
A.limbs()[j - 1] = static_cast<internal_limb_type>(t2);
k = static_cast<internal_limb_type>(t >>
std::numeric_limits<internal_limb_type>::digits);
k2 = static_cast<internal_limb_type>(t2 >>
std::numeric_limits<internal_limb_type>::digits);
}
internal_double_limb_type tmp =
static_cast<internal_double_limb_type>(
custom_get_limb_value<internal_limb_type>(A, mod_size)) +
k + k2;
custom_set_limb_value<internal_limb_type>(A, mod_size - 1,
static_cast<internal_limb_type>(tmp));
custom_set_limb_value<internal_limb_type>(
A, mod_size,
static_cast<internal_limb_type>(tmp >>
std::numeric_limits<internal_limb_type>::digits));
}

if (!eval_lt(A, m_mod)) {
eval_subtract(A, m_mod);
}

result = A;
}

template<typename Backend1, typename Backend2, typename Backend3,
/// result should fit in the output parameter
typename = typename boost::enable_if_c<boost::multiprecision::backends::max_precision<Backend1>::value >=
Expand Down Expand Up @@ -670,12 +910,12 @@ namespace boost {
internal_limb_type lsb = exp.limbs()[0] & 1u;
custom_right_shift(exp, static_cast<internal_limb_type>(1u));
if (lsb) {
montgomery_mul(R_mod_m, base);
montgomery_mul(R_mod_m, base, std::integral_constant<bool, is_trivial_cpp_int_modular<Backend>::value>());
if (eval_eq(exp, static_cast<internal_limb_type>(0u))) {
break;
}
}
montgomery_mul(base, base);
montgomery_mul(base, base, std::integral_constant<bool, is_trivial_cpp_int_modular<Backend>::value>());
}
result = R_mod_m;
}
Expand All @@ -686,6 +926,7 @@ namespace boost {
m_montgomery_r2 = o.get_r2();
m_montgomery_p_dash = o.get_p_dash();
m_mod_compliment = o.get_mod_compliment();
m_no_carry_montgomery_mul_allowed = is_applicable_for_no_carry_montgomery_mul();

return *this;
}
Expand All @@ -703,9 +944,14 @@ namespace boost {
Backend_doubled_1 m_barrett_mu;
Backend m_montgomery_r2;
internal_limb_type m_montgomery_p_dash = 0;

// If set, no-carry optimization is allowed. Must be initialized by function
// is_applicable_for_no_carry_montgomery_mul() after initialization.
bool m_no_carry_montgomery_mul_allowed = false;
};
} // namespace backends
} // namespace multiprecision
} // namespace boost

#endif // CRYPTO3_MULTIPRECISION_MODULAR_FUNCTIONS_FIXED_PRECISION_HPP

Loading

0 comments on commit 6b3c8e8

Please sign in to comment.