diff --git a/libs/multiprecision/include/nil/crypto3/multiprecision/cpp_int_modular/bitwise.hpp b/libs/multiprecision/include/nil/crypto3/multiprecision/cpp_int_modular/bitwise.hpp
index 258630d2f1..7c466a9415 100644
--- a/libs/multiprecision/include/nil/crypto3/multiprecision/cpp_int_modular/bitwise.hpp
+++ b/libs/multiprecision/include/nil/crypto3/multiprecision/cpp_int_modular/bitwise.hpp
@@ -99,10 +99,7 @@ namespace boost {
 
                 template<unsigned Bits>
                 BOOST_MP_FORCEINLINE BOOST_MP_CXX14_CONSTEXPR typename std::enable_if<
-                    boost::multiprecision::is_unsigned_number<cpp_int_modular_backend<Bits>>::value &&
-                    !boost::multiprecision::backends::is_trivial_cpp_int_modular<cpp_int_modular_backend<Bits>>::value &&
-                    !boost::multiprecision::backends::is_trivial_cpp_int_modular<cpp_int_modular_backend<Bits>>::value>::
-                    type
+                    !boost::multiprecision::backends::is_trivial_cpp_int_modular<cpp_int_modular_backend<Bits>>::value>::type
                     eval_complement(cpp_int_modular_backend<Bits>& result, const cpp_int_modular_backend<Bits>& o) noexcept {
 
                     unsigned os = o.size();
@@ -110,7 +107,7 @@ namespace boost {
                         result.limbs()[i] = ~o.limbs()[i];
                     result.normalize();
                 }
-#ifndef TVM
+
                 // Left shift will throw away upper bits.
                 // This function must be called only when s % 8 == 0, i.e. we shift bytes.
                 template<unsigned Bits>
@@ -129,7 +126,6 @@ namespace boost {
                         std::memset(pc, 0, bytes);
                     }
                 }
-#endif
 
                 // Left shift will throw away upper bits.
                 // This function must be called only when s % limb_bits == 0, i.e. we shift limbs, which are normally 64 bit.
diff --git a/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp b/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp
index 01cede1df4..27bb4ff45b 100644
--- a/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp
+++ b/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_functions_fixed.hpp
@@ -245,6 +245,8 @@ namespace boost {
                     initialize_modulus(m);
                     initialize_barrett_params();
                     initialize_montgomery_params();
+
+                    m_no_carry_montgomery_mul_allowed = is_applicable_for_no_carry_montgomery_mul();
                 }
 
             public:
@@ -417,9 +419,27 @@ namespace boost {
                     barrett_reduce(result, tmp);
                 }
 
-                template<typename Backend1>
-                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul(Backend1 &result, const Backend1 &y) const {
-                    return montgomery_mul_impl(result, y, std::integral_constant<bool, is_trivial_cpp_int_modular<Backend1>::value>());
+                // Delegates Montgomery multiplication to one of corresponding algorithms.
+                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul(
+                        Backend &result, const Backend &y,
+                        std::integral_constant<bool, false> const&) const {
+
+                    if ( m_no_carry_montgomery_mul_allowed ) 
+                        montgomery_mul_no_carry_impl( 
+                                result, 
+                                y); 
+                    else
+                        montgomery_mul_CIOS_impl(
+                                result,
+                                y,
+                                std::integral_constant<bool, false>() );
+                }
+
+                void montgomery_mul(Backend &result, const Backend &y, std::integral_constant<bool, true> const&) const {
+                     montgomery_mul_CIOS_impl(
+                         result, 
+                         y,
+                         std::integral_constant<bool, true>() );
                 }
 
                 //
@@ -428,7 +448,9 @@ namespace boost {
                 //
                 // A specialization for trivial cpp_int_modular types only.
                 template<typename Backend1>
-                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_impl(Backend1 &result, const Backend1 &y, std::integral_constant<bool, true> const&) const {
+                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_impl(
+                        Backend1 &result, const Backend1 &y,
+                        std::integral_constant<bool, true> const&) const {
                     BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod));
 
                     Backend_padded_limbs A(internal_limb_type(0u));
@@ -487,9 +509,160 @@ namespace boost {
                     result = A;
                 }
 
+                // Given a value represented in 'double_limb_type', decomposes it into
+                // two 'limb_type' variables, based on high order bits and low order bits.
+                // There 'a' receives high order bits of 'X', and 'b' receives the low order bits.
+                static BOOST_MP_CXX14_CONSTEXPR void dbl_limb_to_limbs( 
+                        const internal_double_limb_type& X,
+                        internal_limb_type& a, 
+                        internal_limb_type& b ) {
+                    b = X;
+                    a = X >> limb_bits;
+                }
+
+                // Tests if the faster implementation of Montgomery multiplication is possible.
+                // We don't need the template argument Backend1, it's just here to enable specialization.
+                template<class Backend1 = Backend> 
+                BOOST_MP_CXX14_CONSTEXPR typename boost::enable_if_c<!is_trivial_cpp_int_modular<Backend1>::value, bool>::type
+                    is_applicable_for_no_carry_montgomery_mul() const {
+
+                    // Check that
+                    // 1. The most significant bit of modulus is non-zero, meaning we have at least 1 additional
+                    // bit in the number. I.E. if modulus is 255 bits, then we have 1 additional "unused" bit in the number.
+                    // 2. Some other bit in modulus is 0.
+                    // 3. The number has < 12 limbs.
+                    return m_mod.internal_limb_count < 12 && (Bits % sizeof(internal_limb_type) != 0) && 
+                        !eval_eq(m_mod_compliment, Backend(internal_limb_type(1u)));
+                }
+
+                template<class Backend1 = Backend> 
+                BOOST_MP_CXX14_CONSTEXPR typename boost::enable_if_c<is_trivial_cpp_int_modular<Backend1>::value, bool>::type
+                    is_applicable_for_no_carry_montgomery_mul() const {
+                    return false;
+                }
+
+                // Non-carry implementation of Montgomery multiplication.
+                // Implemented from pseudo-code at
+                //   "https://hackmd.io/@gnark/modular_multiplication".
+                template< typename Backend1 >
+                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_no_carry_impl( 
+                        Backend1& c,
+                        const Backend1& b) const {
+
+                    BOOST_ASSERT( eval_lt(c, m_mod) && eval_lt(b, m_mod) );
+                    BOOST_ASSERT( is_applicable_for_no_carry_montgomery_mul() ); 
+
+                    // Obtain number of limbs
+                    constexpr int N = Backend1::internal_limb_count;
+
+                    const Backend1 a( c );  // Copy the first argument, as the implemented
+                                            // algorithm doesn't work in-place.
+
+                    // Prepare temporary variables
+                    c = internal_limb_type(0u);
+                    internal_limb_type A( 0u ), C( 0u );
+                    internal_double_limb_type tmp( 0u );
+                    internal_limb_type dummy ( 0u );
+
+                    auto* a_limbs = a.limbs();
+                    auto* b_limbs = b.limbs();
+                    auto* c_limbs = c.limbs();
+                    auto* m_mod_limbs = m_mod.limbs();
+
+                    for ( int i = 0; i < N; ++i ) {
+                        // "(A,t[0]) := t[0] + a[0]*b[i]"
+                        tmp = a_limbs[0];
+                        tmp *= b_limbs[i];
+                        tmp += c_limbs[0];
+                        modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[0] );
+
+                        // "m := t[0]*q'[0] mod W"
+                        tmp = c_limbs[0];
+                        //tmp *= q.limbs()[0];
+                        tmp *= m_montgomery_p_dash;
+                        // tmp = -tmp;
+                        // Note that m is a shorter integer, and we are taking the last bits of tmp.
+                        internal_limb_type m = tmp;
+
+                        // "(C,_) := t[0] + m*q[0]"
+                        tmp = m;
+                        tmp *= m_mod_limbs[0];
+                        tmp += c_limbs[0];
+                        modular_functions_fixed::dbl_limb_to_limbs( tmp, C, dummy );
+
+                        // The lower loop is unrolled. We want to do this for every 3, because normally mod_size == 4.
+                        std::size_t j = 1;
+                        for (; j + 3 <= N; j += 3) {
+                            // For j
+                            // "(A,t[j])  := t[j] + a[j]*b[i] + A"
+                            tmp = a_limbs[j];
+                            tmp *= b_limbs[i];
+                            tmp += c_limbs[j];
+                            tmp += A;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j] );
+
+                            // "(C,t[j-1]) := t[j] + m*q[j] + C"
+                            tmp = m;
+                            tmp *= m_mod_limbs[j];
+                            tmp += c_limbs[j];
+                            tmp += C;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j-1] );
+
+                            // For j + 1
+                            // "(A,t[j])  := t[j] + a[j]*b[i] + A"
+                            tmp = a_limbs[j + 1];
+                            tmp *= b_limbs[i];
+                            tmp += c_limbs[j + 1];
+                            tmp += A;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j + 1] );
+
+                            // "(C,t[j-1]) := t[j] + m*q[j] + C"
+                            tmp = m;
+                            tmp *= m_mod_limbs[j + 1];
+                            tmp += c_limbs[j + 1];
+                            tmp += C;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j] );
+
+                            // For j + 2
+                            // "(A,t[j])  := t[j] + a[j]*b[i] + A"
+                            tmp = a_limbs[j + 2];
+                            tmp *= b_limbs[i];
+                            tmp += c_limbs[j + 2];
+                            tmp += A;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j + 2] );
+
+                            // "(C,t[j-1]) := t[j] + m*q[j] + C"
+                            tmp = m;
+                            tmp *= m_mod_limbs[j + 2];
+                            tmp += c_limbs[j + 2];
+                            tmp += C;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j + 1] );
+                        }
+
+                        for ( ; j < N; ++j ) {
+                            // "(A,t[j])  := t[j] + a[j]*b[i] + A"
+                            tmp = a_limbs[j];
+                            tmp *= b_limbs[i];
+                            tmp += c_limbs[j];
+                            tmp += A;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, A, c_limbs[j] );
+
+                            // "(C,t[j-1]) := t[j] + m*q[j] + C"
+                            tmp = m;
+                            tmp *= m_mod_limbs[j];
+                            tmp += c_limbs[j];
+                            tmp += C;
+                            modular_functions_fixed::dbl_limb_to_limbs( tmp, C, c_limbs[j-1] );
+                        }
+
+                        // "t[N-1] = C + A"
+                        c_limbs[N-1] = C + A;
+                    }
+                }
+
                 // A specialization for non-trivial cpp_int_modular types only.
                 template<typename Backend1>
-                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_impl(Backend1 &result, const Backend1 &y,
+                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_CIOS_impl(Backend1 &result, const Backend1 &y,
                         std::integral_constant<bool, false> const&) const {
                     BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod));
 
@@ -514,8 +687,7 @@ namespace boost {
                     internal_double_limb_type z = 0;
                     internal_double_limb_type z2 = 0;
 
-                    std::size_t i = 0;
-                    while (i < mod_size) {
+                    for (std::size_t i = 0; i < mod_size; ++i) {
                         x_i = x_limbs[i];
                         A_0 = A_limbs[0];
                         u_i = (A_0 + x_i * y_last_limb) * m_montgomery_p_dash;
@@ -534,7 +706,7 @@ namespace boost {
 
                         std::size_t j = 1;
 
-                        // We want to do this for every 3, because normally mod_size == 4.
+                        // The lower loop is unrolled. We want to do this for every 3, because normally mod_size == 4.
                         internal_double_limb_type t = 0, t2 = 0;
                         for (; j + 3 <= mod_size; j += 3) {
                             // For j
@@ -589,7 +761,6 @@ namespace boost {
                         A_limbs[mod_size - 1] = static_cast<internal_limb_type>(tmp);
                         carry = static_cast<internal_limb_type>(
                             tmp >> std::numeric_limits<internal_limb_type>::digits);
-                        ++i;
                     }
 
                     if (carry) {
@@ -602,6 +773,75 @@ namespace boost {
                     result = A;
                 }
 
+                //
+                // WARNING: could be errors here due to trivial backend -- more tests needed
+                // TODO(martun): optimize this function, it obviously does not need to be this long.
+                //
+                // A specialization for trivial cpp_int_modular types only.
+                template< typename Backend1 >
+                BOOST_MP_CXX14_CONSTEXPR void montgomery_mul_CIOS_impl( 
+                        Backend1& result, 
+                        const Backend1& y, 
+                        std::integral_constant<bool, true> const& ) const {
+
+                    BOOST_ASSERT(eval_lt(result, m_mod) && eval_lt(y, m_mod));
+
+                    Backend_padded_limbs A(internal_limb_type(0u));
+                    const size_t mod_size = m_mod.size();
+                    auto mod_last_limb = static_cast<internal_double_limb_type>(get_limb_value(m_mod, 0));
+                    auto y_last_limb = get_limb_value(y, 0);
+
+                    for (size_t i = 0; i < mod_size; i++) {
+                        auto x_i = get_limb_value(result, i);
+                        auto A_0 = A.limbs()[0];
+                        internal_limb_type u_i = (A_0 + x_i * y_last_limb) * m_montgomery_p_dash;
+
+                        // A += x[i] * y + u_i * m followed by a 1 limb-shift to the right
+                        internal_limb_type k = 0;
+                        internal_limb_type k2 = 0;
+
+                        internal_double_limb_type z = static_cast<internal_double_limb_type>(y_last_limb) *
+                                                          static_cast<internal_double_limb_type>(x_i) +
+                                                      A_0 + k;
+                        internal_double_limb_type z2 = mod_last_limb * static_cast<internal_double_limb_type>(u_i) +
+                                                       static_cast<internal_limb_type>(z) + k2;
+                        k = static_cast<internal_limb_type>(z >> std::numeric_limits<internal_limb_type>::digits);
+                        k2 = static_cast<internal_limb_type>(z2 >> std::numeric_limits<internal_limb_type>::digits);
+
+                        for (size_t j = 1; j < mod_size; ++j) {
+                            internal_double_limb_type t =
+                                static_cast<internal_double_limb_type>(get_limb_value(y, j)) *
+                                    static_cast<internal_double_limb_type>(x_i) +
+                                A.limbs()[j] + k;
+                            internal_double_limb_type t2 =
+                                static_cast<internal_double_limb_type>(get_limb_value(m_mod, j)) *
+                                    static_cast<internal_double_limb_type>(u_i) +
+                                static_cast<internal_limb_type>(t) + k2;
+                            A.limbs()[j - 1] = static_cast<internal_limb_type>(t2);
+                            k = static_cast<internal_limb_type>(t >>
+                                                                std::numeric_limits<internal_limb_type>::digits);
+                            k2 = static_cast<internal_limb_type>(t2 >>
+                                                                 std::numeric_limits<internal_limb_type>::digits);
+                        }
+                        internal_double_limb_type tmp =
+                            static_cast<internal_double_limb_type>(
+                                custom_get_limb_value<internal_limb_type>(A, mod_size)) +
+                            k + k2;
+                        custom_set_limb_value<internal_limb_type>(A, mod_size - 1,
+                                                                  static_cast<internal_limb_type>(tmp));
+                        custom_set_limb_value<internal_limb_type>(
+                            A, mod_size,
+                            static_cast<internal_limb_type>(tmp >>
+                                                            std::numeric_limits<internal_limb_type>::digits));
+                    }
+
+                    if (!eval_lt(A, m_mod)) {
+                        eval_subtract(A, m_mod);
+                    }
+
+                    result = A;
+                }
+
                 template<typename Backend1, typename Backend2, typename Backend3,
                          /// result should fit in the output parameter
                          typename = typename boost::enable_if_c<boost::multiprecision::backends::max_precision<Backend1>::value >=
@@ -670,12 +910,12 @@ namespace boost {
                         internal_limb_type lsb = exp.limbs()[0] & 1u;
                         custom_right_shift(exp, static_cast<internal_limb_type>(1u));
                         if (lsb) {
-                            montgomery_mul(R_mod_m, base);
+                            montgomery_mul(R_mod_m, base, std::integral_constant<bool, is_trivial_cpp_int_modular<Backend>::value>());
                             if (eval_eq(exp, static_cast<internal_limb_type>(0u))) {
                                 break;
                             }
                         }
-                        montgomery_mul(base, base);
+                        montgomery_mul(base, base, std::integral_constant<bool, is_trivial_cpp_int_modular<Backend>::value>());
                     }
                     result = R_mod_m;
                 }
@@ -686,6 +926,7 @@ namespace boost {
                     m_montgomery_r2 = o.get_r2(); 
                     m_montgomery_p_dash = o.get_p_dash();
                     m_mod_compliment = o.get_mod_compliment();
+                    m_no_carry_montgomery_mul_allowed = is_applicable_for_no_carry_montgomery_mul();
 
                     return *this;
                 }
@@ -703,9 +944,14 @@ namespace boost {
                 Backend_doubled_1 m_barrett_mu;
                 Backend m_montgomery_r2;
                 internal_limb_type m_montgomery_p_dash = 0;
+
+                // If set, no-carry optimization is allowed. Must be initialized by function 
+                // is_applicable_for_no_carry_montgomery_mul() after initialization.
+                bool m_no_carry_montgomery_mul_allowed = false;
             };
         }    // namespace backends
     }   // namespace multiprecision
 }   // namespace boost
 
 #endif    // CRYPTO3_MULTIPRECISION_MODULAR_FUNCTIONS_FIXED_PRECISION_HPP
+
diff --git a/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_params_fixed.hpp b/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_params_fixed.hpp
index eca3e708c6..57746a9832 100644
--- a/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_params_fixed.hpp
+++ b/libs/multiprecision/include/nil/crypto3/multiprecision/modular/modular_params_fixed.hpp
@@ -122,10 +122,11 @@ namespace boost {
                     }
                 }
 
-                template<typename Backend1, typename Backend2>
-                BOOST_MP_CXX14_CONSTEXPR void mod_mul(Backend1 &result, const Backend2 &y) const {
+                template<typename Backend1>
+                BOOST_MP_CXX14_CONSTEXPR void mod_mul(Backend1 &result, const Backend1 &y) const {
                     if (is_odd_mod) {
-                        m_mod_obj.montgomery_mul(result, y);
+                        m_mod_obj.montgomery_mul(result, y,
+                            std::integral_constant<bool, is_trivial_cpp_int_modular<Backend1>::value>());
                     } else {
                         m_mod_obj.regular_mul(result, y);
                     }
diff --git a/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp b/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp
index 783d353e87..02ba419759 100644
--- a/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp
+++ b/libs/multiprecision/test/bench_test/bench_test_modular_adaptor_fixed.cpp
@@ -64,13 +64,14 @@ BOOST_AUTO_TEST_CASE(modular_adaptor_montgomery_mult_perf_test) {
     auto mod_object = x_modular.mod_data().get_mod_obj();
     auto base_data = x_modular.base_data();
     for (int i = 0; i < SAMPLES; ++i) {
-        mod_object.montgomery_mul(base_data, res_modular.base_data());
+        mod_object.montgomery_mul(base_data, res_modular.base_data(),
+            std::integral_constant<bool, boost::multiprecision::backends::is_trivial_cpp_int_modular<Backend>::value>());
     }
 
     std::cout << base_data << std::endl;
     auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
         std::chrono::high_resolution_clock::now() - start);
-    std::cout << "Multiplication time: " << std::fixed << std::setprecision(3)
+    std::cout << "Multiplication time (when montgomery_mul is called directly): " << std::fixed << std::setprecision(3)
         << std::dec << elapsed.count() / SAMPLES << " ns" << std::endl;
 }
 
@@ -162,7 +163,7 @@ BOOST_AUTO_TEST_CASE(modular_adaptor_backend_mult_perf_test) {
 
     auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(
         std::chrono::high_resolution_clock::now() - start);
-    std::cout << "Multiplication time: " << std::fixed << std::setprecision(3)
+    std::cout << "Multiplication time (when called from modular adaptor): " << std::fixed << std::setprecision(3)
         << elapsed.count() / SAMPLES << " ns" << std::endl;
 
     // Print something so the whole computation is not optimized out.