Skip to content

Commit

Permalink
Merge BoringSSL '13c9d5c': Always end BN_mod_exp_mont_consttime with …
Browse files Browse the repository at this point in the history
…normal Montgomery reduction.

The impact analysis is the same as BoringSSL's. There is further
protection provided by the fact that in the RSA computation we always
double-check the result against faults ("Verify the result" in
`private_exponentiate`). In the other use, the result is immediately
followed by a multiplication that would fully reduce the result.

All the test vectors from BoringSSL were imported except the ones with
moduli with bit sizes that are not a multiple of 512, since that is an
enforced preresquisite in *ring*. Since *ring* doesn't use RSAZ the
test vector that BoringSSL had temporarily commented out is also
enabled.

```
git diff \
      13c9d5c:crypto/fipsmodule/bn/bn_tests.txt \
      src/arithmetic/bigint_elem_exp_consttime_tests.txt
```
  • Loading branch information
briansmith committed Sep 15, 2023
2 parents fd3f3d5 + 30ad170 commit c857cb1
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 245 deletions.
1 change: 0 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
"aes_nohw_set_encrypt_key",
"aesni_gcm_decrypt",
"aesni_gcm_encrypt",
"bn_from_montgomery",
"bn_from_montgomery_in_place",
"bn_gather5",
"bn_mul_mont",
Expand Down
197 changes: 0 additions & 197 deletions crypto/fipsmodule/bn/asm/x86_64-mont5.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2088,194 +2088,6 @@
.size __bn_post4x_internal,.-__bn_post4x_internal
___
}
{
$code.=<<___;
.globl bn_from_montgomery
.type bn_from_montgomery,\@abi-omnipotent
.align 32
bn_from_montgomery:
.cfi_startproc
testl \$7,`($win64?"48(%rsp)":"%r9d")`
jz bn_from_mont8x
xor %eax,%eax
ret
.cfi_endproc
.size bn_from_montgomery,.-bn_from_montgomery

.type bn_from_mont8x,\@function,6
.align 32
bn_from_mont8x:
.cfi_startproc
.byte 0x67
mov %rsp,%rax
.cfi_def_cfa_register %rax
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lfrom_prologue:

shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num
mov ($n0),$n0 # *n0

##############################################################
# Ensure that stack frame doesn't alias with $rptr+3*$num
# modulo 4096, which covers ret[num], am[num] and n[num]
# (see bn_exp.c). The stack is allocated to aligned with
# bn_power5's frame, and as bn_from_montgomery happens to be
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
sub %r11,%rbp # align with $aptr
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done

.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rbp
.Lfrom_sp_done:
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lfrom_page_walk
jmp .Lfrom_page_walk_done

.Lfrom_page_walk:
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lfrom_page_walk
.Lfrom_page_walk_done:

mov $num,%r10
neg $num

##############################################################
# Stack layout
#
# +0 saved $num, used in reduction section
# +8 &t[2*$num], used in reduction section
# +32 saved *n0
# +40 saved %rsp
# +48 t[2*$num]
#
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
.cfi_cfa_expression %rsp+40,deref,+8
.Lfrom_body:
mov $num,%r11
lea 48(%rsp),%rax
pxor %xmm0,%xmm0
jmp .Lmul_by_1

.align 32
.Lmul_by_1:
movdqu ($aptr),%xmm1
movdqu 16($aptr),%xmm2
movdqu 32($aptr),%xmm3
movdqa %xmm0,(%rax,$num)
movdqu 48($aptr),%xmm4
movdqa %xmm0,16(%rax,$num)
.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
movdqa %xmm1,(%rax)
movdqa %xmm0,32(%rax,$num)
movdqa %xmm2,16(%rax)
movdqa %xmm0,48(%rax,$num)
movdqa %xmm3,32(%rax)
movdqa %xmm4,48(%rax)
lea 64(%rax),%rax
sub \$64,%r11
jnz .Lmul_by_1

movq $rptr,%xmm1
movq $nptr,%xmm2
.byte 0x67
mov $nptr,%rbp
movq %r10, %xmm3 # -num
___
$code.=<<___ if ($addx);
leaq OPENSSL_ia32cap_P(%rip),%r11
mov 8(%r11),%r11d
and \$0x80108,%r11d
cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
jne .Lfrom_mont_nox

lea (%rax,$num),$rptr
call __bn_sqrx8x_reduction
call __bn_postx4x_internal

pxor %xmm0,%xmm0
lea 48(%rsp),%rax
jmp .Lfrom_mont_zero

.align 32
.Lfrom_mont_nox:
___
$code.=<<___;
call __bn_sqr8x_reduction
call __bn_post4x_internal

pxor %xmm0,%xmm0
lea 48(%rsp),%rax
jmp .Lfrom_mont_zero

.align 32
.Lfrom_mont_zero:
mov 40(%rsp),%rsi # restore %rsp
.cfi_def_cfa %rsi,8
movdqa %xmm0,16*0(%rax)
movdqa %xmm0,16*1(%rax)
movdqa %xmm0,16*2(%rax)
movdqa %xmm0,16*3(%rax)
lea 16*4(%rax),%rax
sub \$32,$num
jnz .Lfrom_mont_zero

mov \$1,%rax
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lfrom_epilogue:
ret
.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
___
}
}}}
if ($addx) {{{
Expand Down Expand Up @@ -3864,10 +3676,6 @@
.rva .LSEH_begin_bn_power5
.rva .LSEH_end_bn_power5
.rva .LSEH_info_bn_power5

.rva .LSEH_begin_bn_from_mont8x
.rva .LSEH_end_bn_from_mont8x
.rva .LSEH_info_bn_from_mont8x
___
$code.=<<___ if ($addx);
.rva .LSEH_begin_bn_mulx4x_mont_gather5
Expand Down Expand Up @@ -3899,11 +3707,6 @@
.byte 9,0,0,0
.rva mul_handler
.rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
.rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
Expand Down
77 changes: 30 additions & 47 deletions src/arithmetic/bigint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,29 +146,28 @@ impl<M, E> Elem<M, E> {
}
}

impl<M, E: ReductionEncoding> Elem<M, E> {
fn decode_once(self, m: &Modulus<M>) -> Elem<M, <E as ReductionEncoding>::Output> {
// A multiplication isn't required since we're multiplying by the
// unencoded value one (1); only a Montgomery reduction is needed.
// However the only non-multiplication Montgomery reduction function we
// have requires the input to be large, so we avoid using it here.
let mut limbs = self.limbs;
let num_limbs = m.width().num_limbs;
let mut one = [0; MODULUS_MAX_LIMBS];
one[0] = 1;
let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
Elem {
limbs,
encoding: PhantomData,
}
/// Does a Montgomery reduction on `limbs` assuming they are Montgomery-encoded ('R') and assuming
/// they are the same size as `m`, but perhaps not reduced mod `m`. The result will be
/// fully reduced mod `m`.
fn from_montgomery_amm<M>(limbs: BoxedLimbs<M>, m: &Modulus<M>) -> Elem<M, Unencoded> {
debug_assert_eq!(limbs.len(), m.limbs().len());

let mut limbs = limbs;
let num_limbs = m.width().num_limbs;
let mut one = [0; MODULUS_MAX_LIMBS];
one[0] = 1;
let one = &one[..num_limbs]; // assert!(num_limbs <= MODULUS_MAX_LIMBS);
limbs_mont_mul(&mut limbs, one, m.limbs(), m.n0(), m.cpu_features());
Elem {
limbs,
encoding: PhantomData,
}
}

impl<M> Elem<M, R> {
#[inline]
pub fn into_unencoded(self, m: &Modulus<M>) -> Elem<M, Unencoded> {
self.decode_once(m)
from_montgomery_amm(self.limbs, m)
}
}

Expand Down Expand Up @@ -623,7 +622,13 @@ pub fn elem_exp_consttime<M>(
limbs_mont_square(acc, m, n0, cpu_features);
}

fn gather_mul_base(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
fn gather_mul_base_amm(
table: &[Limb],
state: &mut [Limb],
n0: &N0,
i: Window,
num_limbs: usize,
) {
prefixed_extern! {
fn bn_mul_mont_gather5(
rp: *mut Limb,
Expand All @@ -648,7 +653,7 @@ pub fn elem_exp_consttime<M>(
}
}

fn power(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
fn power_amm(table: &[Limb], state: &mut [Limb], n0: &N0, i: Window, num_limbs: usize) {
prefixed_extern! {
fn bn_power5(
r: *mut Limb,
Expand Down Expand Up @@ -690,7 +695,7 @@ pub fn elem_exp_consttime<M>(
// TODO: Optimize this to avoid gathering
gather_square(table, state, m.n0(), i / 2, num_limbs, cpu_features);
} else {
gather_mul_base(table, state, m.n0(), i - 1, num_limbs)
gather_mul_base_amm(table, state, m.n0(), i - 1, num_limbs)
};
scatter(table, state, i, num_limbs);
}
Expand All @@ -702,37 +707,15 @@ pub fn elem_exp_consttime<M>(
state
},
|state, window| {
power(table, state, m.n0(), window, num_limbs);
power_amm(table, state, m.n0(), window, num_limbs);
state
},
);

prefixed_extern! {
fn bn_from_montgomery(
r: *mut Limb,
a: *const Limb,
not_used: *const Limb,
n: *const Limb,
n0: &N0,
num: c::size_t,
) -> bssl::Result;
}
Result::from(unsafe {
bn_from_montgomery(
entry_mut(state, ACC, num_limbs).as_mut_ptr(),
entry(state, ACC, num_limbs).as_ptr(),
core::ptr::null(),
entry(state, M, num_limbs).as_ptr(),
m.n0(),
num_limbs,
)
})?;
let mut r = Elem {
limbs: base.limbs,
encoding: PhantomData,
};
r.limbs.copy_from_slice(entry(state, ACC, num_limbs));
Ok(r)
let mut r_amm = base.limbs;
r_amm.copy_from_slice(entry(state, ACC, num_limbs));

Ok(from_montgomery_amm(r_amm, m))
}

/// Verified a == b**-1 (mod m), i.e. a**-1 == b (mod m).
Expand Down
Loading

0 comments on commit c857cb1

Please sign in to comment.