From f9de43b82a925c33b658fb36c5c169726b6de266 Mon Sep 17 00:00:00 2001 From: Dumitrel Loghin Date: Thu, 6 Jun 2024 17:23:49 +0800 Subject: [PATCH 1/7] poseidon avx and avx512 --- .../src/hash/arch/x86_64/goldilocks_avx2.rs | 79 +- .../src/hash/arch/x86_64/goldilocks_avx512.rs | 150 + plonky2/src/hash/arch/x86_64/mod.rs | 13 +- .../hash/arch/x86_64/poseidon_bn128_avx2.rs | 1155 ++++ .../arch/x86_64/poseidon_goldilocks_avx2.rs | 1082 +++- .../arch/x86_64/poseidon_goldilocks_avx512.rs | 366 ++ plonky2/src/hash/mod.rs | 1 + plonky2/src/hash/poseidon.rs | 18 +- plonky2/src/hash/poseidon_bn128.rs | 54 +- plonky2/src/hash/poseidon_bn128_ops.rs | 4630 +++++++++++++++++ plonky2/src/hash/poseidon_goldilocks.rs | 4 +- plonky2/src/lib.rs | 1 + 12 files changed, 7483 insertions(+), 70 deletions(-) create mode 100644 plonky2/src/hash/arch/x86_64/goldilocks_avx512.rs create mode 100644 plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs create mode 100644 plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs create mode 100644 plonky2/src/hash/poseidon_bn128_ops.rs diff --git a/plonky2/src/hash/arch/x86_64/goldilocks_avx2.rs b/plonky2/src/hash/arch/x86_64/goldilocks_avx2.rs index 6d4e679d13..95921cae4a 100644 --- a/plonky2/src/hash/arch/x86_64/goldilocks_avx2.rs +++ b/plonky2/src/hash/arch/x86_64/goldilocks_avx2.rs @@ -3,26 +3,26 @@ use core::arch::x86_64::*; use crate::hash::hash_types::RichField; -const MSB_: i64 = 0x8000000000000000u64 as i64; -const P_s_: i64 = 0x7FFFFFFF00000001u64 as i64; -const P_n_: i64 = 0xFFFFFFFF; +const MSB_1: i64 = 0x8000000000000000u64 as i64; +const P_S_1: i64 = 0x7FFFFFFF00000001u64 as i64; +const P_N_1: i64 = 0xFFFFFFFF; #[inline(always)] pub fn shift_avx(a: &__m256i) -> __m256i { unsafe { - let MSB = _mm256_set_epi64x(MSB_, MSB_, MSB_, MSB_); - _mm256_xor_si256(*a, MSB) + let msb = _mm256_set_epi64x(MSB_1, MSB_1, MSB_1, MSB_1); + _mm256_xor_si256(*a, msb) } } #[allow(dead_code)] #[inline(always)] -pub fn toCanonical_avx_s(a_s: &__m256i) -> __m256i { +pub fn to_canonical_avx_s(a_s: &__m256i) -> __m256i { unsafe { - let P_s = _mm256_set_epi64x(P_s_, P_s_, P_s_, P_s_); - let P_n = _mm256_set_epi64x(P_n_, P_n_, P_n_, P_n_); - let mask1_ = _mm256_cmpgt_epi64(P_s, *a_s); - let corr1_ = _mm256_andnot_si256(mask1_, P_n); + let p_s = _mm256_set_epi64x(P_S_1, P_S_1, P_S_1, P_S_1); + let p_n = _mm256_set_epi64x(P_N_1, P_N_1, P_N_1, P_N_1); + let mask1_ = _mm256_cmpgt_epi64(p_s, *a_s); + let corr1_ = _mm256_andnot_si256(mask1_, p_n); _mm256_add_epi64(*a_s, corr1_) } } @@ -31,9 +31,9 @@ pub fn toCanonical_avx_s(a_s: &__m256i) -> __m256i { pub fn add_avx_a_sc(a_sc: &__m256i, b: &__m256i) -> __m256i { unsafe { let c0_s = _mm256_add_epi64(*a_sc, *b); - let P_n = _mm256_set_epi64x(P_n_, P_n_, P_n_, P_n_); + let p_n = _mm256_set_epi64x(P_N_1, P_N_1, P_N_1, P_N_1); let mask_ = _mm256_cmpgt_epi64(*a_sc, c0_s); - let corr_ = _mm256_and_si256(mask_, P_n); + let corr_ = _mm256_and_si256(mask_, p_n); let c_s = _mm256_add_epi64(c0_s, corr_); shift_avx(&c_s) } @@ -69,14 +69,27 @@ pub fn sub_avx_s_b_small(a_s: &__m256i, b: &__m256i) -> __m256i { #[inline(always)] pub fn reduce_avx_128_64(c_h: &__m256i, c_l: &__m256i) -> __m256i { unsafe { - let MSB = _mm256_set_epi64x(MSB_, MSB_, MSB_, MSB_); + let msb = _mm256_set_epi64x(MSB_1, MSB_1, MSB_1, MSB_1); let c_hh = _mm256_srli_epi64(*c_h, 32); - let c_ls = _mm256_xor_si256(*c_l, MSB); + let c_ls = _mm256_xor_si256(*c_l, msb); let c1_s = sub_avx_s_b_small(&c_ls, &c_hh); - let P_n = _mm256_set_epi64x(P_n_, P_n_, P_n_, P_n_); - let c2 = _mm256_mul_epu32(*c_h, P_n); + let p_n = _mm256_set_epi64x(P_N_1, P_N_1, P_N_1, P_N_1); + let c2 = _mm256_mul_epu32(*c_h, p_n); let c_s = add_avx_s_b_small(&c1_s, &c2); - _mm256_xor_si256(c_s, MSB) + _mm256_xor_si256(c_s, msb) + } +} + +// Here we suppose c_h < 2^32 +#[inline(always)] +pub fn reduce_avx_96_64(c_h: &__m256i, c_l: &__m256i) -> __m256i { + unsafe { + let msb = _mm256_set_epi64x(MSB_1, MSB_1, MSB_1, MSB_1); + let p_n = _mm256_set_epi64x(P_N_1, P_N_1, P_N_1, P_N_1); + let c_ls = _mm256_xor_si256(*c_l, msb); + let c2 = _mm256_mul_epu32(*c_h, p_n); + let c_s = add_avx_s_b_small(&c_ls, &c2); + _mm256_xor_si256(c_s, msb) } } @@ -128,8 +141,8 @@ pub fn mult_avx_128(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { let c_ll = _mm256_mul_epu32(*a, *b); let c_ll_h = _mm256_srli_epi64(c_ll, 32); let r0 = _mm256_add_epi64(c_hl, c_ll_h); - let P_n = _mm256_set_epi64x(P_n_, P_n_, P_n_, P_n_); - let r0_l = _mm256_and_si256(r0, P_n); + let p_n = _mm256_set_epi64x(P_N_1, P_N_1, P_N_1, P_N_1); + let r0_l = _mm256_and_si256(r0, p_n); let r0_h = _mm256_srli_epi64(r0, 32); let r1 = _mm256_add_epi64(c_lh, r0_l); // let r1_l = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(r1))); @@ -148,6 +161,21 @@ pub fn mult_avx(a: &__m256i, b: &__m256i) -> __m256i { reduce_avx_128_64(&c_h, &c_l) } +// Multiply two 64bit numbers with the assumption that the product does not averflow. +#[inline] +pub unsafe fn mul64_no_overflow(a: &__m256i, b: &__m256i) -> __m256i { + let r = _mm256_mul_epu32(*a, *b); + let ah = _mm256_srli_epi64(*a, 32); + let bh = _mm256_srli_epi64(*b, 32); + let r1 = _mm256_mul_epu32(*a, bh); + let r1 = _mm256_slli_epi64(r1, 32); + let r = _mm256_add_epi64(r, r1); + let r1 = _mm256_mul_epu32(ah, *b); + let r1 = _mm256_slli_epi64(r1, 32); + let r = _mm256_add_epi64(r, r1); + r +} + /* #[inline(always)] pub fn mult_avx_v2(a: &__m256i, b: &__m256i) -> __m256i { @@ -275,3 +303,16 @@ pub fn sbox_avx_m256i(s0: &__m256i, s1: &__m256i, s2: &__m256i) -> (__m256i, __m (r0, r1, r2) } + +#[allow(dead_code)] +#[inline(always)] +pub fn sbox_avx_one(s0: &__m256i) -> __m256i { + // x^2 + let p10 = sqr_avx(s0); + // x^3 + let p30 = mult_avx(&p10, s0); + // x^4 = (x^2)^2 + let p40 = sqr_avx(&p10); + // x^7 + mult_avx(&p40, &p30) +} diff --git a/plonky2/src/hash/arch/x86_64/goldilocks_avx512.rs b/plonky2/src/hash/arch/x86_64/goldilocks_avx512.rs new file mode 100644 index 0000000000..e223fa2fb3 --- /dev/null +++ b/plonky2/src/hash/arch/x86_64/goldilocks_avx512.rs @@ -0,0 +1,150 @@ +// use core::arch::asm; +use core::arch::x86_64::*; + +use crate::hash::hash_types::RichField; + +const MSB_: i64 = 0x8000000000000000u64 as i64; +const P8_: i64 = 0xFFFFFFFF00000001u64 as i64; +const P8_n_: i64 = 0xFFFFFFFF; + +#[allow(dead_code)] +#[inline(always)] +pub fn shift_avx512(a: &__m512i) -> __m512i { + unsafe { + let MSB = _mm512_set_epi64(MSB_, MSB_, MSB_, MSB_, MSB_, MSB_, MSB_, MSB_); + _mm512_xor_si512(*a, MSB) + } +} + +#[allow(dead_code)] +#[inline(always)] +pub fn toCanonical_avx512(a: &__m512i) -> __m512i { + unsafe { + let P8 = _mm512_set_epi64(P8_, P8_, P8_, P8_, P8_, P8_, P8_, P8_); + let P8_n = _mm512_set_epi64(P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_); + let result_mask = _mm512_cmpge_epu64_mask(*a, P8); + _mm512_mask_add_epi64(*a, result_mask, *a, P8_n) + } +} + +#[inline(always)] +pub fn add_avx512_b_c(a: &__m512i, b: &__m512i) -> __m512i { + unsafe { + let P8_n = _mm512_set_epi64(P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_); + let c0 = _mm512_add_epi64(*a, *b); + let result_mask = _mm512_cmpgt_epu64_mask(*a, c0); + _mm512_mask_add_epi64(c0, result_mask, c0, P8_n) + } +} + +#[inline(always)] +pub fn sub_avx512_b_c(a: &__m512i, b: &__m512i) -> __m512i { + unsafe { + let P8 = _mm512_set_epi64(P8_, P8_, P8_, P8_, P8_, P8_, P8_, P8_); + let c0 = _mm512_sub_epi64(*a, *b); + let result_mask = _mm512_cmpgt_epu64_mask(*b, *a); + _mm512_mask_add_epi64(c0, result_mask, c0, P8) + } +} + +#[inline(always)] +pub fn reduce_avx512_128_64(c_h: &__m512i, c_l: &__m512i) -> __m512i { + unsafe { + let P8_n = _mm512_set_epi64(P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_); + let c_hh = _mm512_srli_epi64(*c_h, 32); + let c1 = sub_avx512_b_c(c_l, &c_hh); + let c2 = _mm512_mul_epu32(*c_h, P8_n); + add_avx512_b_c(&c1, &c2) + } +} + +#[inline(always)] +pub fn mult_avx512_128(a: &__m512i, b: &__m512i) -> (__m512i, __m512i) { + unsafe { + let a_h = _mm512_srli_epi64(*a, 32); + let b_h = _mm512_srli_epi64(*b, 32); + let c_hh = _mm512_mul_epu32(a_h, b_h); + let c_hl = _mm512_mul_epu32(a_h, *b); + let c_lh = _mm512_mul_epu32(*a, b_h); + let c_ll = _mm512_mul_epu32(*a, *b); + let c_ll_h = _mm512_srli_epi64(c_ll, 32); + let r0 = _mm512_add_epi64(c_hl, c_ll_h); + let P8_n = _mm512_set_epi64(P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_, P8_n_); + let r0_l = _mm512_and_si512(r0, P8_n); + let r0_h = _mm512_srli_epi64(r0, 32); + let r1 = _mm512_add_epi64(c_lh, r0_l); + let r1_l = _mm512_slli_epi64(r1, 32); + let mask = 0xAAAAu16; + let c_l = _mm512_mask_blend_epi32(mask, c_ll, r1_l); + let r2 = _mm512_add_epi64(c_hh, r0_h); + let r1_h = _mm512_srli_epi64(r1, 32); + let c_h = _mm512_add_epi64(r2, r1_h); + (c_h, c_l) + } +} + +#[inline(always)] +pub fn mult_avx512(a: &__m512i, b: &__m512i) -> __m512i { + let (c_h, c_l) = mult_avx512_128(a, b); + reduce_avx512_128_64(&c_h, &c_l) +} + +#[inline(always)] +pub fn sqr_avx512_128(a: &__m512i) -> (__m512i, __m512i) { + unsafe { + let a_h = _mm512_srli_epi64(*a, 32); + let c_ll = _mm512_mul_epu32(*a, *a); + let c_lh = _mm512_mul_epu32(*a, a_h); + let c_hh = _mm512_mul_epu32(a_h, a_h); + let c_ll_hi = _mm512_srli_epi64(c_ll, 33); + let t0 = _mm512_add_epi64(c_lh, c_ll_hi); + let t0_hi = _mm512_srli_epi64(t0, 31); + let res_hi = _mm512_add_epi64(c_hh, t0_hi); + let c_lh_lo = _mm512_slli_epi64(c_lh, 33); + let res_lo = _mm512_add_epi64(c_ll, c_lh_lo); + (res_hi, res_lo) + } +} + +#[inline(always)] +pub fn sqr_avx512(a: &__m512i) -> __m512i { + let (c_h, c_l) = sqr_avx512_128(a); + reduce_avx512_128_64(&c_h, &c_l) +} + +#[inline(always)] +pub fn sbox_avx512(state: &mut [F; 16]) +where + F: RichField, +{ + unsafe { + let s0 = _mm512_loadu_si512((&state[0..8]).as_ptr().cast::()); + let s1 = _mm512_loadu_si512((&state[8..16]).as_ptr().cast::()); + // x^2 + let p10 = sqr_avx512(&s0); + let p11 = sqr_avx512(&s1); + // x^3 + let p20 = mult_avx512(&p10, &s0); + let p21 = mult_avx512(&p11, &s1); + // x^4 = (x^2)^2 + let s0 = sqr_avx512(&p10); + let s1 = sqr_avx512(&p11); + // x^7 + let p10 = mult_avx512(&s0, &p20); + let p11 = mult_avx512(&s1, &p21); + _mm512_storeu_si512((&mut state[0..8]).as_mut_ptr().cast::(), p10); + _mm512_storeu_si512((&mut state[8..16]).as_mut_ptr().cast::(), p11); + } +} + +#[inline(always)] +pub fn sbox_avx512_one(s0: &__m512i) -> __m512i { + // x^2 + let p10 = sqr_avx512(s0); + // x^3 + let p30 = mult_avx512(&p10, s0); + // x^4 = (x^2)^2 + let p40 = sqr_avx512(&p10); + // x^7 + mult_avx512(&p40, &p30) +} diff --git a/plonky2/src/hash/arch/x86_64/mod.rs b/plonky2/src/hash/arch/x86_64/mod.rs index 7229fc94fd..bfd9da3359 100644 --- a/plonky2/src/hash/arch/x86_64/mod.rs +++ b/plonky2/src/hash/arch/x86_64/mod.rs @@ -2,9 +2,16 @@ // // - AVX2 // // - BMI2 (for MULX and SHRX) // #[cfg(all(target_feature = "avx2", target_feature = "bmi2"))] -#[cfg(target_feature = "avx2")] -pub mod goldilocks_avx2; +#[cfg(all(target_feature = "avx2",not(target_feature = "avx512dq")))] +pub mod poseidon_goldilocks_avx2; +#[cfg(all(target_feature = "avx2",target_feature = "avx512dq"))] +pub mod poseidon_goldilocks_avx512; #[cfg(target_feature = "avx2")] pub mod poseidon2_goldilocks_avx2; #[cfg(target_feature = "avx2")] -pub mod poseidon_goldilocks_avx2; +pub mod goldilocks_avx2; +#[cfg(target_feature = "avx512dq")] +pub mod goldilocks_avx512; +#[cfg(target_feature = "avx2")] +pub mod poseidon_bn128_avx2; + diff --git a/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs b/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs new file mode 100644 index 0000000000..50c177d39f --- /dev/null +++ b/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs @@ -0,0 +1,1155 @@ +use core::arch::x86_64::*; + +use crate::hash::poseidon_bn128_ops::{ElementBN128, C, M, P, S}; + +#[allow(dead_code)] +#[inline] +unsafe fn set_zero() -> __m256i { + _mm256_set_epi64x(0, 0, 0, 0) +} + +#[allow(dead_code)] +#[inline] +unsafe fn set_one() -> __m256i { + _mm256_set_epi64x( + 1011752739694698287i64, + 7381016538464732718i64, + 3962172157175319849i64, + 12436184717236109307u64 as i64, + ) +} + +#[inline] +pub unsafe fn add64_no_carry(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { + /* + * a and b are signed 4 x i64. Suppose a and b represent only one i64, then: + * - (test 1): if a < 2^63 and b < 2^63 (this means a >= 0 and b >= 0) => sum does not overflow => cout = 0 + * - if a >= 2^63 and b >= 2^63 => sum overflows so sum = a + b and cout = 1 + * - (test 2): if (a < 2^63 and b >= 2^63) or (a >= 2^63 and b < 2^63) + * - (test 3): if a + b < 2^64 (this means a + b is negative in signed representation) => no overflow so cout = 0 + * - (test 3): if a + b >= 2^64 (this means a + b becomes positive in signed representation, that is, a + b >= 0) => there is overflow so cout = 1 + */ + let ones = _mm256_set_epi64x(1, 1, 1, 1); + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let r = _mm256_add_epi64(*a, *b); + let ma = _mm256_cmpgt_epi64(zeros, *a); + let mb = _mm256_cmpgt_epi64(zeros, *b); + let m1 = _mm256_and_si256(ma, mb); // test 1 + let m21 = _mm256_andnot_si256(ma, mb); + let m22 = _mm256_andnot_si256(mb, ma); + let m2 = _mm256_or_si256(m21, m22); // test 2 + let m23 = _mm256_cmpgt_epi64(zeros, r); // test 3 + let m2 = _mm256_andnot_si256(m23, m2); + let m = _mm256_or_si256(m1, m2); + let co = _mm256_and_si256(m, ones); + (r, co) +} + +// cin is carry in and must be 0 or 1 +#[inline] +pub unsafe fn add64(a: &__m256i, b: &__m256i, cin: &__m256i) -> (__m256i, __m256i) { + let (r1, c1) = add64_no_carry(a, b); + let max = _mm256_set_epi64x(-1, -1, -1, -1); + let m = _mm256_cmpeq_epi64(r1, max); + let r = _mm256_add_epi64(r1, *cin); + let m = _mm256_and_si256(*cin, m); + let co = _mm256_or_si256(m, c1); + (r, co) +} + +#[inline] +unsafe fn sub64_no_borrow(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { + let ones = _mm256_set_epi64x(1, 1, 1, 1); + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let r = _mm256_sub_epi64(*a, *b); + let m1 = _mm256_cmpgt_epi64(zeros, *a); // a < 0 ? + let m2 = _mm256_cmpgt_epi64(zeros, *b); // b < 0 ? + let m3 = _mm256_cmpgt_epi64(*b, *a); // a < b ? + let m4 = _mm256_or_si256(m2, m3); + let m5 = _mm256_andnot_si256(m1, m4); + let m6 = _mm256_and_si256(m2, m3); + let m7 = _mm256_and_si256(m1, m6); + let m = _mm256_or_si256(m5, m7); + let bo = _mm256_and_si256(m, ones); + (r, bo) +} + +// bin is borrow in and must be 0 or 1 +// TODO: revise +#[inline] +unsafe fn sub64(a: &__m256i, b: &__m256i, bin: &__m256i) -> (__m256i, __m256i) { + let ones = _mm256_set_epi64x(1, 1, 1, 1); + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let (r1, b1) = sub64_no_borrow(a, b); + + // TODO - delete + /* + let mut v = [0i64; 4]; + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), *a); + println!("a: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), *b); + println!("b: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), r1); + println!("r: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b1); + println!("b: {:?}", v); + */ + + let m1 = _mm256_cmpeq_epi64(*bin, ones); + let m2 = _mm256_cmpeq_epi64(r1, zeros); + let m = _mm256_and_si256(m1, m2); + let bo = _mm256_and_si256(m, ones); + let r = _mm256_sub_epi64(r1, *bin); + let bo = _mm256_or_si256(bo, b1); + + // TODO - delete + /* + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), r); + println!("r: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), bo); + println!("b: {:?}", v); + */ + + (r, bo) +} + +#[inline] +unsafe fn mul64(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { + let mut av: [u64; 4] = [0; 4]; + let mut bv: [u64; 4] = [0; 4]; + let mut hv: [u64; 4] = [0; 4]; + let mut lv: [u64; 4] = [0; 4]; + _mm256_storeu_si256(av.as_mut_ptr().cast::<__m256i>(), *a); + _mm256_storeu_si256(bv.as_mut_ptr().cast::<__m256i>(), *b); + let c0 = (av[0] as u128) * (bv[0] as u128); + let c1 = (av[1] as u128) * (bv[1] as u128); + let c2 = (av[2] as u128) * (bv[2] as u128); + let c3 = (av[3] as u128) * (bv[3] as u128); + (hv[0], lv[0]) = ((c0 >> 64) as u64, c0 as u64); + (hv[1], lv[1]) = ((c1 >> 64) as u64, c1 as u64); + (hv[2], lv[2]) = ((c2 >> 64) as u64, c2 as u64); + (hv[3], lv[3]) = ((c3 >> 64) as u64, c3 as u64); + let h = _mm256_loadu_si256(hv.as_mut_ptr().cast::<__m256i>()); + let l = _mm256_loadu_si256(lv.as_mut_ptr().cast::<__m256i>()); + (h, l) +} + +// madd0 hi = a*b + c (discards lo bits) +#[inline] +unsafe fn madd0(a: &__m256i, b: &__m256i, c: &__m256i) -> __m256i { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let (hi, lo) = mul64(a, b); + let (_, cr) = add64(&lo, c, &zeros); + let (hi, _) = add64(&hi, &zeros, &cr); + hi +} + +// madd1 hi, lo = a * b + c +#[inline] +unsafe fn madd1(a: &__m256i, b: &__m256i, c: &__m256i) -> (__m256i, __m256i) { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let (hi, lo) = mul64(a, b); + let (lo, cr) = add64(&lo, c, &zeros); + let (hi, _) = add64(&hi, &zeros, &cr); + (hi, lo) +} + +// madd2 hi, lo = a * b + c + d +#[inline] +unsafe fn madd2(a: &__m256i, b: &__m256i, c: &__m256i, d: &__m256i) -> (__m256i, __m256i) { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let (hi, lo) = mul64(a, b); + let (c, cr) = add64(c, d, &zeros); + let (hi, _) = add64(&hi, &zeros, &cr); + let (lo, cr) = add64(&lo, &c, &zeros); + let (hi, _) = add64(&hi, &zeros, &cr); + (hi, lo) +} + +#[inline] +unsafe fn madd3( + a: &__m256i, + b: &__m256i, + c: &__m256i, + d: &__m256i, + e: &__m256i, +) -> (__m256i, __m256i) { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let (hi, lo) = mul64(a, b); + let (c, cr) = add64(c, d, &zeros); + let (hi, _) = add64(&hi, &zeros, &cr); + let (lo, cr) = add64(&lo, &c, &zeros); + let (hi, _) = add64(&hi, e, &cr); + (hi, lo) +} + +#[inline] +pub unsafe fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i { + let mut av: [u64; 4] = [0; 4]; + let mut bv: [u64; 4] = [0; 4]; + _mm256_storeu_si256(av.as_mut_ptr().cast::<__m256i>(), a); + _mm256_storeu_si256(bv.as_mut_ptr().cast::<__m256i>(), b); + /* + asm!( + "mov rax, [rdi]", + "mov rdx, [rsi]", + "mul rdx", + "mov [rdi], rax", + "mov rax, [rdi+8]", + "mov rdx, [rsi+8]", + "mul rdx", + "mov [rdi+8], rax", + "mov rax, [rdi+16]", + "mov rdx, [rsi+16]", + "mul rdx", + "mov [rdi+16], rax", + "mov rax, [rdi+24]", + "mov rdx, [rsi+24]", + "mul rdx", + "mov [rdi+24], rax", + in("rdi") &av, + in("rsi") &bv, + ); + */ + for i in 0..4 { + av[i] = ((av[i] as u128) * (bv[i] as u128)) as u64; + } + _mm256_loadu_si256(av.as_ptr().cast::<__m256i>()) +} + +#[inline] +unsafe fn _mul_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { + let mut z: [__m256i; 4] = [_mm256_set_epi64x(0, 0, 0, 0); 4]; + let mut t: [__m256i; 4] = [_mm256_set_epi64x(0, 0, 0, 0); 4]; + let mut c: [__m256i; 3] = [_mm256_set_epi64x(0, 0, 0, 0); 3]; + + let ct0 = _mm256_set_epi64x( + 4891460686036598785i64, + 4891460686036598785i64, + 4891460686036598785i64, + 4891460686036598785i64, + ); + let ct1 = _mm256_set_epi64x( + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + ); + let ct2 = _mm256_set_epi64x( + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + ); + let ct3 = _mm256_set_epi64x( + 3486998266802970665i64, + 3486998266802970665i64, + 3486998266802970665i64, + 3486998266802970665i64, + ); + let ct4 = _mm256_set_epi64x( + 14042775128853446655u64 as i64, + 14042775128853446655u64 as i64, + 14042775128853446655u64 as i64, + 14042775128853446655u64 as i64, + ); + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + + // round 0 + let mut v = x[0]; + (c[1], c[0]) = mul64(&v, &y[0]); + let m = _mm256_mullo_epi64(c[0], ct4); + c[2] = madd0(&m, &ct0, &c[0]); + (c[1], c[0]) = madd1(&v, &y[1], &c[1]); + (c[2], t[0]) = madd2(&m, &ct1, &c[2], &c[0]); + (c[1], c[0]) = madd1(&v, &y[2], &c[1]); + (c[2], t[1]) = madd2(&m, &ct2, &c[2], &c[0]); + (c[1], c[0]) = madd1(&v, &y[3], &c[1]); + (t[3], t[2]) = madd3(&m, &ct3, &c[0], &c[2], &c[1]); + + // round 1 + v = x[1]; + (c[1], c[0]) = madd1(&v, &y[0], &t[0]); + let m = _mm256_mullo_epi64(c[0], ct4); + c[2] = madd0(&m, &ct0, &c[0]); + (c[1], c[0]) = madd2(&v, &y[1], &c[1], &t[1]); + (c[2], t[0]) = madd2(&m, &ct1, &c[2], &c[0]); + (c[1], c[0]) = madd2(&v, &y[2], &c[1], &t[2]); + (c[2], t[1]) = madd2(&m, &ct2, &c[2], &c[0]); + (c[1], c[0]) = madd2(&v, &y[3], &c[1], &t[3]); + (t[3], t[2]) = madd3(&m, &ct3, &c[0], &c[2], &c[1]); + + // round 2 + v = x[2]; + (c[1], c[0]) = madd1(&v, &y[0], &t[0]); + let m = _mm256_mullo_epi64(c[0], ct4); + c[2] = madd0(&m, &ct0, &c[0]); + (c[1], c[0]) = madd2(&v, &y[1], &c[1], &t[1]); + (c[2], t[0]) = madd2(&m, &ct1, &c[2], &c[0]); + (c[1], c[0]) = madd2(&v, &y[2], &c[1], &t[2]); + (c[2], t[1]) = madd2(&m, &ct2, &c[2], &c[0]); + (c[1], c[0]) = madd2(&v, &y[3], &c[1], &t[3]); + (t[3], t[2]) = madd3(&m, &ct3, &c[0], &c[2], &c[1]); + + // round 3 + v = x[3]; + (c[1], c[0]) = madd1(&v, &y[0], &t[0]); + let m = _mm256_mullo_epi64(c[0], ct4); + c[2] = madd0(&m, &ct0, &c[0]); + (c[1], c[0]) = madd2(&v, &y[1], &c[1], &t[1]); + (c[2], z[0]) = madd2(&m, &ct1, &c[2], &c[0]); + (c[1], c[0]) = madd2(&v, &y[2], &c[1], &t[2]); + (c[2], z[1]) = madd2(&m, &ct2, &c[2], &c[0]); + (c[1], c[0]) = madd2(&v, &y[3], &c[1], &t[3]); + (z[3], z[2]) = madd3(&m, &ct3, &c[0], &c[2], &c[1]); + + // if z > q --> z -= q + let cmp0 = _mm256_cmpgt_epi64(ct0, z[0]); + let cmp1 = _mm256_cmpeq_epi64(ct1, z[1]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct1, z[1]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpeq_epi64(ct2, z[2]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct2, z[2]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpeq_epi64(ct3, z[3]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct3, z[3]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let st0 = _mm256_andnot_si256(cmp0, ct0); + let st1 = _mm256_andnot_si256(cmp0, ct1); + let st2 = _mm256_andnot_si256(cmp0, ct2); + let st3 = _mm256_andnot_si256(cmp0, ct3); + let mut b; + (z[0], b) = sub64(&z[0], &st0, &zeros); + (z[1], b) = sub64(&z[1], &st1, &b); + (z[2], b) = sub64(&z[2], &st2, &b); + (z[3], _) = sub64(&z[3], &st3, &b); + + z +} + +#[inline] +fn exp5state(state: &mut [__m256i; 8]) { + let s: [__m256i; 4] = [state[0], state[1], state[2], state[3]]; + unsafe { + let s2 = _mul_generic(s, s); + let s4 = _mul_generic(s2, s2); + let s5 = _mul_generic(s, s4); + state[0] = s5[0]; + state[1] = s5[1]; + state[2] = s5[2]; + state[3] = s5[3]; + } + let s: [__m256i; 4] = [state[4], state[5], state[6], state[7]]; + unsafe { + let s2 = _mul_generic(s, s); + let s4 = _mul_generic(s2, s2); + let s5 = _mul_generic(s, s4); + state[4] = s5[0]; + state[5] = s5[1]; + state[6] = s5[2]; + state[7] = s5[3]; + } +} + +#[inline] +unsafe fn _add_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { + let mut z: [__m256i; 4] = [_mm256_set_epi64x(0, 0, 0, 0); 4]; + let mut cr = _mm256_set_epi64x(0, 0, 0, 0); + + // TODO - delete + /* + let mut v: [u64; 4] = [0; 4]; + for i in 0..4 { + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), x[i]); + println!("x{:?}: {:?}", i, v); + } + for i in 0..4 { + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), y[i]); + println!("y{:?}: {:?}", i, v); + } + */ + + (z[0], cr) = add64(&x[0], &y[0], &cr); + (z[1], cr) = add64(&x[1], &y[1], &cr); + (z[2], cr) = add64(&x[2], &y[2], &cr); + (z[3], _) = add64(&x[3], &y[3], &cr); + + // if z > q --> z -= q + let ct0 = _mm256_set_epi64x( + 4891460686036598785i64, + 4891460686036598785i64, + 4891460686036598785i64, + 4891460686036598785i64, + ); + let ct1 = _mm256_set_epi64x( + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + ); + let ct2 = _mm256_set_epi64x( + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + ); + let ct3 = _mm256_set_epi64x( + 3486998266802970665i64, + 3486998266802970665i64, + 3486998266802970665i64, + 3486998266802970665i64, + ); + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + + // if z > q --> z -= q + let cmp0 = _mm256_cmpgt_epi64(ct0, z[0]); + let cmp1 = _mm256_cmpeq_epi64(ct1, z[1]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct1, z[1]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpeq_epi64(ct2, z[2]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct2, z[2]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpeq_epi64(ct3, z[3]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct3, z[3]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let st0 = _mm256_andnot_si256(cmp0, ct0); + let st1 = _mm256_andnot_si256(cmp0, ct1); + let st2 = _mm256_andnot_si256(cmp0, ct2); + let st3 = _mm256_andnot_si256(cmp0, ct3); + let mut b; + (z[0], b) = sub64(&z[0], &st0, &zeros); + (z[1], b) = sub64(&z[1], &st1, &b); + (z[2], b) = sub64(&z[2], &st2, &b); + (z[3], _) = sub64(&z[3], &st3, &b); + + // TODO - delete + /* + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[2]); + println!("z2: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), st2); + println!("ct: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b); + println!("bi: {:?}", v); + + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[2]); + println!("z2: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b); + println!("bo: {:?}", v); + + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[3]); + println!("z3: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), st3); + println!("ct: {:?}", v); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b); + println!("bi: {:?}", v); + + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[3]); + println!("z3: {:?}", v); + // for i in 0..4 { + // _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[i]); + // println!("z{:?}: {:?}", i, v); + //} + */ + + z +} + +#[inline] +unsafe fn to_mont(a: [__m256i; 4]) -> [__m256i; 4] { + let r_square_0 = _mm256_set_epi64x( + 1997599621687373223u64 as i64, + 1997599621687373223u64 as i64, + 1997599621687373223u64 as i64, + 1997599621687373223u64 as i64, + ); + let r_square_1 = _mm256_set_epi64x( + 6052339484930628067u64 as i64, + 6052339484930628067u64 as i64, + 6052339484930628067u64 as i64, + 6052339484930628067u64 as i64, + ); + let r_square_2 = _mm256_set_epi64x( + 10108755138030829701u64 as i64, + 10108755138030829701u64 as i64, + 10108755138030829701u64 as i64, + 10108755138030829701u64 as i64, + ); + let r_square_3 = _mm256_set_epi64x( + 150537098327114917u64 as i64, + 150537098327114917u64 as i64, + 150537098327114917u64 as i64, + 150537098327114917u64 as i64, + ); + let r: [__m256i; 4] = [r_square_0, r_square_1, r_square_2, r_square_3]; + _mul_generic(a, r) +} + +#[inline] +unsafe fn from_mont(a: [__m256i; 4]) -> [__m256i; 4] { + let ct0 = _mm256_set_epi64x( + 4891460686036598785i64, + 4891460686036598785i64, + 4891460686036598785i64, + 4891460686036598785i64, + ); + let ct1 = _mm256_set_epi64x( + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + ); + let ct2 = _mm256_set_epi64x( + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + ); + let ct3 = _mm256_set_epi64x( + 3486998266802970665i64, + 3486998266802970665i64, + 3486998266802970665i64, + 3486998266802970665i64, + ); + let ct4 = _mm256_set_epi64x( + 14042775128853446655u64 as i64, + 14042775128853446655u64 as i64, + 14042775128853446655u64 as i64, + 14042775128853446655u64 as i64, + ); + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + + let mut z: [__m256i; 4] = a; + + // m = z[0]n'[0] mod W + let m = _mm256_mullo_epi64(z[0], ct4); + let mut c = madd0(&m, &ct0, &z[0]); + (c, z[0]) = madd2(&m, &ct1, &z[1], &c); + (c, z[1]) = madd2(&m, &ct2, &z[2], &c); + (c, z[2]) = madd2(&m, &ct3, &z[3], &c); + z[3] = c; + + // m = z[0]n'[0] mod W + let m = _mm256_mullo_epi64(z[0], ct4); + let mut c = madd0(&m, &ct0, &z[0]); + (c, z[0]) = madd2(&m, &ct1, &z[1], &c); + (c, z[1]) = madd2(&m, &ct2, &z[2], &c); + (c, z[2]) = madd2(&m, &ct3, &z[3], &c); + z[3] = c; + + // m = z[0]n'[0] mod W + let m = _mm256_mullo_epi64(z[0], ct4); + let mut c = madd0(&m, &ct0, &z[0]); + (c, z[0]) = madd2(&m, &ct1, &z[1], &c); + (c, z[1]) = madd2(&m, &ct2, &z[2], &c); + (c, z[2]) = madd2(&m, &ct3, &z[3], &c); + z[3] = c; + + // m = z[0]n'[0] mod W + let m = _mm256_mullo_epi64(z[0], ct4); + let mut c = madd0(&m, &ct0, &z[0]); + (c, z[0]) = madd2(&m, &ct1, &z[1], &c); + (c, z[1]) = madd2(&m, &ct2, &z[2], &c); + (c, z[2]) = madd2(&m, &ct3, &z[3], &c); + z[3] = c; + + // if z > q --> z -= q + let cmp0 = _mm256_cmpgt_epi64(ct0, z[0]); + let cmp1 = _mm256_cmpeq_epi64(ct1, z[1]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct1, z[1]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpeq_epi64(ct2, z[2]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct2, z[2]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpeq_epi64(ct3, z[3]); + let cmp0 = _mm256_and_si256(cmp0, cmp1); + let cmp1 = _mm256_cmpgt_epi64(ct3, z[3]); + let cmp0 = _mm256_or_si256(cmp0, cmp1); + let st0 = _mm256_andnot_si256(cmp0, ct0); + let st1 = _mm256_andnot_si256(cmp0, ct1); + let st2 = _mm256_andnot_si256(cmp0, ct2); + let st3 = _mm256_andnot_si256(cmp0, ct3); + let mut b; + (z[0], b) = sub64(&z[0], &st0, &zeros); + (z[1], b) = sub64(&z[1], &st1, &b); + (z[2], b) = sub64(&z[2], &st2, &b); + (z[3], _) = sub64(&z[3], &st3, &b); + + z +} + +#[inline] +unsafe fn ark(state: &mut [__m256i; 8], c: [[u64; 4]; 100], it: usize) { + // 1st element + let cc: [__m256i; 4] = [ + _mm256_set_epi64x(0, 0, 0, c[it][0] as i64), + _mm256_set_epi64x(0, 0, 0, c[it][1] as i64), + _mm256_set_epi64x(0, 0, 0, c[it][2] as i64), + _mm256_set_epi64x(0, 0, 0, c[it][3] as i64), + ]; + let mut ss: [__m256i; 4] = [state[0], state[1], state[2], state[3]]; + ss = _add_generic(ss, cc); + state[0] = ss[0]; + state[1] = ss[1]; + state[2] = ss[2]; + state[3] = ss[3]; + + // next 4 elements + let cc: [__m256i; 4] = [ + _mm256_set_epi64x( + c[it + 4][0] as i64, + c[it + 3][0] as i64, + c[it + 2][0] as i64, + c[it + 1][0] as i64, + ), + _mm256_set_epi64x( + c[it + 4][1] as i64, + c[it + 3][1] as i64, + c[it + 2][1] as i64, + c[it + 1][1] as i64, + ), + _mm256_set_epi64x( + c[it + 4][2] as i64, + c[it + 3][2] as i64, + c[it + 2][2] as i64, + c[it + 1][2] as i64, + ), + _mm256_set_epi64x( + c[it + 4][3] as i64, + c[it + 3][3] as i64, + c[it + 2][3] as i64, + c[it + 1][3] as i64, + ), + ]; + let mut ss: [__m256i; 4] = [state[4], state[5], state[6], state[7]]; + ss = _add_generic(ss, cc); + state[4] = ss[0]; + state[5] = ss[1]; + state[6] = ss[2]; + state[7] = ss[3]; +} + +#[inline] +unsafe fn mix(state: &mut [__m256i; 8], m: [[[u64; 4]; 5]; 5]) { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let mut new_state: [__m256i; 8] = [zeros; 8]; + + // s[0] -> new_state[0] + let ss: [__m256i; 4] = [state[0], state[1], state[2], state[3]]; + let mm: [__m256i; 4] = [ + _mm256_set_epi64x(0, 0, 0, m[0][0][0] as i64), + _mm256_set_epi64x(0, 0, 0, m[0][0][1] as i64), + _mm256_set_epi64x(0, 0, 0, m[0][0][2] as i64), + _mm256_set_epi64x(0, 0, 0, m[0][0][3] as i64), + ]; + let mul = _mul_generic(mm, ss); + let rr = _add_generic(new_state[0..4].try_into().unwrap(), mul); + new_state[0] = rr[0]; + new_state[1] = rr[1]; + new_state[2] = rr[2]; + new_state[3] = rr[3]; + + // s[1..4] -> new_state[0] + let mut ss = [state[4], state[5], state[6], state[7]]; + for j in 1..5 { + if j > 1 { + ss[0] = _mm256_permute4x64_epi64(ss[0], 0x39); + ss[1] = _mm256_permute4x64_epi64(ss[1], 0x39); + ss[2] = _mm256_permute4x64_epi64(ss[2], 0x39); + ss[3] = _mm256_permute4x64_epi64(ss[3], 0x39); + } + let mm: [__m256i; 4] = [ + _mm256_set_epi64x(0, 0, 0, m[j][0][0] as i64), + _mm256_set_epi64x(0, 0, 0, m[j][0][1] as i64), + _mm256_set_epi64x(0, 0, 0, m[j][0][2] as i64), + _mm256_set_epi64x(0, 0, 0, m[j][0][3] as i64), + ]; + let mul = _mul_generic(mm, ss); + let rr = _add_generic(new_state[0..4].try_into().unwrap(), mul); + new_state[0] = rr[0]; + new_state[1] = rr[1]; + new_state[2] = rr[2]; + new_state[3] = rr[3]; + } + + // s[0] -> new_state[1..4] + let mm: [__m256i; 4] = [ + _mm256_set_epi64x( + m[0][4][0] as i64, + m[0][3][0] as i64, + m[0][2][0] as i64, + m[0][1][0] as i64, + ), + _mm256_set_epi64x( + m[0][4][1] as i64, + m[0][3][1] as i64, + m[0][2][1] as i64, + m[0][1][1] as i64, + ), + _mm256_set_epi64x( + m[0][4][2] as i64, + m[0][3][2] as i64, + m[0][2][2] as i64, + m[0][1][2] as i64, + ), + _mm256_set_epi64x( + m[0][4][3] as i64, + m[0][3][3] as i64, + m[0][2][3] as i64, + m[0][1][3] as i64, + ), + ]; + let mut ss: [__m256i; 4] = [state[0], state[1], state[2], state[3]]; + for j in 1..5 { + if j > 1 { + ss[0] = _mm256_permute4x64_epi64(ss[0], 0x93); + ss[1] = _mm256_permute4x64_epi64(ss[1], 0x93); + ss[2] = _mm256_permute4x64_epi64(ss[2], 0x93); + ss[3] = _mm256_permute4x64_epi64(ss[3], 0x93); + } + let mul = _mul_generic(mm, ss); + let rr = _add_generic(new_state[4..8].try_into().unwrap(), mul); + new_state[4] = rr[0]; + new_state[5] = rr[1]; + new_state[6] = rr[2]; + new_state[7] = rr[3]; + } + + // s[1..4] -> new_state[1..4] + for j in 1..5 { + let mut sv4: [i64; 4] = [0; 4]; + let mut sv5: [i64; 4] = [0; 4]; + let mut sv6: [i64; 4] = [0; 4]; + let mut sv7: [i64; 4] = [0; 4]; + _mm256_storeu_si256(sv4.as_mut_ptr().cast::<__m256i>(), state[4]); + _mm256_storeu_si256(sv5.as_mut_ptr().cast::<__m256i>(), state[5]); + _mm256_storeu_si256(sv6.as_mut_ptr().cast::<__m256i>(), state[6]); + _mm256_storeu_si256(sv7.as_mut_ptr().cast::<__m256i>(), state[7]); + let k = j - 1; + let ss = [ + _mm256_set_epi64x(sv4[k], sv4[k], sv4[k], sv4[k]), + _mm256_set_epi64x(sv5[k], sv5[k], sv5[k], sv5[k]), + _mm256_set_epi64x(sv6[k], sv6[k], sv6[k], sv6[k]), + _mm256_set_epi64x(sv7[k], sv7[k], sv7[k], sv7[k]), + ]; + let mm: [__m256i; 4] = [ + _mm256_set_epi64x( + m[j][4][0] as i64, + m[j][3][0] as i64, + m[j][2][0] as i64, + m[j][1][0] as i64, + ), + _mm256_set_epi64x( + m[j][4][1] as i64, + m[j][3][1] as i64, + m[j][2][1] as i64, + m[j][1][1] as i64, + ), + _mm256_set_epi64x( + m[j][4][2] as i64, + m[j][3][2] as i64, + m[j][2][2] as i64, + m[j][1][2] as i64, + ), + _mm256_set_epi64x( + m[j][4][3] as i64, + m[j][3][3] as i64, + m[j][2][3] as i64, + m[j][1][3] as i64, + ), + ]; + let mul = _mul_generic(mm, ss); + let rr = _add_generic(new_state[4..8].try_into().unwrap(), mul); + new_state[4] = rr[0]; + new_state[5] = rr[1]; + new_state[6] = rr[2]; + new_state[7] = rr[3]; + } + + for i in 0..8 { + state[i] = new_state[i]; + } +} + +#[allow(dead_code)] +fn print_state3(state: &[__m256i; 3]) { + let mut a: [u64; 4] = [0; 4]; + println!("State3:"); + unsafe { + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[0]); + println!("{:?}", a); + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[1]); + println!("{:?}", a); + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[2]); + println!("{:?}", a); + } +} + +#[allow(dead_code)] +fn print_state4(state: &[__m256i; 4]) { + let mut a: [u64; 4] = [0; 4]; + println!("State4:"); + unsafe { + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[0]); + println!("{:?}", a); + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[1]); + println!("{:?}", a); + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[2]); + println!("{:?}", a); + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[3]); + println!("{:?}", a); + } +} + +#[allow(dead_code)] +fn print_state8(state: &[__m256i; 8]) { + let mut a: [u64; 4] = [0; 4]; + println!("State8:"); + unsafe { + for i in 0..8 { + _mm256_storeu_si256(a.as_mut_ptr().cast::<__m256i>(), state[i]); + println!("{:?}", a); + } + } +} + +#[allow(dead_code)] +fn print_state(state: &[ElementBN128; 5]) { + println!("{:?}", state[0]); + println!("{:?}", state[1]); + println!("{:?}", state[2]); + println!("{:?}", state[3]); + println!("{:?}", state[4]); + println!(); +} + +pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { + let st64: Vec = input.into_iter().map(|x| x as i64).collect(); + + const CT: usize = 5; + const N_ROUNDS_F: usize = 8; + const N_ROUNDS_P: usize = 60; + + unsafe { + // load states + let mut inp: [__m256i; 4] = [ + _mm256_set_epi64x(st64[11], st64[8], st64[5], st64[2]), + _mm256_set_epi64x(st64[10], st64[7], st64[4], st64[1]), + _mm256_set_epi64x(st64[9], st64[6], st64[3], st64[0]), + _mm256_set_epi64x(0i64, 0i64, 0i64, 0i64), + ]; + + // to mont + inp = to_mont(inp); + + // start rounds + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let mut state: [__m256i; 8] = [zeros, zeros, zeros, zeros, inp[0], inp[1], inp[2], inp[3]]; + + ark(&mut state, C, 0); + + /* + let mut z = [0u64; 4]; + let z1 = [3650884469251175381u64, 0, 0, 0]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[0]); + assert_eq!(z1, z); + let z2 = [4312995929451917048u64, 0, 0, 0]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[1]); + assert_eq!(z2, z); + let z3 = [14528712943685515188u64, 0, 0, 0]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[2]); + assert_eq!(z3, z); + let z4 = [804645480652767018u64, 0, 0, 0]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[3]); + assert_eq!(z4, z); + let z5 = [14462745598712311877u64, 9965481966597437291u64, 5916123222076323011u64, 14423924459958803780u64]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[4]); + assert_eq!(z5, z); + let z6 = [7570161332469838584u64, 18440159137561926521u64, 7248986691198917743u64, 16755156072218033775u64]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[5]); + assert_eq!(z6, z); + let z7 = [12421518753342417017u64, 966430971004801851u64, 13841309536587625009u64, 14460935863064733763u64]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[6]); + assert_eq!(z7, z); + let z8 = [1527580982755995308u64, 1452659775731263630u64, 3308699589782081186u64, 2575827241589250587u64]; + _mm256_storeu_si256(z.as_mut_ptr().cast::<__m256i>(), state[7]); + assert_eq!(z8, z); + */ + + for i in 0..(N_ROUNDS_F / 2 - 1) { + exp5state(&mut state); + ark(&mut state, C, (i + 1) * CT); + mix(&mut state, M); + } + + exp5state(&mut state); + ark(&mut state, C, (N_ROUNDS_F / 2) * CT); + mix(&mut state, P); + + // println!("After 1st rounds:"); + // print_state8(&state); + + // switch to classic representation + let mut cstate = [ElementBN128::zero(); 5]; + let mut tmps = [[0u64; 4]; 4]; + let mut tmpv = [0u64; 4]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[0]); + tmps[0][0] = tmpv[0]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[1]); + tmps[0][1] = tmpv[0]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[2]); + tmps[0][2] = tmpv[0]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[3]); + tmps[0][3] = tmpv[0]; + cstate[0] = ElementBN128::new(tmps[0]); + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[4]); + tmps[0][0] = tmpv[0]; + tmps[1][0] = tmpv[1]; + tmps[2][0] = tmpv[2]; + tmps[3][0] = tmpv[3]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[5]); + tmps[0][1] = tmpv[0]; + tmps[1][1] = tmpv[1]; + tmps[2][1] = tmpv[2]; + tmps[3][1] = tmpv[3]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[6]); + tmps[0][2] = tmpv[0]; + tmps[1][2] = tmpv[1]; + tmps[2][2] = tmpv[2]; + tmps[3][2] = tmpv[3]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), state[7]); + tmps[0][3] = tmpv[0]; + tmps[1][3] = tmpv[1]; + tmps[2][3] = tmpv[2]; + tmps[3][3] = tmpv[3]; + cstate[1] = ElementBN128::new(tmps[0]); + cstate[2] = ElementBN128::new(tmps[1]); + cstate[3] = ElementBN128::new(tmps[2]); + cstate[4] = ElementBN128::new(tmps[3]); + + // println!("After 1st rounds:"); + // print_state(&cstate); + + for i in 0..N_ROUNDS_P { + cstate[0].exp5(); + let cc = ElementBN128::new(C[(N_ROUNDS_F / 2 + 1) * CT + i]); + cstate[0].add(cstate[0], cc); + + let mut mul = ElementBN128::zero(); + let mut new_state0 = ElementBN128::zero(); + for j in 0..CT { + let ss = ElementBN128::new(S[(CT * 2 - 1) * i + j]); + mul.mul(ss, cstate[j]); + new_state0.add(new_state0, mul); + } + + for k in 1..CT { + let ss = ElementBN128::new(S[(CT * 2 - 1) * i + CT + k - 1]); + mul.set_zero(); + mul.mul(cstate[0], ss); + cstate[k].add(cstate[k], mul); + } + cstate[0] = new_state0; + } + + // println!("After middle rounds:"); + // print_state(&cstate); + + // switch to AVX + state = [ + _mm256_set_epi64x(0i64, 0i64, 0i64, cstate[0].z[0] as i64), + _mm256_set_epi64x(0i64, 0i64, 0i64, cstate[0].z[1] as i64), + _mm256_set_epi64x(0i64, 0i64, 0i64, cstate[0].z[2] as i64), + _mm256_set_epi64x(0i64, 0i64, 0i64, cstate[0].z[3] as i64), + _mm256_set_epi64x( + cstate[4].z[0] as i64, + cstate[3].z[0] as i64, + cstate[2].z[0] as i64, + cstate[1].z[0] as i64, + ), + _mm256_set_epi64x( + cstate[4].z[1] as i64, + cstate[3].z[1] as i64, + cstate[2].z[1] as i64, + cstate[1].z[1] as i64, + ), + _mm256_set_epi64x( + cstate[4].z[2] as i64, + cstate[3].z[2] as i64, + cstate[2].z[2] as i64, + cstate[1].z[2] as i64, + ), + _mm256_set_epi64x( + cstate[4].z[3] as i64, + cstate[3].z[3] as i64, + cstate[2].z[3] as i64, + cstate[1].z[3] as i64, + ), + ]; + + // println!("After middle rounds:"); + // print_state8(&state); + + for i in 0..(N_ROUNDS_F / 2 - 1) { + exp5state(&mut state); + ark( + &mut state, + C, + (N_ROUNDS_F / 2 + 1) * CT + N_ROUNDS_P + i * CT, + ); + mix(&mut state, M); + } + exp5state(&mut state); + mix(&mut state, M); + + // println!("After all rounds:"); + // print_state8(&state); + + let ss0 = from_mont(state[0..4].try_into().unwrap()); + let ss1 = from_mont(state[4..8].try_into().unwrap()); + + // println!("After from_mont rounds:"); + // print_state4(&ss0); + // print_state4(&ss1); + + let mut out: [u64; 12] = [0; 12]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), ss0[0]); + out[2] = tmpv[0]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), ss0[1]); + out[1] = tmpv[0]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), ss0[2]); + out[0] = tmpv[0]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), ss1[0]); + out[5] = tmpv[0]; + out[8] = tmpv[1]; + out[11] = tmpv[2]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), ss1[1]); + out[4] = tmpv[0]; + out[7] = tmpv[1]; + out[10] = tmpv[2]; + _mm256_storeu_si256(tmpv.as_mut_ptr().cast::<__m256i>(), ss1[2]); + out[3] = tmpv[0]; + out[6] = tmpv[1]; + out[9] = tmpv[2]; + for i in 0..12 { + if out[i] >= 0xFFFFFFFF00000001u64 { + out[i] = out[i] - 0xFFFFFFFF00000001u64; + } + } + + out + } +} + +#[cfg(test)] +mod tests { + use core::arch::x86_64::*; + + use anyhow::Result; + + use super::{add64, sub64}; + + #[test] + fn test_bn128_avx() -> Result<()> { + unsafe { + let ct1 = _mm256_set_epi64x( + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + 2896914383306846353i64, + ); + let ct2 = _mm256_set_epi64x( + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + 13281191951274694749u64 as i64, + ); + + let r = _mm256_add_epi64(ct1, ct2); + let mut a: [u64; 4] = [0; 4]; + _mm256_store_si256(a.as_mut_ptr().cast::<__m256i>(), r); + println!("{:?}", a); + let x = 2896914383306846353u64 + 13281191951274694749u64; + println!("{:?}", x); + } + Ok(()) + } + + #[test] + fn test_bn128_add64() -> Result<()> { + unsafe { + let a = _mm256_set_epi64x( + 0xFFFFFFFFFFFFFFFFu64 as i64, + 0x0FFFFFFFFFFFFFFF as i64, + 0xFFFFFFFFFFFFFFFFu64 as i64, + 0xFFFFFFFFFFFFFFFFu64 as i64, + ); + let b = _mm256_set_epi64x( + 0xFFFFFFFFFFFFFFFFu64 as i64, + 0x0FFFFFFFFFFFFFFF as i64, + 0x0i64, + 0x1i64, + ); + let cin = _mm256_set_epi64x(0, 0, 0, 0); + let res = [ + 0u64, + 0xFFFFFFFFFFFFFFFFu64, + 0x1FFFFFFFFFFFFFFEu64, + 0xFFFFFFFFFFFFFFFEu64, + ]; + + let cout = [1u64, 0u64, 0u64, 1u64]; + + let mut v: [u64; 4] = [0; 4]; + let (r, c) = add64(&a, &b, &cin); + + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), r); + println!(" Res: {:X?}", v); + assert_eq!(v, res); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), c); + println!("Cout: {:X?}", v); + assert_eq!(v, cout); + } + Ok(()) + } + + #[test] + fn test_bn128_sub64() -> Result<()> { + unsafe { + let a = _mm256_set_epi64x( + 4i64, + 7i64, + 0xFFFFFFFFFFFFFFFFu64 as i64, + 4291643747455737684u64 as i64, + ); + let b = _mm256_set_epi64x(7i64, 4i64, 0x0i64, 3486998266802970665u64 as i64); + let bin = _mm256_set_epi64x(0, 0, 0, 0); + + let res = [ + 0xFFFFFFFFFFFFFFFFu64, + 0xFFFFFFFFFFFFFFFFu64, + 3u64, + 0xFFFFFFFFFFFFFFFDu64, + ]; + + let bout = [1u64, 0u64, 0u64, 1u64]; + + let mut v: [u64; 4] = [0; 4]; + let (c1, c2) = sub64(&a, &b, &bin); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), c1); + println!(" Res: {:?}", v); + assert_eq!(v, res); + _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), c2); + println!("Cout: {:X?}", v); + assert_eq!(v, bout); + } + Ok(()) + } +} diff --git a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2.rs b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2.rs index 485915eaaf..0564a73de8 100644 --- a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2.rs +++ b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx2.rs @@ -2,13 +2,17 @@ use core::arch::x86_64::*; use unroll::unroll_for_loops; +use super::goldilocks_avx2::{mul64_no_overflow, mult_avx_128, reduce_avx_96_64}; +use super::poseidon_bn128_avx2::add64_no_carry; use crate::field::types::PrimeField64; use crate::hash::arch::x86_64::goldilocks_avx2::{ add_avx, mult_avx, reduce_avx_128_64, sbox_avx_m256i, }; use crate::hash::poseidon::{ - Poseidon, ALL_ROUND_CONSTANTS, HALF_N_FULL_ROUNDS, N_PARTIAL_ROUNDS, N_ROUNDS, SPONGE_WIDTH, + add_u160_u128, reduce_u160, Poseidon, ALL_ROUND_CONSTANTS, HALF_N_FULL_ROUNDS, + N_PARTIAL_ROUNDS, N_ROUNDS, SPONGE_WIDTH, }; +use crate::hash::poseidon_goldilocks::poseidon12_mds::block2; #[allow(dead_code)] const MDS_MATRIX_CIRC: [u64; 12] = [17, 15, 41, 16, 2, 28, 13, 13, 39, 18, 34, 20]; @@ -214,7 +218,613 @@ const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; 12]; 12] = [ ], ]; +const FAST_PARTIAL_ROUND_W_HATS: [[u64; 12 - 1]; N_PARTIAL_ROUNDS] = [ + [ + 0x3d999c961b7c63b0, + 0x814e82efcd172529, + 0x2421e5d236704588, + 0x887af7d4dd482328, + 0xa5e9c291f6119b27, + 0xbdc52b2676a4b4aa, + 0x64832009d29bcf57, + 0x09c4155174a552cc, + 0x463f9ee03d290810, + 0xc810936e64982542, + 0x043b1c289f7bc3ac, + ], + [ + 0x673655aae8be5a8b, + 0xd510fe714f39fa10, + 0x2c68a099b51c9e73, + 0xa667bfa9aa96999d, + 0x4d67e72f063e2108, + 0xf84dde3e6acda179, + 0x40f9cc8c08f80981, + 0x5ead032050097142, + 0x6591b02092d671bb, + 0x00e18c71963dd1b7, + 0x8a21bcd24a14218a, + ], + [ + 0x202800f4addbdc87, + 0xe4b5bdb1cc3504ff, + 0xbe32b32a825596e7, + 0x8e0f68c5dc223b9a, + 0x58022d9e1c256ce3, + 0x584d29227aa073ac, + 0x8b9352ad04bef9e7, + 0xaead42a3f445ecbf, + 0x3c667a1d833a3cca, + 0xda6f61838efa1ffe, + 0xe8f749470bd7c446, + ], + [ + 0xc5b85bab9e5b3869, + 0x45245258aec51cf7, + 0x16e6b8e68b931830, + 0xe2ae0f051418112c, + 0x0470e26a0093a65b, + 0x6bef71973a8146ed, + 0x119265be51812daf, + 0xb0be7356254bea2e, + 0x8584defff7589bd7, + 0x3c5fe4aeb1fb52ba, + 0x9e7cd88acf543a5e, + ], + [ + 0x179be4bba87f0a8c, + 0xacf63d95d8887355, + 0x6696670196b0074f, + 0xd99ddf1fe75085f9, + 0xc2597881fef0283b, + 0xcf48395ee6c54f14, + 0x15226a8e4cd8d3b6, + 0xc053297389af5d3b, + 0x2c08893f0d1580e2, + 0x0ed3cbcff6fcc5ba, + 0xc82f510ecf81f6d0, + ], + [ + 0x94b06183acb715cc, + 0x500392ed0d431137, + 0x861cc95ad5c86323, + 0x05830a443f86c4ac, + 0x3b68225874a20a7c, + 0x10b3309838e236fb, + 0x9b77fc8bcd559e2c, + 0xbdecf5e0cb9cb213, + 0x30276f1221ace5fa, + 0x7935dd342764a144, + 0xeac6db520bb03708, + ], + [ + 0x7186a80551025f8f, + 0x622247557e9b5371, + 0xc4cbe326d1ad9742, + 0x55f1523ac6a23ea2, + 0xa13dfe77a3d52f53, + 0xe30750b6301c0452, + 0x08bd488070a3a32b, + 0xcd800caef5b72ae3, + 0x83329c90f04233ce, + 0xb5b99e6664a0a3ee, + 0x6b0731849e200a7f, + ], + [ + 0xec3fabc192b01799, + 0x382b38cee8ee5375, + 0x3bfb6c3f0e616572, + 0x514abd0cf6c7bc86, + 0x47521b1361dcc546, + 0x178093843f863d14, + 0xad1003c5d28918e7, + 0x738450e42495bc81, + 0xaf947c59af5e4047, + 0x4653fb0685084ef2, + 0x057fde2062ae35bf, + ], + [ + 0xe376678d843ce55e, + 0x66f3860d7514e7fc, + 0x7817f3dfff8b4ffa, + 0x3929624a9def725b, + 0x0126ca37f215a80a, + 0xfce2f5d02762a303, + 0x1bc927375febbad7, + 0x85b481e5243f60bf, + 0x2d3c5f42a39c91a0, + 0x0811719919351ae8, + 0xf669de0add993131, + ], + [ + 0x7de38bae084da92d, + 0x5b848442237e8a9b, + 0xf6c705da84d57310, + 0x31e6a4bdb6a49017, + 0x889489706e5c5c0f, + 0x0e4a205459692a1b, + 0xbac3fa75ee26f299, + 0x5f5894f4057d755e, + 0xb0dc3ecd724bb076, + 0x5e34d8554a6452ba, + 0x04f78fd8c1fdcc5f, + ], + [ + 0x4dd19c38779512ea, + 0xdb79ba02704620e9, + 0x92a29a3675a5d2be, + 0xd5177029fe495166, + 0xd32b3298a13330c1, + 0x251c4a3eb2c5f8fd, + 0xe1c48b26e0d98825, + 0x3301d3362a4ffccb, + 0x09bb6c88de8cd178, + 0xdc05b676564f538a, + 0x60192d883e473fee, + ], + [ + 0x16b9774801ac44a0, + 0x3cb8411e786d3c8e, + 0xa86e9cf505072491, + 0x0178928152e109ae, + 0x5317b905a6e1ab7b, + 0xda20b3be7f53d59f, + 0xcb97dedecebee9ad, + 0x4bd545218c59f58d, + 0x77dc8d856c05a44a, + 0x87948589e4f243fd, + 0x7e5217af969952c2, + ], + [ + 0xbc58987d06a84e4d, + 0x0b5d420244c9cae3, + 0xa3c4711b938c02c0, + 0x3aace640a3e03990, + 0x865a0f3249aacd8a, + 0x8d00b2a7dbed06c7, + 0x6eacb905beb7e2f8, + 0x045322b216ec3ec7, + 0xeb9de00d594828e6, + 0x088c5f20df9e5c26, + 0xf555f4112b19781f, + ], + [ + 0xa8cedbff1813d3a7, + 0x50dcaee0fd27d164, + 0xf1cb02417e23bd82, + 0xfaf322786e2abe8b, + 0x937a4315beb5d9b6, + 0x1b18992921a11d85, + 0x7d66c4368b3c497b, + 0x0e7946317a6b4e99, + 0xbe4430134182978b, + 0x3771e82493ab262d, + 0xa671690d8095ce82, + ], + [ + 0xb035585f6e929d9d, + 0xba1579c7e219b954, + 0xcb201cf846db4ba3, + 0x287bf9177372cf45, + 0xa350e4f61147d0a6, + 0xd5d0ecfb50bcff99, + 0x2e166aa6c776ed21, + 0xe1e66c991990e282, + 0x662b329b01e7bb38, + 0x8aa674b36144d9a9, + 0xcbabf78f97f95e65, + ], + [ + 0xeec24b15a06b53fe, + 0xc8a7aa07c5633533, + 0xefe9c6fa4311ad51, + 0xb9173f13977109a1, + 0x69ce43c9cc94aedc, + 0xecf623c9cd118815, + 0x28625def198c33c7, + 0xccfc5f7de5c3636a, + 0xf5e6c40f1621c299, + 0xcec0e58c34cb64b1, + 0xa868ea113387939f, + ], + [ + 0xd8dddbdc5ce4ef45, + 0xacfc51de8131458c, + 0x146bb3c0fe499ac0, + 0x9e65309f15943903, + 0x80d0ad980773aa70, + 0xf97817d4ddbf0607, + 0xe4626620a75ba276, + 0x0dfdc7fd6fc74f66, + 0xf464864ad6f2bb93, + 0x02d55e52a5d44414, + 0xdd8de62487c40925, + ], + [ + 0xc15acf44759545a3, + 0xcbfdcf39869719d4, + 0x33f62042e2f80225, + 0x2599c5ead81d8fa3, + 0x0b306cb6c1d7c8d0, + 0x658c80d3df3729b1, + 0xe8d1b2b21b41429c, + 0xa1b67f09d4b3ccb8, + 0x0e1adf8b84437180, + 0x0d593a5e584af47b, + 0xa023d94c56e151c7, + ], + [ + 0x49026cc3a4afc5a6, + 0xe06dff00ab25b91b, + 0x0ab38c561e8850ff, + 0x92c3c8275e105eeb, + 0xb65256e546889bd0, + 0x3c0468236ea142f6, + 0xee61766b889e18f2, + 0xa206f41b12c30415, + 0x02fe9d756c9f12d1, + 0xe9633210630cbf12, + 0x1ffea9fe85a0b0b1, + ], + [ + 0x81d1ae8cc50240f3, + 0xf4c77a079a4607d7, + 0xed446b2315e3efc1, + 0x0b0a6b70915178c3, + 0xb11ff3e089f15d9a, + 0x1d4dba0b7ae9cc18, + 0x65d74e2f43b48d05, + 0xa2df8c6b8ae0804a, + 0xa4e6f0a8c33348a6, + 0xc0a26efc7be5669b, + 0xa6b6582c547d0d60, + ], + [ + 0x84afc741f1c13213, + 0x2f8f43734fc906f3, + 0xde682d72da0a02d9, + 0x0bb005236adb9ef2, + 0x5bdf35c10a8b5624, + 0x0739a8a343950010, + 0x52f515f44785cfbc, + 0xcbaf4e5d82856c60, + 0xac9ea09074e3e150, + 0x8f0fa011a2035fb0, + 0x1a37905d8450904a, + ], + [ + 0x3abeb80def61cc85, + 0x9d19c9dd4eac4133, + 0x075a652d9641a985, + 0x9daf69ae1b67e667, + 0x364f71da77920a18, + 0x50bd769f745c95b1, + 0xf223d1180dbbf3fc, + 0x2f885e584e04aa99, + 0xb69a0fa70aea684a, + 0x09584acaa6e062a0, + 0x0bc051640145b19b, + ], +]; + +const FAST_PARTIAL_ROUND_VS: [[u64; 12]; N_PARTIAL_ROUNDS] = [ + [ + 0x0, + 0x94877900674181c3, + 0xc6c67cc37a2a2bbd, + 0xd667c2055387940f, + 0x0ba63a63e94b5ff0, + 0x99460cc41b8f079f, + 0x7ff02375ed524bb3, + 0xea0870b47a8caf0e, + 0xabcad82633b7bc9d, + 0x3b8d135261052241, + 0xfb4515f5e5b0d539, + 0x3ee8011c2b37f77c, + ], + [ + 0x0, + 0x0adef3740e71c726, + 0xa37bf67c6f986559, + 0xc6b16f7ed4fa1b00, + 0x6a065da88d8bfc3c, + 0x4cabc0916844b46f, + 0x407faac0f02e78d1, + 0x07a786d9cf0852cf, + 0x42433fb6949a629a, + 0x891682a147ce43b0, + 0x26cfd58e7b003b55, + 0x2bbf0ed7b657acb3, + ], + [ + 0x0, + 0x481ac7746b159c67, + 0xe367de32f108e278, + 0x73f260087ad28bec, + 0x5cfc82216bc1bdca, + 0xcaccc870a2663a0e, + 0xdb69cd7b4298c45d, + 0x7bc9e0c57243e62d, + 0x3cc51c5d368693ae, + 0x366b4e8cc068895b, + 0x2bd18715cdabbca4, + 0xa752061c4f33b8cf, + ], + [ + 0x0, + 0xb22d2432b72d5098, + 0x9e18a487f44d2fe4, + 0x4b39e14ce22abd3c, + 0x9e77fde2eb315e0d, + 0xca5e0385fe67014d, + 0x0c2cb99bf1b6bddb, + 0x99ec1cd2a4460bfe, + 0x8577a815a2ff843f, + 0x7d80a6b4fd6518a5, + 0xeb6c67123eab62cb, + 0x8f7851650eca21a5, + ], + [ + 0x0, + 0x11ba9a1b81718c2a, + 0x9f7d798a3323410c, + 0xa821855c8c1cf5e5, + 0x535e8d6fac0031b2, + 0x404e7c751b634320, + 0xa729353f6e55d354, + 0x4db97d92e58bb831, + 0xb53926c27897bf7d, + 0x965040d52fe115c5, + 0x9565fa41ebd31fd7, + 0xaae4438c877ea8f4, + ], + [ + 0x0, + 0x37f4e36af6073c6e, + 0x4edc0918210800e9, + 0xc44998e99eae4188, + 0x9f4310d05d068338, + 0x9ec7fe4350680f29, + 0xc5b2c1fdc0b50874, + 0xa01920c5ef8b2ebe, + 0x59fa6f8bd91d58ba, + 0x8bfc9eb89b515a82, + 0xbe86a7a2555ae775, + 0xcbb8bbaa3810babf, + ], + [ + 0x0, + 0x577f9a9e7ee3f9c2, + 0x88c522b949ace7b1, + 0x82f07007c8b72106, + 0x8283d37c6675b50e, + 0x98b074d9bbac1123, + 0x75c56fb7758317c1, + 0xfed24e206052bc72, + 0x26d7c3d1bc07dae5, + 0xf88c5e441e28dbb4, + 0x4fe27f9f96615270, + 0x514d4ba49c2b14fe, + ], + [ + 0x0, + 0xf02a3ac068ee110b, + 0x0a3630dafb8ae2d7, + 0xce0dc874eaf9b55c, + 0x9a95f6cff5b55c7e, + 0x626d76abfed00c7b, + 0xa0c1cf1251c204ad, + 0xdaebd3006321052c, + 0x3d4bd48b625a8065, + 0x7f1e584e071f6ed2, + 0x720574f0501caed3, + 0xe3260ba93d23540a, + ], + [ + 0x0, + 0xab1cbd41d8c1e335, + 0x9322ed4c0bc2df01, + 0x51c3c0983d4284e5, + 0x94178e291145c231, + 0xfd0f1a973d6b2085, + 0xd427ad96e2b39719, + 0x8a52437fecaac06b, + 0xdc20ee4b8c4c9a80, + 0xa2c98e9549da2100, + 0x1603fe12613db5b6, + 0x0e174929433c5505, + ], + [ + 0x0, + 0x3d4eab2b8ef5f796, + 0xcfff421583896e22, + 0x4143cb32d39ac3d9, + 0x22365051b78a5b65, + 0x6f7fd010d027c9b6, + 0xd9dd36fba77522ab, + 0xa44cf1cb33e37165, + 0x3fc83d3038c86417, + 0xc4588d418e88d270, + 0xce1320f10ab80fe2, + 0xdb5eadbbec18de5d, + ], + [ + 0x0, + 0x1183dfce7c454afd, + 0x21cea4aa3d3ed949, + 0x0fce6f70303f2304, + 0x19557d34b55551be, + 0x4c56f689afc5bbc9, + 0xa1e920844334f944, + 0xbad66d423d2ec861, + 0xf318c785dc9e0479, + 0x99e2032e765ddd81, + 0x400ccc9906d66f45, + 0xe1197454db2e0dd9, + ], + [ + 0x0, + 0x84d1ecc4d53d2ff1, + 0xd8af8b9ceb4e11b6, + 0x335856bb527b52f4, + 0xc756f17fb59be595, + 0xc0654e4ea5553a78, + 0x9e9a46b61f2ea942, + 0x14fc8b5b3b809127, + 0xd7009f0f103be413, + 0x3e0ee7b7a9fb4601, + 0xa74e888922085ed7, + 0xe80a7cde3d4ac526, + ], + [ + 0x0, + 0x238aa6daa612186d, + 0x9137a5c630bad4b4, + 0xc7db3817870c5eda, + 0x217e4f04e5718dc9, + 0xcae814e2817bd99d, + 0xe3292e7ab770a8ba, + 0x7bb36ef70b6b9482, + 0x3c7835fb85bca2d3, + 0xfe2cdf8ee3c25e86, + 0x61b3915ad7274b20, + 0xeab75ca7c918e4ef, + ], + [ + 0x0, + 0xd6e15ffc055e154e, + 0xec67881f381a32bf, + 0xfbb1196092bf409c, + 0xdc9d2e07830ba226, + 0x0698ef3245ff7988, + 0x194fae2974f8b576, + 0x7a5d9bea6ca4910e, + 0x7aebfea95ccdd1c9, + 0xf9bd38a67d5f0e86, + 0xfa65539de65492d8, + 0xf0dfcbe7653ff787, + ], + [ + 0x0, + 0x0bd87ad390420258, + 0x0ad8617bca9e33c8, + 0x0c00ad377a1e2666, + 0x0ac6fc58b3f0518f, + 0x0c0cc8a892cc4173, + 0x0c210accb117bc21, + 0x0b73630dbb46ca18, + 0x0c8be4920cbd4a54, + 0x0bfe877a21be1690, + 0x0ae790559b0ded81, + 0x0bf50db2f8d6ce31, + ], + [ + 0x0, + 0x000cf29427ff7c58, + 0x000bd9b3cf49eec8, + 0x000d1dc8aa81fb26, + 0x000bc792d5c394ef, + 0x000d2ae0b2266453, + 0x000d413f12c496c1, + 0x000c84128cfed618, + 0x000db5ebd48fc0d4, + 0x000d1b77326dcb90, + 0x000beb0ccc145421, + 0x000d10e5b22b11d1, + ], + [ + 0x0, + 0x00000e24c99adad8, + 0x00000cf389ed4bc8, + 0x00000e580cbf6966, + 0x00000cde5fd7e04f, + 0x00000e63628041b3, + 0x00000e7e81a87361, + 0x00000dabe78f6d98, + 0x00000efb14cac554, + 0x00000e5574743b10, + 0x00000d05709f42c1, + 0x00000e4690c96af1, + ], + [ + 0x0, + 0x0000000f7157bc98, + 0x0000000e3006d948, + 0x0000000fa65811e6, + 0x0000000e0d127e2f, + 0x0000000fc18bfe53, + 0x0000000fd002d901, + 0x0000000eed6461d8, + 0x0000001068562754, + 0x0000000fa0236f50, + 0x0000000e3af13ee1, + 0x0000000fa460f6d1, + ], + [ + 0x0, + 0x0000000011131738, + 0x000000000f56d588, + 0x0000000011050f86, + 0x000000000f848f4f, + 0x00000000111527d3, + 0x00000000114369a1, + 0x00000000106f2f38, + 0x0000000011e2ca94, + 0x00000000110a29f0, + 0x000000000fa9f5c1, + 0x0000000010f625d1, + ], + [ + 0x0, + 0x000000000011f718, + 0x000000000010b6c8, + 0x0000000000134a96, + 0x000000000010cf7f, + 0x0000000000124d03, + 0x000000000013f8a1, + 0x0000000000117c58, + 0x0000000000132c94, + 0x0000000000134fc0, + 0x000000000010a091, + 0x0000000000128961, + ], + [ + 0x0, + 0x0000000000001300, + 0x0000000000001750, + 0x000000000000114e, + 0x000000000000131f, + 0x000000000000167b, + 0x0000000000001371, + 0x0000000000001230, + 0x000000000000182c, + 0x0000000000001368, + 0x0000000000000f31, + 0x00000000000015c9, + ], + [ + 0x0, + 0x0000000000000014, + 0x0000000000000022, + 0x0000000000000012, + 0x0000000000000027, + 0x000000000000000d, + 0x000000000000000d, + 0x000000000000001c, + 0x0000000000000002, + 0x0000000000000010, + 0x0000000000000029, + 0x000000000000000f, + ], +]; + +const MDS_FREQ_BLOCK_ONE: [i64; 3] = [16, 32, 16]; +const MDS_FREQ_BLOCK_TWO: [(i64, i64); 3] = [(2, -1), (-4, 1), (16, 1)]; +const MDS_FREQ_BLOCK_THREE: [i64; 3] = [-1, -8, 2]; + #[allow(dead_code)] +#[inline(always)] +#[unroll_for_loops] fn mds_row_shf(r: usize, v: &[u64; SPONGE_WIDTH]) -> (u64, u64) { let mut res = 0u128; @@ -232,7 +842,11 @@ fn mds_row_shf(r: usize, v: &[u64; SPONGE_WIDTH]) -> (u64, u64) { #[allow(dead_code)] #[inline(always)] #[unroll_for_loops] -unsafe fn mds_layer_avx(s0: &__m256i, s1: &__m256i, s2: &__m256i) -> (__m256i, __m256i, __m256i) { +unsafe fn mds_layer_avx_v1( + s0: &__m256i, + s1: &__m256i, + s2: &__m256i, +) -> (__m256i, __m256i, __m256i) { let mut st64 = [0u64; SPONGE_WIDTH]; _mm256_storeu_si256((&mut st64[0..4]).as_mut_ptr().cast::<__m256i>(), *s0); @@ -293,6 +907,280 @@ where (r0, r1, r2) } +#[inline(always)] +unsafe fn block1_avx(x: &__m256i, y: [i64; 3]) -> __m256i { + let x0 = _mm256_permute4x64_epi64(*x, 0x0); + let x1 = _mm256_permute4x64_epi64(*x, 0x55); + let x2 = _mm256_permute4x64_epi64(*x, 0xAA); + + let f0 = _mm256_set_epi64x(0, y[2], y[1], y[0]); + let f1 = _mm256_set_epi64x(0, y[1], y[0], y[2]); + let f2 = _mm256_set_epi64x(0, y[0], y[2], y[1]); + + let t0 = mul64_no_overflow(&x0, &f0); + let t1 = mul64_no_overflow(&x1, &f1); + let t2 = mul64_no_overflow(&x2, &f2); + + let t0 = _mm256_add_epi64(t0, t1); + _mm256_add_epi64(t0, t2) +} + +#[allow(dead_code)] +#[inline(always)] +unsafe fn block2_full_avx(xr: &__m256i, xi: &__m256i, y: [(i64, i64); 3]) -> (__m256i, __m256i) { + let yr = _mm256_set_epi64x(0, y[2].0, y[1].0, y[0].0); + let yi = _mm256_set_epi64x(0, y[2].1, y[1].1, y[0].1); + let ys = _mm256_add_epi64(yr, yi); + let xs = _mm256_add_epi64(*xr, *xi); + + // z0 + // z0r = dif2[0] + prod[1] - sum[1] + prod[2] - sum[2] + // z0i = prod[0] - sum[0] + dif1[1] + dif1[2] + let yy = _mm256_permute4x64_epi64(yr, 0x18); + let mr_z0 = mul64_no_overflow(xr, &yy); + let yy = _mm256_permute4x64_epi64(yi, 0x18); + let mi_z0 = mul64_no_overflow(xi, &yy); + let sum = _mm256_add_epi64(mr_z0, mi_z0); + let dif1 = _mm256_sub_epi64(mi_z0, mr_z0); + let dif2 = _mm256_sub_epi64(mr_z0, mi_z0); + let yy = _mm256_permute4x64_epi64(ys, 0x18); + let prod = mul64_no_overflow(&xs, &yy); + let dif3 = _mm256_sub_epi64(prod, sum); + let dif3perm1 = _mm256_permute4x64_epi64(dif3, 0x1); + let dif3perm2 = _mm256_permute4x64_epi64(dif3, 0x2); + let z0r = _mm256_add_epi64(dif2, dif3perm1); + let z0r = _mm256_add_epi64(z0r, dif3perm2); + let dif1perm1 = _mm256_permute4x64_epi64(dif1, 0x1); + let dif1perm2 = _mm256_permute4x64_epi64(dif1, 0x2); + let z0i = _mm256_add_epi64(dif3, dif1perm1); + let z0i = _mm256_add_epi64(z0i, dif1perm2); + let mask = _mm256_set_epi64x(0, 0, 0, 0xFFFFFFFFFFFFFFFFu64 as i64); + let z0r = _mm256_and_si256(z0r, mask); + let z0i = _mm256_and_si256(z0i, mask); + + // z1 + // z1r = dif2[0] + dif2[1] + prod[2] - sum[2]; + // z1i = prod[0] - sum[0] + prod[1] - sum[1] + dif1[2]; + let yy = _mm256_permute4x64_epi64(yr, 0x21); + let mr_z1 = mul64_no_overflow(xr, &yy); + let yy = _mm256_permute4x64_epi64(yi, 0x21); + let mi_z1 = mul64_no_overflow(xi, &yy); + let sum = _mm256_add_epi64(mr_z1, mi_z1); + let dif1 = _mm256_sub_epi64(mi_z1, mr_z1); + let dif2 = _mm256_sub_epi64(mr_z1, mi_z1); + let yy = _mm256_permute4x64_epi64(ys, 0x21); + let prod = mul64_no_overflow(&xs, &yy); + let dif3 = _mm256_sub_epi64(prod, sum); + let dif2perm = _mm256_permute4x64_epi64(dif2, 0x0); + let dif3perm = _mm256_permute4x64_epi64(dif3, 0x8); + let z1r = _mm256_add_epi64(dif2, dif2perm); + let z1r = _mm256_add_epi64(z1r, dif3perm); + let dif3perm = _mm256_permute4x64_epi64(dif3, 0x0); + let dif1perm = _mm256_permute4x64_epi64(dif1, 0x8); + let z1i = _mm256_add_epi64(dif3, dif3perm); + let z1i = _mm256_add_epi64(z1i, dif1perm); + let mask = _mm256_set_epi64x(0, 0, 0xFFFFFFFFFFFFFFFFu64 as i64, 0); + let z1r = _mm256_and_si256(z1r, mask); + let z1i = _mm256_and_si256(z1i, mask); + + // z2 + // z2r = dif2[0] + dif2[1] + dif2[2]; + // z2i = prod[0] - sum[0] + prod[1] - sum[1] + prod[2] - sum[2] + let yy = _mm256_permute4x64_epi64(yr, 0x6); + let mr_z2 = mul64_no_overflow(xr, &yy); + let yy = _mm256_permute4x64_epi64(yi, 0x6); + let mi_z2 = mul64_no_overflow(xi, &yy); + let sum = _mm256_add_epi64(mr_z2, mi_z2); + let dif2 = _mm256_sub_epi64(mr_z2, mi_z2); + let yy = _mm256_permute4x64_epi64(ys, 0x6); + let prod = mul64_no_overflow(&xs, &yy); + let dif3 = _mm256_sub_epi64(prod, sum); + let dif2perm1 = _mm256_permute4x64_epi64(dif2, 0x0); + let dif2perm2 = _mm256_permute4x64_epi64(dif2, 0x10); + let z2r = _mm256_add_epi64(dif2, dif2perm1); + let z2r = _mm256_add_epi64(z2r, dif2perm2); + let dif3perm1 = _mm256_permute4x64_epi64(dif3, 0x0); + let dif3perm2 = _mm256_permute4x64_epi64(dif3, 0x10); + let z2i = _mm256_add_epi64(dif3, dif3perm1); + let z2i = _mm256_add_epi64(z2i, dif3perm2); + let mask = _mm256_set_epi64x(0, 0xFFFFFFFFFFFFFFFFu64 as i64, 0, 0); + let z2r = _mm256_and_si256(z2r, mask); + let z2i = _mm256_and_si256(z2i, mask); + + let zr = _mm256_or_si256(z0r, z1r); + let zr = _mm256_or_si256(zr, z2r); + let zi = _mm256_or_si256(z0i, z1i); + let zi = _mm256_or_si256(zi, z2i); + (zr, zi) +} + +#[inline(always)] +unsafe fn block2_avx(xr: &__m256i, xi: &__m256i, y: [(i64, i64); 3]) -> (__m256i, __m256i) { + let mut vxr: [i64; 4] = [0; 4]; + let mut vxi: [i64; 4] = [0; 4]; + _mm256_storeu_si256(vxr.as_mut_ptr().cast::<__m256i>(), *xr); + _mm256_storeu_si256(vxi.as_mut_ptr().cast::<__m256i>(), *xi); + let x: [(i64, i64); 3] = [(vxr[0], vxi[0]), (vxr[1], vxi[1]), (vxr[2], vxi[2])]; + let b = block2(x, y); + vxr = [b[0].0, b[1].0, b[2].0, 0]; + vxi = [b[0].1, b[1].1, b[2].1, 0]; + let rr = _mm256_loadu_si256(vxr.as_ptr().cast::<__m256i>()); + let ri = _mm256_loadu_si256(vxi.as_ptr().cast::<__m256i>()); + (rr, ri) +} + +#[inline(always)] +unsafe fn block3_avx(x: &__m256i, y: [i64; 3]) -> __m256i { + let x0 = _mm256_permute4x64_epi64(*x, 0x0); + let x1 = _mm256_permute4x64_epi64(*x, 0x55); + let x2 = _mm256_permute4x64_epi64(*x, 0xAA); + + let f0 = _mm256_set_epi64x(0, y[2], y[1], y[0]); + let f1 = _mm256_set_epi64x(0, y[1], y[0], -y[2]); + let f2 = _mm256_set_epi64x(0, y[0], -y[2], -y[1]); + + let t0 = mul64_no_overflow(&x0, &f0); + let t1 = mul64_no_overflow(&x1, &f1); + let t2 = mul64_no_overflow(&x2, &f2); + + let t0 = _mm256_add_epi64(t0, t1); + _mm256_add_epi64(t0, t2) +} + +#[inline(always)] +unsafe fn fft2_real_avx(x0: &__m256i, x1: &__m256i) -> (__m256i, __m256i) { + let y0 = _mm256_add_epi64(*x0, *x1); + let y1 = _mm256_sub_epi64(*x0, *x1); + (y0, y1) +} + +#[inline(always)] +unsafe fn fft4_real_avx( + x0: &__m256i, + x1: &__m256i, + x2: &__m256i, + x3: &__m256i, +) -> (__m256i, __m256i, __m256i, __m256i) { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let (z0, z2) = fft2_real_avx(x0, x2); + let (z1, z3) = fft2_real_avx(x1, x3); + let y0 = _mm256_add_epi64(z0, z1); + let y2 = _mm256_sub_epi64(z0, z1); + let y3 = _mm256_sub_epi64(zeros, z3); + (y0, z2, y3, y2) +} + +#[inline(always)] +unsafe fn ifft2_real_unreduced_avx(y0: &__m256i, y1: &__m256i) -> (__m256i, __m256i) { + let x0 = _mm256_add_epi64(*y0, *y1); + let x1 = _mm256_sub_epi64(*y0, *y1); + (x0, x1) +} + +#[inline(always)] +unsafe fn ifft4_real_unreduced_avx( + y: (__m256i, (__m256i, __m256i), __m256i), +) -> (__m256i, __m256i, __m256i, __m256i) { + let zeros = _mm256_set_epi64x(0, 0, 0, 0); + let z0 = _mm256_add_epi64(y.0, y.2); + let z1 = _mm256_sub_epi64(y.0, y.2); + let z2 = y.1 .0; + let z3 = _mm256_sub_epi64(zeros, y.1 .1); + let (x0, x2) = ifft2_real_unreduced_avx(&z0, &z2); + let (x1, x3) = ifft2_real_unreduced_avx(&z1, &z3); + (x0, x1, x2, x3) +} + +#[inline] +unsafe fn mds_multiply_freq_avx(s0: &mut __m256i, s1: &mut __m256i, s2: &mut __m256i) { + /* + // Alternative code using store and set. + let mut s: [i64; 12] = [0; 12]; + _mm256_storeu_si256(s[0..4].as_mut_ptr().cast::<__m256i>(), *s0); + _mm256_storeu_si256(s[4..8].as_mut_ptr().cast::<__m256i>(), *s1); + _mm256_storeu_si256(s[8..12].as_mut_ptr().cast::<__m256i>(), *s2); + let f0 = _mm256_set_epi64x(0, s[2], s[1], s[0]); + let f1 = _mm256_set_epi64x(0, s[5], s[4], s[3]); + let f2 = _mm256_set_epi64x(0, s[8], s[7], s[6]); + let f3 = _mm256_set_epi64x(0, s[11], s[10], s[9]); + */ + + // Alternative code using permute and blend (it is faster). + let f0 = *s0; + let f11 = _mm256_permute4x64_epi64(*s0, 0x3); + let f12 = _mm256_permute4x64_epi64(*s1, 0x10); + let f1 = _mm256_blend_epi32(f11, f12, 0x3C); + let f21 = _mm256_permute4x64_epi64(*s1, 0xE); + let f22 = _mm256_permute4x64_epi64(*s2, 0x0); + let f2 = _mm256_blend_epi32(f21, f22, 0x30); + let f3 = _mm256_permute4x64_epi64(*s2, 0x39); + + let (u0, u1, u2, u3) = fft4_real_avx(&f0, &f1, &f2, &f3); + + // let [v0, v4, v8] = block1_avx([u[0], u[1], u[2]], MDS_FREQ_BLOCK_ONE); + // [u[0], u[1], u[2]] are all in u0 + let f0 = block1_avx(&u0, MDS_FREQ_BLOCK_ONE); + + // let [v1, v5, v9] = block2([(u[0], v[0]), (u[1], v[1]), (u[2], v[2])], MDS_FREQ_BLOCK_TWO); + let (f1, f2) = block2_avx(&u1, &u2, MDS_FREQ_BLOCK_TWO); + + // let [v2, v6, v10] = block3_avx([u[0], u[1], u[2]], MDS_FREQ_BLOCK_ONE); + // [u[0], u[1], u[2]] are all in u3 + let f3 = block3_avx(&u3, MDS_FREQ_BLOCK_THREE); + + let (r0, r3, r6, r9) = ifft4_real_unreduced_avx((f0, (f1, f2), f3)); + let t = _mm256_permute4x64_epi64(r3, 0x0); + *s0 = _mm256_blend_epi32(r0, t, 0xC0); + let t1 = _mm256_permute4x64_epi64(r3, 0x9); + let t2 = _mm256_permute4x64_epi64(r6, 0x40); + *s1 = _mm256_blend_epi32(t1, t2, 0xF0); + let t1 = _mm256_permute4x64_epi64(r6, 0x2); + let t2 = _mm256_permute4x64_epi64(r9, 0x90); + *s2 = _mm256_blend_epi32(t1, t2, 0xFC); +} + +#[allow(dead_code)] +#[inline(always)] +#[unroll_for_loops] +unsafe fn mds_layer_avx(s0: &mut __m256i, s1: &mut __m256i, s2: &mut __m256i) { + let mask = _mm256_set_epi64x(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); + let mut sl0 = _mm256_and_si256(*s0, mask); + let mut sl1 = _mm256_and_si256(*s1, mask); + let mut sl2 = _mm256_and_si256(*s2, mask); + let mut sh0 = _mm256_srli_epi64(*s0, 32); + let mut sh1 = _mm256_srli_epi64(*s1, 32); + let mut sh2 = _mm256_srli_epi64(*s2, 32); + + mds_multiply_freq_avx(&mut sl0, &mut sl1, &mut sl2); + mds_multiply_freq_avx(&mut sh0, &mut sh1, &mut sh2); + + let shl0 = _mm256_slli_epi64(sh0, 32); + let shl1 = _mm256_slli_epi64(sh1, 32); + let shl2 = _mm256_slli_epi64(sh2, 32); + let shh0 = _mm256_srli_epi64(sh0, 32); + let shh1 = _mm256_srli_epi64(sh1, 32); + let shh2 = _mm256_srli_epi64(sh2, 32); + + let (rl0, c0) = add64_no_carry(&sl0, &shl0); + let (rh0, _) = add64_no_carry(&shh0, &c0); + let r0 = reduce_avx_128_64(&rh0, &rl0); + + let (rl1, c1) = add64_no_carry(&sl1, &shl1); + let (rh1, _) = add64_no_carry(&shh1, &c1); + *s1 = reduce_avx_128_64(&rh1, &rl1); + + let (rl2, c2) = add64_no_carry(&sl2, &shl2); + let (rh2, _) = add64_no_carry(&shh2, &c2); + *s2 = reduce_avx_128_64(&rh2, &rl2); + + let rl = _mm256_slli_epi64(*s0, 3); // * 8 (low part) + let rh = _mm256_srli_epi64(*s0, 61); // * 8 (high part, only 3 bits) + let rx = reduce_avx_96_64(&rh, &rl); + let rx = add_avx(&r0, &rx); + *s0 = _mm256_blend_epi32(r0, rx, 0x3); +} + +#[allow(dead_code)] #[inline(always)] #[unroll_for_loops] fn mds_partial_layer_init_avx(state: &mut [F; SPONGE_WIDTH]) @@ -341,6 +1229,118 @@ where } } +#[inline(always)] +#[unroll_for_loops] +unsafe fn mds_partial_layer_fast_avx( + s0: &mut __m256i, + s1: &mut __m256i, + s2: &mut __m256i, + state: &mut [F; SPONGE_WIDTH], + r: usize, +) where + F: PrimeField64, +{ + let mut d_sum = (0u128, 0u32); // u160 accumulator + for i in 1..12 { + if i < SPONGE_WIDTH { + let t = FAST_PARTIAL_ROUND_W_HATS[r][i - 1] as u128; + let si = state[i].to_noncanonical_u64() as u128; + d_sum = add_u160_u128(d_sum, si * t); + } + } + let x0 = state[0].to_noncanonical_u64() as u128; + let mds0to0 = (MDS_MATRIX_CIRC[0] + MDS_MATRIX_DIAG[0]) as u128; + d_sum = add_u160_u128(d_sum, x0 * mds0to0); + let d = reduce_u160::(d_sum); + + // result = [d] concat [state[0] * v + state[shift up by 1]] + let ss0 = _mm256_set_epi64x( + state[0].to_noncanonical_u64() as i64, + state[0].to_noncanonical_u64() as i64, + state[0].to_noncanonical_u64() as i64, + state[0].to_noncanonical_u64() as i64, + ); + let rc0 = _mm256_loadu_si256((&FAST_PARTIAL_ROUND_VS[r][0..4]).as_ptr().cast::<__m256i>()); + let rc1 = _mm256_loadu_si256((&FAST_PARTIAL_ROUND_VS[r][4..8]).as_ptr().cast::<__m256i>()); + let rc2 = _mm256_loadu_si256( + (&FAST_PARTIAL_ROUND_VS[r][8..12]) + .as_ptr() + .cast::<__m256i>(), + ); + let (mh, ml) = mult_avx_128(&ss0, &rc0); + let m = reduce_avx_128_64(&mh, &ml); + let r0 = add_avx(s0, &m); + let d0 = _mm256_set_epi64x(0, 0, 0, d.to_canonical_u64() as i64); + *s0 = _mm256_blend_epi32(r0, d0, 0x3); + + let (mh, ml) = mult_avx_128(&ss0, &rc1); + let m = reduce_avx_128_64(&mh, &ml); + *s1 = add_avx(s1, &m); + + let (mh, ml) = mult_avx_128(&ss0, &rc2); + let m = reduce_avx_128_64(&mh, &ml); + *s2 = add_avx(s2, &m); + + _mm256_storeu_si256((state[0..4]).as_mut_ptr().cast::<__m256i>(), *s0); + _mm256_storeu_si256((state[4..8]).as_mut_ptr().cast::<__m256i>(), *s1); + _mm256_storeu_si256((state[8..12]).as_mut_ptr().cast::<__m256i>(), *s2); +} + +#[inline(always)] +#[unroll_for_loops] +unsafe fn mds_partial_layer_init_avx_m256i(s0: &mut __m256i, s1: &mut __m256i, s2: &mut __m256i) +where + F: PrimeField64, +{ + let mut result = [F::ZERO; SPONGE_WIDTH]; + let res0 = *s0; + + let mut r0 = _mm256_loadu_si256((&mut result[0..4]).as_mut_ptr().cast::<__m256i>()); + let mut r1 = _mm256_loadu_si256((&mut result[0..4]).as_mut_ptr().cast::<__m256i>()); + let mut r2 = _mm256_loadu_si256((&mut result[0..4]).as_mut_ptr().cast::<__m256i>()); + for r in 1..12 { + let sr = match r { + 1 => _mm256_permutex_epi64(*s0, 0x55), + 2 => _mm256_permutex_epi64(*s0, 0xAA), + 3 => _mm256_permutex_epi64(*s0, 0xFF), + 4 => _mm256_permutex_epi64(*s1, 0x0), + 5 => _mm256_permutex_epi64(*s1, 0x55), + 6 => _mm256_permutex_epi64(*s1, 0xAA), + 7 => _mm256_permutex_epi64(*s1, 0xFF), + 8 => _mm256_permutex_epi64(*s2, 0x0), + 9 => _mm256_permutex_epi64(*s2, 0x55), + 10 => _mm256_permutex_epi64(*s2, 0xAA), + 11 => _mm256_permutex_epi64(*s2, 0xFF), + _ => _mm256_permutex_epi64(*s0, 0x55), + }; + let t0 = _mm256_loadu_si256( + (&FAST_PARTIAL_ROUND_INITIAL_MATRIX[r][0..4]) + .as_ptr() + .cast::<__m256i>(), + ); + let t1 = _mm256_loadu_si256( + (&FAST_PARTIAL_ROUND_INITIAL_MATRIX[r][4..8]) + .as_ptr() + .cast::<__m256i>(), + ); + let t2 = _mm256_loadu_si256( + (&FAST_PARTIAL_ROUND_INITIAL_MATRIX[r][8..12]) + .as_ptr() + .cast::<__m256i>(), + ); + let m0 = mult_avx(&sr, &t0); + let m1 = mult_avx(&sr, &t1); + let m2 = mult_avx(&sr, &t2); + r0 = add_avx(&r0, &m0); + r1 = add_avx(&r1, &m1); + r2 = add_avx(&r2, &m2); + } + *s0 = _mm256_blend_epi32(r0, res0, 0x3); + *s1 = r1; + *s2 = r2; +} + +#[allow(dead_code)] #[inline(always)] #[unroll_for_loops] fn partial_first_constant_layer_avx(state: &mut [F; SPONGE_WIDTH]) @@ -396,13 +1396,12 @@ where let mut round_ctr = 0; unsafe { - // Self::full_rounds(&mut state, &mut round_ctr); - for _ in 0..HALF_N_FULL_ROUNDS { - // load state - let s0 = _mm256_loadu_si256((&state[0..4]).as_ptr().cast::<__m256i>()); - let s1 = _mm256_loadu_si256((&state[4..8]).as_ptr().cast::<__m256i>()); - let s2 = _mm256_loadu_si256((&state[8..12]).as_ptr().cast::<__m256i>()); + // load state + let mut s0 = _mm256_loadu_si256((&state[0..4]).as_ptr().cast::<__m256i>()); + let mut s1 = _mm256_loadu_si256((&state[4..8]).as_ptr().cast::<__m256i>()); + let mut s2 = _mm256_loadu_si256((&state[8..12]).as_ptr().cast::<__m256i>()); + for _ in 0..HALF_N_FULL_ROUNDS { let rc: &[u64; 12] = &ALL_ROUND_CONSTANTS[SPONGE_WIDTH * round_ctr..][..SPONGE_WIDTH] .try_into() .unwrap(); @@ -412,38 +1411,47 @@ where let ss0 = add_avx(&s0, &rc0); let ss1 = add_avx(&s1, &rc1); let ss2 = add_avx(&s2, &rc2); - let (r0, r1, r2) = sbox_avx_m256i(&ss0, &ss1, &ss2); - // let (s0, s1, s2) = mds_layer_avx(&r0, &r1, &r2); - // let (s0, s1, s2) = mds_layer_avx_v2::(&r0, &r1, &r2); - - // store state - _mm256_storeu_si256((state[0..4]).as_mut_ptr().cast::<__m256i>(), r0); - _mm256_storeu_si256((state[4..8]).as_mut_ptr().cast::<__m256i>(), r1); - _mm256_storeu_si256((state[8..12]).as_mut_ptr().cast::<__m256i>(), r2); - - *state = ::mds_layer(&state); - // mds_layer_avx::(&mut s0, &mut s1, &mut s2); + (s0, s1, s2) = sbox_avx_m256i(&ss0, &ss1, &ss2); + mds_layer_avx(&mut s0, &mut s1, &mut s2); round_ctr += 1; } - // Self::partial_rounds(&mut state, &mut round_ctr); - partial_first_constant_layer_avx(&mut state); - mds_partial_layer_init_avx(&mut state); + // this does partial_first_constant_layer_avx(&mut state); + let c0 = _mm256_loadu_si256( + (&FAST_PARTIAL_FIRST_ROUND_CONSTANT[0..4]) + .as_ptr() + .cast::<__m256i>(), + ); + let c1 = _mm256_loadu_si256( + (&FAST_PARTIAL_FIRST_ROUND_CONSTANT[4..8]) + .as_ptr() + .cast::<__m256i>(), + ); + let c2 = _mm256_loadu_si256( + (&FAST_PARTIAL_FIRST_ROUND_CONSTANT[8..12]) + .as_ptr() + .cast::<__m256i>(), + ); + s0 = add_avx(&s0, &c0); + s1 = add_avx(&s1, &c1); + s2 = add_avx(&s2, &c2); + + mds_partial_layer_init_avx_m256i::(&mut s0, &mut s1, &mut s2); + + _mm256_storeu_si256((state[0..4]).as_mut_ptr().cast::<__m256i>(), s0); + _mm256_storeu_si256((state[4..8]).as_mut_ptr().cast::<__m256i>(), s1); + _mm256_storeu_si256((state[8..12]).as_mut_ptr().cast::<__m256i>(), s2); for i in 0..N_PARTIAL_ROUNDS { state[0] = sbox_monomial(state[0]); state[0] = state[0].add_canonical_u64(FAST_PARTIAL_ROUND_CONSTANTS[i]); - *state = ::mds_partial_layer_fast(&state, i); + mds_partial_layer_fast_avx(&mut s0, &mut s1, &mut s2, &mut state, i); } round_ctr += N_PARTIAL_ROUNDS; + // here state is already loaded in s0, s1, s2 // Self::full_rounds(&mut state, &mut round_ctr); for _ in 0..HALF_N_FULL_ROUNDS { - // load state - let s0 = _mm256_loadu_si256((&state[0..4]).as_ptr().cast::<__m256i>()); - let s1 = _mm256_loadu_si256((&state[4..8]).as_ptr().cast::<__m256i>()); - let s2 = _mm256_loadu_si256((&state[8..12]).as_ptr().cast::<__m256i>()); - let rc: &[u64; 12] = &ALL_ROUND_CONSTANTS[SPONGE_WIDTH * round_ctr..][..SPONGE_WIDTH] .try_into() .unwrap(); @@ -453,20 +1461,16 @@ where let ss0 = add_avx(&s0, &rc0); let ss1 = add_avx(&s1, &rc1); let ss2 = add_avx(&s2, &rc2); - let (r0, r1, r2) = sbox_avx_m256i(&ss0, &ss1, &ss2); - // let (s0, s1, s2) = mds_layer_avx(&r0, &r1, &r2); - // let (s0, s1, s2) = mds_layer_avx_v2::(&r0, &r1, &r2); - - // store state - _mm256_storeu_si256((state[0..4]).as_mut_ptr().cast::<__m256i>(), r0); - _mm256_storeu_si256((state[4..8]).as_mut_ptr().cast::<__m256i>(), r1); - _mm256_storeu_si256((state[8..12]).as_mut_ptr().cast::<__m256i>(), r2); - - *state = ::mds_layer(&state); - // mds_layer_avx::(&mut s0, &mut s1, &mut s2); + (s0, s1, s2) = sbox_avx_m256i(&ss0, &ss1, &ss2); + mds_layer_avx(&mut s0, &mut s1, &mut s2); round_ctr += 1; } + // store state + _mm256_storeu_si256((state[0..4]).as_mut_ptr().cast::<__m256i>(), s0); + _mm256_storeu_si256((state[4..8]).as_mut_ptr().cast::<__m256i>(), s1); + _mm256_storeu_si256((state[8..12]).as_mut_ptr().cast::<__m256i>(), s2); + debug_assert_eq!(round_ctr, N_ROUNDS); }; *state diff --git a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs new file mode 100644 index 0000000000..fdaf321681 --- /dev/null +++ b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs @@ -0,0 +1,366 @@ +use core::arch::x86_64::*; + +use unroll::unroll_for_loops; + +use crate::field::types::PrimeField64; +use crate::hash::arch::x86_64::goldilocks_avx512::*; +use crate::hash::poseidon::{ + Poseidon, ALL_ROUND_CONSTANTS, HALF_N_FULL_ROUNDS, N_PARTIAL_ROUNDS, N_ROUNDS, SPONGE_WIDTH, +}; + +#[allow(dead_code)] +const MDS_MATRIX_CIRC: [u64; 12] = [17, 15, 41, 16, 2, 28, 13, 13, 39, 18, 34, 20]; + +#[allow(dead_code)] +const MDS_MATRIX_DIAG: [u64; 12] = [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + +const FAST_PARTIAL_FIRST_ROUND_CONSTANT: [u64; 12] = [ + 0x3cc3f892184df408, + 0xe993fd841e7e97f1, + 0xf2831d3575f0f3af, + 0xd2500e0a350994ca, + 0xc5571f35d7288633, + 0x91d89c5184109a02, + 0xf37f925d04e5667b, + 0x2d6e448371955a69, + 0x740ef19ce01398a1, + 0x694d24c0752fdf45, + 0x60936af96ee2f148, + 0xc33448feadc78f0c, +]; + +const FAST_PARTIAL_ROUND_CONSTANTS: [u64; N_PARTIAL_ROUNDS] = [ + 0x74cb2e819ae421ab, + 0xd2559d2370e7f663, + 0x62bf78acf843d17c, + 0xd5ab7b67e14d1fb4, + 0xb9fe2ae6e0969bdc, + 0xe33fdf79f92a10e8, + 0x0ea2bb4c2b25989b, + 0xca9121fbf9d38f06, + 0xbdd9b0aa81f58fa4, + 0x83079fa4ecf20d7e, + 0x650b838edfcc4ad3, + 0x77180c88583c76ac, + 0xaf8c20753143a180, + 0xb8ccfe9989a39175, + 0x954a1729f60cc9c5, + 0xdeb5b550c4dca53b, + 0xf01bb0b00f77011e, + 0xa1ebb404b676afd9, + 0x860b6e1597a0173e, + 0x308bb65a036acbce, + 0x1aca78f31c97c876, + 0x0, +]; + +const FAST_PARTIAL_ROUND_INITIAL_MATRIX: [[u64; 12]; 12] = [ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [ + 0, + 0x80772dc2645b280b, + 0xdc927721da922cf8, + 0xc1978156516879ad, + 0x90e80c591f48b603, + 0x3a2432625475e3ae, + 0x00a2d4321cca94fe, + 0x77736f524010c932, + 0x904d3f2804a36c54, + 0xbf9b39e28a16f354, + 0x3a1ded54a6cd058b, + 0x42392870da5737cf, + ], + [ + 0, + 0xe796d293a47a64cb, + 0xb124c33152a2421a, + 0x0ee5dc0ce131268a, + 0xa9032a52f930fae6, + 0x7e33ca8c814280de, + 0xad11180f69a8c29e, + 0xc75ac6d5b5a10ff3, + 0xf0674a8dc5a387ec, + 0xb36d43120eaa5e2b, + 0x6f232aab4b533a25, + 0x3a1ded54a6cd058b, + ], + [ + 0, + 0xdcedab70f40718ba, + 0x14a4a64da0b2668f, + 0x4715b8e5ab34653b, + 0x1e8916a99c93a88e, + 0xbba4b5d86b9a3b2c, + 0xe76649f9bd5d5c2e, + 0xaf8e2518a1ece54d, + 0xdcda1344cdca873f, + 0xcd080204256088e5, + 0xb36d43120eaa5e2b, + 0xbf9b39e28a16f354, + ], + [ + 0, + 0xf4a437f2888ae909, + 0xc537d44dc2875403, + 0x7f68007619fd8ba9, + 0xa4911db6a32612da, + 0x2f7e9aade3fdaec1, + 0xe7ffd578da4ea43d, + 0x43a608e7afa6b5c2, + 0xca46546aa99e1575, + 0xdcda1344cdca873f, + 0xf0674a8dc5a387ec, + 0x904d3f2804a36c54, + ], + [ + 0, + 0xf97abba0dffb6c50, + 0x5e40f0c9bb82aab5, + 0x5996a80497e24a6b, + 0x07084430a7307c9a, + 0xad2f570a5b8545aa, + 0xab7f81fef4274770, + 0xcb81f535cf98c9e9, + 0x43a608e7afa6b5c2, + 0xaf8e2518a1ece54d, + 0xc75ac6d5b5a10ff3, + 0x77736f524010c932, + ], + [ + 0, + 0x7f8e41e0b0a6cdff, + 0x4b1ba8d40afca97d, + 0x623708f28fca70e8, + 0xbf150dc4914d380f, + 0xc26a083554767106, + 0x753b8b1126665c22, + 0xab7f81fef4274770, + 0xe7ffd578da4ea43d, + 0xe76649f9bd5d5c2e, + 0xad11180f69a8c29e, + 0x00a2d4321cca94fe, + ], + [ + 0, + 0x726af914971c1374, + 0x1d7f8a2cce1a9d00, + 0x18737784700c75cd, + 0x7fb45d605dd82838, + 0x862361aeab0f9b6e, + 0xc26a083554767106, + 0xad2f570a5b8545aa, + 0x2f7e9aade3fdaec1, + 0xbba4b5d86b9a3b2c, + 0x7e33ca8c814280de, + 0x3a2432625475e3ae, + ], + [ + 0, + 0x64dd936da878404d, + 0x4db9a2ead2bd7262, + 0xbe2e19f6d07f1a83, + 0x02290fe23c20351a, + 0x7fb45d605dd82838, + 0xbf150dc4914d380f, + 0x07084430a7307c9a, + 0xa4911db6a32612da, + 0x1e8916a99c93a88e, + 0xa9032a52f930fae6, + 0x90e80c591f48b603, + ], + [ + 0, + 0x85418a9fef8a9890, + 0xd8a2eb7ef5e707ad, + 0xbfe85ababed2d882, + 0xbe2e19f6d07f1a83, + 0x18737784700c75cd, + 0x623708f28fca70e8, + 0x5996a80497e24a6b, + 0x7f68007619fd8ba9, + 0x4715b8e5ab34653b, + 0x0ee5dc0ce131268a, + 0xc1978156516879ad, + ], + [ + 0, + 0x156048ee7a738154, + 0x91f7562377e81df5, + 0xd8a2eb7ef5e707ad, + 0x4db9a2ead2bd7262, + 0x1d7f8a2cce1a9d00, + 0x4b1ba8d40afca97d, + 0x5e40f0c9bb82aab5, + 0xc537d44dc2875403, + 0x14a4a64da0b2668f, + 0xb124c33152a2421a, + 0xdc927721da922cf8, + ], + [ + 0, + 0xd841e8ef9dde8ba0, + 0x156048ee7a738154, + 0x85418a9fef8a9890, + 0x64dd936da878404d, + 0x726af914971c1374, + 0x7f8e41e0b0a6cdff, + 0xf97abba0dffb6c50, + 0xf4a437f2888ae909, + 0xdcedab70f40718ba, + 0xe796d293a47a64cb, + 0x80772dc2645b280b, + ], +]; + +#[inline(always)] +#[unroll_for_loops] +fn mds_partial_layer_init_avx(state: &mut [F; SPONGE_WIDTH]) +where + F: PrimeField64, +{ + let mut result = [F::ZERO; SPONGE_WIDTH]; + let res0 = state[0]; + unsafe { + let mut r0 = _mm512_loadu_si512((&mut result[0..8]).as_mut_ptr().cast::()); + let mut r1 = _mm512_loadu_si512((&mut result[4..12]).as_mut_ptr().cast::()); + + for r in 1..12 { + let sr512 = _mm512_set_epi64( + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + state[r].to_canonical_u64() as i64, + ); + let t0 = _mm512_loadu_si512( + (&FAST_PARTIAL_ROUND_INITIAL_MATRIX[r][0..8]) + .as_ptr() + .cast::(), + ); + let t1 = _mm512_loadu_si512( + (&FAST_PARTIAL_ROUND_INITIAL_MATRIX[r][4..12]) + .as_ptr() + .cast::(), + ); + let m0 = mult_avx512(&sr512, &t0); + let m1 = mult_avx512(&sr512, &t1); + r0 = add_avx512_b_c(&r0, &m0); + r1 = add_avx512_b_c(&r1, &m1); + } + _mm512_storeu_si512((state[0..8]).as_mut_ptr().cast::(), r0); + _mm512_storeu_si512((state[4..12]).as_mut_ptr().cast::(), r1); + state[0] = res0; + } +} + +#[inline(always)] +#[unroll_for_loops] +fn partial_first_constant_layer_avx(state: &mut [F; SPONGE_WIDTH]) +where + F: PrimeField64, +{ + unsafe { + let c0 = _mm512_loadu_si512( + (&FAST_PARTIAL_FIRST_ROUND_CONSTANT[0..8]) + .as_ptr() + .cast::(), + ); + let c1 = _mm512_loadu_si512( + (&FAST_PARTIAL_FIRST_ROUND_CONSTANT[4..12]) + .as_ptr() + .cast::(), + ); + let mut s0 = _mm512_loadu_si512((state[0..8]).as_ptr().cast::()); + let mut s1 = _mm512_loadu_si512((state[4..12]).as_ptr().cast::()); + s0 = add_avx512_b_c(&s0, &c0); + s1 = add_avx512_b_c(&s1, &c1); + _mm512_storeu_si512((state[0..8]).as_mut_ptr().cast::(), s0); + _mm512_storeu_si512((state[4..12]).as_mut_ptr().cast::(), s1); + } +} + +#[inline(always)] +fn sbox_monomial(x: F) -> F +where + F: PrimeField64, +{ + // x |--> x^7 + let x2 = x.square(); + let x4 = x2.square(); + let x3 = x * x2; + x3 * x4 +} + +pub fn poseidon_avx512(input: &[F; SPONGE_WIDTH]) -> [F; SPONGE_WIDTH] +where + F: PrimeField64 + Poseidon, +{ + let mut state = &mut input.clone(); + let mut round_ctr = 0; + + unsafe { + // Self::full_rounds(&mut state, &mut round_ctr); + for _ in 0..HALF_N_FULL_ROUNDS { + // load state + let s0 = _mm512_loadu_si512((&state[0..8]).as_ptr().cast::()); + let s1 = _mm512_loadu_si512((&state[4..12]).as_ptr().cast::()); + + let rc: &[u64; 12] = &ALL_ROUND_CONSTANTS[SPONGE_WIDTH * round_ctr..][..SPONGE_WIDTH] + .try_into() + .unwrap(); + let rc0 = _mm512_loadu_si512((&rc[0..8]).as_ptr().cast::()); + let rc1 = _mm512_loadu_si512((&rc[4..12]).as_ptr().cast::()); + let ss0 = add_avx512_b_c(&s0, &rc0); + let ss1 = add_avx512_b_c(&s1, &rc1); + let r0 = sbox_avx512_one(&ss0); + let r1 = sbox_avx512_one(&ss1); + + // store state + _mm512_storeu_si512((state[0..8]).as_mut_ptr().cast::(), r0); + _mm512_storeu_si512((state[4..12]).as_mut_ptr().cast::(), r1); + + *state = ::mds_layer(&state); + round_ctr += 1; + } + partial_first_constant_layer_avx(&mut state); + mds_partial_layer_init_avx(&mut state); + + for i in 0..N_PARTIAL_ROUNDS { + state[0] = sbox_monomial(state[0]); + state[0] = state[0].add_canonical_u64(FAST_PARTIAL_ROUND_CONSTANTS[i]); + *state = ::mds_partial_layer_fast(&state, i); + } + round_ctr += N_PARTIAL_ROUNDS; + + // Self::full_rounds(&mut state, &mut round_ctr); + for _ in 0..HALF_N_FULL_ROUNDS { + // load state + let s0 = _mm512_loadu_si512((&state[0..8]).as_ptr().cast::()); + let s1 = _mm512_loadu_si512((&state[4..12]).as_ptr().cast::()); + + let rc: &[u64; 12] = &ALL_ROUND_CONSTANTS[SPONGE_WIDTH * round_ctr..][..SPONGE_WIDTH] + .try_into() + .unwrap(); + let rc0 = _mm512_loadu_si512((&rc[0..8]).as_ptr().cast::()); + let rc1 = _mm512_loadu_si512((&rc[4..12]).as_ptr().cast::()); + let ss0 = add_avx512_b_c(&s0, &rc0); + let ss1 = add_avx512_b_c(&s1, &rc1); + let r0 = sbox_avx512_one(&ss0); + let r1 = sbox_avx512_one(&ss1); + + // store state + _mm512_storeu_si512((state[0..8]).as_mut_ptr().cast::(), r0); + _mm512_storeu_si512((state[4..12]).as_mut_ptr().cast::(), r1); + + *state = ::mds_layer(&state); + // mds_layer_avx::(&mut s0, &mut s1, &mut s2); + round_ctr += 1; + } + + debug_assert_eq!(round_ctr, N_ROUNDS); + }; + *state +} diff --git a/plonky2/src/hash/mod.rs b/plonky2/src/hash/mod.rs index 1a91d38960..edcfced4df 100644 --- a/plonky2/src/hash/mod.rs +++ b/plonky2/src/hash/mod.rs @@ -11,4 +11,5 @@ pub mod path_compression; pub mod poseidon; pub mod poseidon2; pub mod poseidon_bn128; +mod poseidon_bn128_ops; pub mod poseidon_goldilocks; diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs index 03591217c4..410e6d2094 100644 --- a/plonky2/src/hash/poseidon.rs +++ b/plonky2/src/hash/poseidon.rs @@ -8,8 +8,6 @@ use core::fmt::Debug; use plonky2_field::packed::PackedField; use unroll::unroll_for_loops; -#[cfg(target_feature = "avx2")] -use super::arch::x86_64::poseidon_goldilocks_avx2::poseidon_avx; use super::hash_types::HashOutTarget; use crate::field::extension::{Extendable, FieldExtension}; use crate::field::types::{Field, PrimeField64}; @@ -22,6 +20,10 @@ use crate::iop::ext_target::ExtensionTarget; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType}; +#[cfg(all(target_feature = "avx2", not(target_feature = "avx512dq")))] +use super::arch::x86_64::poseidon_goldilocks_avx2::poseidon_avx; +#[cfg(all(target_feature = "avx2", target_feature = "avx512dq"))] +use super::arch::x86_64::poseidon_goldilocks_avx512::poseidon_avx512; pub const SPONGE_RATE: usize = 8; pub const SPONGE_CAPACITY: usize = 4; @@ -40,14 +42,14 @@ pub const N_ROUNDS: usize = N_FULL_ROUNDS_TOTAL + N_PARTIAL_ROUNDS; const MAX_WIDTH: usize = 12; // we only have width 8 and 12, and 12 is bigger. :) #[inline(always)] -const fn add_u160_u128((x_lo, x_hi): (u128, u32), y: u128) -> (u128, u32) { +pub(crate) const fn add_u160_u128((x_lo, x_hi): (u128, u32), y: u128) -> (u128, u32) { let (res_lo, over) = x_lo.overflowing_add(y); let res_hi = x_hi + (over as u32); (res_lo, res_hi) } #[inline(always)] -fn reduce_u160((n_lo, n_hi): (u128, u32)) -> F { +pub(crate) fn reduce_u160((n_lo, n_hi): (u128, u32)) -> F { let n_lo_hi = (n_lo >> 64) as u64; let n_lo_lo = n_lo as u64; let reduced_hi: u64 = F::from_noncanonical_u96((n_lo_hi, n_hi)).to_noncanonical_u64(); @@ -781,11 +783,17 @@ pub trait Poseidon: PrimeField64 { } #[inline] - #[cfg(target_feature = "avx2")] + #[cfg(all(target_feature = "avx2", not(target_feature = "avx512dq")))] fn poseidon(input: [Self; SPONGE_WIDTH]) -> [Self; SPONGE_WIDTH] { poseidon_avx(&input) } + #[inline] + #[cfg(all(target_feature = "avx2", target_feature = "avx512dq"))] + fn poseidon(input: [Self; SPONGE_WIDTH]) -> [Self; SPONGE_WIDTH] { + poseidon_avx512(&input) + } + // For testing only, to ensure that various tricks are correct. #[inline] fn partial_rounds_naive(state: &mut [Self; SPONGE_WIDTH], round_ctr: &mut usize) { diff --git a/plonky2/src/hash/poseidon_bn128.rs b/plonky2/src/hash/poseidon_bn128.rs index 4f2b89d9e1..e1012f8b5c 100644 --- a/plonky2/src/hash/poseidon_bn128.rs +++ b/plonky2/src/hash/poseidon_bn128.rs @@ -14,6 +14,10 @@ use crate::hash::poseidon::{PoseidonHash, SPONGE_RATE, SPONGE_WIDTH}; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; use crate::plonk::config::{AlgebraicHasher, GenericConfig, Hasher, HasherType}; +#[cfg(not(target_feature = "avx2"))] +use crate::hash::poseidon_bn128_ops::PoseidonBN128NativePermutation; +#[cfg(target_feature = "avx2")] +use crate::hash::arch::x86_64::poseidon_bn128_avx2::permute_bn128_avx; #[derive(Copy, Clone, Default, Debug, PartialEq)] pub struct PoseidonBN128Permutation { @@ -56,6 +60,8 @@ impl PlonkyPermutation for PoseidonBN128Permutation { } } + /* + // Go Wrapper - 33% slower than Rust version below fn permute(&mut self) { assert_eq!(SPONGE_WIDTH, 12); // println!("start permute............"); @@ -140,6 +146,49 @@ impl PlonkyPermutation for PoseidonBN128Permutation { self.set_from_slice(&permute_output, 0) } } + */ + + fn permute(&mut self) { + assert_eq!(SPONGE_WIDTH, 12); + let su64: [u64; 12] = [ + self.state[0].to_canonical_u64(), + self.state[1].to_canonical_u64(), + self.state[2].to_canonical_u64(), + self.state[3].to_canonical_u64(), + self.state[4].to_canonical_u64(), + self.state[5].to_canonical_u64(), + self.state[6].to_canonical_u64(), + self.state[7].to_canonical_u64(), + self.state[8].to_canonical_u64(), + self.state[9].to_canonical_u64(), + self.state[10].to_canonical_u64(), + self.state[11].to_canonical_u64(), + ]; + + #[cfg(not(target_feature = "avx2"))] + let p: PoseidonBN128NativePermutation = Default::default(); + #[cfg(not(target_feature = "avx2"))] + let out = p.permute_fn(su64); + #[cfg(target_feature = "avx2")] + let out = permute_bn128_avx(su64); + + let permute_output = [ + F::from_canonical_u64(out[0]), + F::from_canonical_u64(out[1]), + F::from_canonical_u64(out[2]), + F::from_canonical_u64(out[3]), + F::from_canonical_u64(out[4]), + F::from_canonical_u64(out[5]), + F::from_canonical_u64(out[6]), + F::from_canonical_u64(out[7]), + F::from_canonical_u64(out[8]), + F::from_canonical_u64(out[9]), + F::from_canonical_u64(out[10]), + F::from_canonical_u64(out[11]), + ]; + + self.set_from_slice(&permute_output, 0) + } fn squeeze(&self) -> &[F] { &self.state[..Self::RATE] @@ -211,9 +260,10 @@ impl GenericConfig<2> for PoseidonBN128GoldilocksConfig { mod tests { use anyhow::Result; use plonky2_field::types::Field; - use super::PoseidonBN128Hash; - use crate::plonk::config::{GenericConfig, GenericHashOut, Hasher, PoseidonGoldilocksConfig}; + use crate::plonk::config::{ + GenericConfig, GenericHashOut, Hasher, PoseidonGoldilocksConfig, + }; #[test] fn test_poseidon_bn128_hash_no_pad() -> Result<()> { diff --git a/plonky2/src/hash/poseidon_bn128_ops.rs b/plonky2/src/hash/poseidon_bn128_ops.rs new file mode 100644 index 0000000000..48e770a507 --- /dev/null +++ b/plonky2/src/hash/poseidon_bn128_ops.rs @@ -0,0 +1,4630 @@ +use super::hash_types::RichField; +use super::poseidon::SPONGE_WIDTH; + +#[allow(dead_code)] +pub const RSQUARE: [u64; 4] = [ + 1997599621687373223u64, + 6052339484930628067u64, + 10108755138030829701u64, + 150537098327114917u64, +]; + +pub const C: [[u64; 4]; 100] = [ + [ + 0x878a9569334498e4, + 0x4641e4a29d08274f, + 0xf2713820fea6f0c4, + 0x898c94bd2c76331, + ], + [ + 0xd6dec67b3646bdbc, + 0x626a9e071b154f27, + 0x71a61cb1f9d90cbe, + 0x134dd09bc5dffaa7, + ], + [ + 0xc24d9503f8682c8c, + 0x9cf5f5abe19fedff, + 0x125f8816cdb2d9f1, + 0x5954a7a4436fd78, + ], + [ + 0xc306f8ed4ba6732d, + 0x5b187030689573d0, + 0xb0a9df5b5120771d, + 0x5513e9e64511461, + ], + [ + 0x84b301dccd446ff0, + 0x59d0332079fd0d4c, + 0xcb69fbff03ebf775, + 0x1582477fe7736802, + ], + [ + 0xd4cba791193dd512, + 0xc07dddce6dba21d5, + 0x79391672a0b6ecd2, + 0x2b13399d4308ec41, + ], + [ + 0x3eb7b07418da854d, + 0x18df0397a244d7e3, + 0x983c1a1e41a858c5, + 0x14a4dc22dbbaf6f9, + ], + [ + 0x317311a626a4e71c, + 0x8bfc4d5753b69402, + 0x8147d97f129bca1c, + 0x1779b47e3a5bfab, + ], + [ + 0x969e97b2d9029781, + 0x6da6b49c2cc91cd2, + 0xf1779eeb56dc1b36, + 0x24e67809f1c36f1c, + ], + [ + 0x96be623f30e5dab1, + 0x45d644353b9ff9af, + 0x5173775702777781, + 0x177bbab6eef5c2cc, + ], + [ + 0x766c8f5d09003723, + 0xc35a793f1c4ef16d, + 0x1ccbcc21f8416aba, + 0xda62e07998b986d, + ], + [ + 0x50d495b1c8b1cce2, + 0x8973e470121c3a76, + 0x1b4c8afdbe808a92, + 0x26cc2ec9d51be4d3, + ], + [ + 0xfc8703f33d12bad0, + 0x6544a99005e01916, + 0x3e3149839d1a658e, + 0xd29900eab66c55b, + ], + [ + 0x508ee4483fab787d, + 0x9557c4fb953244fa, + 0x150c0c527c7b0292, + 0x1c62a2de197849db, + ], + [ + 0x9fc486f53774e8a, + 0xe31561abbcefb5cf, + 0x3d34f5bdbc156014, + 0x195b4ee8457027e1, + ], + [ + 0xbd2efe1961e1b3c7, + 0x2b534ac8bc6e4ecb, + 0x501233ac87899cb3, + 0x1653eb7e017f9132, + ], + [ + 0x77753115159d25f2, + 0x151334a5631b7216, + 0x16d209ff858f73eb, + 0x7c2d28738436409, + ], + [ + 0x4ca3db03d44c586e, + 0x79fa80602ef44da6, + 0x5c89f9827c970f1d, + 0x2b2a0e263583b992, + ], + [ + 0xe6ecdb864b836a8d, + 0x26eaf84ab4176c3c, + 0xa71eafecc6f4bc70, + 0xc1c26366f225554, + ], + [ + 0x4239d17551cfdc1a, + 0x8d5f0841c18e1c3b, + 0x2fc1d2d58d15e408, + 0x1bae097b713e008e, + ], + [ + 0x5a6dd19ca0fc3fa7, + 0xd51682cf90294707, + 0x18017b89769b9c26, + 0x9cd442dc7580835, + ], + [ + 0xdb3686d6c12abc6, + 0x7c6b8216046da6eb, + 0xab8cba80ce31bd0f, + 0x723c7da54840864, + ], + [ + 0xf9d771ae57719136, + 0xe871941125ccd77c, + 0x5877371eeb756c92, + 0x236db3bc6f7b868c, + ], + [ + 0x1991f3a9ab1a8a1e, + 0xf1bc34fa7390b469, + 0x7a500a6fd5b5e601, + 0x47c1bff1d838d9a, + ], + [ + 0x77d255c68da3371e, + 0x1c3d1d80af7c8849, + 0xf11e46404a393db7, + 0x244c9872ae7424f8, + ], + [ + 0x54fc7ffde14dda4e, + 0x27f1da61c05e3c9b, + 0x2f48569498b2fc16, + 0x1bbead80194032f0, + ], + [ + 0x5ed3dcfee7bbec96, + 0x4afe5ef835e3cbe8, + 0xc2616076444869a1, + 0x61ac6cf5b2a8fd4, + ], + [ + 0x2792e1b29658268, + 0x45fe50fa6556f4b6, + 0xeb5a0b8c0f389162, + 0x2035c4acf7c80c91, + ], + [ + 0x3d62bde5f5805cbf, + 0x530062cfd7a99c80, + 0xeb76c38ed028bf7e, + 0x19f568f69be678c4, + ], + [ + 0x8aeb08756964c799, + 0x9a26e9a4a3f659ca, + 0x6dfff103f8546e1c, + 0x25952ae4d0199104, + ], + [ + 0xcb5ba73726983c80, + 0xb63c2258f907c40f, + 0x234a8b16fa31bbe, + 0x65fb4ddea2dcdc3, + ], + [ + 0xb7ca530b0c46cb0a, + 0x8b4b30ec102b9498, + 0x89232a1702f21d35, + 0x17b61f3f4482891a, + ], + [ + 0x21ef0c8ef817f0b3, + 0x37e59fcfc295b10a, + 0xba190e69dcd371f0, + 0x8fc5d938192794a, + ], + [ + 0x2490ca6341905e10, + 0x33a54aa9db48a5b6, + 0xe4f933be9632ef11, + 0x285221bf1f69c861, + ], + [ + 0xb07dbb0743de3c5b, + 0xe81ab68bfa32ebf0, + 0x2a79628405d61365, + 0x3042b01a44f9123a, + ], + [ + 0x941e8310bd2deb09, + 0x35e821e30b453bef, + 0xcee259921cbe111c, + 0xc06807889f9682, + ], + [ + 0x66ff167e37db96ea, + 0xcb78ccd38eeeffdc, + 0x6f1e2690f1c90f59, + 0x1bed12cb4798cbdf, + ], + [ + 0x70df149a76545cf3, + 0x9719c66a8f620da4, + 0x4cb01f0dc2c27a3, + 0x102abc29d3c94951, + ], + [ + 0xfb61ac9547363bb6, + 0xb085490027d4e8eb, + 0xebb13083e19d63dd, + 0x12b63d6b9afd9a12, + ], + [ + 0x39c5ef6deff4e0f7, + 0x38ae71161204910c, + 0xca0005aec936c88c, + 0x111cc546cbc138c5, + ], + [ + 0x6d516cfdc4b4cd1e, + 0xf45f63ef6f96f930, + 0x83bea60478b1b76d, + 0x2f98a006fb13bcb5, + ], + [ + 0x99dd58d1e9d818ff, + 0x7d060b0ed4121a0f, + 0x65252326690ab5e5, + 0x2e81a5ee340447bd, + ], + [ + 0x1a1fc00d3e8c6709, + 0xbf8ae07ffb30ba1d, + 0x70e3044d7c1dbdbc, + 0x757ea27942cedfc, + ], + [ + 0xaebdbb67c8ea3466, + 0x957ad9399fb6f98f, + 0xea7bde5d161c2f91, + 0x26cd837861787ed6, + ], + [ + 0x43536d705f42cbd1, + 0x5ffaa914d6c6a273, + 0x23100f56d0b51cd2, + 0xd341ec358dcd032, + ], + [ + 0x8783c40007d6d526, + 0xa4e28f55dbcc78b0, + 0x2a55553286590a39, + 0x2dc47e212043f68b, + ], + [ + 0x818ce67df3bb08, + 0x39077bf1e2576b, + 0xd23b0bcf26fdd456, + 0x256a6d82275993a7, + ], + [ + 0x14dbf2fcd6c39de1, + 0x5ddb69184942ab82, + 0x1767d433553e8d44, + 0xf8c85df551b034f, + ], + [ + 0x86f32353e026f84c, + 0xddd0abee29343b82, + 0xf58b940b21fd9251, + 0xed6e71fa5025d70, + ], + [ + 0xd4b29e5118ab86de, + 0x568b41bf5f9b4090, + 0xdb458bc7b18059ce, + 0x19e04bd20505ddb7, + ], + [ + 0xaf52d4cbcea88c02, + 0x5286de716e60b0a3, + 0x4d508ff9fa524102, + 0x10b5f643ecbf138e, + ], + [ + 0xc7fa4464b8e0f44d, + 0xe0b8537d6313e95e, + 0xf97182422dc7bc05, + 0x21ea8852d764b486, + ], + [ + 0xeb27dfb49ab04f65, + 0xea9b98fcb4cc1f21, + 0xf746e8891fdacb02, + 0x270a5fd7486e7e0b, + ], + [ + 0x20a199b1adac97d, + 0x68fcb2fb4e651e8b, + 0x58382af7cc3ec1b0, + 0x70f3ad69698f1b9, + ], + [ + 0xa597882c006bd71a, + 0x551ef9f559c1d0a8, + 0xf5f5386fba24ceb, + 0x2f8375d466d028f4, + ], + [ + 0xc7b6d1ffde7a0955, + 0xeec8e3f476fa8462, + 0xf65b7c76a5fb2c79, + 0x1dcf609a551f3770, + ], + [ + 0xb0fa63e8782c4f49, + 0x48534388ab5d42d1, + 0xb337f2a513c9cc34, + 0xa6c8c859bf435d4, + ], + [ + 0x1228104210e268a4, + 0x655b3afd73247f79, + 0x2691c84e84b36768, + 0x1f23ce44898ad585, + ], + [ + 0xd3239dade1abbe9a, + 0xc52eddf7a1ece350, + 0x5c30277d426bc665, + 0x2f2920065a9d2af9, + ], + [ + 0x70a36f47f2aff6c, + 0xfdb85e88e4550844, + 0x9627ed9978424e6a, + 0x21b6e1114fc62df0, + ], + [ + 0x99935b823db6717e, + 0x9bc1842e83d4e5e3, + 0xe2189f767a11dc2, + 0xbe3b9ec8e2ca437, + ], + [ + 0x45b7d695f757c594, + 0xb2a6d390594226d0, + 0xf9ebe18eb37f5259, + 0xa3769a486f411e7, + ], + [ + 0x7026984e0085dc44, + 0x24da82e0203bb6cc, + 0x6766d00070e380c7, + 0xfbf3bfedf380cc1, + ], + [ + 0x8d830bef3c888f69, + 0xe7837340530306, + 0xb93becc667d41b84, + 0x1ab671517c659364, + ], + [ + 0x47413450eaa79fe9, + 0x43668c726d781e27, + 0xbfa1f603a306338f, + 0x181e1094e48f2844, + ], + [ + 0xb2fd433efbc00cb6, + 0x799b6311847008a, + 0x4f93d06eb357bcc8, + 0x186a5be67a6d8a5e, + ], + [ + 0xa6706ab88cc66075, + 0xa1df6bdd4a52a3db, + 0x16f12be46cefbfb, + 0x217f69e8c16f7db4, + ], + [ + 0x841021ca9bd86897, + 0x22c8b22051a6d56d, + 0xf5cc67c845bfb18c, + 0x4df0afa22f4f3e5, + ], + [ + 0x5dacef67b5c8145c, + 0xc33b3b9ad06681d7, + 0x3a7de3e9e0d45caf, + 0x1bbc33136e620c2f, + ], + [ + 0x3e8327f808e15fb, + 0x9e49afcbc23c9572, + 0x7ae61434c7fecaf8, + 0x25c980ba66ee7aff, + ], + [ + 0x61524ccb01ca9b9e, + 0xfce05819a5a0f2f4, + 0x28d68a3060c4dc3f, + 0x2c1b4cffec884b89, + ], + [ + 0x88f7f5fed6230e90, + 0x1a311b161f04f8ee, + 0x412d5efe72759335, + 0x2b99045bdc52f7d0, + ], + [ + 0x74410cc4ebe2e7c1, + 0xc5f3eb89c9781264, + 0xfb7297857779b613, + 0x27dc40000a0d3c73, + ], + [ + 0x1c574f23928dbf5b, + 0x14d8a86f73ad59b9, + 0xffe26f3a19e8a96f, + 0xc5e96c0f330e5a2, + ], + [ + 0xaf2690a49f664be7, + 0xa88a16989e37acb5, + 0x48eeb8b8e66860b0, + 0x17c44e9d385ff670, + ], + [ + 0x155bddc3ff596b6f, + 0xa57aa664606d8a4b, + 0x5daaed84e6d0b6d1, + 0x2a4a43fd4227426, + ], + [ + 0x125b97e5debe8526, + 0x71fe8d28ed156dec, + 0xaafd8804faba65cb, + 0x198f3617e3b5e480, + ], + [ + 0x5aedecd0c58a1b4, + 0xf4ecdc5014f4ca99, + 0x2913aaa6ea41d52e, + 0x18b1799115d6d38d, + ], + [ + 0x5f1869e3535ce098, + 0xa0aa7550b031d82b, + 0xcd093399c25e5e14, + 0x29ef9dcf6e2bef6f, + ], + [ + 0x6fe70cce749aeef3, + 0x8afe7caafe4123e0, + 0x9fcd8b228f2344c9, + 0x5e600c65e33cdee, + ], + [ + 0xa851c8caf84c9dce, + 0x79213f3a745fa85, + 0x561e877c4b7223b6, + 0xd7085ac8021cb56, + ], + [ + 0xb85f4383ae556156, + 0x10d1afd8ac169c5, + 0xe40c294e792ade32, + 0x2522b4968520ccd9, + ], + [ + 0x794be50448306390, + 0xcd33805ed0157321, + 0x9ddcafd32c22b4e6, + 0x119f97064698b03a, + ], + [ + 0x30863344bed9e91f, + 0xa191078ba28f526f, + 0x220d96bb84ec07cd, + 0x24373331db348a2, + ], + [ + 0xbbe999ed9c275bd8, + 0x2ebe3bb2c1ff920d, + 0xce1795c2cc4de1bf, + 0xdc18ee1fb665b16, + ], + [ + 0x63267907e2dc6e73, + 0x7aef5ef2a5bad828, + 0xc360a5b6b54a39c2, + 0x2828b8c806976f, + ], + [ + 0x88509c8adde817a6, + 0xf08e4d7764fbeda, + 0x3caff9074054604c, + 0x2835ff01d970e85f, + ], + [ + 0x395e8f84d5f7fe8c, + 0x6552af119280cbc3, + 0x433cfe24cee250b6, + 0x24c1dbc8e4ca5479, + ], + [ + 0xf93ba49aeb18e5d0, + 0xe4a0b527def68c97, + 0x15f69bc15e3c87bf, + 0xf6b491044f9bd36, + ], + [ + 0xcba1c5843fef9248, + 0xe43ca0602cf98cd3, + 0x37610f6f73cc7c5, + 0x13e9643942b162ab, + ], + [ + 0xd2e25df8630f6b7b, + 0xc55d2d12b8c7a34c, + 0xc63fe1090e68b8be, + 0x12c84fab76ec5b64, + ], + [ + 0xadada4921aa4585, + 0x6a85fbc947dfe9dd, + 0xda29c744c80b0aa2, + 0x1cec39e596c498a, + ], + [ + 0x1bca6e7ca22e3bd4, + 0xa66284b3e9f3dd54, + 0xab279edbaece0bf0, + 0x52670f132690604, + ], + [ + 0xd366cac98bffbaf, + 0x3ec680e324945f78, + 0xf8e5e615b395de9f, + 0xaff7f3fd26f8df2, + ], + [ + 0xb0f696b16cf1f098, + 0x9b6cbdda6e9e236f, + 0xa8c2de72d5fb302, + 0x2ff786d2518f5bf4, + ], + [ + 0x2b232229c663a101, + 0x1b93fd6e28f5164c, + 0xe0be4257697a48c1, + 0x2f89bd024d6a2f9, + ], + [ + 0xf73da0cfccbb8672, + 0xd389d107249c26e, + 0x30e4dac54b9c4e50, + 0x1d738b08ab28a9b2, + ], + [ + 0xaf93bbae969c6038, + 0x50cd80c7820760a7, + 0xcf7460fd8dde6e02, + 0x2d862490c21d01a2, + ], + [ + 0xb9f01904b48fa6dc, + 0xc3b7151b000e2bd5, + 0xa155d1cdb09c4a10, + 0xfa2d959db0307f5, + ], + [ + 0xd87ddb4fc32fef17, + 0x12a63bb4a327005e, + 0x175e6fbfebdbba71, + 0x4bd0d58ae73a5ce, + ], +]; + +pub const M: [[[u64; 4]; 5]; 5] = [ + [ + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x6907e36200995439, + 0xb9f80b5666c65169, + 0x7ba328f07ebc2640, + 0x152d921c334deb59, + ], + [ + 0x235bc3071b88c57f, + 0x1edd9e8b512a928b, + 0x4eba9db9a285a5db, + 0x208c85cecd6e86b2, + ], + [ + 0xd7e96fada4cc7131, + 0xe05eeb104bdd4f26, + 0xd629a31acc8b39c6, + 0x292e987009256cb4, + ], + [ + 0x9337ce2160d27631, + 0xb7603b2e38f0d93e, + 0xba04b96b55dfec38, + 0x25c45b9bb527b189, + ], + ], + [ + [ + 0x9d7560eab0fe4046, + 0x35aebb7e1cbabfde, + 0x46f4c2b5ffaab98, + 0x10c9d5b18c43b9ea, + ], + [ + 0x9de26ee0faaa6230, + 0x8b3cedd3678272c4, + 0xbf689106033676ec, + 0xa4f014b431ef663, + ], + [ + 0x8b7a04145ef1d11a, + 0xed5ccb60d2f55df9, + 0xc0463074d5d84b7c, + 0xfc883bdcf417770, + ], + [ + 0xc7a0f540e19091eb, + 0xd6b9fc0427f1efb4, + 0xd709082fce71505b, + 0x2c2f39bf3fb689c1, + ], + [ + 0x570517f8d7bf3625, + 0x6f64bcced634daf, + 0x85747cad8e788981, + 0x240f49cb93d117d5, + ], + ], + [ + [ + 0xb866652e4f26da85, + 0xb9e2d4c767608cb5, + 0x7266982acf0812ff, + 0x1075bbdae372b70d, + ], + [ + 0xabe2754c2279be8, + 0xf34d6acdb0ef8be1, + 0x638c985fb12509f5, + 0xce4a0756717cd0d, + ], + [ + 0x16ef19d92023860d, + 0x97313a990cdaa693, + 0xfa536002a38deb76, + 0x157c584bf12b5fc2, + ], + [ + 0x32ec79c4fa39b5e0, + 0x7e1d8f6dc66882f, + 0xdafcf6f32b1b7f1f, + 0xb80626e4af5efe5, + ], + [ + 0x74572ba3822678b6, + 0x1178400143204c5f, + 0x46e8e28cd12c3a6f, + 0x10b1d99213e5666e, + ], + ], + [ + [ + 0x6190b23770183886, + 0x101d044302cb2858, + 0xecd03dccfbeaf617, + 0xb084598422035a5, + ], + [ + 0x4ff66343628de773, + 0x8669e3967283e9d5, + 0xdbdb4492fd9478a1, + 0x2a172f4971297058, + ], + [ + 0x26b36d6f81141445, + 0x46db4e5f5c0c0592, + 0x1c8ff6641950ef7f, + 0x3831bb3c0404ec0, + ], + [ + 0x48268958c0294633, + 0xe32eaddae7cd0cfb, + 0x83f515af535c5f73, + 0xeb68faa42851083, + ], + [ + 0x1c641486ade67a7a, + 0x4b50719a5e10222c, + 0x9f5dd44f4cc1d827, + 0x1b5b9eef181679f, + ], + ], + [ + [ + 0x71d451ca47c3e06f, + 0x1a4dc1da0d245f85, + 0x4812497a20f7afce, + 0x2d1c2ecb1969e4b, + ], + [ + 0xa96b93484bd7274b, + 0xb6ffb6120bbc6f39, + 0x4f8cc3b20738a669, + 0x26d0dab233956299, + ], + [ + 0xe693b6e9a4a622a4, + 0xd3c7b489ce3e9706, + 0x97a65d65e20440eb, + 0x1c50a5a391d3e7f, + ], + [ + 0xde28a4428ec83e3a, + 0xc302d6eb2a211388, + 0x78e5ca7195aeb86e, + 0x1f159c9528951410, + ], + [ + 0xfeb302a5110d9eb0, + 0xc251af52f6c4abc6, + 0xff454cd9ef575da7, + 0x1ab6f8eace913fdb, + ], + ], +]; + +pub const P: [[[u64; 4]; 5]; 5] = [ + [ + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xce4acf526c3274f2, + 0x972b38360d5fd556, + 0xa1a97c8aad5da8a0, + 0x2be88e1419bbf1d1, + ], + [ + 0xc7d3895b727eedec, + 0x9c9ed3e16476b8b6, + 0xa9e5c73aed214353, + 0xcedc2b424714991, + ], + [ + 0xb0e375ee354211bc, + 0xe1992ccf53eb2ba, + 0xf55443898f891f17, + 0x24b90e300d60fb3, + ], + [ + 0xd35616d7cd4bd73c, + 0x1bd40038d7c8631c, + 0xe90a36846119186f, + 0x2e21bbb03e588b76, + ], + ], + [ + [ + 0x9d7560eab0fe4046, + 0x35aebb7e1cbabfde, + 0x46f4c2b5ffaab98, + 0x10c9d5b18c43b9ea, + ], + [ + 0xca9781aa6ae87fde, + 0xd1c4a4d087c2a2b9, + 0xd6437422dca6093e, + 0x28162016274897b, + ], + [ + 0x52db09e853a3e166, + 0x7487b59c278a20d2, + 0x1dfdb672fd18aa1f, + 0x28ff78245cf7f81d, + ], + [ + 0xadd151974076d23d, + 0x5f8492e905ae02a8, + 0xb3b4196e642f4f18, + 0xe89ac1de8eef3b6, + ], + [ + 0x2df964cfa7dc18b7, + 0x1d3af8b6bebbaf98, + 0x51d6285ac8606f65, + 0x166c6108fca1229e, + ], + ], + [ + [ + 0xb866652e4f26da85, + 0xb9e2d4c767608cb5, + 0x7266982acf0812ff, + 0x1075bbdae372b70d, + ], + [ + 0x467fea194d48444d, + 0x6aba7d733bbe47d0, + 0xff534616dc9ed70c, + 0x2a4d50030a08e8ed, + ], + [ + 0x21309082d0680bc8, + 0xa845a6803b1661e5, + 0x644dfdf7fdb83a6c, + 0x1a32fe76699539a1, + ], + [ + 0xef6bc2668234d6c, + 0x7da53b65314fb93a, + 0xeb96475131d7f47c, + 0x2ad247267cee7e3b, + ], + [ + 0x5861343a190743da, + 0x49af2332fe815c68, + 0x967af596d4244272, + 0xf550256a143b287, + ], + ], + [ + [ + 0x6190b23770183886, + 0x101d044302cb2858, + 0xecd03dccfbeaf617, + 0xb084598422035a5, + ], + [ + 0x5a2bf485692a7cb9, + 0x4f5108a0020a4931, + 0x28d29db230c101a5, + 0x4bcb517bb201cd8, + ], + [ + 0x1f870b694aecfb71, + 0x25625b7d2015039, + 0x6d03980a21d13e7, + 0x304f0cdee0897900, + ], + [ + 0xef5edf21ab8bbee8, + 0x3bd7cc6f038e5d15, + 0x571360cfbb9e7c82, + 0x157eeca7b34ef66, + ], + [ + 0x78581ee2de8b718b, + 0x8ce7c831855c6337, + 0x7a97345b7b11aa0d, + 0x24595c5b3a27c0cc, + ], + ], + [ + [ + 0x71d451ca47c3e06f, + 0x1a4dc1da0d245f85, + 0x4812497a20f7afce, + 0x2d1c2ecb1969e4b, + ], + [ + 0x6dd65b61a058785b, + 0x397956e306004c9, + 0xb1f877d6e048113c, + 0x2417e33f665b5776, + ], + [ + 0x2e90398e8f6ddcd9, + 0xd29e3b9dd925fc46, + 0xdd99d1f604fda03, + 0x2d450ff72cfdf452, + ], + [ + 0x6d45c0f2ccb58ae2, + 0x4d281cf644c142da, + 0x9b2d8cd3db913490, + 0x2b07b87649647799, + ], + [ + 0x59bf9b9a45ad5268, + 0xbaf9bb8902915e9b, + 0x2759360ee75d5c78, + 0x1281668c47613685, + ], + ], +]; + +pub const S: [[u64; 4]; 540] = [ + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xbf11a04b622deab2, + 0x2551cd2c3634c305, + 0x7d6c9625d1d5d6ce, + 0xc01a573a0d5c0da, + ], + [ + 0xb4deae83dcae7248, + 0xd9c0a6d372d959d4, + 0xef0de8470b523633, + 0x1518497d31dc7d3b, + ], + [ + 0x272125eb8df6b56b, + 0xe111e3227487e58, + 0xe8dab3635fa1a86e, + 0x299ad906d957ba19, + ], + [ + 0x544530838e246a86, + 0xe293be8211932040, + 0x73e41b39e02fea72, + 0x219580ceb0215cb9, + ], + [ + 0xa5ed06b6e49aa39f, + 0x215bb2b82158e97f, + 0x264db96304e8567f, + 0x25920d1ccf9c7285, + ], + [ + 0x9ac018a2cfd37a50, + 0xa918469993d26127, + 0xbdf452bf2056ff92, + 0x1c05f14ec94e3fe7, + ], + [ + 0x57ec39da626d588a, + 0x27b7eb7e2fc9b337, + 0x48a69067e5ae1558, + 0xc274a824bbef1e7, + ], + [ + 0xd189672638464d82, + 0x833ee377da1a9bb4, + 0xd5486ceee3e2946a, + 0x24e31f9dc4e2a59a, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x586c65c93756fbc4, + 0x65ee70d136b2be19, + 0xb8722ed61863bcc9, + 0x22fa7f5d5cd64e6a, + ], + [ + 0x73f1b37cd5be8473, + 0x9253604876145cd4, + 0xe1b077489e85fafa, + 0x680266ab1e1325f, + ], + [ + 0x2b02af3cd7441648, + 0xa40e068244a6631e, + 0x7774a8db3ab2efb0, + 0x1d64fc9a934a2ebd, + ], + [ + 0xadb56623821d948, + 0xb1f47ff0cca09ab8, + 0xcbe2a39f207fb7b5, + 0x301e38a62406b6e0, + ], + [ + 0xe351a61e71607681, + 0xb5c237b53116c93a, + 0x3340e786aa3003dc, + 0x2322c896ae71d2ce, + ], + [ + 0x8cfca919901dc51f, + 0xb4e50e871d53df16, + 0x7acbd8f012103d00, + 0x26aa00cb25dc862c, + ], + [ + 0x1ae9225d0b16f71c, + 0x595ff45b3c426524, + 0x5505a62dfaf6f493, + 0x11b8f7283cfc2a5f, + ], + [ + 0x71d5e2f0d972952, + 0xb84cc2f6ad1233c2, + 0xacd6893225accbd1, + 0x1bfbe13e534494b2, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x257778bcebcdba7c, + 0x60deedd7c09d6466, + 0xebb9c6be708fa912, + 0xac6b975ac48b584, + ], + [ + 0x92f54018e47c00f6, + 0x7450956cb58793e0, + 0xc2e5a40d4c8563e7, + 0x2988ff8d1b886279, + ], + [ + 0xd4811cbb14f200ab, + 0xa8d8b8fb30573c5b, + 0x543014b7436934af, + 0x115403e0995f3402, + ], + [ + 0x153becd107032529, + 0x5e68ee8314012b, + 0xcb203eb61c5914ea, + 0x109bbccb9fb14f06, + ], + [ + 0xaf0e0e19935abbcb, + 0xb005c672f3dc0133, + 0xf1913d505d198754, + 0x1648cff6869e93e7, + ], + [ + 0xd1fb2933bae6f537, + 0x972013c298fe99aa, + 0xd3d38f07d1d6da4, + 0x148ebc961a711fad, + ], + [ + 0x8d3c4c9bd1d39edd, + 0x65996783da1d5d60, + 0xab2ccb33784afa0d, + 0x192ace199fbb9bdc, + ], + [ + 0x1011d66765ae6d67, + 0xc3b6691782d17904, + 0xe8380f69696ac20d, + 0x2279691aebca447d, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xf3f148374dfca65a, + 0x3f3ad15e6dec8383, + 0xa5bcf217c9561b58, + 0x108675d995d4b8aa, + ], + [ + 0x12252baae20ddffb, + 0x7a92e3fc52f50a54, + 0xb03eb8376efccb07, + 0x2c06537ece26b1b0, + ], + [ + 0x99ec59e887883236, + 0xf2cb762cb28a6af5, + 0xe8b0f66e26e7f8a7, + 0x982d6f9c60850ec, + ], + [ + 0xb8a15cf6dcbf7b2, + 0xd91bedc08c13ac02, + 0xc8b8ab89820428c9, + 0x1eb0084f37fca447, + ], + [ + 0xaefb48c038861699, + 0x2ad7074b2ae20c77, + 0x354178ff342cce27, + 0x17b6e61f442d307f, + ], + [ + 0x6f3bf6547e9eaf04, + 0x111f4c15195b780d, + 0xf7f70e840f538e0b, + 0x1d9bd1e4a338f998, + ], + [ + 0x9d1ce7cb7fe01808, + 0x953e34f80bb25773, + 0xf84743c8c14a842b, + 0xfddc59a3d118837, + ], + [ + 0x7966bf15e5df7a25, + 0x4d586e7d5b87eab0, + 0xc318ae69198aaf9b, + 0x97b4e6b14f60ee6, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x6147c4817ee1e62c, + 0xa83c4ef836b34020, + 0x12695fe71bc813fd, + 0x60858c56feebac4, + ], + [ + 0x9a3ab8323c9552fd, + 0xc4ff414e5a4854e3, + 0x9ea8a21355e83c35, + 0xdda6ed6361d1587, + ], + [ + 0xabc55ce594f60f33, + 0xa87e8db2b369e817, + 0x387f17baedb2f6dd, + 0x2b1b901928231075, + ], + [ + 0x773cf10b164f2933, + 0x378e1d67ecb7887e, + 0x519fda0300f27cae, + 0x1700adbf53cffd61, + ], + [ + 0x46b1ed46294e4bd7, + 0xeb6b709664306f87, + 0xfa79e75e40ba2e3b, + 0x103f865f84b8ab7a, + ], + [ + 0xfe6ef280c28030c5, + 0x1f7aa0efe4b98b35, + 0xb68627ee03e7834a, + 0x260ed8d34914f449, + ], + [ + 0x2c93872aa8553006, + 0xeb3b351f5335f5ef, + 0xb924ff2d9fa98b79, + 0x217ec6cc068ae4ee, + ], + [ + 0x6663eb1c16af082b, + 0xffb4d3e568d3c8bd, + 0x676f3832cf67e1bd, + 0x3f33ac0134c1318, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xe9125cf016a3f6b4, + 0xe39835227fc7c09b, + 0xdcbc17260db9053b, + 0x198e8807d1415ee1, + ], + [ + 0xa45dfb0d6bba9285, + 0xb0ee7605e15cbaf6, + 0x8e0c52f69086d7b5, + 0x530d12103a6ddb3, + ], + [ + 0xa1efc1b75222b549, + 0x1499bed8d3e6e1c4, + 0x76bb206a1bf9d2e, + 0x63e564e9686ec83, + ], + [ + 0xec01f1f97b6d750b, + 0xe3be3a33a2ffe4a9, + 0x79311afa3eef777a, + 0x2b3676ebe7d5bd3e, + ], + [ + 0x2302186e25fce1a7, + 0x54c382a1e8502479, + 0x1b7d84c7e7dcc27f, + 0x6bf1afe285a0238, + ], + [ + 0xaace91512857df31, + 0xe45c8a8213236049, + 0x3458791c287e7353, + 0x2e1310c703b10c5a, + ], + [ + 0xd1612e6fd8188708, + 0xfdc5b75f6abbd1a8, + 0x9c0fd87d516ea875, + 0x1ec4f396c48b2c00, + ], + [ + 0x50235cc68fd154f1, + 0xa46c1564730dad8, + 0x66ff22920b126d00, + 0x6a7d44d81ac449b, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x1d03106697350658, + 0x6a7b8effe6303e57, + 0x8c827728c74b18d5, + 0x191445cff29c6df3, + ], + [ + 0x3999af744d5c5334, + 0xdfb8f1dafcde99a1, + 0x52fdcb3ccb360f53, + 0x1ed732608683dacf, + ], + [ + 0xe73824b10c081633, + 0x9f85f7cd8a66cd1f, + 0x640bf731d6ffeec3, + 0x215db24ca86956d9, + ], + [ + 0xa62ab0a2639d52a1, + 0x5fc4a59698ffc426, + 0xbe570c35c926d02b, + 0x2a5d49f2ba3181, + ], + [ + 0x825465638486f6c8, + 0x733a1a5647a900a9, + 0xaf696d543455c86e, + 0x17151cc7eeb8ba97, + ], + [ + 0xaaf9c3d3412fcc23, + 0x29b08c97eecbdbfe, + 0x9222543fdd170f05, + 0x16dd416d91759f74, + ], + [ + 0x84b49cc2d348bfd5, + 0xa02b37592fcc8025, + 0x19b04bd513a6f705, + 0x2c4decc143e11111, + ], + [ + 0x410976261fbec77, + 0x385f41fea626291c, + 0x7834346ca364e14a, + 0x268c617ef12f6977, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x13cc46e420309046, + 0x68087a560719ed6e, + 0xb17f8863faaad244, + 0x2cf99f791627bff6, + ], + [ + 0x194566d109f40caa, + 0x3bbd0da50252e95f, + 0x7bd343406384cd10, + 0xf4fa94a4fc9c358, + ], + [ + 0x925528723f34dfc, + 0x4a116ad45941d02e, + 0x7781f8eb3443854b, + 0x1054fab981f474e8, + ], + [ + 0xf2eb6b62d5665151, + 0xc13828c0970f0261, + 0x7f663c08845cc5c3, + 0x13d0797fff2db7b3, + ], + [ + 0xdbe579130dee1510, + 0x9a7d2342d8bc867, + 0xe019a0bdbb49cdb4, + 0x6a66ed76a78ace7, + ], + [ + 0x1319d660e22f461f, + 0xe08a8dc6fbd869ee, + 0x9a0a598d8452d435, + 0x914e6125339ed02, + ], + [ + 0x87036fa9ad4dabb7, + 0x5a3e0f35601445a3, + 0x6884b664ccad6371, + 0x2e9fa8e59c054870, + ], + [ + 0x1c91785629dad242, + 0x12657b8f6dbc371, + 0x9afc3ae592fd64db, + 0x18426f6dafd57e64, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xa88b192f77be83cf, + 0x41abf098d347adf4, + 0x2017dd25516e087b, + 0xfadeeb9947f671b, + ], + [ + 0xdf0a09646851c7a3, + 0x3c0398fa77eecbe0, + 0x667b67a54b355ff6, + 0xc858ebd80a2611d, + ], + [ + 0x8213ccfc6aa9f175, + 0x45c56895be7c209d, + 0x787f4e7e480d74a8, + 0x74251e210cdbf1a, + ], + [ + 0xf3a1ec058f629912, + 0x2e5cbda164847b51, + 0xb84abdccbb22571a, + 0x251ed8f5001e05cb, + ], + [ + 0x9b4b1e32bc2ed421, + 0xa3cb18a755a4ac3b, + 0xd37bfc2f2184efc9, + 0x1d145af4c6842b71, + ], + [ + 0xf037ca6f310ac0d0, + 0x845bd6e4dec0d09d, + 0x3b21af56753e1ff7, + 0x1f51fb65fcf0892f, + ], + [ + 0xc4078e5ba867b481, + 0x2cc10f2f620797a1, + 0x72c34382e64cb498, + 0x28c386b4e693fd8e, + ], + [ + 0xfdf5875983183561, + 0x35c1cf0d71caa997, + 0x8f910c532b9e69f5, + 0x965219d546e66c3, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x7e7816b4cb734aad, + 0x53d9e55cf599424e, + 0x6942a55e0f76b32, + 0xa8581db097c1fa5, + ], + [ + 0xf926b7c35e153a40, + 0xf6cb47c6917f8f22, + 0x6fef17bbd925b415, + 0x1d6462ed00a4d6f, + ], + [ + 0x91749635b3f33754, + 0x5b38fa8a6a84ba6d, + 0x3589eca62c93c329, + 0x5c37c18afc7441b, + ], + [ + 0x68b338eb84fe468e, + 0x12cec9c5f1740e50, + 0xa1c586897b6889f8, + 0x29dfe101d3a300ea, + ], + [ + 0x916513e134d51220, + 0x1e6b2475e0a1ec00, + 0x640d711ab63433bd, + 0x855058bdd9d6e1c, + ], + [ + 0xc098b9f1790afca8, + 0x4f590be7bcdf78da, + 0xe5fc1daabb1fcd5, + 0x2a0e9b63cfb69fd7, + ], + [ + 0xc94527b6e8b014cb, + 0xbff4fc3906f6ac47, + 0x5cbabdbf0f10dabc, + 0x153816b1f081152c, + ], + [ + 0x7155f16ed60a08dc, + 0x3dff7c4631246a29, + 0x7ae9a052af08de4, + 0x181f1dbe559d5263, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xe5b09fabb14f0335, + 0x2c2c3c25f6a21119, + 0xffb80cb3fd9d066e, + 0x132ba9c6126b0b02, + ], + [ + 0x6d58e925fed416c, + 0x8c19b3fce5c74b8f, + 0xae3956674e907870, + 0x17a3c256e4db41d6, + ], + [ + 0x1fe5e8145d1bfcef, + 0xc978e8d6e669bc7c, + 0xe1bce7f0777118a1, + 0x2a51fb315927bc07, + ], + [ + 0x77b53f1475ddbc4a, + 0x2efd31b2dbac83ef, + 0x1acd1e00ce58297e, + 0x3f3f60b104b69bd, + ], + [ + 0x3adfc22da869c8aa, + 0x9cc1adb11ef9766b, + 0x773007d11d6c52c4, + 0x269d070e3a78fc1c, + ], + [ + 0xbcbd986ded026d5f, + 0x86b1eade91905046, + 0x283889aa7927b71c, + 0x1a0f518d9141e2af, + ], + [ + 0x374ea888b8a2cc50, + 0xf99003afbf13c0c8, + 0x8ecae501b671014d, + 0x301732eaa67d0310, + ], + [ + 0xcdea9c245e0e663f, + 0xb55f2231a9df57c5, + 0x45edd9d9bd4eea2a, + 0x1fc928223cc43981, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x78861d0fb2d65ba5, + 0xfa9b6f3a34eb7c2, + 0x436a982a9d2428a6, + 0x1eea555bbbec1bc5, + ], + [ + 0xcacc4ac1953ea7d6, + 0x22132694fe16da03, + 0xcbd5a6884945fb38, + 0x2397a3209a449f02, + ], + [ + 0x41da06f90a327518, + 0x7a1a1d66343222d, + 0xe63bcc80bb34d5d2, + 0x16be76eb34f43a97, + ], + [ + 0x8f130f83401cd598, + 0xb165ade60ccaed2, + 0x6235be7fb36734de, + 0x168f981d069a2553, + ], + [ + 0xc9d62e518205bb0a, + 0x52cf6c816793c05b, + 0x78ac0ba0ac3d7bb6, + 0x2563183f49723d41, + ], + [ + 0xb529a3c5ab915ae0, + 0xee07da25f5ac745b, + 0xf7eb6f187c69df91, + 0x20d3ddc908f1cbb9, + ], + [ + 0x5744e4e22afaa12b, + 0xe449c3e1912e8566, + 0xb97dffe2e87792a7, + 0x25885b6af656afcc, + ], + [ + 0x41a840d8381fd4e3, + 0xab22387b3f1c1fb9, + 0x94511f88d5bee00d, + 0xf7a550e1febb2aa, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x3f683f7eb7c76669, + 0xa94d54bde704c040, + 0xffbc6368c0b18847, + 0x214cb5bcca169388, + ], + [ + 0xb45f405fd464ea9a, + 0xd3cb1150369665b6, + 0xfcaea4091a63fa57, + 0x226d94c99feaa346, + ], + [ + 0x848a548f21e6125d, + 0x3d17f4697a61dcda, + 0x6afb37f73a46d17c, + 0x49fdd79a2ba15ca, + ], + [ + 0x1bac0fe2ecf34e4d, + 0xb2d5ea6c49552e16, + 0x16c8cb4c4207fba3, + 0x9bd70d7f184ecec, + ], + [ + 0xa2d67a07292a4551, + 0xa4d8fb895e1e4b83, + 0xa7b5f1d172734ab0, + 0x13651ab8ab431e2c, + ], + [ + 0x33780a67702b0441, + 0x8e684d206df37d23, + 0x2ff72f5ff521d319, + 0x190c0d41cc7b2db8, + ], + [ + 0xb142ce59a2db575a, + 0x1e7bf843ed05c4ff, + 0xc9b5f01b5928f3bd, + 0x12880105ec1994e4, + ], + [ + 0x965e7e9fad17522e, + 0xfa709388e82c8b46, + 0xbfb5566c4319c149, + 0x62d0ef299da3739, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xbbaf731fee96bb9, + 0xbbc66a471df54ade, + 0xd57d53e4c13a2e5, + 0x23e514402dff7a67, + ], + [ + 0x5f77c5385259d99d, + 0x89f959a4eb0b28b4, + 0x40590480880b5052, + 0x8f4f47c470e593d, + ], + [ + 0x499670101c1d60d3, + 0xb4439ee8bd90bbc2, + 0x8b0203247153a225, + 0x1e52cff5367afeb4, + ], + [ + 0x7dfef0fced959bdb, + 0xc78ebe5e8fba41ef, + 0x4be2bbfd1060d94d, + 0x25a9561001f0fa23, + ], + [ + 0xd76b8fd2973cd3a, + 0x7cae65e332f60c41, + 0xfa99a89d9e2d785e, + 0x2204dea705c836, + ], + [ + 0x46338c1735c049b0, + 0x5bbf568bd0debb0a, + 0x7aee175f45e6b207, + 0xddcd3bda702ebc4, + ], + [ + 0xe77ea01cc086ab16, + 0x69c4298ba6b7423b, + 0xea3e659b4229f081, + 0x5adcd0faf17af7e, + ], + [ + 0xd7e9184c2f144778, + 0x502daef1ba26bd05, + 0xcd105e8373c7663c, + 0x2534db20ada3d7c7, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x8cfa6c3c383333ef, + 0xd012440b38a7bab, + 0x72895946af13d0ce, + 0x15d7375fdfbaf8e0, + ], + [ + 0xea0c4ed918e573a6, + 0xc6b0c03494a87600, + 0x65a8d2a3a2744d8b, + 0x2b837c75f202d1f9, + ], + [ + 0x857ab4539145d55a, + 0x70e662b98fc61179, + 0x899aa7a5c21c628e, + 0x27584ef80bf3d908, + ], + [ + 0xc2731a3acaada59f, + 0x2f1d62452316c7b8, + 0x4885ba0d7c5dc695, + 0x108d8b9596410f17, + ], + [ + 0xf71e7919ea13fde9, + 0xd85689d59d0e1bdb, + 0xe185059043bbd8c4, + 0x218c3858097257e, + ], + [ + 0x594aa31bf0096833, + 0xf7df01486a673d0c, + 0x6de3e20d9bd0a74c, + 0x27e32f99704268e3, + ], + [ + 0x601ca56534a98d4e, + 0x69b3f4939797cd75, + 0xa13d3dfb9594634a, + 0x2cf55a67e36306c5, + ], + [ + 0xd1e2fa8c12122664, + 0x3affb4d5407d39e7, + 0x2f916d5ea6601566, + 0x6c4afa0f721536, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x432ac458d187b75e, + 0xff51bcf3ae9fd8c4, + 0xef9bbe558824a8ad, + 0x159006059ab9d28b, + ], + [ + 0x765c28053dccc104, + 0x3afc9369159808a7, + 0x15b9b2368f06a7aa, + 0x119e710f6d47e65c, + ], + [ + 0x64893c3f578773bf, + 0xb8420d7c2144a124, + 0xbf0d41a2cc8a03e4, + 0x1a4f88f4dec10a50, + ], + [ + 0x4ee622c45efa5701, + 0xe1a47b9c67fe60b5, + 0xb10ef406ec781ee8, + 0x918282018b4e85b, + ], + [ + 0x5360ab88e3a44969, + 0x6c486b8969f48f53, + 0xf618b139d9da2bbc, + 0x191cadd39b880ab4, + ], + [ + 0x1b522860950d45d8, + 0xbf019da4de97d689, + 0x93d30bfd3e0f340e, + 0x2968316a507518b6, + ], + [ + 0x32c076474e598039, + 0xef07f7e384cc57a7, + 0x95a596eedf9fb893, + 0x192f6132d3671c81, + ], + [ + 0x9282cf5a619c071f, + 0xb497bc89acea043f, + 0x70b5d546244e78a5, + 0x249401ca772a7926, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xc6d904918f2478a6, + 0xb0db5675d74217d8, + 0xe5c774f7eaa612a2, + 0x7814465578ad907, + ], + [ + 0x2f156a93c26fc997, + 0xabf36f313ff309e0, + 0x6acf466e38017008, + 0x1a99e06aa8d18313, + ], + [ + 0x7ff07cff6d3f586c, + 0x5ee0ce55161a0ad8, + 0x6d11b926046896d9, + 0xe8bd1b9aca1e30f, + ], + [ + 0x35c5efd30637c8c3, + 0x3cbe1e3c6f504e93, + 0xecbd77fd1dfac792, + 0x1649b284b0f7e724, + ], + [ + 0xe2451c5c72c8a6c9, + 0xf72ce6a4f93ef45f, + 0x607795ce57182831, + 0xb7ed552cc8084a7, + ], + [ + 0xfb2632ddf9403311, + 0x98eeeb8b9472a51a, + 0x44207fd291ac5c4e, + 0x191138b433d0511a, + ], + [ + 0x7fd728134256d5d8, + 0x655c23b7729f1423, + 0x2d8266dfe7517e95, + 0x277dded352b5f91c, + ], + [ + 0xb6faf64c2ae139d9, + 0x39952af7cc8b0313, + 0xb639d9c9235f394b, + 0x9faa1f3f4568855, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x47ed4bec448d5c0d, + 0x137b4b7d6c739cd9, + 0x749442540c7c93c, + 0x1119d766b3d158f9, + ], + [ + 0x545131c703f7f81d, + 0x7c421b2eeba16014, + 0xefb3e8dac4271a85, + 0xae153a0154d60bf, + ], + [ + 0xd0485132c20691c7, + 0xa4bb97fc6f89d885, + 0x48e9bfc19eaf1ade, + 0x24c3092271992db6, + ], + [ + 0x5cac12e544121f4c, + 0x776620c5f75785f9, + 0x94f97ca99fd0590b, + 0x5d054d668e47f0c, + ], + [ + 0x977594b89dae0cd0, + 0x9629d58fc00fd8c, + 0x30c22936be7c80d3, + 0x234699f23ad18e08, + ], + [ + 0x7597709068a19af7, + 0xe24e8ddda6c5c508, + 0xc3ed3859d4dfb5e4, + 0x33e56d7d9ddabd4, + ], + [ + 0xc7dc2f4739ba9e5d, + 0xc93932ac1745b32f, + 0xba0d3a1231fea8d5, + 0x28da5ade1a72ced2, + ], + [ + 0x9dde392555e219d5, + 0xaf68246535bf554, + 0xcf867a555cda7eb9, + 0x26f99190e1d74042, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x6206ad4f14690f0f, + 0x8d7ba2ea6afc8cff, + 0x17bc3f4d81d861db, + 0x664a318383b0fe, + ], + [ + 0x7f82816ddda55079, + 0x4146141ebcbe9571, + 0xc95f15f87975d82f, + 0x22cca78c436d0569, + ], + [ + 0x5e8be9bbf183ded6, + 0x45bdc7311842b746, + 0x3b7f72de3e54a397, + 0x25bc0a8fde96077b, + ], + [ + 0x83a0e0222fe40fd7, + 0xc376ebca9a4e1560, + 0x2beab0e78f03b14d, + 0x2d3338fd463cba51, + ], + [ + 0xd7beaff2afe2dc8c, + 0x7bbe264ee2b9d60c, + 0xea7a4353f52cfbd2, + 0x1f67f314857365c9, + ], + [ + 0xe8ee86597c1b2c77, + 0x8bc05a06f28bf6e2, + 0x8d68e277cf80c87d, + 0x2f62acc8e9009c48, + ], + [ + 0x8fe7d76f80470d2, + 0xa61e3dc8bd6211ba, + 0x1ae0de6a3c3069a6, + 0x1bdaf3fb4710dc08, + ], + [ + 0xc5876d2283c0b793, + 0xc46d2efeb304938f, + 0xad1d1b9ef539b87a, + 0x221a2b263397630a, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x756a7472b5f5b8fe, + 0xb45786494b54474c, + 0x22d32d90dd40e79e, + 0x173f49548bf7cb94, + ], + [ + 0xa432a9d75f4aed13, + 0x2564adb3f5068ade, + 0x381d959b6f6bf268, + 0x100a7d9c96b60393, + ], + [ + 0xd451ae2c2ae8d1cc, + 0xa39c47486d9281fa, + 0xc68a2dfb2b9d334a, + 0x6570c3ab52f49a2, + ], + [ + 0x2e133dc9be5bfde2, + 0x8d31df384a31f8ae, + 0x96a6922f5b2d1843, + 0x676ca0ee3440a63, + ], + [ + 0xa3df1bec096f6d3f, + 0xe99cd747f9f894c1, + 0x6fe7c4687a96df9a, + 0x2f59b539bb0659df, + ], + [ + 0x968cfb7bad640b18, + 0xd3b187e389f95031, + 0x14776fadda574833, + 0x271825757c313ff4, + ], + [ + 0x3b936fd711c02d08, + 0x8d0094c7547520ce, + 0xb61e2869d8979202, + 0x206765c2219a8e66, + ], + [ + 0x45cbd07f95ea81cc, + 0x59eb4750ecc35811, + 0xe63b08c05f9aa5e, + 0x778a92f260d0c5f, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x87999025970a0c3f, + 0x7f3d4746f0859dae, + 0x72454236230c3483, + 0xb1ebb20eb490787, + ], + [ + 0x5421411eac090249, + 0x2c7029d708416d0d, + 0x3af13a4a5a53bd0c, + 0xdc65b56f4f4e7f6, + ], + [ + 0x2910f3c3da771162, + 0xb869b1ddb884ae0e, + 0xebda7e048e8829d1, + 0x68eb4cf6dd488c9, + ], + [ + 0x8916433a5fb55c4c, + 0x6cac059208b72d2e, + 0x9ccfc60be18db1b8, + 0xb09821f4a2105c4, + ], + [ + 0x8e8184c025bc550a, + 0x7936853eaeb949b2, + 0x72cce12f2e7e9a00, + 0xe02566080867901, + ], + [ + 0x8c74722377453d2, + 0x145217e3962f6fc3, + 0x9a3e8735acb1b265, + 0x148856b588dc1599, + ], + [ + 0x656bcfcb5b550840, + 0x5b77e4ffc4aedb2c, + 0xe808f8df600ed67b, + 0x1cb70321f7afb8e7, + ], + [ + 0xd4641c5c46d6af61, + 0xc8a436a41c854022, + 0xfc45d92a219dee88, + 0xf66c27776151f5a, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x8a6b6a0b39dfd1dc, + 0x1fe49bbadbb242be, + 0xb125c45fdd01cdf8, + 0x1b5a1fbbec2f076c, + ], + [ + 0x226e45baecc4b6b6, + 0xcc42e41c4e3e821b, + 0x2c65bd930d12c769, + 0x5837967004ac000, + ], + [ + 0x5cca116d480987d1, + 0xd7a3fd1043e8a736, + 0x6bb6fffe124bac05, + 0x2f10f8b0f5a55430, + ], + [ + 0xa1b86c9143e23765, + 0x3827eef4dbef003f, + 0x80d20afd24a4606b, + 0x1225ce8737e2db07, + ], + [ + 0x4f6c0342b2392010, + 0x23e3c193c29fdb8, + 0x8c1a69a794e52c6, + 0x1575656b189f0373, + ], + [ + 0x4233c6287b96d8f5, + 0xa31fe84dea5883fa, + 0x96c611041688f291, + 0x14afb1c20320a7d6, + ], + [ + 0x6f3a8ef4faf6ef27, + 0xec3517bd2d785cb1, + 0xc6f5f5a456727c82, + 0x13d02ff2b0e8b10f, + ], + [ + 0x4da1614c7aa3292d, + 0x747fcf003c810d76, + 0x5159fce86bdb6b63, + 0x88d9f994105af79, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x832a2b824028e37a, + 0xed73351c243ae965, + 0xf06ab1d853e03040, + 0x2dd81e0d40c3bff2, + ], + [ + 0xe547911f5510da18, + 0x7aef313a3f1fdacf, + 0x8ab10e6625febd9c, + 0x31c69e7710aa97f, + ], + [ + 0x7413ab54a40b7ba, + 0x8e212008b30f4d65, + 0x13cf7abdc3cb96a8, + 0x1a2257c854855d6, + ], + [ + 0x25653bb1d3b79736, + 0xc85e93e0ce6884c2, + 0xe1335658ddcb7bf4, + 0x686786ab92162f1, + ], + [ + 0x9aee2f849970d10, + 0x69f47d214abb1301, + 0x3e4eb4aea26b2a, + 0xe22535fe9d273c3, + ], + [ + 0x842bdc73813647da, + 0x7eb0b1791df87c6c, + 0xede27a0dc55e241d, + 0x25b93f711c0d7066, + ], + [ + 0xd8c784d184cf78dd, + 0x9e34d897ff5b12e5, + 0xe05ba20da522f40c, + 0x410a330600de737, + ], + [ + 0x40d6ceeaec00a7b8, + 0xa905a05995086e8d, + 0x9876cf3d4957f8c6, + 0x161fc6204e497137, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xadaa468dc344f2e, + 0x3985b918347c8f5e, + 0xa91ca671a50f11bf, + 0x1efc4d056b476256, + ], + [ + 0x340e6a2d1bf142a4, + 0xa06ea3148612f264, + 0x81df40c20a52f020, + 0x2648fe050b0c1a9c, + ], + [ + 0xafcbcf04f317cff8, + 0x5e21a002a90d4a12, + 0x785b01784790726c, + 0x2b082c67696fea7, + ], + [ + 0x35d06c881562e78, + 0x54882ecd7f3bba9a, + 0xbbbadb1636188258, + 0x132b5109eabbe4b0, + ], + [ + 0xb7a2ee13fa50f540, + 0xa421097f3c0be2d9, + 0x9661faf296d35a8f, + 0x282068432027f888, + ], + [ + 0x4959a735a1bf20ee, + 0x1de1cbb9a15330a4, + 0x6668a2b6ff1e5c50, + 0x145be66f28ed8d68, + ], + [ + 0xc4ef609886f04936, + 0xe9312fa3805f0316, + 0x117eaf4a54a482ff, + 0xa1183aad6af28c3, + ], + [ + 0xd603ade200efb2a, + 0x390019833cfe33c2, + 0x99d1b9e4690b86eb, + 0x16b6733264196afe, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xe4d0fba08ce11be0, + 0x95996e30ee54cd0b, + 0x54e5afa62d068d6a, + 0xd49a833080fd464, + ], + [ + 0xe8d542bdd238f535, + 0x7d90bbfa3c1fe8db, + 0x8e48c480b342126b, + 0x2dc6203d28134e88, + ], + [ + 0x49c4babd2d6c66d0, + 0x4730c2d77afcc727, + 0x783f29475dac0926, + 0x2129fba4b0b016ca, + ], + [ + 0x958c3dcdd2a239d3, + 0x3445838a3a0588cf, + 0x591c5946c8e99c33, + 0x2c2dd38cd2a274e9, + ], + [ + 0x3489b7d5b310d7, + 0x53f4249466051200, + 0x873bde04340b8c53, + 0x30bb5c0c1461b88, + ], + [ + 0x9e3129ad9d3c425b, + 0x5da52bc2204ae2f5, + 0x128181f1c02ace13, + 0x2e0000c15b87a9d4, + ], + [ + 0xa4011d3998e1549f, + 0x112d8edf167ba1c2, + 0x3e420a535702fcde, + 0x74c861122b3874a, + ], + [ + 0x66274c8d60f6cda3, + 0x88743ce93be16c99, + 0x33ece716e0b06c5b, + 0x18cd6e8cada5d14b, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x87082b99e907d0d9, + 0xe99bdb0ee6836c56, + 0x69035f5dffb4c992, + 0x245625b4e06993c6, + ], + [ + 0x83d0701119c02973, + 0x5586c3909c2dbc70, + 0x9c6ec3d1e29bebe8, + 0x95286b3c83d024e, + ], + [ + 0xe6f2fe150d400c58, + 0x6e67497ec774e682, + 0xb8f218fbd0c228e0, + 0x3028cfa753c0a1da, + ], + [ + 0x79bb6c80876956a, + 0xbd867f7d3890bda1, + 0x2678ac7801717d04, + 0xa6844d941e09e60, + ], + [ + 0xbfe46a2aa0fda05f, + 0x9573d4ccc92bc5ed, + 0xd80c181467cc041b, + 0xfacc618a8e81fcd, + ], + [ + 0xe4ac11f0a6d0f27e, + 0x43a9bc4d9161e2da, + 0xcf97c33daacb8a59, + 0x1aee71f254a8ec41, + ], + [ + 0x6d576646a22c7230, + 0x8838b445aa1b8660, + 0x59de2b3ce45f133c, + 0x1f9f4f6964dc9122, + ], + [ + 0x53123ffb647dd162, + 0x5db98571a478ffd5, + 0x3334ed1e52753a69, + 0x756fe455e81df13, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xcd4c8060d1d39a19, + 0x2cefde14f0a6f029, + 0x21dda0ee47690f83, + 0x12407d7d640e0690, + ], + [ + 0x579727d9b54cb07a, + 0x9fa4d3eb70be0c33, + 0xafb758d31a287e31, + 0xdd98bbe539107fb, + ], + [ + 0x8ea9caa9eb81c7b3, + 0x5bf978cc70af0e97, + 0x64955b8e208d9687, + 0xc773b3a772cb9cb, + ], + [ + 0x4858b3aa33a586b4, + 0x252cd3c3e805d7ff, + 0x8f17ef4387c06363, + 0xb7a2390140f2c60, + ], + [ + 0x81a63dcb3128b64f, + 0x92bda8e2a4dc10a6, + 0x7f5a9d10e16a2a15, + 0x24c368237920f42c, + ], + [ + 0x8ed747d60e1240, + 0x96fdc032eca4073f, + 0x7e13bb60a39f5b38, + 0x1d227cb3ec2203a4, + ], + [ + 0x2d56a9075bb9309e, + 0x182a034a1cb26c42, + 0x31f57863379e057, + 0x1730f6452f2cc39f, + ], + [ + 0xcb041d0fd77a0bb0, + 0x3021f6b21b269e64, + 0xb2cdc16a0a4b0635, + 0x126564590a785f71, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x608b2abf1cab0e97, + 0x8ecc21a4f60dc772, + 0xc4571b1a4f163198, + 0x21bad02a85e55fba, + ], + [ + 0x9d0348f6e3470367, + 0x823201d319a69f84, + 0xeef527abb4aff794, + 0x24d560fa8e0593fc, + ], + [ + 0xf941fd1269b791c5, + 0x843627103e98b0e2, + 0x5a3459f4f1becb3e, + 0x23509f722ee4b035, + ], + [ + 0x73a90ac7b06ee376, + 0x9e1d8f5c062dbbbb, + 0xf2fe7a5342b7b7da, + 0x2b3de611bafb881, + ], + [ + 0x3222a42ba443962d, + 0xbfb05c3e5519570a, + 0xe1592ae7059ba898, + 0x26e8640c8f116588, + ], + [ + 0x77d7a2aca72dc1a0, + 0x6a8ecc91562fc89, + 0xdb714f5003c53c8e, + 0xffc0352a75dbd65, + ], + [ + 0x1d86b3072250e914, + 0x547495fb16a3ef93, + 0x4c713d63e76e1a19, + 0x10356bc7ec7b2201, + ], + [ + 0xaa1972376f6c80c1, + 0xe61c88acc46c01ad, + 0x4377beb48daafc99, + 0xf8fc5ee4e78d6da, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xab18882ea1509d89, + 0x44a783dd450f89aa, + 0x5f75368ad3acdaa, + 0x27711e2b72c1519c, + ], + [ + 0xf0bfa5ee861c5a92, + 0x6b72131a0606cc5, + 0xf778c19a6bb1f36f, + 0x1f783d7b4d28e60d, + ], + [ + 0x71ad09e040b3a16a, + 0x2fb25339203fe03, + 0xd97dc68f2635d8c3, + 0x277fe3bba079ed0a, + ], + [ + 0x85a8f6077066a02, + 0x384d491f60e2465a, + 0xc253c5c4318446a9, + 0xb3fad658900452c, + ], + [ + 0x8d43ce7c8ab5bffe, + 0x25c94a702e8f9a62, + 0x13aa577681396300, + 0x13ec4cd549a9481b, + ], + [ + 0x6f457dbedd5344da, + 0x8303269b9c9cac95, + 0x3d84d1a865d11173, + 0x1078b96874c50367, + ], + [ + 0x77faa8e05d052904, + 0x1141e1b5bf590fcd, + 0xb4b660de73430286, + 0x37a333ee8e3b244, + ], + [ + 0x9855c4b2801dc898, + 0x6c1703c099a78e3f, + 0xf8809f7e06d695bf, + 0x3ac00f5b5e5648e, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xb914c2bb6c67a950, + 0x134bf2408dd88db9, + 0xb90b73a859bcd86, + 0xad2698e3704abd7, + ], + [ + 0x80a03a5f8479efe0, + 0xd7a7144c319f9019, + 0x9e3254e70dce2604, + 0x29ba2f224a9dbbef, + ], + [ + 0x8c3c4be05518c3, + 0xc410f03028e8b850, + 0x6c310c9e77371be9, + 0x12a6ca0955372dd5, + ], + [ + 0x4e6251426e2756a6, + 0x2066db6a6f924a20, + 0xb8e9b300763f7e6, + 0x121b96002c26f6f6, + ], + [ + 0xae25e2c71d8b8047, + 0x30b9fd49a7c9adf6, + 0x7fd2de1a4f9ead0f, + 0xa4ce41df83152f7, + ], + [ + 0xf1d7d0fc219a5d84, + 0x8cec83ae71fc90ed, + 0xb2763868789b813f, + 0x26240b953168eed0, + ], + [ + 0x630c08cc3ecf653a, + 0xf208a114d866154e, + 0x25a8351a3c7f0072, + 0x158ac36c59b0ffc2, + ], + [ + 0x22f1156bf6d70013, + 0x268204a3185b46e2, + 0x1384431c352867be, + 0x2de6d610e89c20fc, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xff8c61e80863966, + 0x3001a88a1b912f8a, + 0x395f18e19f8063f5, + 0x2c7a5a2e6328c591, + ], + [ + 0x4fae447f49412411, + 0xa29bb4b5c2239d1a, + 0x683371f6c7717ba4, + 0x2954f47f89865a60, + ], + [ + 0x9e9f078a4cb87de9, + 0x74ef90eae950b918, + 0x1618027e78c88903, + 0x24b4b44e99e238d7, + ], + [ + 0xd0c6ac8e61794537, + 0x478422f601f074a3, + 0xae1f7c0b47f0871, + 0x25104557a7a21885, + ], + [ + 0x879de921772849aa, + 0xa654c30df13d8781, + 0xd8a8f3245beb6ab0, + 0xecc45775570acd3, + ], + [ + 0x3eaaecaf754b0e3b, + 0x47a280910be2d1ad, + 0x8b8d9f9b51c68217, + 0x115a761f1529845e, + ], + [ + 0x7b49bd824146aac4, + 0x4a4f759c98c360a5, + 0xcea9366b530c2fc0, + 0x2def5b56a312c2aa, + ], + [ + 0x4ea3efda3c826b85, + 0xa58c426d46581ff5, + 0x981ee7e9718524eb, + 0x2444fbb5c74abdcb, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xda892b4d2482d68c, + 0x5f852acc2be1bf8b, + 0xc6214ed809a0c8ef, + 0x892b9b3cbdb6526, + ], + [ + 0xa32a875db7a3460d, + 0x73148eace7cb6595, + 0xff152f04dfbd7223, + 0xc8f53bb4e244029, + ], + [ + 0xe9a46fce4d88bbc6, + 0x20d352e68d020d86, + 0xbe83102276a48dfe, + 0x1a3092d97a523e5e, + ], + [ + 0xdda3f8e1333ef64d, + 0x33e2afc89d2a58c3, + 0x307c48709dbc50a5, + 0x30055ee4add81d2c, + ], + [ + 0xd01fdebe0cf1f25e, + 0xcf4e2dbf99fdde86, + 0xc1f127df94020c74, + 0x1823274045aa5efb, + ], + [ + 0x52203a257670c4de, + 0x122986f306409e39, + 0x4842270bab03d3f6, + 0x2eb1d43a34e61191, + ], + [ + 0xf69b49e7f3d9f587, + 0xd8bbb38dcbce2a7a, + 0x8c8feb9ced41290a, + 0x17666a0e7fa67702, + ], + [ + 0xd6203b27f04e5fa7, + 0x5c9f21f440e00ec6, + 0xee7a6372f32ec084, + 0x2c0b7dcc9d99e2b5, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xc2c9a5ac13e39e4c, + 0x51af663cc9fa40bf, + 0xd771086cde4a149c, + 0x69009ba9e522330, + ], + [ + 0x18203e2494a066d6, + 0x46a724cd3bc15e2b, + 0x5b45e7ad643f4fe1, + 0x9980ece9d30fd70, + ], + [ + 0x1413ea87be975166, + 0xfbf7fc63056a2572, + 0x846f7ebd143fc2d8, + 0x1f78e99410578155, + ], + [ + 0x6fc6aac7c80af3c7, + 0xa62c6fea23f3799f, + 0x7f37e039355de227, + 0x253825684ae0ab5d, + ], + [ + 0xca59c5a0728ffbb, + 0xdb30a5adae9ac995, + 0x7988c954ec3b781e, + 0x2992ea528e3e44ae, + ], + [ + 0x9bcb1e277b6e5d50, + 0xe2903e6cb6edf498, + 0x32fe19671e999fab, + 0x1327da37f80a88b0, + ], + [ + 0xf2fddfcd5bdfbff7, + 0xa82bbe2a9c8fa88c, + 0x21ca6415eacc540b, + 0x23a389cbba942e04, + ], + [ + 0x205654c296885be, + 0x4da933fc3fb2b89a, + 0xab443978b20e4b8f, + 0x96177061e006595, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x6949c45d954a704a, + 0x1642ae7919986811, + 0x670c3f74e433e3eb, + 0x1f9947d8dbcfbdb7, + ], + [ + 0x1b07e4bf2e9cc0e3, + 0x6d821332be5f22a, + 0xaa5be302598114d, + 0x15f86cb3419d1e04, + ], + [ + 0x131ff007b8dd0f0c, + 0xbaade2ab4282a1b9, + 0x8571cd1f6249436f, + 0x81fcced399f4250, + ], + [ + 0x67f949b1447951a4, + 0xc971a0b8f5d65a23, + 0x66e0f8299b4be8df, + 0x2200d5c954f45239, + ], + [ + 0x647c941a28c79865, + 0xb29299ae2d282bb, + 0x716f1620c903666e, + 0x1e7a63418e21f5b7, + ], + [ + 0x5294be151efecce8, + 0x6ac98f72391bb9b7, + 0x6c0b2d5a75125160, + 0x1266462ebb8080d9, + ], + [ + 0x322ea0f8fed40e2d, + 0x23b428488f5570b1, + 0x700ca0580af4a506, + 0x9e80fd601adcad2, + ], + [ + 0xb925082e4d25e542, + 0x16297c65e364e10f, + 0xcf71459fddac2aca, + 0x30a5364cc5698f1, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x2796917a04145936, + 0x9a7eafa66d9d56fa, + 0x2e6a32cd69f78d5c, + 0x7eac18b86d54640, + ], + [ + 0x63da2d2750e51b4f, + 0xcb9263b92a97bc35, + 0x9b5c671fd63b1c58, + 0x1f581f8f23478b66, + ], + [ + 0xdd01cb8870105eb6, + 0xb4414d2673edd75, + 0x5d4dc6f0d2456770, + 0x284290ec01635309, + ], + [ + 0x82f73183c103013c, + 0xebdf63c49735e32f, + 0xc8b9732c464598da, + 0x19ab5d52dcf64a89, + ], + [ + 0x9298371c9492283a, + 0xe93bf2b360ecb448, + 0x743e1b3c1a128ce5, + 0x13320f6e84913254, + ], + [ + 0x4ddc240b6b49cafc, + 0x55382adb9f77d963, + 0x666ee17254e66cd6, + 0x1ecdf202383e948e, + ], + [ + 0xe1d8febd6b1988c4, + 0x4512b5d7015f1689, + 0xbe4d3eddc9eec7a2, + 0x58cec4ae71b92ba, + ], + [ + 0xf2ecbf4eec6b5ac9, + 0xc74ccc062795f3cf, + 0xcf64f83f0e0aa75f, + 0x208022fe288fe32d, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x53d46ded35ce1eaa, + 0x9937200e09b5a5db, + 0x560ba4bb087e0959, + 0x10a60114d223f4ca, + ], + [ + 0xdb946d097fc8413, + 0x9fd50fdd9cf8d178, + 0x835f46c89d463670, + 0x2a19a44f1204bad5, + ], + [ + 0x4553b01bfbb08b1e, + 0x67555b21d1754536, + 0x91f3d8ccf207ee26, + 0x44a07143fdd5ecd, + ], + [ + 0xb6203d1037e456f, + 0xd42035b26b26eec7, + 0x4f6101d0937143b1, + 0x2eae89eb1d3b0886, + ], + [ + 0xaca521d0ec3fe3af, + 0xa9ee4bc1d4a7e943, + 0x2f917e70f9f8ac9, + 0x41dc1765dc96878, + ], + [ + 0x919d4fe16d141d2, + 0xf00fc28dd2a4f194, + 0xff66e6b736b31b65, + 0xeabbc36b814a573, + ], + [ + 0xff778a72be40e049, + 0x66ca907cbabd0a8, + 0x2124dc5e4d83c09b, + 0x2456d05857289a3f, + ], + [ + 0xcfbdcabee4e00483, + 0x9d35c42eb53ce5d4, + 0x60f07536b6afdc18, + 0x20f83cfea9a95da5, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x6cfc05fa3b0886d2, + 0xd051ce4ec1989375, + 0x61a3fb3b60f1d21a, + 0x29a3bbbb12fcb1f1, + ], + [ + 0x9e91c0f974e66c8d, + 0xabff08b2c0a313b1, + 0x3e3be7138b874bd4, + 0x109f5b9d5573be1f, + ], + [ + 0x48ddcab89408dbf8, + 0xd9904f067ea69bd1, + 0x6c2838b800c50dc, + 0x1e657bc7725efa65, + ], + [ + 0xc6a826bc449b7332, + 0x4662095972ca8210, + 0x354125c7e2e476d5, + 0x3734287b8bb3bd0, + ], + [ + 0x1221a61f5769ee39, + 0xd982d05c474d4874, + 0xb426c02fb2abeeda, + 0x1e99a651d1840c3a, + ], + [ + 0x9f18dfc837e25da2, + 0x57fb029e437cabf2, + 0xdf264577b597c1f7, + 0x10ff8dea8e0a7b2b, + ], + [ + 0x2a58b5931775353e, + 0xc926f972dda2f1e7, + 0xc4b81a0fab2158da, + 0x1931318a9d965f13, + ], + [ + 0xc8d4f493944b2a48, + 0xfb04bc4b14385901, + 0xbe1d411b7fcf87af, + 0x9520b83d628ba36, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xcecc2daa81040a56, + 0x16c6f77e48aefa01, + 0xfdffdbd4295bad78, + 0x213c5bb781eaaaa1, + ], + [ + 0x3acb6f07e096e093, + 0xf787fe9c9f33f81, + 0xf2ae71b44ea49dd7, + 0x2ccb91d33f6b1f65, + ], + [ + 0xac4d81503b51dfb4, + 0x28f99fc84098da87, + 0xf45afb2984dff10b, + 0x2ec1ab94e4a64474, + ], + [ + 0xcb8f9a00a88e103b, + 0x322675ebef1f2c, + 0x78af916b533744be, + 0x71cdc1ca84219da, + ], + [ + 0xe9ea6a16efd6b306, + 0x5f3312fbff7259e2, + 0x6d4b9f612e1e12c4, + 0xa7e3e5e98a0559d, + ], + [ + 0x526723b822313f1b, + 0xb7a994a7093f29f2, + 0x55fed19fb8289748, + 0x119945426f25555b, + ], + [ + 0xfc8cc498130ae613, + 0x7b07bdc20b22b86b, + 0x9ca0dea8fec1ced7, + 0xd1a495aeef35734, + ], + [ + 0xf1a0be44cad4f1b6, + 0x7197f37eebf69e2a, + 0x2017616454fd0878, + 0x1132404dced3db53, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x90b521ab55b6ff57, + 0x5c5bbe3565c152d1, + 0x4f497f515683e7cf, + 0x7b0b0294f0a056c, + ], + [ + 0x89772a0540d5d14c, + 0xbf6c27731f78f3d3, + 0x3eb726132dc96bee, + 0x104df269ce435120, + ], + [ + 0x345a66accabb905c, + 0xd98ab9f0a15134c7, + 0x1d9063c8742b688, + 0x152a78e443e1ae7f, + ], + [ + 0xdcf0fab7eef30a4f, + 0x7594016f3543431, + 0xd486292e10d0b0c3, + 0x2ca0653d1cb35845, + ], + [ + 0xdcc9992ba5b18dbc, + 0x5fd9cb4f8ce35650, + 0xd49265963bec3b9e, + 0x140c29902a26efaf, + ], + [ + 0xc0c1638e2331aaa8, + 0xca83ddfd4e8ce384, + 0x812e618cf8e45843, + 0x1f7c4ad98c3798da, + ], + [ + 0x49d07115df6a3813, + 0x111e660c68955b68, + 0xcf05d6713e4b0429, + 0xd5c88536a687e41, + ], + [ + 0x7cc02c14f639790e, + 0x95c234c22bf85af3, + 0xeb4cfd3803877e55, + 0x1b446a617d228bb, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x9fb12f650141555b, + 0x6b04fc09b61de230, + 0xa53375070d2d9508, + 0x3058bfdfe766e443, + ], + [ + 0xe031426f03fe7990, + 0xde44c8b5952df667, + 0xb7b142db5638e5eb, + 0x18d0673eaa7f6052, + ], + [ + 0xecd1cbc67af08391, + 0x7c5666eb253cf05d, + 0x9175a1b0dbe4d90f, + 0x22fb7ba78bcb97c4, + ], + [ + 0xc30803e5949d70f0, + 0x87914839604b03f5, + 0xc7d5c5f1e1dd7590, + 0x9af15716e3d7d08, + ], + [ + 0xc907d4a9bc3131ac, + 0xe8063043dd2d6834, + 0x3eb79c77f55446ce, + 0x1020ed859061ed00, + ], + [ + 0x1786590875d5895, + 0x79ecc38585336371, + 0xff03c1c5c1098a47, + 0x1a3a72e31ab463ba, + ], + [ + 0x2571ad1d1ce70ca3, + 0xf44ec6064434c159, + 0xd05934a14fba89fd, + 0x17d329e59b45d29, + ], + [ + 0x1036a4df3ae13270, + 0x17a8e64baa58d1ec, + 0x3a3ed60b86c3cec4, + 0xd7865c810dd85f0, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x252068c8dd50be21, + 0x2ae29a111c93f739, + 0x5087cc7e5ac10e08, + 0x24796035b3f3d7e3, + ], + [ + 0x2d4c92d08ce4465e, + 0x9f284b0cc1a6d2c4, + 0x3765c79af504ce3c, + 0x2123b8940b5f56e1, + ], + [ + 0xbcdb660cad2df70, + 0xd5606ab7ea087d0f, + 0xdbef7de1dc3bf014, + 0xf10f014e75ccdf6, + ], + [ + 0xd6bc906388e2cbc9, + 0x9e2e2eacc3e3705, + 0xda83dd4a1c9debb6, + 0x2f14a8ebd94f7144, + ], + [ + 0xe2c0dab18b19f45f, + 0xd3ee5b84dbbe3d73, + 0xa32b6dd041ecd993, + 0x131774d55e68a781, + ], + [ + 0x7c1ab4a6b14b76c3, + 0x98a80f95868ef00b, + 0xb376886b5f681712, + 0x17f3d865ed04a806, + ], + [ + 0x90e37a0848cba788, + 0xdfaf2d4054461460, + 0xcfe98f975b336e68, + 0x1ef51d65718588d4, + ], + [ + 0xe1d3e979272bd294, + 0x97eeeab9b60dd796, + 0x325035ba6a33de66, + 0xfe87d55e3669fb9, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x6313789b3c944429, + 0xbee32546f878fd7a, + 0x36bf2a360e462849, + 0x2425edf764b837d0, + ], + [ + 0x4d0a85ab967ebc93, + 0xd568fba87427e7ec, + 0xb7be9f0d806667f2, + 0x102179f829dbc478, + ], + [ + 0xc02a8ebf54ec401f, + 0xfecdc5b8bf4bbdb1, + 0x8491841d82b641ae, + 0x11138abbbac34d05, + ], + [ + 0x5e07ff4757a04b2f, + 0x394cbe08aff02459, + 0xffb8b17c506cca47, + 0x24850e9d3b2d09f8, + ], + [ + 0xb852b60cc57c6ac, + 0xcca6f2e3c6543736, + 0xacb37f8a01f9b805, + 0x102f37b053151a60, + ], + [ + 0xd047f8991deec5b9, + 0x15e67de9dd1d99a3, + 0xcb7bebc503dfc8cd, + 0x2cdb3cc7031440a9, + ], + [ + 0x3130c1791e80deba, + 0xa5b2005d457e8886, + 0x5c8a0aa6a9d4475c, + 0x17cd55f89580342e, + ], + [ + 0xef65ab5b8f72c465, + 0xd3a95b4198e8d0a6, + 0xab2c13121086762d, + 0x1c45dcf724953229, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xcb9e218606a78e13, + 0x9d174c974a060029, + 0x3a27c1db3dcbd1bd, + 0x1340a91338f71156, + ], + [ + 0xfd961688c3835a1a, + 0xaef18a2322da5478, + 0x25bff46180ad9507, + 0x247585dcd95d2c4e, + ], + [ + 0x4db801dc314efacb, + 0xe6e1718a6c5e074, + 0x1adfb3d1974b338b, + 0x6e0c22b790cde9d, + ], + [ + 0xe424f7b30e1074ce, + 0x274d80d7dd4efc72, + 0x222bf42a2c1991d7, + 0x197a80ed623970db, + ], + [ + 0xf02fec561c37d1e4, + 0xc1fee6e809f2efe6, + 0x14a8da08ed26223, + 0x4711a454c91315a, + ], + [ + 0x8d597ad589b7153d, + 0xdd0149e79fbad7b5, + 0x59a4b7e5db622cc, + 0xe49cb23a3989f00, + ], + [ + 0xe8985169d1d1ab23, + 0xb4d8ab564b4a387e, + 0xc4a2447ca53c1a, + 0x20a2e720d66fa052, + ], + [ + 0xfa8cf386196f5784, + 0xe5e750955726fcae, + 0x2d32c5db59788ac8, + 0x19ddc1f347f59d59, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xd27d1dcc9ca38ab2, + 0x5e39efaa9018889, + 0x9207371596ac589b, + 0x2068cb526a99dab3, + ], + [ + 0x604ca8dfe6a6530d, + 0x21bae4266927efa8, + 0x7f6b11220fd0ac9c, + 0x24efcb7f4ef40684, + ], + [ + 0x2710d34a34c38b4b, + 0xd6b176815d3b81be, + 0xe4417d4200f66a87, + 0x92478471e003f3d, + ], + [ + 0xcb7ea4e214b3add9, + 0x7f6c9f7e84663cb4, + 0xd16c9d6506d97444, + 0x283f6c7292c98dc9, + ], + [ + 0xd3997f5a3756c70b, + 0xfc81dc0087ecd8f4, + 0x7ac432a1202abafd, + 0x1a8d80d1f1eaa8df, + ], + [ + 0x3d98bd68a3204f24, + 0x153fda7ea79e7eba, + 0xf07d3b7d5be7f840, + 0x26f7c40357e2c54d, + ], + [ + 0xbc8cbcd2e238d70c, + 0xa0e08096a1f11001, + 0x50e2b3e30e97cfde, + 0x1e7f99ee2cfdc84a, + ], + [ + 0x63116c9c73edfb5b, + 0x1dd96002c25a1354, + 0x7ad1ddf916cf5434, + 0x2ef113e25cbc4843, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xf9208a4afda9a51d, + 0xd7382f83a1430396, + 0xe1a7db330e513ae6, + 0x1116676b69a5e766, + ], + [ + 0x6e556fdd17eb7e8a, + 0x4e48c3beaf515bee, + 0x3a4b489bc66e1724, + 0x194b78c575e0ad9f, + ], + [ + 0xc93c6d5ec4e7975, + 0x5453b12fa9bc8b2b, + 0x8ffd003a145b743f, + 0x9dc01947f53b0a3, + ], + [ + 0xafb467a5c286aee, + 0xe8ca9b1c27426db1, + 0xed80277ef0ac52d1, + 0xfd1153fe8065109, + ], + [ + 0x86f6207241180c0f, + 0x64869e24348b65ef, + 0xb3f6874c50fd3ddd, + 0x4c8b9a301f3e183, + ], + [ + 0xed0ba01f32e8bad4, + 0xe9602619d9cfb2, + 0xae52ec8f08411e05, + 0x180633a3181acf40, + ], + [ + 0x9c130c6c9ddc9c21, + 0xbf5bd658260e9ad8, + 0xe578df1e08eadc06, + 0xea01e3cff8a21a2, + ], + [ + 0x2f9c97a9c9cf655a, + 0x7a6d09f017ef656e, + 0x2fafa9616295c573, + 0xe83da970bab37bd, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x3f66d2e5892678a4, + 0xf37aa0e3fd596299, + 0x5f70a9b4aa151e0c, + 0x100957bb0326679d, + ], + [ + 0xf7ed9d4358bac80a, + 0x81e2621dcd31038d, + 0xdf9974cc70fd4ba9, + 0x22d2b2a4ab1fa37, + ], + [ + 0xb34591bed73513b4, + 0xf8cc2f8046d2ef7d, + 0x608508465065c3c2, + 0x499c92bbb300582, + ], + [ + 0xad633a24674dd124, + 0xb2b2c8ac03c9edc3, + 0x8b759eeb44bd9e1e, + 0x6417c34a2ea162f, + ], + [ + 0x48e10be5b9d00a71, + 0xd3191b62bfcfbb33, + 0xecbda5aa0a0cbc4b, + 0x281b769bab078655, + ], + [ + 0x68de5373c4928e72, + 0x30ef9a3b2a3289d2, + 0xec13cd50d11a6a89, + 0x19fbb01fa5c850ec, + ], + [ + 0x5450f0c3d7288f38, + 0x201df4e15cee2617, + 0x5363a3f0a748e688, + 0x2077775cb722188b, + ], + [ + 0xfbb1e778371a3f98, + 0x2be3475680cf1a, + 0x371854cf91f014c6, + 0x2c6967978be797e2, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x53743ca725f71c77, + 0x99d0ffab90b4907b, + 0xadea33d57583a76a, + 0x53db0de329c840a, + ], + [ + 0x7dd8e66ad073e2dd, + 0x7a2600d35732035b, + 0x88f41b790d7a7044, + 0x16e68ec691b5f983, + ], + [ + 0x584f0bbd5c1beed1, + 0xc0d3e98ff863fbc5, + 0xa6b2b2b21064b7f0, + 0x77d6cd251694629, + ], + [ + 0xe60b0945d5534007, + 0x389d7ec485a852c2, + 0x1cc80721a3752164, + 0x24acb4de24d1bbf8, + ], + [ + 0x1efedba855a15590, + 0x67dcacefb049d275, + 0xb9a9b5ee83d4f761, + 0x1271781b9daf33a3, + ], + [ + 0x669319c8f20799a2, + 0x86ea0df32a6a997a, + 0x2032005bfe53039a, + 0x11002c653b883220, + ], + [ + 0xf41e6343d7d2f2a8, + 0x90fae0bd8eaa4d7e, + 0xfaec2e547a7c6e27, + 0x2904e3b5430f1e8c, + ], + [ + 0xd22e6c60b6e7b589, + 0x43e9e33b3823b124, + 0x2bf6a728f64eaaf5, + 0x2be092f1b5b8084c, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xf1301acb161d48de, + 0xc40aa06e10ad0e88, + 0x2e79b07f5a0c9b3e, + 0x1805f177d66da7b5, + ], + [ + 0xb8364a0a169bb310, + 0xe82c8d50f4a8f906, + 0x47387162bbe160a7, + 0x2198c3a9e1b3b2c3, + ], + [ + 0xe34c5895501b5a36, + 0xd2ea6e140143a43a, + 0xd033326dbec17bd3, + 0x18f3f593221ce589, + ], + [ + 0x94dcc0d0062da796, + 0x222bef8b46ed3110, + 0x8c9df30726d21ddd, + 0x239e15267ed5c68d, + ], + [ + 0x447645563b1e7f9a, + 0x693bfb58040ef5cf, + 0x4334e9171c49f49b, + 0x15b85df2a1a63c35, + ], + [ + 0x8cf0d2ee37a4745b, + 0xbdcf54042b3408cd, + 0x31ae14d7f2ac8be7, + 0x2f3b2893ae352786, + ], + [ + 0xe4c4301d4ed53f25, + 0xf110dc403894dcc5, + 0x561eacc81bafb668, + 0x1689d861f04b9f61, + ], + [ + 0x6f6d29c7f97ff9d2, + 0x78eff758d4decf42, + 0x14230e3dc3e100e1, + 0x2a4afb5a05e3ed8b, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xf35b0c6516f90566, + 0x33cf4a639d31391e, + 0x938cbf57e802a214, + 0x219fc92fa34e1ab5, + ], + [ + 0x22e0dd319a8eb25e, + 0x91818b042aac6979, + 0x6ecdf200ae5513, + 0xf62cef531e40312, + ], + [ + 0xd9c47e521c0b5efa, + 0xbc6dce3d3c20386e, + 0xd2ded0f1b7074a74, + 0x70104b788465583, + ], + [ + 0xff358d004d84bb7d, + 0x405b43ae005dd53e, + 0xdfa602d1217427f5, + 0x14c2a963c8f0e7be, + ], + [ + 0x503f334533f93c12, + 0x6f11e90c4b43bf45, + 0x39c56d5fbb59cbc6, + 0x1b5d67b83437f083, + ], + [ + 0x559be66fc086b334, + 0xc6c0bf8e5e3f3412, + 0x9dbbda6ed7662963, + 0x28791fb32b420424, + ], + [ + 0x2b57e715955f634a, + 0x4bc410847a998a86, + 0xb1f9aff2fb3aba30, + 0x160349e5c538b71, + ], + [ + 0x5a82e66322d84bc3, + 0xac4b4a466d8416c2, + 0x740cf069191d9d3e, + 0x2a701555d73b5b7c, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x919e6e170bcef5da, + 0x41a809f0a70f3300, + 0x8180bdc08c775914, + 0x119a057fe62065dd, + ], + [ + 0xcf73e1884a46f0f2, + 0x2a43610b32a49aad, + 0xd85e702d53d77da0, + 0x6e0166ce8666d6e, + ], + [ + 0x757b4ca0867d8121, + 0x720372eeb3f4e777, + 0x5d2dddc54344d35c, + 0x267d7ef2fcec498d, + ], + [ + 0x85e7c604d4c51bce, + 0xc0f28dd99e3527bf, + 0xc7cb2801cc9dc4b7, + 0x1ad6edb119b4bbde, + ], + [ + 0x1f54b5cdd3aee9db, + 0xbf49c0a6e2116d69, + 0xf76c5201502ac12d, + 0x1f0f0b34db8f16f6, + ], + [ + 0xf9177d32dd62b882, + 0x5bb076a6ec4a7a42, + 0xcc9dedcb7e6317d2, + 0x139373c21e167883, + ], + [ + 0x974ad051563e58c1, + 0xb1155111bbc51112, + 0x352a3e7322fcb8b3, + 0x1ba59f468c3056e0, + ], + [ + 0x21e03d61479ad6c0, + 0x49a0a79d2aec0673, + 0x30431be88a8b31ab, + 0x1717fc6fc7674234, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xc2be8b5e347eb367, + 0xbd503d07bf039f31, + 0x82111a6819cc0c4e, + 0x20314524368625fd, + ], + [ + 0xe6962cf2dfd0f099, + 0x65c2bd9f1dac0f53, + 0xd24a57e8390a4360, + 0x3941b8859077547, + ], + [ + 0xe23aad2fb13c6a17, + 0x1b4d97fa6b352068, + 0xc4fd7f7ed3de5717, + 0x1d9a5ce1856da325, + ], + [ + 0x121ceb6ebebc3572, + 0xe6d0e5e72dc6cbc0, + 0x1f22076c2242bd02, + 0xfdb553da2819360, + ], + [ + 0x35499eabea20ef87, + 0x6b2b43ccbc2d1b56, + 0x3bae61694f5fcdf2, + 0x2d70898faedc00b3, + ], + [ + 0x5c649215dfcb0957, + 0x9ff966129743f0e5, + 0x211d8c9829746dee, + 0x244b6e45294dcdbe, + ], + [ + 0x443e554cf8b47efe, + 0xde207dffefede66, + 0x81c70894ccd2ae25, + 0x311f2f060bc5e92, + ], + [ + 0x48e685171389bf7c, + 0xe30ecd7a20ac90e8, + 0xfc459b80abe02a47, + 0xa78168fa5fc46be, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x21df2d9b138fd0fa, + 0x9f5d5e2eb828c903, + 0x3a2965e7c4c1f583, + 0x9a19032ec4ba560, + ], + [ + 0x16de823896d592a4, + 0xc31dea904277bf40, + 0xdfb0179eb9bfb219, + 0x654e0476455edb6, + ], + [ + 0x53c25cb643b90593, + 0xd430014f7b893e36, + 0xead3c8dd006c4fa9, + 0x28e4cfd93ecf20c3, + ], + [ + 0x2c3cadebf8eb2e3c, + 0x4023936c03f58f53, + 0x11ea66327445e2bd, + 0x22073179f0b59024, + ], + [ + 0xaced7955f830c51a, + 0x4391f0de52af7e07, + 0xd378377f3cc77eea, + 0x13d1893757215864, + ], + [ + 0xc086a2d43594e0, + 0x20778a90154bab98, + 0x388c4e9d745b62a1, + 0x118af3941ec0578d, + ], + [ + 0x62c89da1f9de10c5, + 0xb5c542e5beb37598, + 0x5ef00d955d1bd6e6, + 0x2435258a29a09d3, + ], + [ + 0x57985a838784935e, + 0x78754327e4dc5449, + 0xc0f0f48cd49eb0e7, + 0xc815745fb063ea2, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x9a2290e575d4abeb, + 0x3b9cfc5ba00abe85, + 0x8b27aed1ccd841e2, + 0x2c141abf0417d768, + ], + [ + 0x66cdbd7c0951b1b5, + 0xf0079247e86c34d6, + 0x19066b05961f26d6, + 0x201d6669de00ee30, + ], + [ + 0xdfd3cf7e777cd28c, + 0xb171ef2b7cc5fa42, + 0xebb370540d47af0e, + 0x1d5e53eec4abb123, + ], + [ + 0x6b51ca2dd48c6118, + 0x4c53a9ecd1798c67, + 0xa9919c40ad901b33, + 0x2d51fa592069b178, + ], + [ + 0x317e4ae3a49dd2e2, + 0x71e8f00649f6836a, + 0x94a7fd47f294880e, + 0xef2bab21a352f33, + ], + [ + 0x8634aa0227d0064a, + 0x8e8d13e2703d8aea, + 0x4183aa8fecfb7693, + 0x2d6a146eb4be93f6, + ], + [ + 0x2c7e3a32b7f0a003, + 0x48f0fdc49bbbff14, + 0xa65178834dde262c, + 0x23b7b4ba3d43f654, + ], + [ + 0xb5d361801b319d41, + 0xf730382c4bdb5b19, + 0x8d60bb6269880cc, + 0x1f4b1e426439bb, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xb522bb20659a82e8, + 0x3b5ad9ecf511ef08, + 0xa9db028e41be7676, + 0x84dc6e9d330da71, + ], + [ + 0x7c01aeb4b07cec82, + 0xde5b226e8a6373ff, + 0x687753d16c6eff27, + 0x2374e744eb1352c8, + ], + [ + 0xeb6224758bc6cf8f, + 0x52a6ab82dbc3b745, + 0x83ed78ae379b65c7, + 0x554b9d8d3b8b377, + ], + [ + 0xc8a52c5385782d2c, + 0x86ca3632a66f526a, + 0xfe9bb40530c9fc22, + 0x431ee5694d7b8f8, + ], + [ + 0x3f7e52f1312a9dcd, + 0x1b0aa1d07c43040c, + 0x9d8795cb11d4987c, + 0x2f8bd1d7961ea699, + ], + [ + 0x7ff87bdce907ebbb, + 0x79b5f3ebf31d24d8, + 0x70a3f453aa57fa06, + 0x12528735b96a94d6, + ], + [ + 0x11193eec22edeee0, + 0x4216c155c0151bbb, + 0x449fdb6b1a5e5899, + 0x2110f4266af40785, + ], + [ + 0xe60dd333c2b618c3, + 0x61dbf8d98e44bbd7, + 0x4c16f836b8462918, + 0x14b245b49d54b6be, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x1ff4dd4294eca39, + 0x8a30f1f106e156d, + 0x729ad8eed3272f6a, + 0x20fffbe3df0383b0, + ], + [ + 0x17df91cb8974e63a, + 0x3d10259b9c7ad4f6, + 0xcec874829a4d2f1d, + 0x29e3e9ea1bd995bb, + ], + [ + 0x10544ddfde48d528, + 0x3e646d99a8a516c6, + 0xf7cf7c362c545973, + 0x14075c7d2e70f60e, + ], + [ + 0xb371618b1236c4b8, + 0x18ac6fc422cddf02, + 0x99d21b60fc95a46e, + 0xf04c4c7d50a72db, + ], + [ + 0x200172dadc4dd9c2, + 0xf3ba3a772680ef1f, + 0x1e9edf8846da2082, + 0x1ce0a4ef85154ac1, + ], + [ + 0x4de9991fc700b301, + 0xd5ef550abc806d0c, + 0xa8ee79a122bb52dd, + 0x1df87a2fec57efd, + ], + [ + 0x7b318af664f75dad, + 0xd089d4a188fdfa99, + 0xd4fe1605e818baa8, + 0x204f80b46fcf0d4b, + ], + [ + 0xda3feb673b1624c6, + 0x1d469b8aeb3427b1, + 0xe35398380432f5d4, + 0x18cec4783a06dcbc, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x9a2bf46712f5b309, + 0x2d78818cd18a50a3, + 0x1f18b98279540f8, + 0x294e1a5c9e553e23, + ], + [ + 0xceed2ff8cc76f5c9, + 0x51211836220b6746, + 0x2db5ca360dbe1ebf, + 0x20e973036c9d546f, + ], + [ + 0x6b9018afb3353b6d, + 0xa7ca111d69388518, + 0x739f58cd3cb9005f, + 0x2a21f56c1edbc530, + ], + [ + 0xe6a1ce930bdce522, + 0xf5add08fe0c71b66, + 0xe51fcf23a3e4df2d, + 0x2e3724678b3a0e43, + ], + [ + 0x72d583736efed4ee, + 0x31c1925e9081b4b5, + 0xcfcee1dcd8c1b601, + 0x1d3417a280c93cd2, + ], + [ + 0x8427160096e91695, + 0x3cfe88ddec7cbeeb, + 0xc73b8803d8742bde, + 0xf99a6adbef1a430, + ], + [ + 0xd236c15110520a4f, + 0xb054e8ecc0068411, + 0xb2837c6af5b6629e, + 0x180f8d1138de1e41, + ], + [ + 0x4504f8f9c4d8dd3d, + 0x853abb65d3422684, + 0x942544bfec6ecc28, + 0x1bb08ff1f5c8c4c5, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0xebd3b00b8c5a0d3a, + 0xd209d5a7f5f74c9e, + 0x28e4a7ad86dd7677, + 0x6bd7fa4a6900e06, + ], + [ + 0xd7d2b44f4ed3638e, + 0xef516d5d0f88b04a, + 0x44d044650b77332d, + 0x1268a2223956c1ec, + ], + [ + 0x638d36fed6be09b1, + 0xb106e9f09e5063d8, + 0x6837e63b712eeb8, + 0x1222c99b4a3d46fc, + ], + [ + 0x975cf5ff61de0021, + 0x82c23c7ea26741fb, + 0x58f9edb0cf776d50, + 0xaed216f533fbc71, + ], + [ + 0xc8c3b3dc8a787871, + 0x20bdac682a86212d, + 0xf54d8622a25dc552, + 0x1d55a1a9b66a2205, + ], + [ + 0x527a91a89d40e5a1, + 0x393281b6613f4b90, + 0x179ac694e4833955, + 0x25118d0b3cdad95c, + ], + [ + 0xa7ea2e1c9e62e353, + 0x36201d720ea0c4de, + 0x816206de3db55d96, + 0x215f666460bc76ee, + ], + [ + 0x2cf72041929f6ac4, + 0x799454aed9cf12ce, + 0x6fd9dc9c5d2f0f1e, + 0x4efeff8cea2cc9b, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x896d7d209a8f5528, + 0x7871313b706d5cc5, + 0x1d3c5ab49139a43a, + 0x2bb7ac0185f346ae, + ], + [ + 0x1b135bc4473a234e, + 0xb9ecdf893190e27b, + 0x68b30536a8774e37, + 0x25dc7c675c2bb44c, + ], + [ + 0xb591ce943973d1f4, + 0xcbb935202bf44ec5, + 0x9c16a5637620f5bd, + 0x2ccfc414e9da8c4f, + ], + [ + 0x5b7673c6aa29a0ed, + 0xd07e4002444ba5e0, + 0xa7d547ea9b055e4b, + 0x5af54de8f0e05b4, + ], + [ + 0x617dde87143bf2d5, + 0x47876d8804129315, + 0xaa2034ff6f2dbbfe, + 0x198e21c07650e17e, + ], + [ + 0x726e909e04387991, + 0xc7ffa3a0b15010b0, + 0x3dcbf53796014e5f, + 0x6ff975c6296a545, + ], + [ + 0x873409c6cb8f4293, + 0x7c9f0e575481511a, + 0x1abcfb86def465b5, + 0x2cce45bbe220a9c3, + ], + [ + 0x177f82b8244a8c86, + 0x6a0cb9b7180ae675, + 0xf6303980a4badcd8, + 0x19efdde7d176db1e, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x7ca525e12dcdfe7e, + 0x92d2bc34b0f668d6, + 0xd2d5a509f2ec999e, + 0x292446b4c9fd9e00, + ], + [ + 0x54d64973a1801db1, + 0x14b1b30d6b84eb85, + 0xe004e75e1d890069, + 0x14999df03eb80105, + ], + [ + 0x55b67aec4745eab5, + 0x182941437038d3b5, + 0x51bf1f0cb7ccfe30, + 0xea55e886ee27d93, + ], + [ + 0xa44d4db7e6ef8cc, + 0xc375c91ef5cb057c, + 0x28d678602a27e751, + 0x19dc4a9160444b9c, + ], + [ + 0x528bc8918d25c86c, + 0xacd10e805b45d1ac, + 0x8c4720cc56502e82, + 0x13b82223db128626, + ], + [ + 0xccbacd18f1eecaca, + 0xecbefb323ea5559d, + 0x6bb96f4a3aabdfb4, + 0x1aa43176293727d3, + ], + [ + 0xd051fd4374e8904d, + 0x1746ac4a91826db4, + 0x557925ba3ec855ba, + 0x117b12630bc1d8ae, + ], + [ + 0xe4d0619de509d6ed, + 0xe5b690b6867b247e, + 0x8b83994073adf8ec, + 0x22e48989e8002d57, + ], + [ + 0x77464b55cd95efca, + 0x68ba7a74ae0e5894, + 0xbd4dc1c2266c359d, + 0x2967c834940e37a0, + ], + [ + 0x79aa4a02b1dbe3d4, + 0xf322a2e403eea8fc, + 0x172baee67d916242, + 0x141ec3e7f70137c2, + ], + [ + 0x216607ba60d7c149, + 0x26e6cc52b461abd7, + 0x7f911ecfec23090b, + 0x30081a3826a50e1b, + ], + [ + 0x102120ed7e949d8a, + 0xb8d3f8948fb3a8bd, + 0xd11f96163a9e50a5, + 0x14af79017ce754bc, + ], + [ + 0xf3ca9a27797eeba6, + 0x18daf636af68980a, + 0xabbe059eff309807, + 0x82fa0439a50110c, + ], + [ + 0x6907e36200995439, + 0xb9f80b5666c65169, + 0x7ba328f07ebc2640, + 0x152d921c334deb59, + ], + [ + 0x235bc3071b88c57f, + 0x1edd9e8b512a928b, + 0x4eba9db9a285a5db, + 0x208c85cecd6e86b2, + ], + [ + 0xd7e96fada4cc7131, + 0xe05eeb104bdd4f26, + 0xd629a31acc8b39c6, + 0x292e987009256cb4, + ], + [ + 0x9337ce2160d27631, + 0xb7603b2e38f0d93e, + 0xba04b96b55dfec38, + 0x25c45b9bb527b189, + ], +]; + +#[derive(Copy, Clone, Default, Debug, PartialEq)] +pub struct ElementBN128 { + pub z: [u64; 4], +} + +impl ElementBN128 { + #[inline] + fn mul64(self, a: u64, b: u64) -> (u64, u64) { + let c128: u128 = (a as u128) * (b as u128); + ((c128 >> 64) as u64, c128 as u64) + } + + #[inline] + fn mul64trunc(self, a: u64, b: u64) -> u64 { + let c128: u128 = (a as u128) * (b as u128); + c128 as u64 + } + + #[inline] + fn add64(self, a: u64, b: u64, cin: u64) -> (u64, u64) { + debug_assert!(cin == 0 || cin == 1); + + let mut r; + let mut cout; + + if a > 0xFFFFFFFFFFFFFFFF - b { + r = a.wrapping_add(b); + cout = 1; + } else { + r = a + b; + cout = 0; + } + if cin == 1 && r == 0xFFFFFFFFFFFFFFFF { + r = 0; + cout = 1; + } else { + r += cin; + } + (r, cout) + } + + #[inline] + fn sub64(self, a: u64, b: u64, bin: u64) -> (u64, u64) { + debug_assert!(bin == 0 || bin == 1); + + let mut r: u64; + let mut bout; + + if a < b { + // r = 0xFFFFFFFFFFFFFFFF - b + a + 1; + r = a.wrapping_sub(b); + bout = 1; + } else { + r = a - b; + bout = 0; + } + if r < bin { + // here we can only have r = 0, bin = 1 => r becomes -1 + r = 0xFFFFFFFFFFFFFFFF; + bout = 1; + } else { + r -= bin; + } + (r, bout) + } + + // madd0 hi = a*b + c (discards lo bits) + #[inline] + fn madd0(self, a: u64, b: u64, c: u64) -> u64 { + let (hi, lo) = self.mul64(a, b); + let (_, carry) = self.add64(lo, c, 0); + let (hi, _) = self.add64(hi, 0, carry); + hi + } + + // madd1 hi, lo = a*b + c + #[inline] + fn madd1(self, a: u64, b: u64, c: u64) -> (u64, u64) { + let (hi, lo) = self.mul64(a, b); + let (lo, carry) = self.add64(lo, c, 0); + let (hi, _) = self.add64(hi, 0, carry); + (hi, lo) + } + + // madd2 hi, lo = a*b + c + d + #[inline] + fn madd2(self, a: u64, b: u64, c: u64, d: u64) -> (u64, u64) { + let (hi, lo) = self.mul64(a, b); + let (cc, carry) = self.add64(c, d, 0); + let (hi, _) = self.add64(hi, 0, carry); + let (lo, carry) = self.add64(lo, cc, 0); + let (hi, _) = self.add64(hi, 0, carry); + (hi, lo) + } + + #[inline] + fn madd3(self, a: u64, b: u64, c: u64, d: u64, e: u64) -> (u64, u64) { + let (hi, lo) = self.mul64(a, b); + let (cc, carry) = self.add64(c, d, 0); + let (hi, _) = self.add64(hi, 0, carry); + let (lo, carry) = self.add64(lo, cc, 0); + let (hi, _) = self.add64(hi, e, carry); + (hi, lo) + } + + #[inline] + fn _mul_generic(self, x: [u64; 4], y: [u64; 4]) -> [u64; 4] { + let mut z: [u64; 4] = [0u64; 4]; + let mut t: [u64; 4] = [0u64; 4]; + let mut c: [u64; 3] = [0u64; 3]; + + // round 0 + let v = x[0]; + (c[1], c[0]) = self.mul64(v, y[0]); + let m = self.mul64trunc(c[0], 14042775128853446655u64); + c[2] = self.madd0(m, 4891460686036598785u64, c[0]); + (c[1], c[0]) = self.madd1(v, y[1], c[1]); + (c[2], t[0]) = self.madd2(m, 2896914383306846353u64, c[2], c[0]); + (c[1], c[0]) = self.madd1(v, y[2], c[1]); + (c[2], t[1]) = self.madd2(m, 13281191951274694749u64, c[2], c[0]); + (c[1], c[0]) = self.madd1(v, y[3], c[1]); + (t[3], t[2]) = self.madd3(m, 3486998266802970665u64, c[0], c[2], c[1]); + + // round 1 + let v = x[1]; + (c[1], c[0]) = self.madd1(v, y[0], t[0]); + let m = self.mul64trunc(c[0], 14042775128853446655u64); + c[2] = self.madd0(m, 4891460686036598785u64, c[0]); + (c[1], c[0]) = self.madd2(v, y[1], c[1], t[1]); + (c[2], t[0]) = self.madd2(m, 2896914383306846353u64, c[2], c[0]); + (c[1], c[0]) = self.madd2(v, y[2], c[1], t[2]); + (c[2], t[1]) = self.madd2(m, 13281191951274694749u64, c[2], c[0]); + (c[1], c[0]) = self.madd2(v, y[3], c[1], t[3]); + (t[3], t[2]) = self.madd3(m, 3486998266802970665u64, c[0], c[2], c[1]); + + // round 2 + let v = x[2]; + (c[1], c[0]) = self.madd1(v, y[0], t[0]); + let m = self.mul64trunc(c[0], 14042775128853446655u64); + c[2] = self.madd0(m, 4891460686036598785u64, c[0]); + (c[1], c[0]) = self.madd2(v, y[1], c[1], t[1]); + (c[2], t[0]) = self.madd2(m, 2896914383306846353u64, c[2], c[0]); + (c[1], c[0]) = self.madd2(v, y[2], c[1], t[2]); + (c[2], t[1]) = self.madd2(m, 13281191951274694749u64, c[2], c[0]); + (c[1], c[0]) = self.madd2(v, y[3], c[1], t[3]); + (t[3], t[2]) = self.madd3(m, 3486998266802970665u64, c[0], c[2], c[1]); + + // round 3 + let v = x[3]; + (c[1], c[0]) = self.madd1(v, y[0], t[0]); + let m = self.mul64trunc(c[0], 14042775128853446655u64); + c[2] = self.madd0(m, 4891460686036598785u64, c[0]); + (c[1], c[0]) = self.madd2(v, y[1], c[1], t[1]); + (c[2], z[0]) = self.madd2(m, 2896914383306846353u64, c[2], c[0]); + (c[1], c[0]) = self.madd2(v, y[2], c[1], t[2]); + (c[2], z[1]) = self.madd2(m, 13281191951274694749u64, c[2], c[0]); + (c[1], c[0]) = self.madd2(v, y[3], c[1], t[3]); + (z[3], z[2]) = self.madd3(m, 3486998266802970665u64, c[0], c[2], c[1]); + + // if z > q --> z -= q + // note: this is NOT constant time + if !(z[3] < 3486998266802970665u64 + || (z[3] == 3486998266802970665u64 + && (z[2] < 13281191951274694749u64 + || (z[2] == 13281191951274694749u64 + && (z[1] < 2896914383306846353u64 + || (z[1] == 2896914383306846353u64 + && (z[0] < 4891460686036598785u64))))))) + { + let mut b; + (z[0], b) = self.sub64(z[0], 4891460686036598785u64, 0); + (z[1], b) = self.sub64(z[1], 2896914383306846353u64, b); + (z[2], b) = self.sub64(z[2], 13281191951274694749u64, b); + (z[3], _) = self.sub64(z[3], 3486998266802970665u64, b); + } + + z + } + + #[inline] + fn _add_generic(self, x: [u64; 4], y: [u64; 4]) -> [u64; 4] { + let mut z: [u64; 4] = [0u64; 4]; + let mut carry; + + (z[0], carry) = self.add64(x[0], y[0], 0); + (z[1], carry) = self.add64(x[1], y[1], carry); + (z[2], carry) = self.add64(x[2], y[2], carry); + (z[3], _) = self.add64(x[3], y[3], carry); + + // if z > q --> z -= q + // note: this is NOT constant time + if !(z[3] < 3486998266802970665u64 + || (z[3] == 3486998266802970665u64 + && (z[2] < 13281191951274694749u64 + || (z[2] == 13281191951274694749u64 + && (z[1] < 2896914383306846353u64 + || (z[1] == 2896914383306846353u64 + && (z[0] < 4891460686036598785u64))))))) + { + let mut b; + (z[0], b) = self.sub64(z[0], 4891460686036598785u64, 0); + (z[1], b) = self.sub64(z[1], 2896914383306846353u64, b); + (z[2], b) = self.sub64(z[2], 13281191951274694749u64, b); + (z[3], _) = self.sub64(z[3], 3486998266802970665u64, b); + } + + z + } + + #[inline] + fn _from_mont_generic(self, x: [u64; 4]) -> [u64; 4] { + let mut z: [u64; 4] = x; + + // m = z[0]n'[0] mod W + let m = self.mul64trunc(z[0], 14042775128853446655u64); + let mut c = self.madd0(m, 4891460686036598785u64, z[0]); + (c, z[0]) = self.madd2(m, 2896914383306846353u64, z[1], c); + (c, z[1]) = self.madd2(m, 13281191951274694749u64, z[2], c); + (c, z[2]) = self.madd2(m, 3486998266802970665u64, z[3], c); + z[3] = c; + + // m = z[0]n'[0] mod W + let m = self.mul64trunc(z[0], 14042775128853446655u64); + let mut c = self.madd0(m, 4891460686036598785u64, z[0]); + (c, z[0]) = self.madd2(m, 2896914383306846353u64, z[1], c); + (c, z[1]) = self.madd2(m, 13281191951274694749u64, z[2], c); + (c, z[2]) = self.madd2(m, 3486998266802970665u64, z[3], c); + z[3] = c; + + // m = z[0]n'[0] mod W + let m = self.mul64trunc(z[0], 14042775128853446655u64); + let mut c = self.madd0(m, 4891460686036598785u64, z[0]); + (c, z[0]) = self.madd2(m, 2896914383306846353u64, z[1], c); + (c, z[1]) = self.madd2(m, 13281191951274694749u64, z[2], c); + (c, z[2]) = self.madd2(m, 3486998266802970665u64, z[3], c); + z[3] = c; + + // m = z[0]n'[0] mod W + let m = self.mul64trunc(z[0], 14042775128853446655u64); + let mut c = self.madd0(m, 4891460686036598785u64, z[0]); + (c, z[0]) = self.madd2(m, 2896914383306846353u64, z[1], c); + (c, z[1]) = self.madd2(m, 13281191951274694749u64, z[2], c); + (c, z[2]) = self.madd2(m, 3486998266802970665u64, z[3], c); + z[3] = c; + + // if z > q --> z -= q + // note: this is NOT constant time + if !(z[3] < 3486998266802970665u64 + || (z[3] == 3486998266802970665u64 + && (z[2] < 13281191951274694749u64 + || (z[2] == 13281191951274694749u64 + && (z[1] < 2896914383306846353u64 + || (z[1] == 2896914383306846353u64 + && (z[0] < 4891460686036598785u64))))))) + { + let mut b; + (z[0], b) = self.sub64(z[0], 4891460686036598785u64, 0); + (z[1], b) = self.sub64(z[1], 2896914383306846353u64, b); + (z[2], b) = self.sub64(z[2], 13281191951274694749u64, b); + (z[3], _) = self.sub64(z[3], 3486998266802970665u64, b); + } + + z + } + + fn square(&mut self) { + self.z = self._mul_generic(self.z, self.z); + } + + #[inline] + pub fn exp5(&mut self) { + let x: [u64; 4] = self.z; + self.square(); + self.square(); + self.z = self._mul_generic(self.z, x); + } + + #[inline] + pub fn add(&mut self, x: ElementBN128, y: ElementBN128) { + self.z = self._add_generic(x.z, y.z); + } + + #[inline] + pub fn mul(&mut self, x: ElementBN128, y: ElementBN128) { + self.z = self._mul_generic(x.z, y.z); + } + + #[allow(dead_code)] + #[inline] + pub fn to_mont(&mut self) { + self.z = self._mul_generic(self.z, RSQUARE); + } + + #[allow(dead_code)] + #[inline] + pub fn from_mont(&mut self) { + self.z = self._from_mont_generic(self.z); + } + + pub fn new(v: [u64; 4]) -> Self { + Self { z: v } + } + + pub fn zero() -> Self { + let v: [u64; 4] = [0; 4]; + Self { z: v } + } + + #[inline] + pub fn set_zero(&mut self) { + self.z[0] = 0; + self.z[1] = 0; + self.z[2] = 0; + self.z[3] = 0; + } + + #[allow(dead_code)] + #[inline] + pub fn set_uint64(&mut self, v: u64) { + self.z[0] = v; + self.z[1] = 0; + self.z[2] = 0; + self.z[3] = 0; + + self.z = self._mul_generic(self.z, RSQUARE); + } +} + +#[derive(Copy, Clone, Default, Debug, PartialEq)] +pub struct PoseidonBN128NativePermutation { + state: [F; SPONGE_WIDTH], +} + +impl PoseidonBN128NativePermutation { + #[inline] + fn exp5state(self, state: &mut [ElementBN128; 5]) { + state[0].exp5(); + state[1].exp5(); + state[2].exp5(); + state[3].exp5(); + state[4].exp5(); + } + + #[inline] + fn ark(self, state: &mut [ElementBN128; 5], c: [[u64; 4]; 100], it: usize) { + for i in 0..5 { + let cc = ElementBN128::new(c[it + i]); + state[i].add(state[i], cc); + } + } + + #[inline] + fn mix(self, state: &mut [ElementBN128; 5], m: [[[u64; 4]; 5]; 5]) { + let mut new_state: [ElementBN128; 5] = [ElementBN128::zero(); 5]; + let mut mul = ElementBN128::zero(); + for i in 0..5 { + new_state[i].set_uint64(0); + for j in 0..5 { + let mm = ElementBN128::new(m[j][i]); + mul.mul(mm, state[j]); + new_state[i].add(new_state[i], mul); + } + } + for i in 0..5 { + state[i] = new_state[i]; + } + } + + pub fn permute_fn(&self, input: [u64; 12]) -> [u64; 12] { + let mut inp: [ElementBN128; 4] = [ElementBN128::zero(); 4]; + for i in 0..4 { + inp[i].z[0] = input[i * 3 + 2]; + inp[i].z[1] = input[i * 3 + 1]; + inp[i].z[2] = input[i * 3 + 0]; + inp[i].z[3] = 0; + inp[i].to_mont(); + } + + const CT: usize = 5; + const N_ROUNDS_F: usize = 8; + const N_ROUNDS_P: usize = 60; + + let mut state: [ElementBN128; 5] = [ElementBN128::zero(); 5]; + state[1] = inp[0]; + state[2] = inp[1]; + state[3] = inp[2]; + state[4] = inp[3]; + + self.ark(&mut state, C, 0); + + for i in 0..(N_ROUNDS_F / 2 - 1) { + self.exp5state(&mut state); + self.ark(&mut state, C, (i + 1) * CT); + self.mix(&mut state, M); + } + + self.exp5state(&mut state); + self.ark(&mut state, C, (N_ROUNDS_F / 2) * CT); + self.mix(&mut state, P); + + for i in 0..N_ROUNDS_P { + state[0].exp5(); + let cc = ElementBN128::new(C[(N_ROUNDS_F / 2 + 1) * CT + i]); + state[0].add(state[0], cc); + + let mut mul = ElementBN128::zero(); + let mut new_state0 = ElementBN128::zero(); + for j in 0..CT { + let ss = ElementBN128::new(S[(CT * 2 - 1) * i + j]); + mul.mul(ss, state[j]); + new_state0.add(new_state0, mul); + } + + for k in 1..CT { + let ss = ElementBN128::new(S[(CT * 2 - 1) * i + CT + k - 1]); + mul.set_zero(); + mul.mul(state[0], ss); + state[k].add(state[k], mul); + } + state[0] = new_state0; + } + + for i in 0..(N_ROUNDS_F / 2 - 1) { + self.exp5state(&mut state); + self.ark(&mut state, C, (N_ROUNDS_F / 2 + 1) * CT + N_ROUNDS_P + i * CT); + self.mix(&mut state, M); + } + self.exp5state(&mut state); + self.mix(&mut state, M); + + let mut out: [u64; 12] = [0; 12]; + for i in 0..4 { + let mut r_e = state[i]; + r_e.from_mont(); + out[i * 3] = r_e.z[2]; + out[i * 3 + 1] = r_e.z[1]; + out[i * 3 + 2] = r_e.z[0]; + } + for i in 0..12 { + if out[i] >= 0xFFFFFFFF00000001u64 { + out[i] = out[i] - 0xFFFFFFFF00000001u64; + } + } + + out + } +} diff --git a/plonky2/src/hash/poseidon_goldilocks.rs b/plonky2/src/hash/poseidon_goldilocks.rs index 12d061265e..300b737c56 100644 --- a/plonky2/src/hash/poseidon_goldilocks.rs +++ b/plonky2/src/hash/poseidon_goldilocks.rs @@ -308,7 +308,7 @@ impl Poseidon for GoldilocksField { // The following code has been adapted from winterfell/crypto/src/hash/mds/mds_f64_12x12.rs // located at https://github.com/facebook/winterfell. #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] -mod poseidon12_mds { +pub(crate) mod poseidon12_mds { const MDS_FREQ_BLOCK_ONE: [i64; 3] = [16, 32, 16]; const MDS_FREQ_BLOCK_TWO: [(i64, i64); 3] = [(2, -1), (-4, 1), (16, 1)]; const MDS_FREQ_BLOCK_THREE: [i64; 3] = [-1, -8, 2]; @@ -354,7 +354,7 @@ mod poseidon12_mds { } #[inline(always)] - const fn block2(x: [(i64, i64); 3], y: [(i64, i64); 3]) -> [(i64, i64); 3] { + pub(crate) const fn block2(x: [(i64, i64); 3], y: [(i64, i64); 3]) -> [(i64, i64); 3] { let [(x0r, x0i), (x1r, x1i), (x2r, x2i)] = x; let [(y0r, y0i), (y1r, y1i), (y2r, y2i)] = y; let x0s = x0r + x0i; diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs index 3bc266a9f5..4fc1e03841 100644 --- a/plonky2/src/lib.rs +++ b/plonky2/src/lib.rs @@ -3,6 +3,7 @@ #![deny(rustdoc::broken_intra_doc_links)] #![deny(missing_debug_implementations)] #![cfg_attr(not(feature = "std"), no_std)] +#![feature(stdarch_x86_avx512)] // #[cfg(not(feature = "std"))] pub extern crate alloc; From 6ef25975b06da0ae3a920dffd999f308b354fbef Mon Sep 17 00:00:00 2001 From: Dumi Loghin Date: Fri, 7 Jun 2024 17:42:40 +0800 Subject: [PATCH 2/7] improvements to Poseidon BN128 implementation --- plonky2/Cargo.toml | 2 + .../hash/arch/x86_64/poseidon_bn128_avx2.rs | 208 +++++++++--------- plonky2/src/hash/poseidon_bn128.rs | 4 +- plonky2/src/hash/poseidon_bn128_ops.rs | 140 +++++++++--- plonky2/src/util/mod.rs | 3 + plonky2/src/util/papi.rs | 15 ++ 6 files changed, 247 insertions(+), 125 deletions(-) create mode 100644 plonky2/src/util/papi.rs diff --git a/plonky2/Cargo.toml b/plonky2/Cargo.toml index 23589c9443..742579eab2 100644 --- a/plonky2/Cargo.toml +++ b/plonky2/Cargo.toml @@ -25,6 +25,7 @@ cuda = ["cryptography_cuda/cuda"] no_cuda = ["cryptography_cuda/no_cuda"] batch = [] cuda_timing = [] +papi = [] [dependencies] ahash = { workspace = true } @@ -41,6 +42,7 @@ static_assertions = { workspace = true } unroll = { workspace = true } web-time = { version = "1.0.0", optional = true } once_cell = { version = "1.18.0" } +papi-bindings = { version = "0.5.2" } # Local dependencies plonky2_field = { version = "0.2.0", path = "../field", default-features = false } diff --git a/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs b/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs index 50c177d39f..0ca669189b 100644 --- a/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs +++ b/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs @@ -2,6 +2,9 @@ use core::arch::x86_64::*; use crate::hash::poseidon_bn128_ops::{ElementBN128, C, M, P, S}; +#[cfg(feature = "papi")] +use crate::util::papi::{init_papi, stop_papi}; + #[allow(dead_code)] #[inline] unsafe fn set_zero() -> __m256i { @@ -113,8 +116,9 @@ unsafe fn sub64(a: &__m256i, b: &__m256i, bin: &__m256i) -> (__m256i, __m256i) { (r, bo) } +#[allow(dead_code)] #[inline] -unsafe fn mul64(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { +unsafe fn mul64_v1(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { let mut av: [u64; 4] = [0; 4]; let mut bv: [u64; 4] = [0; 4]; let mut hv: [u64; 4] = [0; 4]; @@ -134,35 +138,50 @@ unsafe fn mul64(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { (h, l) } +unsafe fn mul64(a: &__m256i, b: &__m256i) -> (__m256i, __m256i) { + let ah = _mm256_srli_epi64(*a, 32); + let bh = _mm256_srli_epi64(*b, 32); + let rl = _mm256_mul_epu32(*a, *b); + let ahbl = _mm256_mul_epu32(ah, *b); + let albh = _mm256_mul_epu32(*a, bh); + let rh = _mm256_mul_epu32(ah, bh); + let (rm, cm) = add64_no_carry(&ahbl, &albh); + let rm_l = _mm256_slli_epi64(rm, 32); + let rm_h = _mm256_srli_epi64(rm, 32); + let (rl, cl) = add64_no_carry(&rl, &rm_l); + let cm_s = _mm256_slli_epi64(cm, 32); + let rtmp = _mm256_add_epi64(rh, cm_s); + let (rh, _) = add64(&rtmp, &rm_h, &cl); + + (rh, rl) +} + // madd0 hi = a*b + c (discards lo bits) #[inline] unsafe fn madd0(a: &__m256i, b: &__m256i, c: &__m256i) -> __m256i { - let zeros = _mm256_set_epi64x(0, 0, 0, 0); let (hi, lo) = mul64(a, b); - let (_, cr) = add64(&lo, c, &zeros); - let (hi, _) = add64(&hi, &zeros, &cr); + let (_, cr) = add64_no_carry(&lo, c); + let hi = _mm256_add_epi64(hi, cr); hi } // madd1 hi, lo = a * b + c #[inline] unsafe fn madd1(a: &__m256i, b: &__m256i, c: &__m256i) -> (__m256i, __m256i) { - let zeros = _mm256_set_epi64x(0, 0, 0, 0); let (hi, lo) = mul64(a, b); - let (lo, cr) = add64(&lo, c, &zeros); - let (hi, _) = add64(&hi, &zeros, &cr); + let (lo, cr) = add64_no_carry(&lo, c); + let hi = _mm256_add_epi64(hi, cr); (hi, lo) } // madd2 hi, lo = a * b + c + d #[inline] unsafe fn madd2(a: &__m256i, b: &__m256i, c: &__m256i, d: &__m256i) -> (__m256i, __m256i) { - let zeros = _mm256_set_epi64x(0, 0, 0, 0); let (hi, lo) = mul64(a, b); - let (c, cr) = add64(c, d, &zeros); - let (hi, _) = add64(&hi, &zeros, &cr); - let (lo, cr) = add64(&lo, &c, &zeros); - let (hi, _) = add64(&hi, &zeros, &cr); + let (c, cr) = add64_no_carry(c, d); + let hi = _mm256_add_epi64(hi, cr); + let (lo, cr) = add64_no_carry(&lo, &c); + let hi = _mm256_add_epi64(hi, cr); (hi, lo) } @@ -174,43 +193,34 @@ unsafe fn madd3( d: &__m256i, e: &__m256i, ) -> (__m256i, __m256i) { - let zeros = _mm256_set_epi64x(0, 0, 0, 0); let (hi, lo) = mul64(a, b); - let (c, cr) = add64(c, d, &zeros); - let (hi, _) = add64(&hi, &zeros, &cr); - let (lo, cr) = add64(&lo, &c, &zeros); - let (hi, _) = add64(&hi, e, &cr); + let (c, cr) = add64_no_carry(c, d); + let hi = _mm256_add_epi64(hi, cr); + let (lo, cr) = add64_no_carry(&lo, &c); + let hi = _mm256_add_epi64(hi, cr); + let hi = _mm256_add_epi64(hi, *e); (hi, lo) } #[inline] pub unsafe fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i { + let rl = _mm256_mul_epu32(a, b); + let ah = _mm256_srli_epi64(a, 32); + let bh = _mm256_srli_epi64(b, 32); + let rh_1 = _mm256_mul_epu32(a, bh); + let rh_2 = _mm256_mul_epu32(ah, b); + let rh = _mm256_add_epi64(rh_1, rh_2); + let rh = _mm256_slli_epi64(rh, 32); + _mm256_add_epi64(rh, rl) +} + +#[allow(dead_code)] +#[inline] +pub unsafe fn _mm256_mullo_epi64_v2(a: __m256i, b: __m256i) -> __m256i { let mut av: [u64; 4] = [0; 4]; let mut bv: [u64; 4] = [0; 4]; _mm256_storeu_si256(av.as_mut_ptr().cast::<__m256i>(), a); _mm256_storeu_si256(bv.as_mut_ptr().cast::<__m256i>(), b); - /* - asm!( - "mov rax, [rdi]", - "mov rdx, [rsi]", - "mul rdx", - "mov [rdi], rax", - "mov rax, [rdi+8]", - "mov rdx, [rsi+8]", - "mul rdx", - "mov [rdi+8], rax", - "mov rax, [rdi+16]", - "mov rdx, [rsi+16]", - "mul rdx", - "mov [rdi+16], rax", - "mov rax, [rdi+24]", - "mov rdx, [rsi+24]", - "mul rdx", - "mov [rdi+24], rax", - in("rdi") &av, - in("rsi") &bv, - ); - */ for i in 0..4 { av[i] = ((av[i] as u128) * (bv[i] as u128)) as u64; } @@ -253,7 +263,6 @@ unsafe fn _mul_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { 14042775128853446655u64 as i64, 14042775128853446655u64 as i64, ); - let zeros = _mm256_set_epi64x(0, 0, 0, 0); // round 0 let mut v = x[0]; @@ -322,10 +331,11 @@ unsafe fn _mul_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { let st2 = _mm256_andnot_si256(cmp0, ct2); let st3 = _mm256_andnot_si256(cmp0, ct3); let mut b; - (z[0], b) = sub64(&z[0], &st0, &zeros); + (z[0], b) = sub64_no_borrow(&z[0], &st0); (z[1], b) = sub64(&z[1], &st1, &b); (z[2], b) = sub64(&z[2], &st2, &b); - (z[3], _) = sub64(&z[3], &st3, &b); + let tmp = _mm256_sub_epi64(z[3], st3); + z[3] = _mm256_sub_epi64(tmp, b); z } @@ -357,25 +367,13 @@ fn exp5state(state: &mut [__m256i; 8]) { #[inline] unsafe fn _add_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { let mut z: [__m256i; 4] = [_mm256_set_epi64x(0, 0, 0, 0); 4]; - let mut cr = _mm256_set_epi64x(0, 0, 0, 0); - - // TODO - delete - /* - let mut v: [u64; 4] = [0; 4]; - for i in 0..4 { - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), x[i]); - println!("x{:?}: {:?}", i, v); - } - for i in 0..4 { - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), y[i]); - println!("y{:?}: {:?}", i, v); - } - */ + let mut cr: __m256i; - (z[0], cr) = add64(&x[0], &y[0], &cr); + (z[0], cr) = add64_no_carry(&x[0], &y[0]); (z[1], cr) = add64(&x[1], &y[1], &cr); (z[2], cr) = add64(&x[2], &y[2], &cr); - (z[3], _) = add64(&x[3], &y[3], &cr); + let tmp = _mm256_add_epi64(x[3], y[3]); + z[3] = _mm256_add_epi64(tmp, cr); // if z > q --> z -= q let ct0 = _mm256_set_epi64x( @@ -402,7 +400,6 @@ unsafe fn _add_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { 3486998266802970665i64, 3486998266802970665i64, ); - let zeros = _mm256_set_epi64x(0, 0, 0, 0); // if z > q --> z -= q let cmp0 = _mm256_cmpgt_epi64(ct0, z[0]); @@ -423,39 +420,11 @@ unsafe fn _add_generic(x: [__m256i; 4], y: [__m256i; 4]) -> [__m256i; 4] { let st2 = _mm256_andnot_si256(cmp0, ct2); let st3 = _mm256_andnot_si256(cmp0, ct3); let mut b; - (z[0], b) = sub64(&z[0], &st0, &zeros); + (z[0], b) = sub64_no_borrow(&z[0], &st0); (z[1], b) = sub64(&z[1], &st1, &b); (z[2], b) = sub64(&z[2], &st2, &b); - (z[3], _) = sub64(&z[3], &st3, &b); - - // TODO - delete - /* - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[2]); - println!("z2: {:?}", v); - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), st2); - println!("ct: {:?}", v); - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b); - println!("bi: {:?}", v); - - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[2]); - println!("z2: {:?}", v); - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b); - println!("bo: {:?}", v); - - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[3]); - println!("z3: {:?}", v); - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), st3); - println!("ct: {:?}", v); - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), b); - println!("bi: {:?}", v); - - _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[3]); - println!("z3: {:?}", v); - // for i in 0..4 { - // _mm256_storeu_si256(v.as_mut_ptr().cast::<__m256i>(), z[i]); - // println!("z{:?}: {:?}", i, v); - //} - */ + let tmp = _mm256_sub_epi64(z[3], st3); + z[3] = _mm256_sub_epi64(tmp, b); z } @@ -522,7 +491,6 @@ unsafe fn from_mont(a: [__m256i; 4]) -> [__m256i; 4] { 14042775128853446655u64 as i64, 14042775128853446655u64 as i64, ); - let zeros = _mm256_set_epi64x(0, 0, 0, 0); let mut z: [__m256i; 4] = a; @@ -577,10 +545,11 @@ unsafe fn from_mont(a: [__m256i; 4]) -> [__m256i; 4] { let st2 = _mm256_andnot_si256(cmp0, ct2); let st3 = _mm256_andnot_si256(cmp0, ct3); let mut b; - (z[0], b) = sub64(&z[0], &st0, &zeros); + (z[0], b) = sub64_no_borrow(&z[0], &st0); (z[1], b) = sub64(&z[1], &st1, &b); (z[2], b) = sub64(&z[2], &st2, &b); - (z[3], _) = sub64(&z[3], &st3, &b); + let tmp = _mm256_sub_epi64(z[3], st3); + z[3] = _mm256_sub_epi64(tmp, b); z } @@ -830,8 +799,13 @@ fn print_state(state: &[ElementBN128; 5]) { println!(); } + + pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { - let st64: Vec = input.into_iter().map(|x| x as i64).collect(); + #[cfg(feature = "papi")] + let mut event_set = init_papi(); + #[cfg(feature = "papi")] + event_set.start().unwrap(); const CT: usize = 5; const N_ROUNDS_F: usize = 8; @@ -840,21 +814,29 @@ pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { unsafe { // load states let mut inp: [__m256i; 4] = [ - _mm256_set_epi64x(st64[11], st64[8], st64[5], st64[2]), - _mm256_set_epi64x(st64[10], st64[7], st64[4], st64[1]), - _mm256_set_epi64x(st64[9], st64[6], st64[3], st64[0]), + _mm256_set_epi64x(input[11] as i64, input[8] as i64, input[5] as i64, input[2] as i64), + _mm256_set_epi64x(input[10] as i64, input[7] as i64, input[4] as i64, input[1] as i64), + _mm256_set_epi64x(input[9] as i64, input[6] as i64, input[3] as i64, input[0] as i64), _mm256_set_epi64x(0i64, 0i64, 0i64, 0i64), ]; // to mont inp = to_mont(inp); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "to_mont"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + // start rounds let zeros = _mm256_set_epi64x(0, 0, 0, 0); let mut state: [__m256i; 8] = [zeros, zeros, zeros, zeros, inp[0], inp[1], inp[2], inp[3]]; ark(&mut state, C, 0); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "first ark"); + /* let mut z = [0u64; 4]; let z1 = [3650884469251175381u64, 0, 0, 0]; @@ -883,19 +865,39 @@ pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { assert_eq!(z8, z); */ + #[cfg(feature = "papi")] + event_set.start().unwrap(); for i in 0..(N_ROUNDS_F / 2 - 1) { exp5state(&mut state); ark(&mut state, C, (i + 1) * CT); mix(&mut state, M); } + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "half full rounds"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); exp5state(&mut state); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "exp5state"); + + #[cfg(feature = "papi")] + event_set.start().unwrap(); ark(&mut state, C, (N_ROUNDS_F / 2) * CT); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "ark"); + + #[cfg(feature = "papi")] + event_set.start().unwrap(); mix(&mut state, P); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "mix"); // println!("After 1st rounds:"); // print_state8(&state); + #[cfg(feature = "papi")] + event_set.start().unwrap(); // switch to classic representation let mut cstate = [ElementBN128::zero(); 5]; let mut tmps = [[0u64; 4]; 4]; @@ -958,10 +960,13 @@ pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { } cstate[0] = new_state0; } - + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "partial rounds"); // println!("After middle rounds:"); // print_state(&cstate); + #[cfg(feature = "papi")] + event_set.start().unwrap(); // switch to AVX state = [ _mm256_set_epi64x(0i64, 0i64, 0i64, cstate[0].z[0] as i64), @@ -1008,10 +1013,15 @@ pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { } exp5state(&mut state); mix(&mut state, M); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "half full rounds"); // println!("After all rounds:"); // print_state8(&state); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + let ss0 = from_mont(state[0..4].try_into().unwrap()); let ss1 = from_mont(state[4..8].try_into().unwrap()); @@ -1043,6 +1053,8 @@ pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { out[i] = out[i] - 0xFFFFFFFF00000001u64; } } + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "from_mont"); out } diff --git a/plonky2/src/hash/poseidon_bn128.rs b/plonky2/src/hash/poseidon_bn128.rs index e1012f8b5c..67e224d911 100644 --- a/plonky2/src/hash/poseidon_bn128.rs +++ b/plonky2/src/hash/poseidon_bn128.rs @@ -305,7 +305,7 @@ mod tests { GenericHashOut::::from_bytes(&left), GenericHashOut::::from_bytes(&right), ); - println!("output: {:?}", h); + assert_eq!(h.elements[0].0, 5894400909438531414u64); assert_eq!(h.elements[1].0, 4814851992117646301u64); assert_eq!(h.elements[2].0, 17814584260098324190u64); @@ -327,7 +327,7 @@ mod tests { v.push(F::from_canonical_u64(1441151880423231811u64)); let h = PoseidonBN128Hash::hash_public_inputs(&v); - println!("out: {:?}", h); + assert_eq!(h.elements[0].0, 2325439551141788444); assert_eq!(h.elements[1].0, 15244397589056680708); assert_eq!(h.elements[2].0, 5900587506047513594); diff --git a/plonky2/src/hash/poseidon_bn128_ops.rs b/plonky2/src/hash/poseidon_bn128_ops.rs index 48e770a507..ada239b620 100644 --- a/plonky2/src/hash/poseidon_bn128_ops.rs +++ b/plonky2/src/hash/poseidon_bn128_ops.rs @@ -1,3 +1,6 @@ +#[cfg(feature = "papi")] +use crate::util::papi::{init_papi, stop_papi}; + use super::hash_types::RichField; use super::poseidon::SPONGE_WIDTH; @@ -4195,8 +4198,9 @@ impl ElementBN128 { #[inline] fn mul64trunc(self, a: u64, b: u64) -> u64 { - let c128: u128 = (a as u128) * (b as u128); - c128 as u64 + // let c128: u128 = (a as u128) * (b as u128); + // c128 as u64 + a * b } #[inline] @@ -4222,6 +4226,21 @@ impl ElementBN128 { (r, cout) } + #[inline] + fn add64_no_carry(self, a: u64, b: u64) -> (u64, u64) { + let r; + let cout; + + if a > 0xFFFFFFFFFFFFFFFF - b { + r = a.wrapping_add(b); + cout = 1; + } else { + r = a + b; + cout = 0; + } + (r, cout) + } + #[inline] fn sub64(self, a: u64, b: u64, bin: u64) -> (u64, u64) { debug_assert!(bin == 0 || bin == 1); @@ -4247,42 +4266,60 @@ impl ElementBN128 { (r, bout) } + #[inline] + fn sub64_no_borrow(self, a: u64, b: u64) -> (u64, u64) { + let r: u64; + let bout; + + if a < b { + // r = 0xFFFFFFFFFFFFFFFF - b + a + 1; + r = a.wrapping_sub(b); + bout = 1; + } else { + r = a - b; + bout = 0; + } + (r, bout) + } + // madd0 hi = a*b + c (discards lo bits) #[inline] fn madd0(self, a: u64, b: u64, c: u64) -> u64 { let (hi, lo) = self.mul64(a, b); - let (_, carry) = self.add64(lo, c, 0); - let (hi, _) = self.add64(hi, 0, carry); - hi + let mut carry = 0u64; + if 0xFFFFFFFFFFFFFFFF - lo < c { + carry = 1; + } + hi + carry } // madd1 hi, lo = a*b + c #[inline] fn madd1(self, a: u64, b: u64, c: u64) -> (u64, u64) { let (hi, lo) = self.mul64(a, b); - let (lo, carry) = self.add64(lo, c, 0); - let (hi, _) = self.add64(hi, 0, carry); - (hi, lo) + let (lo, carry) = self.add64_no_carry(lo, c); + (hi + carry, lo) } // madd2 hi, lo = a*b + c + d #[inline] fn madd2(self, a: u64, b: u64, c: u64, d: u64) -> (u64, u64) { let (hi, lo) = self.mul64(a, b); - let (cc, carry) = self.add64(c, d, 0); - let (hi, _) = self.add64(hi, 0, carry); - let (lo, carry) = self.add64(lo, cc, 0); - let (hi, _) = self.add64(hi, 0, carry); + let (cc, carry) = self.add64_no_carry(c, d); + let htmp = hi + carry; + let (lo, carry) = self.add64_no_carry(lo, cc); + let hi = htmp + carry; (hi, lo) } #[inline] fn madd3(self, a: u64, b: u64, c: u64, d: u64, e: u64) -> (u64, u64) { let (hi, lo) = self.mul64(a, b); - let (cc, carry) = self.add64(c, d, 0); - let (hi, _) = self.add64(hi, 0, carry); - let (lo, carry) = self.add64(lo, cc, 0); - let (hi, _) = self.add64(hi, e, carry); + let (cc, carry) = self.add64_no_carry(c, d); + let htmp = hi + carry; + let (lo, carry) = self.add64_no_carry(lo, cc); + let htmp = htmp + carry; + let hi = htmp + e; (hi, lo) } @@ -4351,10 +4388,10 @@ impl ElementBN128 { && (z[0] < 4891460686036598785u64))))))) { let mut b; - (z[0], b) = self.sub64(z[0], 4891460686036598785u64, 0); + (z[0], b) = self.sub64_no_borrow(z[0], 4891460686036598785u64); (z[1], b) = self.sub64(z[1], 2896914383306846353u64, b); (z[2], b) = self.sub64(z[2], 13281191951274694749u64, b); - (z[3], _) = self.sub64(z[3], 3486998266802970665u64, b); + z[3] = z[3] - 3486998266802970665u64 - b; } z @@ -4365,10 +4402,10 @@ impl ElementBN128 { let mut z: [u64; 4] = [0u64; 4]; let mut carry; - (z[0], carry) = self.add64(x[0], y[0], 0); + (z[0], carry) = self.add64_no_carry(x[0], y[0]); (z[1], carry) = self.add64(x[1], y[1], carry); (z[2], carry) = self.add64(x[2], y[2], carry); - (z[3], _) = self.add64(x[3], y[3], carry); + z[3] = x[3] + y[3] + carry; // if z > q --> z -= q // note: this is NOT constant time @@ -4381,10 +4418,10 @@ impl ElementBN128 { && (z[0] < 4891460686036598785u64))))))) { let mut b; - (z[0], b) = self.sub64(z[0], 4891460686036598785u64, 0); + (z[0], b) = self.sub64_no_borrow(z[0], 4891460686036598785u64); (z[1], b) = self.sub64(z[1], 2896914383306846353u64, b); (z[2], b) = self.sub64(z[2], 13281191951274694749u64, b); - (z[3], _) = self.sub64(z[3], 3486998266802970665u64, b); + z[3] = z[3] - 3486998266802970665u64 - b; } z @@ -4437,10 +4474,10 @@ impl ElementBN128 { && (z[0] < 4891460686036598785u64))))))) { let mut b; - (z[0], b) = self.sub64(z[0], 4891460686036598785u64, 0); + (z[0], b) = self.sub64_no_borrow(z[0], 4891460686036598785u64); (z[1], b) = self.sub64(z[1], 2896914383306846353u64, b); (z[2], b) = self.sub64(z[2], 13281191951274694749u64, b); - (z[3], _) = self.sub64(z[3], 3486998266802970665u64, b); + z[3] = z[3] - 3486998266802970665u64 - b; } z @@ -4550,6 +4587,11 @@ impl PoseidonBN128NativePermutation { } pub fn permute_fn(&self, input: [u64; 12]) -> [u64; 12] { + #[cfg(feature = "papi")] + let mut event_set = init_papi(); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + let mut inp: [ElementBN128; 4] = [ElementBN128::zero(); 4]; for i in 0..4 { inp[i].z[0] = input[i * 3 + 2]; @@ -4559,6 +4601,11 @@ impl PoseidonBN128NativePermutation { inp[i].to_mont(); } + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "to_mont"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + const CT: usize = 5; const N_ROUNDS_F: usize = 8; const N_ROUNDS_P: usize = 60; @@ -4571,16 +4618,43 @@ impl PoseidonBN128NativePermutation { self.ark(&mut state, C, 0); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "first ark"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + for i in 0..(N_ROUNDS_F / 2 - 1) { self.exp5state(&mut state); self.ark(&mut state, C, (i + 1) * CT); self.mix(&mut state, M); } + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "first full rounds"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + self.exp5state(&mut state); + + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "exp5state"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + self.ark(&mut state, C, (N_ROUNDS_F / 2) * CT); + + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "ark"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + self.mix(&mut state, P); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "mix"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + for i in 0..N_ROUNDS_P { state[0].exp5(); let cc = ElementBN128::new(C[(N_ROUNDS_F / 2 + 1) * CT + i]); @@ -4603,14 +4677,28 @@ impl PoseidonBN128NativePermutation { state[0] = new_state0; } + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "partial rounds"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + for i in 0..(N_ROUNDS_F / 2 - 1) { self.exp5state(&mut state); - self.ark(&mut state, C, (N_ROUNDS_F / 2 + 1) * CT + N_ROUNDS_P + i * CT); + self.ark( + &mut state, + C, + (N_ROUNDS_F / 2 + 1) * CT + N_ROUNDS_P + i * CT, + ); self.mix(&mut state, M); } self.exp5state(&mut state); self.mix(&mut state, M); + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "second full rounds"); + #[cfg(feature = "papi")] + event_set.start().unwrap(); + let mut out: [u64; 12] = [0; 12]; for i in 0..4 { let mut r_e = state[i]; @@ -4624,6 +4712,8 @@ impl PoseidonBN128NativePermutation { out[i] = out[i] - 0xFFFFFFFF00000001u64; } } + #[cfg(feature = "papi")] + stop_papi(&mut event_set, "from_mont"); out } diff --git a/plonky2/src/util/mod.rs b/plonky2/src/util/mod.rs index 8f9960034d..f286834469 100644 --- a/plonky2/src/util/mod.rs +++ b/plonky2/src/util/mod.rs @@ -17,6 +17,9 @@ pub mod serialization; pub mod strided_view; pub mod timing; +#[cfg(feature = "papi")] +pub(crate) mod papi; + pub(crate) fn transpose_poly_values(polys: Vec>) -> Vec> { let poly_values = polys.into_iter().map(|p| p.values).collect::>(); transpose(&poly_values) diff --git a/plonky2/src/util/papi.rs b/plonky2/src/util/papi.rs new file mode 100644 index 0000000000..5287379d91 --- /dev/null +++ b/plonky2/src/util/papi.rs @@ -0,0 +1,15 @@ +use papi_bindings::counter::Counter; +use papi_bindings::events_set::EventsSet; + +pub fn init_papi() -> EventsSet { + papi_bindings::initialize(true).unwrap(); + let counters = vec![Counter::from_name("instructions").unwrap()]; + EventsSet::new(&counters).unwrap() +} + +pub fn stop_papi(event_set: &mut EventsSet, msg: &str) { + let counters = event_set.stop().unwrap(); + println!( + "No. of instructions {}: {}", msg, counters[0] + ); +} \ No newline at end of file From 3ce0815038a8af87151efdfaf0fc4c0755d8e7e9 Mon Sep 17 00:00:00 2001 From: Dumi Loghin Date: Mon, 10 Jun 2024 12:51:32 +0800 Subject: [PATCH 3/7] update readme --- README.md | 55 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 77b73a36ce..ac03dea19f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,58 @@ -# description -this repo is a fork of https://github.com/0xPolygonZero/plonky2. several optimizations were implemented to boost the computation speed. +# Description -# optimizations -- precompute of fft twiddle factors -- cuda implementation of Goldilocks Field NTT (feature `cuda`) +This repo is a fork of https://github.com/0xPolygonZero/plonky2. To boost speed, several optimizations were implemented: + +# Optimizations +- Precompute FFT twiddle factors. +- CUDA implementation of Goldilocks Field NTT (feature `cuda`). +- CUDA implementation of Poseidon (Goldilocks) and Poseidon (BN 128) (feature `cuda`). +- Fixed the AVX implementation for Poseidon (Goldilocks) (target CPU must support AVX2). +- CUDA implementation of Merkle Tree building (feature `cuda`). +- Change Merkle Tree structure from recursive to iterative (1-dimensional vector). + +# Dependencies -# dependencies ``` git submodule update --init --recursive ``` -# run examples -- cuda NTT +## Benchmarking Merkle Tree building with Poseison hash + +Set the latest Rust nightly: ``` -cargo run --release -p plonky2_field --features=cuda --example fft +rustup update +rustup override set nightly-x86_64-unknown-linux-gnu ``` +CPU, no AVX: ``cargo bench merkle`` + +CPU with AVX2: ``RUSTFLAGS="-C target-feature=+avx2" cargo bench merkle`` + +CPU with AVX512: ``RUSTFLAGS="-C target-feature=+avx512dq" cargo bench merkle`` + +GPU (CUDA): ``cargo bench merkle --features=cuda`` + +### Results + +The results in the table below represent the build time of a Merkle Tree with the indicated number of leaves (first row) using the hashing method indicated in the first column. The systems used for benchmarking are: + +- first three columns: AMD Ryzen Threadripper PRO 5975WX 32-Cores (only AVX2) + +NVIDIA RTX 4090 + +- last three columns: AMD Ryzen 9 7950X 16-Core (AVX2 and AVX512DQ) + + +| Number of MT Leaves | 2^13 | 2^14 | 2^15 | | 2^13 | 2^14 | 2^15 | +| --- | --- | --- | --- | --- | --- | --- | --- | +| Poseidon (no AVX) | 12.4 | 23.4 | 46.6 | | 12.8 | 25.2 | 50.3 | +| Poseidon (AVX) | 11.4 | 21.3 | 39.2 | | 10.3 | 20.3 | 40.2 | +| Poseidon (AVX512) | - | - | - | | 12.3 | 24.1 | 47.8 | +| Poseidon (GPU) | 8 | 14.3 | 26.5 | | - | - | - | +| Poseidon BN 128 (no AVX) | 111.9 | 223 | 446.3 | | 176.9 | 351 | 699.1 | +| Poseidon BN 128 (AVX) | 146.8 | 291.7 | 581.8 | | 220.1 | 433.5 | 858.8 | +| Poseidon BN 128 (AVX512) | - | - | - | | WIP | WIP | WIP | +| Poseidon BN 128 (GPU) | 37.5 | 57.6 | 92.9 | | - | - | - | + ## Running To see recursion performance, one can run this bench, which generates a chain of three recursion proofs: From 3229b837452f6cba512397b9000855783aa9a7cd Mon Sep 17 00:00:00 2001 From: Dumi Loghin Date: Mon, 10 Jun 2024 12:53:02 +0800 Subject: [PATCH 4/7] update readme --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ac03dea19f..55fbed4e82 100644 --- a/README.md +++ b/README.md @@ -34,12 +34,11 @@ GPU (CUDA): ``cargo bench merkle --features=cuda`` ### Results -The results in the table below represent the build time of a Merkle Tree with the indicated number of leaves (first row) using the hashing method indicated in the first column. The systems used for benchmarking are: +The results in the table below represent the build time (in milliseconds) of a Merkle Tree with the indicated number of leaves (first row) using the hashing method indicated in the first column. The systems used for benchmarking are: -- first three columns: AMD Ryzen Threadripper PRO 5975WX 32-Cores (only AVX2) + -NVIDIA RTX 4090 +- first three columns: AMD Ryzen Threadripper PRO 5975WX 32-Cores (only AVX2) + NVIDIA RTX 4090 (feature `cuda`); -- last three columns: AMD Ryzen 9 7950X 16-Core (AVX2 and AVX512DQ) +- last three columns: AMD Ryzen 9 7950X 16-Core (AVX2 and AVX512DQ). | Number of MT Leaves | 2^13 | 2^14 | 2^15 | | 2^13 | 2^14 | 2^15 | From c7fff0e62ee4a321d1102e91bbee1cf7b468ba32 Mon Sep 17 00:00:00 2001 From: Dumitrel Loghin Date: Mon, 1 Jul 2024 11:59:29 +0800 Subject: [PATCH 5/7] cargo fmt --- plonky2/src/hash/arch/x86_64/mod.rs | 13 ++-- .../hash/arch/x86_64/poseidon_bn128_avx2.rs | 24 ++++++-- .../arch/x86_64/poseidon_goldilocks_avx512.rs | 14 ++--- plonky2/src/hash/poseidon.rs | 8 +-- plonky2/src/hash/poseidon_bn128.rs | 61 +++++++++---------- plonky2/src/hash/poseidon_bn128_ops.rs | 5 +- plonky2/src/util/papi.rs | 6 +- 7 files changed, 69 insertions(+), 62 deletions(-) diff --git a/plonky2/src/hash/arch/x86_64/mod.rs b/plonky2/src/hash/arch/x86_64/mod.rs index bfd9da3359..28b49ce53c 100644 --- a/plonky2/src/hash/arch/x86_64/mod.rs +++ b/plonky2/src/hash/arch/x86_64/mod.rs @@ -2,16 +2,15 @@ // // - AVX2 // // - BMI2 (for MULX and SHRX) // #[cfg(all(target_feature = "avx2", target_feature = "bmi2"))] -#[cfg(all(target_feature = "avx2",not(target_feature = "avx512dq")))] -pub mod poseidon_goldilocks_avx2; -#[cfg(all(target_feature = "avx2",target_feature = "avx512dq"))] -pub mod poseidon_goldilocks_avx512; -#[cfg(target_feature = "avx2")] -pub mod poseidon2_goldilocks_avx2; #[cfg(target_feature = "avx2")] pub mod goldilocks_avx2; #[cfg(target_feature = "avx512dq")] pub mod goldilocks_avx512; #[cfg(target_feature = "avx2")] +pub mod poseidon2_goldilocks_avx2; +#[cfg(target_feature = "avx2")] pub mod poseidon_bn128_avx2; - +#[cfg(all(target_feature = "avx2", not(target_feature = "avx512dq")))] +pub mod poseidon_goldilocks_avx2; +#[cfg(all(target_feature = "avx2", target_feature = "avx512dq"))] +pub mod poseidon_goldilocks_avx512; diff --git a/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs b/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs index 0ca669189b..453d41d894 100644 --- a/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs +++ b/plonky2/src/hash/arch/x86_64/poseidon_bn128_avx2.rs @@ -1,7 +1,6 @@ use core::arch::x86_64::*; use crate::hash::poseidon_bn128_ops::{ElementBN128, C, M, P, S}; - #[cfg(feature = "papi")] use crate::util::papi::{init_papi, stop_papi}; @@ -799,8 +798,6 @@ fn print_state(state: &[ElementBN128; 5]) { println!(); } - - pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { #[cfg(feature = "papi")] let mut event_set = init_papi(); @@ -814,9 +811,24 @@ pub fn permute_bn128_avx(input: [u64; 12]) -> [u64; 12] { unsafe { // load states let mut inp: [__m256i; 4] = [ - _mm256_set_epi64x(input[11] as i64, input[8] as i64, input[5] as i64, input[2] as i64), - _mm256_set_epi64x(input[10] as i64, input[7] as i64, input[4] as i64, input[1] as i64), - _mm256_set_epi64x(input[9] as i64, input[6] as i64, input[3] as i64, input[0] as i64), + _mm256_set_epi64x( + input[11] as i64, + input[8] as i64, + input[5] as i64, + input[2] as i64, + ), + _mm256_set_epi64x( + input[10] as i64, + input[7] as i64, + input[4] as i64, + input[1] as i64, + ), + _mm256_set_epi64x( + input[9] as i64, + input[6] as i64, + input[3] as i64, + input[0] as i64, + ), _mm256_set_epi64x(0i64, 0i64, 0i64, 0i64), ]; diff --git a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs index fdaf321681..0ab9dc9146 100644 --- a/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs +++ b/plonky2/src/hash/arch/x86_64/poseidon_goldilocks_avx512.rs @@ -223,8 +223,8 @@ where unsafe { let mut r0 = _mm512_loadu_si512((&mut result[0..8]).as_mut_ptr().cast::()); let mut r1 = _mm512_loadu_si512((&mut result[4..12]).as_mut_ptr().cast::()); - - for r in 1..12 { + + for r in 1..12 { let sr512 = _mm512_set_epi64( state[r].to_canonical_u64() as i64, state[r].to_canonical_u64() as i64, @@ -322,19 +322,19 @@ where _mm512_storeu_si512((state[0..8]).as_mut_ptr().cast::(), r0); _mm512_storeu_si512((state[4..12]).as_mut_ptr().cast::(), r1); - *state = ::mds_layer(&state); + *state = ::mds_layer(&state); round_ctr += 1; - } - partial_first_constant_layer_avx(&mut state); + } + partial_first_constant_layer_avx(&mut state); mds_partial_layer_init_avx(&mut state); - + for i in 0..N_PARTIAL_ROUNDS { state[0] = sbox_monomial(state[0]); state[0] = state[0].add_canonical_u64(FAST_PARTIAL_ROUND_CONSTANTS[i]); *state = ::mds_partial_layer_fast(&state, i); } round_ctr += N_PARTIAL_ROUNDS; - + // Self::full_rounds(&mut state, &mut round_ctr); for _ in 0..HALF_N_FULL_ROUNDS { // load state diff --git a/plonky2/src/hash/poseidon.rs b/plonky2/src/hash/poseidon.rs index 410e6d2094..b1cfb20d63 100644 --- a/plonky2/src/hash/poseidon.rs +++ b/plonky2/src/hash/poseidon.rs @@ -8,6 +8,10 @@ use core::fmt::Debug; use plonky2_field::packed::PackedField; use unroll::unroll_for_loops; +#[cfg(all(target_feature = "avx2", not(target_feature = "avx512dq")))] +use super::arch::x86_64::poseidon_goldilocks_avx2::poseidon_avx; +#[cfg(all(target_feature = "avx2", target_feature = "avx512dq"))] +use super::arch::x86_64::poseidon_goldilocks_avx512::poseidon_avx512; use super::hash_types::HashOutTarget; use crate::field::extension::{Extendable, FieldExtension}; use crate::field::types::{Field, PrimeField64}; @@ -20,10 +24,6 @@ use crate::iop::ext_target::ExtensionTarget; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; use crate::plonk::config::{AlgebraicHasher, Hasher, HasherType}; -#[cfg(all(target_feature = "avx2", not(target_feature = "avx512dq")))] -use super::arch::x86_64::poseidon_goldilocks_avx2::poseidon_avx; -#[cfg(all(target_feature = "avx2", target_feature = "avx512dq"))] -use super::arch::x86_64::poseidon_goldilocks_avx512::poseidon_avx512; pub const SPONGE_RATE: usize = 8; pub const SPONGE_CAPACITY: usize = 4; diff --git a/plonky2/src/hash/poseidon_bn128.rs b/plonky2/src/hash/poseidon_bn128.rs index 67e224d911..e143d53502 100644 --- a/plonky2/src/hash/poseidon_bn128.rs +++ b/plonky2/src/hash/poseidon_bn128.rs @@ -8,16 +8,16 @@ use super::poseidon::PoseidonPermutation; use crate::field::extension::quadratic::QuadraticExtension; use crate::field::extension::Extendable; use crate::field::goldilocks_field::GoldilocksField; +#[cfg(target_feature = "avx2")] +use crate::hash::arch::x86_64::poseidon_bn128_avx2::permute_bn128_avx; use crate::hash::hash_types::{HashOut, RichField}; use crate::hash::hashing::{compress, hash_n_to_hash_no_pad, PlonkyPermutation}; use crate::hash::poseidon::{PoseidonHash, SPONGE_RATE, SPONGE_WIDTH}; +#[cfg(not(target_feature = "avx2"))] +use crate::hash::poseidon_bn128_ops::PoseidonBN128NativePermutation; use crate::iop::target::{BoolTarget, Target}; use crate::plonk::circuit_builder::CircuitBuilder; use crate::plonk::config::{AlgebraicHasher, GenericConfig, Hasher, HasherType}; -#[cfg(not(target_feature = "avx2"))] -use crate::hash::poseidon_bn128_ops::PoseidonBN128NativePermutation; -#[cfg(target_feature = "avx2")] -use crate::hash::arch::x86_64::poseidon_bn128_avx2::permute_bn128_avx; #[derive(Copy, Clone, Default, Debug, PartialEq)] pub struct PoseidonBN128Permutation { @@ -151,18 +151,18 @@ impl PlonkyPermutation for PoseidonBN128Permutation { fn permute(&mut self) { assert_eq!(SPONGE_WIDTH, 12); let su64: [u64; 12] = [ - self.state[0].to_canonical_u64(), - self.state[1].to_canonical_u64(), - self.state[2].to_canonical_u64(), - self.state[3].to_canonical_u64(), - self.state[4].to_canonical_u64(), - self.state[5].to_canonical_u64(), - self.state[6].to_canonical_u64(), - self.state[7].to_canonical_u64(), - self.state[8].to_canonical_u64(), - self.state[9].to_canonical_u64(), - self.state[10].to_canonical_u64(), - self.state[11].to_canonical_u64(), + self.state[0].to_canonical_u64(), + self.state[1].to_canonical_u64(), + self.state[2].to_canonical_u64(), + self.state[3].to_canonical_u64(), + self.state[4].to_canonical_u64(), + self.state[5].to_canonical_u64(), + self.state[6].to_canonical_u64(), + self.state[7].to_canonical_u64(), + self.state[8].to_canonical_u64(), + self.state[9].to_canonical_u64(), + self.state[10].to_canonical_u64(), + self.state[11].to_canonical_u64(), ]; #[cfg(not(target_feature = "avx2"))] @@ -173,18 +173,18 @@ impl PlonkyPermutation for PoseidonBN128Permutation { let out = permute_bn128_avx(su64); let permute_output = [ - F::from_canonical_u64(out[0]), - F::from_canonical_u64(out[1]), - F::from_canonical_u64(out[2]), - F::from_canonical_u64(out[3]), - F::from_canonical_u64(out[4]), - F::from_canonical_u64(out[5]), - F::from_canonical_u64(out[6]), - F::from_canonical_u64(out[7]), - F::from_canonical_u64(out[8]), - F::from_canonical_u64(out[9]), - F::from_canonical_u64(out[10]), - F::from_canonical_u64(out[11]), + F::from_canonical_u64(out[0]), + F::from_canonical_u64(out[1]), + F::from_canonical_u64(out[2]), + F::from_canonical_u64(out[3]), + F::from_canonical_u64(out[4]), + F::from_canonical_u64(out[5]), + F::from_canonical_u64(out[6]), + F::from_canonical_u64(out[7]), + F::from_canonical_u64(out[8]), + F::from_canonical_u64(out[9]), + F::from_canonical_u64(out[10]), + F::from_canonical_u64(out[11]), ]; self.set_from_slice(&permute_output, 0) @@ -260,10 +260,9 @@ impl GenericConfig<2> for PoseidonBN128GoldilocksConfig { mod tests { use anyhow::Result; use plonky2_field::types::Field; + use super::PoseidonBN128Hash; - use crate::plonk::config::{ - GenericConfig, GenericHashOut, Hasher, PoseidonGoldilocksConfig, - }; + use crate::plonk::config::{GenericConfig, GenericHashOut, Hasher, PoseidonGoldilocksConfig}; #[test] fn test_poseidon_bn128_hash_no_pad() -> Result<()> { diff --git a/plonky2/src/hash/poseidon_bn128_ops.rs b/plonky2/src/hash/poseidon_bn128_ops.rs index ada239b620..0009ce4eb9 100644 --- a/plonky2/src/hash/poseidon_bn128_ops.rs +++ b/plonky2/src/hash/poseidon_bn128_ops.rs @@ -1,8 +1,7 @@ -#[cfg(feature = "papi")] -use crate::util::papi::{init_papi, stop_papi}; - use super::hash_types::RichField; use super::poseidon::SPONGE_WIDTH; +#[cfg(feature = "papi")] +use crate::util::papi::{init_papi, stop_papi}; #[allow(dead_code)] pub const RSQUARE: [u64; 4] = [ diff --git a/plonky2/src/util/papi.rs b/plonky2/src/util/papi.rs index 5287379d91..037479187e 100644 --- a/plonky2/src/util/papi.rs +++ b/plonky2/src/util/papi.rs @@ -9,7 +9,5 @@ pub fn init_papi() -> EventsSet { pub fn stop_papi(event_set: &mut EventsSet, msg: &str) { let counters = event_set.stop().unwrap(); - println!( - "No. of instructions {}: {}", msg, counters[0] - ); -} \ No newline at end of file + println!("No. of instructions {}: {}", msg, counters[0]); +} From 436851f1f4995f302135dcf19015a6a388f44998 Mon Sep 17 00:00:00 2001 From: Dumitrel Loghin Date: Mon, 1 Jul 2024 12:05:29 +0800 Subject: [PATCH 6/7] remove stdarch_x86_avx512 --- plonky2/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plonky2/src/lib.rs b/plonky2/src/lib.rs index 4fc1e03841..1325f51555 100644 --- a/plonky2/src/lib.rs +++ b/plonky2/src/lib.rs @@ -3,7 +3,7 @@ #![deny(rustdoc::broken_intra_doc_links)] #![deny(missing_debug_implementations)] #![cfg_attr(not(feature = "std"), no_std)] -#![feature(stdarch_x86_avx512)] +// #![feature(stdarch_x86_avx512)] // #[cfg(not(feature = "std"))] pub extern crate alloc; From beb6e1174a5cd3f79cec4d9edb19d29728538f8b Mon Sep 17 00:00:00 2001 From: Dumitrel Loghin Date: Mon, 1 Jul 2024 12:22:19 +0800 Subject: [PATCH 7/7] fix test issue --- plonky2/src/hash/poseidon_bn128_ops.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/plonky2/src/hash/poseidon_bn128_ops.rs b/plonky2/src/hash/poseidon_bn128_ops.rs index 0009ce4eb9..38b573fe98 100644 --- a/plonky2/src/hash/poseidon_bn128_ops.rs +++ b/plonky2/src/hash/poseidon_bn128_ops.rs @@ -4197,9 +4197,8 @@ impl ElementBN128 { #[inline] fn mul64trunc(self, a: u64, b: u64) -> u64 { - // let c128: u128 = (a as u128) * (b as u128); - // c128 as u64 - a * b + let c128: u128 = (a as u128) * (b as u128); + c128 as u64 } #[inline]