From dbae141c461f383bd1639e58426f046f52aaa017 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 21 Sep 2018 14:35:12 +0200 Subject: [PATCH 1/2] GF RING: get rid of butterfly operations --- src/gf_ring.cpp | 102 ------------------------------------------------ src/gf_ring.h | 85 ---------------------------------------- 2 files changed, 187 deletions(-) diff --git a/src/gf_ring.cpp b/src/gf_ring.cpp index 90bd8542..e5b6a29f 100644 --- a/src/gf_ring.cpp +++ b/src/gf_ring.cpp @@ -59,108 +59,6 @@ void RingModN::mul_coef_to_buf( simd::mul_coef_to_buf(a, src, dest, len, this->_card); } -template <> -void RingModN::butterfly_ct( - uint16_t coef, - uint16_t* buf1, - uint16_t* buf2, - size_t len) const -{ - unsigned ratio = simd::countof(); - size_t vec_len = len / ratio; - size_t last_len = len - vec_len * ratio; - - // perform vector operations - simd::butterfly_ct(coef, buf1, buf2, vec_len, this->_card); - - // for last elements, perform as non-SIMD method - if (last_len > 0) { - for (size_t i = vec_len * ratio; i < len; ++i) { - uint16_t a = buf1[i]; - uint16_t b = mul(coef, buf2[i]); - buf1[i] = add(a, b); - buf2[i] = sub(a, b); - } - } -} - -template <> -void RingModN::butterfly_ct( - uint32_t coef, - uint32_t* buf1, - uint32_t* buf2, - size_t len) const -{ - unsigned ratio = simd::countof(); - size_t vec_len = len / ratio; - size_t last_len = len - vec_len * ratio; - - // perform vector operations - simd::butterfly_ct(coef, buf1, buf2, vec_len, this->_card); - - // for last elements, perform as non-SIMD method - if (last_len > 0) { - for (size_t i = vec_len * ratio; i < len; ++i) { - uint32_t a = buf1[i]; - uint32_t b = mul(coef, buf2[i]); - buf1[i] = add(a, b); - buf2[i] = sub(a, b); - } - } -} - -template <> -void RingModN::butterfly_gs( - uint16_t coef, - uint16_t* buf1, - uint16_t* buf2, - size_t len) const -{ - unsigned ratio = simd::countof(); - size_t vec_len = len / ratio; - size_t last_len = len - vec_len * ratio; - - // perform vector operations - simd::butterfly_gs(coef, buf1, buf2, vec_len, this->_card); - - // for last elements, perform as non-SIMD method - if (last_len > 0) { - for (size_t i = vec_len * ratio; i < len; ++i) { - uint16_t a = buf1[i]; - uint16_t b = buf2[i]; - uint16_t c = sub(a, b); - buf1[i] = add(a, b); - buf2[i] = mul(coef, c); - } - } -} - -template <> -void RingModN::butterfly_gs( - uint32_t coef, - uint32_t* buf1, - uint32_t* buf2, - size_t len) const -{ - unsigned ratio = simd::countof(); - size_t vec_len = len / ratio; - size_t last_len = len - vec_len * ratio; - - // perform vector operations - simd::butterfly_gs(coef, buf1, buf2, vec_len, this->_card); - - // for last elements, perform as non-SIMD method - if (last_len > 0) { - for (size_t i = vec_len * ratio; i < len; ++i) { - uint32_t a = buf1[i]; - uint32_t b = buf2[i]; - uint32_t c = sub(a, b); - buf1[i] = add(a, b); - buf2[i] = mul(coef, c); - } - } -} - template <> void RingModN::add_two_bufs(uint32_t* src, uint32_t* dest, size_t len) const diff --git a/src/gf_ring.h b/src/gf_ring.h index 2fa85d1e..173cb384 100644 --- a/src/gf_ring.h +++ b/src/gf_ring.h @@ -108,8 +108,6 @@ class RingModN { vec::Buffers& veca, vec::Buffers& vecb, vec::Buffers& res) const; - virtual void butterfly_ct(T coef, T* buf1, T* buf2, size_t len) const; - virtual void butterfly_gs(T coef, T* buf1, T* buf2, size_t len) const; bool is_quadratic_residue(T q) const; virtual void compute_omegas(vec::Vector& W, int n, T w) const; void compute_omegas_cached(vec::Vector& W, int n, T w) const; @@ -464,61 +462,6 @@ inline void RingModN::sub_vecp_to_vecp( } } -/** Butterfly computation for Cooley-Tukey FFT algorithm - * - * Perform in-place oprations on two buffers `P`, `Q` with a coefficient `c` - * - * \f{eqnarray*}{ - * P_i &= P_i + c \times Q_i \\ - * Q_i &= P_i - c \times Q_i \\ - * \f} - * - * @param coef - coefficient, a finite field element - * @param buf1 - a buffer of `len` elements - * @param buf2 - a buffer of `len` elements - * @param len - number of elements per buffer - */ -template -inline void -RingModN::butterfly_ct(T coef, T* buf1, T* buf2, size_t len) const -{ - size_t i; - for (i = 0; i < len; ++i) { - T a = buf1[i]; - T b = mul(coef, buf2[i]); - buf1[i] = add(a, b); - buf2[i] = sub(a, b); - } -} - -/** Butterfly computation for Gentleman-Sande FFT algorithm - * - * Perform in-place oprations on two buffers `P`, `Q` with a coefficient `c` - * - * \f{eqnarray*}{ - * P_i &= P_i + Q_i \\ - * Q_i &= c \times (P_i - Q_i) - * \f} - * - * @param coef - coefficient, a finite field element - * @param buf1 - a buffer of `len` elements - * @param buf2 - a buffer of `len` elements - * @param len - number of elements per buffer - */ -template -inline void -RingModN::butterfly_gs(T coef, T* buf1, T* buf2, size_t len) const -{ - size_t i; - for (i = 0; i < len; ++i) { - T a = buf1[i]; - T b = buf2[i]; - T c = sub(a, b); - buf1[i] = add(a, b); - buf2[i] = mul(coef, c); - } -} - template void RingModN::compute_factors_of_order() { @@ -1005,34 +948,6 @@ void RingModN::sub_two_bufs( uint32_t* res, size_t len) const; -template <> -void RingModN::butterfly_ct( - uint16_t coef, - uint16_t* buf1, - uint16_t* buf2, - size_t len) const; - -template <> -void RingModN::butterfly_ct( - uint32_t coef, - uint32_t* buf1, - uint32_t* buf2, - size_t len) const; - -template <> -void RingModN::butterfly_gs( - uint16_t coef, - uint16_t* buf1, - uint16_t* buf2, - size_t len) const; - -template <> -void RingModN::butterfly_gs( - uint32_t coef, - uint32_t* buf1, - uint32_t* buf2, - size_t len) const; - template <> void RingModN::hadamard_mul(int n, uint16_t* x, uint16_t* y) const; template <> From af01aed761c402ea854d0eb5cde00c7e8b2cc7bc Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 15 Oct 2018 11:36:14 +0200 Subject: [PATCH 2/2] FFT2n.h: rewrite butterfly CT & GS 1. Butterfly CT 1.1. single-layer butterfly For each pair `(P, Q) = (buf[i], buf[i + m]` for `step = 2 * m` ``` coef r1 = W[start * n / (2 * m)] P = P + r1 * Q Q = P - r1 * Q ``` 1.2. 2-layer butterfly For each quadruple `(P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])` First layer: butterfly on `(P, Q)` and` (R, S)` for `step = 2 * m` ``` coef r1 = W[start * n / (2 * m)] P = P + r1 * Q Q = P - r1 * Q R = R + r1 * S S = R - r1 * S ``` Second layer: butterfly on `(P, R)` and `(Q, S)` for `step = 4 * m` ``` coef r2 = W[start * n / (4 * m)] coef r3 = W[(start + m) * n / (4 * m)] P = P + r2 * R R = P - r2 * R Q = Q + r3 * S S = Q - r3 * S ``` 2. Butterfly GS 2.1. single-layer butterfly For each pair `(P, Q) = (buf[i], buf[i + m]` for `step = 2 * m` ``` coef r = inv_W[start * n / (2 * m)] P = P + Q Q = r * (P - Q) ``` --- src/fft_2n.h | 285 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 254 insertions(+), 31 deletions(-) diff --git a/src/fft_2n.h b/src/fft_2n.h index 17b57ade..fd39aca7 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -89,15 +89,64 @@ class Radix2 : public FourierTransform { void init_bitrev(); void bit_rev_permute(vec::Vector& vec); void bit_rev_permute(vec::Buffers& vec); + void butterfly_ct_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step); + void butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); + void butterfly_gs_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step); + void butterfly_gs_step_simple( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step); + + // Only used for non-vectorized elements + void butterfly_ct_step_slow( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step, + size_t offset = 0); + void butterfly_gs_step_slow( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step, + size_t offset = 0); + void butterfly_gs_step_simple_slow( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step, + size_t offset = 0); unsigned data_len; // number of real input elements + T card; + T card_minus_one; T w; T inv_w; size_t pkt_size; + size_t buf_size; std::unique_ptr rev = nullptr; std::unique_ptr> W = nullptr; std::unique_ptr> inv_W = nullptr; + T* vec_W; }; /** @@ -120,12 +169,18 @@ Radix2::Radix2(const gf::Field& gf, int n, int data_len, size_t pkt_size) inv_w = gf.inv(w); this->pkt_size = pkt_size; this->data_len = data_len > 0 ? data_len : n; + buf_size = pkt_size * sizeof(T); W = std::unique_ptr>(new vec::Vector(gf, n)); inv_W = std::unique_ptr>(new vec::Vector(gf, n)); gf.compute_omegas(*W, n, w); gf.compute_omegas(*inv_W, n, inv_w); + vec_W = W->get_mem(); + + card = this->gf->card(); + card_minus_one = this->gf->card_minus_one(); + rev = std::unique_ptr(new T[n]); init_bitrev(); } @@ -284,39 +339,127 @@ template void Radix2::fft(vec::Buffers& output, vec::Buffers& input) { const unsigned len = this->n; - const unsigned size = this->pkt_size; const unsigned input_len = input.get_n(); // to support FFT on input vectors of length greater than from `data_len` const unsigned group_len = (input_len > data_len) ? len / input_len : len / data_len; + const std::vector& i_mem = input.get_mem(); + const std::vector& o_mem = output.get_mem(); + for (unsigned idx = 0; idx < input_len; ++idx) { // set output = scramble(input), i.e. bit reversal ordering - T* a = input.get(idx); - const unsigned end = rev[idx] + group_len; - for (unsigned i = rev[idx]; i < end; ++i) { - output.copy(i, a); + for (unsigned i = rev[idx]; i < rev[idx] + group_len; ++i) { + memcpy(o_mem[i], i_mem[idx], buf_size); } } for (unsigned idx = input_len; idx < data_len; ++idx) { // set output = scramble(input), i.e. bit reversal ordering - const unsigned end = rev[idx] + group_len; - for (unsigned i = rev[idx]; i < end; ++i) { - output.fill(i, 0); + for (unsigned i = rev[idx]; i < rev[idx] + group_len; ++i) { + memset(o_mem[i], 0, buf_size); } } - // perform butterfly operations - for (unsigned m = group_len; m < len; m *= 2) { - const unsigned doubled_m = 2 * m; + + // ---------------------- + // Two layers at a time + // ---------------------- + unsigned m = group_len; + const unsigned end = len / 2; + for (; m < end; m <<= 2) { for (unsigned j = 0; j < m; ++j) { - const T r = W->get(j * len / doubled_m); - for (unsigned i = j; i < len; i += doubled_m) { - T* a = output.get(i); - T* b = output.get(i + m); + butterfly_ct_two_layers_step(output, j, m); + } + } + if (m < len) { + assert(m == end); + // perform the last butterfly operations + for (unsigned j = 0; j < m; ++j) { + const T r = W->get(j); + butterfly_ct_step(output, r, j, m, len); + } + } +} - // perform butterfly operation for Cooley-Tukey FFT algorithm - this->gf->butterfly_ct(r, a, b, size); - } +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = P + c * Q +// Q = P - c * Q +template +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step) +{ + butterfly_ct_step_slow(buf, r, start, m, step); +} + +/** + * Butterly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param start - index of buffer among `m` ones + * @param m - current group size + */ +template +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned step = m << 2; + // --------- + // First layer + // --------- + const T r1 = W->get(start * this->n / m / 2); + // first pair + butterfly_ct_step(buf, r1, start, m, step); + // second pair + butterfly_ct_step(buf, r1, start + 2 * m, m, step); + // --------- + // Second layer + // --------- + // first pair + const T r2 = W->get(start * this->n / m / 4); + butterfly_ct_step(buf, r2, start, 2 * m, step); + // second pair + const T r3 = W->get((start + m) * this->n / m / 4); + butterfly_ct_step(buf, r3, start + m, 2 * m, step); +} + +template +void Radix2::butterfly_ct_step_slow( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step, + size_t offset) +{ + for (int i = start; i < this->n; i += step) { + T* a = buf.get(i); + T* b = buf.get(i + m); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T x = this->gf->mul(coef, b[j]); + b[j] = this->gf->sub(a[j], x); + a[j] = this->gf->add(a[j], x); } } } @@ -336,32 +479,45 @@ void Radix2::fft(vec::Buffers& output, vec::Buffers& input) template void Radix2::fft_inv(vec::Buffers& output, vec::Buffers& input) { - const unsigned size = this->pkt_size; const unsigned len = this->n; const unsigned input_len = input.get_n(); // 1st reversion of elements of output bit_rev_permute(output); + // copy input to output + const std::vector& i_mem = input.get_mem(); + const std::vector& o_mem = output.get_mem(); unsigned i; for (i = 0; i < input_len; ++i) { - output.copy(i, input.get(i)); + memcpy(o_mem[i], i_mem[i], buf_size); } - for (; i < len; ++i) { - output.fill(i, 0); + + unsigned m = len / 2; + + if (input_len < len) { + const unsigned input_len_power_2 = + arith::get_smallest_power_of_2(input_len); + for (; i < input_len_power_2; ++i) { + memset(o_mem[i], 0, buf_size); + } + + // For Q are zeros only => Q = c * P + for (; m >= input_len; m /= 2) { + unsigned doubled_m = 2 * m; + for (unsigned j = 0; j < m; ++j) { + const T r = inv_W->get(j * len / doubled_m); + butterfly_gs_step_simple(output, r, j, m, doubled_m); + } + } } - for (unsigned m = len / 2; m >= 1; m /= 2) { + // Next, normal butterlfy GS is performed + for (; m >= 1; m /= 2) { unsigned doubled_m = 2 * m; for (unsigned j = 0; j < m; ++j) { - T r = inv_W->get(j * len / doubled_m); - for (unsigned i = j; i < len; i += doubled_m) { - T* a = output.get(i); - T* b = output.get(i + m); - - // perform butterfly operation for Gentleman-Sande FFT algorithm - this->gf->butterfly_gs(r, a, b, size); - } + const T r = inv_W->get(j * len / doubled_m); + butterfly_gs_step(output, r, j, m, doubled_m); } } @@ -369,6 +525,73 @@ void Radix2::fft_inv(vec::Buffers& output, vec::Buffers& input) bit_rev_permute(output); } +// for each pair (P, Q) = (buf[i], buf[i + m]): +// Q = c * P +template +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step) +{ + butterfly_gs_step_simple_slow(buf, coef, start, m, step); +} + +// for each pair (P, Q) = (buf[i], buf[i + m]): +// P = P + Q +// Q = c * (P - Q) +template +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step) +{ + butterfly_gs_step_slow(buf, coef, start, m, step); +} + +template +void Radix2::butterfly_gs_step_slow( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step, + size_t offset) +{ + for (int i = start; i < this->n; i += step) { + T* a = buf.get(i); + T* b = buf.get(i + m); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + T x = this->gf->sub(a[j], b[j]); + a[j] = this->gf->add(a[j], b[j]); + b[j] = this->gf->mul(coef, x); + } + } +} + +template +void Radix2::butterfly_gs_step_simple_slow( + vec::Buffers& buf, + T coef, + unsigned start, + unsigned m, + unsigned step, + size_t offset) +{ + for (int i = start; i < this->n; i += step) { + T* a = buf.get(i); + T* b = buf.get(i + m); + // perform butterfly operation for Cooley-Tukey FFT algorithm + for (size_t j = offset; j < this->pkt_size; ++j) { + b[j] = this->gf->mul(coef, a[j]); + } + } +} + template void Radix2::ifft(vec::Buffers& output, vec::Buffers& input) {