From acc1ee8890b0be3cd3c3e79346cf3dba1531bf35 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:34 +0200 Subject: [PATCH] SIMD FNT: vectorised operations for FNT --- src/simd_fnt.h | 555 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 555 insertions(+) create mode 100644 src/simd_fnt.h diff --git a/src/simd_fnt.h b/src/simd_fnt.h new file mode 100644 index 00000000..fc92b2d0 --- /dev/null +++ b/src/simd_fnt.h @@ -0,0 +1,555 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_FNT_H__ +#define __QUAD_SIMD_FNT_H__ + +#include + +namespace quadiron { +namespace simd { + +/* ================= Vectorized Operations ================= */ + +// butterfly CT with r == 1 +template +inline void BUTTERFLY_1(VecType* x, VecType* y, T q) +{ + VecType add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*x, *y, q); + *x = add; +} + +// butterfly CT with r == q - 1 +template +inline void BUTTERFLY_2(VecType* x, VecType* y, T q) +{ + VecType add = ADD_MOD(*x, *y, q); + *x = SUB_MOD(*x, *y, q); + *y = add; +} + +// butterfly CT with 1 < r < q - 1 +template +inline void BUTTERFLY_3(VecType c, VecType* x, VecType* y, T q) +{ + VecType z = MUL_MOD(c, *y, q); + *y = SUB_MOD(*x, z, q); + *x = ADD_MOD(*x, z, q); +} + +template +inline void BUTTERFLY_CT(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + if (rp1 == 2) { + BUTTERFLY_1(x, y, q); + } else if (rp1 < q) { + BUTTERFLY_3(c, x, y, q); + } else { + BUTTERFLY_2(x, y, q); + } +} + +// butterfly GS w/ r = q - 1 +template +inline void BUTTERFLY_4(VecType* x, VecType* y, T q) +{ + VecType add = ADD_MOD(*x, *y, q); + *y = SUB_MOD(*y, *x, q); + *x = add; +} + +// butterfly GS w/ 1 < r < q - 1 +// x = x + y mod q +// y = z * (x - y) mod q +template +inline void BUTTERFLY_5(VecType c, VecType* x, VecType* y, T q) +{ + VecType sub = SUB_MOD(*x, *y, q); + *x = ADD_MOD(*x, *y, q); + *y = MUL_MOD(c, sub, q); +} + +template +inline void BUTTERFLY_GS(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + if (rp1 == 2) { + BUTTERFLY_1(x, y, q); + } else if (rp1 < q) { + BUTTERFLY_5(c, x, y, q); + } else { + BUTTERFLY_4(x, y, q); + } +} + +template +inline VecType BUTTERFLY_GS_SIMPLE(T rp1, VecType c, VecType x, T q) +{ + if (rp1 == 2) { + return x; + } else if (rp1 < q) { + return MUL_MOD(c, x, q); + } else { + return NEG_MOD(x, q); + } +} + +/** + * Vectorized butterly CT step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + r * Q + * Q = P - r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param step - next loop + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const T rp1 = r + 1; + VecType c = SET1(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_CT(rp1, c, &x1, &y1, card); + + x2 = LOAD(p + j + 1); + y2 = LOAD(q + j + 1); + + BUTTERFLY_CT(rp1, c, &x2, &y2, card); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_CT(rp1, c, &x1, &y1, card); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } +} + +template +inline static void do_butterfly_ct_2_layers( + const std::vector& mem, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + const T r1p1 = r1 + 1; + const T r2p1 = r2 + 1; + const T r3p1 = r3 + 1; + + VecType c1 = SET1(r1); + VecType c2 = SET1(r2); + VecType c3 = SET1(r3); + + VecType* __restrict p = reinterpret_cast(mem[start]); + VecType* __restrict q = reinterpret_cast(mem[start + m]); + VecType* __restrict r = reinterpret_cast(mem[start + 2 * m]); + VecType* __restrict s = reinterpret_cast(mem[start + 3 * m]); + + // #pragma omp parallel for + size_t j = 0; + const size_t end = (len > 1) ? len - 1 : 0; + // #pragma unroll + while (j < end) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = LOAD(p); + VecType x2 = LOAD(p + 1); + VecType y1 = LOAD(q); + VecType y2 = LOAD(q + 1); + + BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); + BUTTERFLY_CT(r1p1, c1, &x2, &y2, card); + + VecType u1 = LOAD(r); + VecType u2 = LOAD(r + 1); + VecType v1 = LOAD(s); + VecType v2 = LOAD(s + 1); + + BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); + BUTTERFLY_CT(r1p1, c1, &u2, &v2, card); + + // Second layer (c2, x, u) & (c3, y, v) + BUTTERFLY_CT(r2p1, c2, &x1, &u1, card); + BUTTERFLY_CT(r2p1, c2, &x2, &u2, card); + + BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); + BUTTERFLY_CT(r3p1, c3, &y2, &v2, card); + + // Store back to memory + STORE(p, x1); + STORE(p + 1, x2); + STORE(q, y1); + STORE(q + 1, y2); + + STORE(r, u1); + STORE(r + 1, u2); + STORE(s, v1); + STORE(s + 1, v2); + p = p + 2; + q = q + 2; + r = r + 2; + s = s + 2; + j = j + 2; + }; + + for (; j < len; ++j) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = LOAD(p + j); + VecType y1 = LOAD(q + j); + VecType u1 = LOAD(r + j); + VecType v1 = LOAD(s + j); + + // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); + BUTTERFLY_CT(r1p1, c1, &x1, &y1, card); + BUTTERFLY_CT(r1p1, c1, &u1, &v1, card); + BUTTERFLY_CT(r2p1, c2, &x1, &u1, card); + BUTTERFLY_CT(r3p1, c3, &y1, &v1, card); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + STORE(r + j, u1); + STORE(s + j, v1); + } +} + +/** + * Vectorized butterly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param r1 - coefficient for the 1st layer + * @param r2 - 1st coefficient for the 2nd layer + * @param r3 - 2nd coefficient for the 2nd layer + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_two_layers_step( + vec::Buffers& buf, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 2; + const unsigned bufs_nb = buf.get_n(); + + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card); + } +} + +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + Q + * Q = r * (P - Q) + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = SET1(r); + + const size_t end = (len > 3) ? len - 3 : 0; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, x2, x3, x4; + VecType y1, y2, y3, y4; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 4) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + x3 = LOAD(p + j + 2); + x4 = LOAD(p + j + 3); + y1 = LOAD(q + j); + y2 = LOAD(q + j + 1); + y3 = LOAD(q + j + 2); + y4 = LOAD(q + j + 3); + + BUTTERFLY_GS(rp1, c, &x1, &y1, card); + BUTTERFLY_GS(rp1, c, &x2, &y2, card); + BUTTERFLY_GS(rp1, c, &x3, &y3, card); + BUTTERFLY_GS(rp1, c, &x4, &y4, card); + + // Store back to memory + STORE(p + j, x1); + STORE(p + j + 1, x2); + STORE(p + j + 2, x3); + STORE(p + j + 3, x4); + STORE(q + j, y1); + STORE(q + j + 1, y2); + STORE(q + j + 2, y3); + STORE(q + j + 3, y4); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + y1 = LOAD(q + j); + + BUTTERFLY_GS(rp1, c, &x1, &y1, card); + + // Store back to memory + STORE(p + j, x1); + STORE(q + j, y1); + } + } +} + +/** + * Vectorized butterly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * Q = r * P + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step_simple( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = SET1(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + // #pragma omp parallel for + // #pragma unroll + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* __restrict p = reinterpret_cast(mem[i]); + VecType* __restrict q = reinterpret_cast(mem[i + m]); + + // #pragma omp parallel for + size_t j = 0; + // #pragma unroll + for (; j < end; j += 2) { + x1 = LOAD(p + j); + x2 = LOAD(p + j + 1); + + y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); + y2 = BUTTERFLY_GS_SIMPLE(rp1, c, x2, card); + + // Store back to memory + STORE(q + j, y1); + STORE(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = LOAD(p + j); + + y1 = BUTTERFLY_GS_SIMPLE(rp1, c, x1, card); + + // Store back to memory + STORE(q + j, y1); + } + } +} + +template +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, + off_t offset, + unsigned code_len, + T threshold, + size_t vecs_nb) +{ + const unsigned element_size = sizeof(T); + const unsigned vec_size = countof(); + const T max = 1 << (element_size * 8 - 1); + const VecType _threshold = SET1(threshold); + const VecType mask_hi = SET1(max); + + // #pragma unroll + const std::vector& mem = output.get_mem(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + VecType* __restrict buf = reinterpret_cast(mem[frag_id]); + + size_t vec_id = 0; + size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; + // #pragma unroll + for (; vec_id < end; vec_id += 4) { + VecType a1 = LOAD(buf + vec_id); + VecType a2 = LOAD(buf + vec_id + 1); + VecType a3 = LOAD(buf + vec_id + 2); + VecType a4 = LOAD(buf + vec_id + 3); + + if (TESTZ(a1, _threshold) == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a1, curr_offset, max); + } + if (TESTZ(a2, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 1) * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a2, curr_offset, max); + } + if (TESTZ(a3, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 2) * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a3, curr_offset, max); + } + if (TESTZ(a4, _threshold) == 0) { + const off_t curr_offset = offset + (vec_id + 3) * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a4, curr_offset, max); + } + } + for (; vec_id < vecs_nb; ++vec_id) { + VecType a = LOAD(buf + vec_id); + uint32_t c = TESTZ(a, _threshold); + if (c == 0) { + const off_t curr_offset = offset + vec_id * vec_size; + ADD_PROPS( + props[frag_id], _threshold, mask_hi, a, curr_offset, max); + } + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif