From d2f771bf85132b643cbd52240fd5cf7d5f3f03b0 Mon Sep 17 00:00:00 2001 From: Naoki Shibata Date: Sat, 15 Mar 2025 21:08:35 +0900 Subject: [PATCH] This patch adds more options for unrolling DFT kernels --- src/dft/CMakeLists.txt | 82 +- src/dft/dft.cpp | 97 +- src/dft/dftcommon.hpp | 18 +- src/dft/mkdispatch.c | 67 +- src/dft/mkunroll.c | 24 +- src/dft/unroll2.cpp.in | 4856 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 5074 insertions(+), 70 deletions(-) create mode 100644 src/dft/unroll2.cpp.in diff --git a/src/dft/CMakeLists.txt b/src/dft/CMakeLists.txt index 793f73fa..18f8bd36 100644 --- a/src/dft/CMakeLists.txt +++ b/src/dft/CMakeLists.txt @@ -11,12 +11,29 @@ if (SLEEFDFT_MAXBUTWIDTH GREATER 7) message(FATAL_ERROR "SLEEFDFT_MAXBUTWIDTH has to be smaller than 8." ) endif() -set(SLEEFDFT_MAXSHIFT 2 CACHE STRING "Max hardcoded shift") +set(SLEEFDFT_MINSHIFT 1 CACHE STRING "Min hardcoded shift") +set(SLEEFDFT_MAXSHIFT 1 CACHE STRING "Max hardcoded shift") + +if ((${SLEEFDFT_MINSHIFT} GREATER ${SLEEFDFT_MAXSHIFT}) OR (${SLEEFDFT_MINSHIFT} LESS 1)) + message(FATAL_ERROR "SLEEFDFT_MINSHIFT, SLEEFDFT_MAXSHIFT range error") +endif() math(EXPR SLEEFDFT_MAXSHIFT_MINUS_1 "${SLEEFDFT_MAXSHIFT} - 1") -foreach(J RANGE 0 ${SLEEFDFT_MAXSHIFT_MINUS_1}) - list(APPEND LISTSHIFTSTR ${J}) -endforeach() +if (${SLEEFDFT_MINSHIFT} LESS ${SLEEFDFT_MAXSHIFT}) + foreach(J RANGE ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT_MINUS_1}) + list(APPEND LISTSHIFTSTR ${J}) + endforeach() +else() + set(LISTSHIFTSTR) +endif() + +if (1 LESS ${SLEEFDFT_MAXSHIFT}) + foreach(J RANGE 1 ${SLEEFDFT_MAXSHIFT_MINUS_1}) + list(APPEND LISTSHIFTSTR2 ${J}) + endforeach() +else() + set(LISTSHIFTSTR2) +endif() # Settings @@ -30,6 +47,10 @@ set(MACRODEF_purecdp BASETYPEID=1 ENABLE_PUREC CONFIG=1) set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC}) set(MACRODEF_purecsp BASETYPEID=2 ENABLE_PUREC CONFIG=1) set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC}) +if(CMAKE_C_COMPILER_ID MATCHES "Clang") +set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC} -O0) +set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC} -O0) +endif() set(MACRODEF_purecld BASETYPEID=3 ENABLE_PUREC CONFIG=1) set(CFLAGS_purecld ${FLAGS_ENABLE_PUREC}) set(MACRODEF_purecqp BASETYPEID=4 ENABLE_PUREC CONFIG=1) @@ -215,7 +236,8 @@ endif() set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} MAXBUTWIDTHDP=${SLEEFDFT_MAXBUTWIDTH} MAXBUTWIDTHSP=${SLEEFDFT_MAXBUTWIDTH} - MAXSHIFTDP=${SLEEFDFT_MAXSHIFT} MAXSHIFTSP=${SLEEFDFT_MAXSHIFT} + MINSHIFTDP=${SLEEFDFT_MINSHIFT} MAXSHIFTDP=${SLEEFDFT_MAXSHIFT} + MINSHIFTSP=${SLEEFDFT_MINSHIFT} MAXSHIFTSP=${SLEEFDFT_MAXSHIFT} ) if (SLEEFDFT_ENABLE_STREAM) @@ -253,7 +275,7 @@ endif() add_custom_command(OUTPUT dispatchparam.h COMMENT "Generating dispatchparam.h" - COMMAND $ paramonly ALL ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MAXSHIFT} ${ISALIST_SP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h + COMMAND $ paramonly ALL ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT} ${ISALIST_SP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h DEPENDS ${TARGET_MKDISPATCH} ) add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h) @@ -269,7 +291,7 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) string(CONCAT S "dispatch" ${ST} ".hpp") # S is dispatchdp.hpp add_custom_command(OUTPUT ${S} COMMENT "Generating ${S}" - COMMAND $ ${LT} ${CST} ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MAXSHIFT} ${ISALIST_${CST}} > ${S} + COMMAND $ ${LT} ${CST} ${SLEEFDFT_MAXBUTWIDTH} ${SLEEFDFT_MINSHIFT} ${SLEEFDFT_MAXSHIFT} ${ISALIST_${CST}} > ${S} DEPENDS ${TARGET_MKDISPATCH} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) @@ -305,6 +327,11 @@ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll1.cpp.in DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll1.cpp.in) add_custom_target(unroll1.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll1.cpp.in) +add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll2.cpp.in + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll2.cpp.in ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll2.cpp.in) +add_custom_target(unroll2.cpp.in.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll2.cpp.in) + # Target unroll*.cpp foreach(T ${LIST_SUPPORTED_FPTYPE}) @@ -323,7 +350,7 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) if(UNROLL_TARGET_${CST}) add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}} COMMENT "Generating ${UNROLL_TARGET_${CST}}" - COMMAND $ unroll0.cpp.in ${LT} ${CST} -1 ${ISALIST_${CST}} + COMMAND $ unroll0.cpp.in ${LT} ${CST} - ${ISALIST_${CST}} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} DEPENDS ${TARGET_MKUNROLL} unroll0.cpp.in.copied ) @@ -348,6 +375,26 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) ) add_custom_target(unroll_target_${ST}_${I} DEPENDS ${UNROLL_TARGET_${CST}_${I}}) endforeach() + + # + + foreach(I ${LISTSHIFTSTR2}) + foreach(E ${ISALIST_${CST}}) # E is "sse2dp" + foreach(N ${NLIST}) + string(CONCAT UC unroll2_ ${N} _ ${E} _ ${I} ".cpp") # UC is "unroll2_0_sse2dp_1.cpp" + set(UNROLL2_TARGET_${CST}_${I} ${UNROLL2_TARGET_${CST}_${I}} ${UC}) + endforeach() + endforeach() + message(STATUS "Unroll2 target for ${CST}_${I} : ${UNROLL2_TARGET_${CST}_${I}}") + + add_custom_command(OUTPUT ${UNROLL2_TARGET_${CST}_${I}} + COMMENT "Generating ${UNROLL2_TARGET_${CST}_${I}}" + COMMAND $ unroll2.cpp.in ${LT} ${CST} -${I} ${ISALIST_${CST}} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${TARGET_MKUNROLL} unroll2.cpp.in.copied + ) + add_custom_target(unroll2_target_${ST}_${I} DEPENDS ${UNROLL2_TARGET_${CST}_${I}}) + endforeach() endif() endforeach() @@ -360,7 +407,7 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) foreach(E ${ISALIST_${CST}}) # E is "sse2dp" foreach(N ${NLIST}) - string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp" + string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp" string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_obj" string(CONCAT UC ${U} ".cpp") # UC is "unroll_0_sse2dp.cpp" add_library(${UG} OBJECT ${UC}) @@ -372,7 +419,7 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) list(APPEND UNROLL_OBJECTS $) foreach(I ${LISTSHIFTSTR}) - string(CONCAT U unroll_ ${N} _ ${E} _ ${I}) # U is "unroll_0_sse2dp_1" + string(CONCAT U unroll_ ${N} _ ${E} _ ${I}) # U is "unroll_0_sse2dp_1" string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_1_obj" string(CONCAT UC ${U} ".cpp") # UC is "unroll_0_sse2dp_1.cpp" add_library(${UG} OBJECT ${UC}) @@ -380,7 +427,20 @@ foreach(T ${LIST_SUPPORTED_FPTYPE}) target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}}) target_compile_options(${UG} PRIVATE ${CFLAGS_${E}}) - add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST}) + add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST}_${I}) + list(APPEND UNROLL_OBJECTS $) + endforeach() + + foreach(I ${LISTSHIFTSTR2}) + string(CONCAT U unroll2_ ${N} _ ${E} _ ${I}) # U is "unroll2_0_sse2dp_1" + string(CONCAT UG ${U} "_obj") # UG is "unroll2_0_sse2dp_1_obj" + string(CONCAT UC ${U} ".cpp") # UC is "unroll2_0_sse2dp_1.cpp" + add_library(${UG} OBJECT ${UC}) + set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES}) + target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}}) + target_compile_options(${UG} PRIVATE ${CFLAGS_${E}}) + add_dependencies(${UG} ${TARGET_HEADERS} unroll2_target_${ST}_${I}) list(APPEND UNROLL_OBJECTS $) endforeach() endforeach() diff --git a/src/dft/dft.cpp b/src/dft/dft.cpp index 90a70e47..808ccef6 100644 --- a/src/dft/dft.cpp +++ b/src/dft/dft.cpp @@ -81,49 +81,60 @@ void SleefDFTXX::dispatch(const int N, real if (level == N) { const int shift = log2len-N - log2vecwidth; if ((mode & SLEEF_MODE_BACKWARD) == 0) { - if (shift >= MAXSHIFT) { - void (*func)(real *, const real *, const int) = DFTF[config][isa][N]; - (*func)(d, s, log2len-N); - } else { + if (minshift <= shift && shift < MAXSHIFT) { void (*func)(real *, const real *) = DFTFS[shift][config][isa][N]; (*func)(d, s); + } else { + void (*func)(real *, const real *, const int) = DFTF[config][isa][N]; + (*func)(d, s, log2len-N); } } else { - if (shift >= MAXSHIFT) { - void (*func)(real *, const real *, const int) = DFTB[config][isa][N]; - (*func)(d, s, log2len-N); - } else { + if (minshift <= shift && shift < MAXSHIFT) { void (*func)(real *, const real *) = DFTBS[shift][config][isa][N]; (*func)(d, s); + } else { + void (*func)(real *, const real *, const int) = DFTB[config][isa][N]; + (*func)(d, s, log2len-N); } } } else if (level == (int)log2len) { assert(vecwidth <= (1 << N)); const int shift = log2len-N - log2vecwidth; if ((mode & SLEEF_MODE_BACKWARD) == 0) { - if (shift >= MAXSHIFT) { - void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTF[config][isa][N]; - (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); - } else { + if (minshift <= shift && shift < MAXSHIFT) { void (*func)(real *, uint32_t *, const real *, const real *, const int) = TBUTFS[shift][config][isa][N]; (*func)(d, perm[level], s, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTF[config][isa][N]; + (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); } } else { - if (shift >= MAXSHIFT) { - void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTB[config][isa][N]; - (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); - } else { + if (minshift <= shift && shift < MAXSHIFT) { void (*func)(real *, uint32_t *, const real *, const real *, const int) = TBUTBS[shift][config][isa][N]; (*func)(d, perm[level], s, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = TBUTB[config][isa][N]; + (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); } } } else { + const int inshift = log2len - level; if ((mode & SLEEF_MODE_BACKWARD) == 0) { - void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTF[config][isa][N]; - (*func)(d, perm[level], log2len-level, s, log2len-N, tbl[N][level], K); + if (inshift < MAXSHIFT) { + void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = BUTFS[inshift][config][isa][N]; + (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTF[config][isa][N]; + (*func)(d, perm[level], log2len-level, s, log2len-N, tbl[N][level], K); + } } else { - void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTB[config][isa][N]; - (*func)(d, perm[level], log2len-level, s, log2len-N, tbl[N][level], K); + if (inshift < MAXSHIFT) { + void (*func)(real *, uint32_t *, const real *, const int, const real *, const int) = BUTBS[inshift][config][isa][N]; + (*func)(d, perm[level], s, log2len-N, tbl[N][level], K); + } else { + void (*func)(real *, uint32_t *, const int, const real *, const int, const real *, const int) = BUTB[config][isa][N]; + (*func)(d, perm[level], log2len-level, s, log2len-N, tbl[N][level], K); + } } } } @@ -1017,7 +1028,7 @@ void SleefDFT2DXX::measureTranspose() { template SleefDFTXX::SleefDFTXX(uint32_t n, const real *in_, real *out_, uint64_t mode_, const char *baseTypeString, - int BASETYPEID_, int MAGIC_, + int BASETYPEID_, int MAGIC_, int minshift_, int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), @@ -1030,12 +1041,16 @@ SleefDFTXX::SleefDFTXX(uint32_t n, const rea void (*DFTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*DFTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), - void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*BUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) ) : magic(MAGIC_), baseTypeID(BASETYPEID_), in(in_), out(out_), nThread(omp_thread_count()), - log2len((mode_ & SLEEF_MODE_REAL) ? ilog2(n)-1 : ilog2(n)), mode(((mode_ & SLEEF_MODE_ALT) && log2len > 1) ? mode_ ^ SLEEF_MODE_BACKWARD : mode_), - DFTF(DFTF_), DFTB(DFTB_), TBUTF(TBUTF_), TBUTB(TBUTB_), BUTF(BUTF_), BUTB(BUTB_), - REALSUB0(REALSUB0_), REALSUB1(REALSUB1_), DFTFS(DFTFS_), DFTBS(DFTBS_), TBUTFS(TBUTFS_), TBUTBS(TBUTBS_) { + log2len((mode_ & SLEEF_MODE_REAL) ? ilog2(n)-1 : ilog2(n)), + mode(((mode_ & SLEEF_MODE_ALT) && log2len > 1) ? mode_ ^ SLEEF_MODE_BACKWARD : mode_), + minshift(minshift_), + DFTF(DFTF_), DFTB(DFTB_), TBUTF(TBUTF_), TBUTB(TBUTB_), BUTF(BUTF_), BUTB(BUTB_), REALSUB0(REALSUB0_), REALSUB1(REALSUB1_), + DFTFS(DFTFS_), DFTBS(DFTBS_), TBUTFS(TBUTFS_), TBUTBS(TBUTBS_), BUTFS(BUTFS_), BUTBS(BUTBS_) { verboseFP = defaultVerboseFP; @@ -1165,7 +1180,7 @@ SleefDFTXX::SleefDFTXX(uint32_t n, const rea template SleefDFT2DXX::SleefDFT2DXX(uint32_t vlen_, uint32_t hlen_, const real *in_, real *out_, uint64_t mode_, const char *baseTypeString, - int BASETYPEID_, int MAGIC_, int MAGIC2D_, + int BASETYPEID_, int MAGIC_, int MAGIC2D_, int minshift_, int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), @@ -1178,7 +1193,9 @@ SleefDFT2DXX::SleefDFT2DXX(uint32_t vlen_, u void (*DFTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*DFTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), - void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*BUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) ) { magic = MAGIC2D_; baseTypeID = BASETYPEID_; @@ -1200,13 +1217,15 @@ SleefDFT2DXX::SleefDFT2DXX(uint32_t vlen_, u if ((mode & SLEEF_MODE_NO_MT) == 0) mode3 |= SLEEF_MODE3_MT2D; instH = instV = new SleefDFTXX(hlen, NULL, NULL, mode1D, baseTypeString, - BASETYPEID_, MAGIC_, GETINT_, GETPTR_, SINCOSPI_, + BASETYPEID_, MAGIC_, minshift_, + GETINT_, GETPTR_, SINCOSPI_, DFTF_, DFTB_, TBUTF_, TBUTB_, BUTF_, BUTB_, - REALSUB0_, REALSUB1_, DFTFS_, DFTBS_, TBUTFS_, TBUTBS_); + REALSUB0_, REALSUB1_, DFTFS_, DFTBS_, TBUTFS_, TBUTBS_, BUTFS_, BUTBS_); if (hlen != vlen) instV = new SleefDFTXX(vlen, NULL, NULL, mode1D, baseTypeString, - BASETYPEID_, MAGIC_, GETINT_, GETPTR_, SINCOSPI_, + BASETYPEID_, MAGIC_, minshift_, + GETINT_, GETPTR_, SINCOSPI_, DFTF_, DFTB_, TBUTF_, TBUTB_, BUTF_, BUTB_, - REALSUB0_, REALSUB1_, DFTFS_, DFTBS_, TBUTFS_, TBUTBS_); + REALSUB0_, REALSUB1_, DFTFS_, DFTBS_, TBUTFS_, TBUTBS_, BUTFS_, BUTBS_); tBuf = (real *)Sleef_malloc(sizeof(real)*2*hlen*vlen); @@ -1370,10 +1389,10 @@ void SleefDFT2DXX::execute(const real *s0, r EXPORT SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *out, uint64_t mode) { SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); p->double_ = new SleefDFTXX(n, in, out, mode, "double", - 1, 0x27182818, getInt_double, getPtr_double, Sleef_sincospi_u05, + 1, 0x27182818, MINSHIFTDP, getInt_double, getPtr_double, Sleef_sincospi_u05, dftf_double, dftb_double, tbutf_double, tbutb_double, butf_double, butb_double, realSub0_double, realSub1_double, - dftfs_double, dftbs_double, tbutfs_double, tbutbs_double + dftfs_double, dftbs_double, tbutfs_double, tbutbs_double, butfs_double, butbs_double ); p->magic = p->double_->magic; return p; @@ -1382,10 +1401,10 @@ EXPORT SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *ou EXPORT SleefDFT *SleefDFT_double_init2d(uint32_t vlen, uint32_t hlen, const double *in, double *out, uint64_t mode) { SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); p->double2d_ = new SleefDFT2DXX(vlen, hlen, in, out, mode, "double", - 1, 0x27182818, 0x17320508, getInt_double, getPtr_double, Sleef_sincospi_u05, + 1, 0x27182818, 0x17320508, MINSHIFTDP, getInt_double, getPtr_double, Sleef_sincospi_u05, dftf_double, dftb_double, tbutf_double, tbutb_double, butf_double, butb_double, realSub0_double, realSub1_double, - dftfs_double, dftbs_double, tbutfs_double, tbutbs_double + dftfs_double, dftbs_double, tbutfs_double, tbutbs_double, butfs_double, butbs_double ); p->magic = p->double2d_->magic; return p; @@ -1407,10 +1426,10 @@ EXPORT void SleefDFT_double_execute(SleefDFT *p, const double *s0, double *d0) { EXPORT SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, uint64_t mode) { SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); p->float_ = new SleefDFTXX(n, in, out, mode, "float", - 2, 0x31415926, getInt_float, getPtr_float, Sleef_sincospif_u05, + 2, 0x31415926, MINSHIFTSP, getInt_float, getPtr_float, Sleef_sincospif_u05, dftf_float, dftb_float, tbutf_float, tbutb_float, butf_float, butb_float, realSub0_float, realSub1_float, - dftfs_float, dftbs_float, tbutfs_float, tbutbs_float + dftfs_float, dftbs_float, tbutfs_float, tbutbs_float, butfs_float, butbs_float ); p->magic = p->float_->magic; return p; @@ -1419,10 +1438,10 @@ EXPORT SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, EXPORT SleefDFT *SleefDFT_float_init2d(uint32_t vlen, uint32_t hlen, const float *in, float *out, uint64_t mode) { SleefDFT *p = (SleefDFT *)calloc(1, sizeof(SleefDFT)); p->float2d_ = new SleefDFT2DXX(vlen, hlen, in, out, mode, "float", - 2, 0x31415926, 0x22360679, getInt_float, getPtr_float, Sleef_sincospif_u05, + 2, 0x31415926, 0x22360679, MINSHIFTSP, getInt_float, getPtr_float, Sleef_sincospif_u05, dftf_float, dftb_float, tbutf_float, tbutb_float, butf_float, butb_float, realSub0_float, realSub1_float, - dftfs_float, dftbs_float, tbutfs_float, tbutbs_float + dftfs_float, dftbs_float, tbutfs_float, tbutbs_float, butfs_float, butbs_float ); p->magic = p->float2d_->magic; return p; diff --git a/src/dft/dftcommon.hpp b/src/dft/dftcommon.hpp index f4fd4a6e..053b3259 100644 --- a/src/dft/dftcommon.hpp +++ b/src/dft/dftcommon.hpp @@ -22,6 +22,7 @@ struct SleefDFTXX { const int nThread; const uint32_t log2len; const uint64_t mode; + const int minshift; uint64_t mode2 = 0, mode3 = 0; @@ -52,13 +53,14 @@ struct SleefDFTXX { void (*(* const BUTB)[ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int); void (** const REALSUB0)(real *, const real *, const int, const real *, const real *); void (** const REALSUB1)(real *, const real *, const int, const real *, const real *, const int); - void (*(* const DFTFS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *); void (*(* const DFTBS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *); void (*(* const TBUTFS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int); void (*(* const TBUTBS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int); + void (*(* const BUTFS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int); + void (*(* const BUTBS)[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int); - SleefDFTXX(uint32_t n, const real *in, real *out, uint64_t mode, const char *baseTypeString, int BASETYPEID_, int MAGIC_, + SleefDFTXX(uint32_t n, const real *in, real *out, uint64_t mode, const char *baseTypeString, int BASETYPEID_, int MAGIC_, int minshift_, int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), @@ -68,11 +70,12 @@ struct SleefDFTXX { void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *), void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int), - void (*DFTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*DFTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), - void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*BUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) ); ~SleefDFTXX(); @@ -111,7 +114,7 @@ struct SleefDFT2DXX { FILE *verboseFP = NULL; SleefDFT2DXX(uint32_t vlen, uint32_t hlen, const real *in, real *out, uint64_t mode, const char *baseTypeString, - int BASETYPEID_, int MAGIC_, int MAGIC2D_, + int BASETYPEID_, int MAGIC_, int MAGIC2D_, int minshift_, int (*GETINT_[16])(int), const void *(*GETPTR_[16])(int), real2 (*SINCOSPI_)(real), void (*DFTF_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), void (*DFTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int), @@ -121,11 +124,12 @@ struct SleefDFT2DXX { void (*BUTB_[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int), void (*REALSUB0_[ISAMAX])(real *, const real *, const int, const real *, const real *), void (*REALSUB1_[ISAMAX])(real *, const real *, const int, const real *, const real *, const int), - void (*DFTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*DFTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *), void (*TBUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), - void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int) + void (*TBUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const real *, const int), + void (*BUTFS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int), + void (*BUTBS_[MAXSHIFT][CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) ); ~SleefDFT2DXX(); diff --git a/src/dft/mkdispatch.c b/src/dft/mkdispatch.c index 802aaf4d..804e193c 100644 --- a/src/dft/mkdispatch.c +++ b/src/dft/mkdispatch.c @@ -14,15 +14,16 @@ int main(int argc, char **argv) { if (argc < 3) { - fprintf(stderr, "Usage : %s ...\n", argv[0]); + fprintf(stderr, "Usage : %s ...\n", argv[0]); exit(-1); } const char *baseType = argv[1]; const char *baseTypeID = argv[2]; const int maxbutwidth = atoi(argv[3]); - const int maxshift = atoi(argv[4]); - const int isastart = 5; + const int minshift = atoi(argv[4]); + const int maxshift = atoi(argv[5]); + const int isastart = 6; const int isamax = argc - isastart; #if ENABLE_STREAM == 1 @@ -32,6 +33,7 @@ int main(int argc, char **argv) { #endif printf("#define MAXBUTWIDTH%s %d\n", baseTypeID, maxbutwidth); + printf("#define MINSHIFT%s %d\n", baseTypeID, minshift); printf("#define MAXSHIFT%s %d\n", baseTypeID, maxshift); printf("#define CONFIGMAX 4\n"); printf("#define ISAMAX %d\n", isamax); @@ -52,12 +54,17 @@ int main(int argc, char **argv) { printf("void but%df_%d_%s(%s *, uint32_t *, const int, const %s *, const int, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType, baseType); printf("void but%db_%d_%s(%s *, uint32_t *, const int, const %s *, const int, const %s *, const int);\n", 1 << j, config, argv[k], baseType, baseType, baseType); - for(int s=0;s= minshift) { printf("dft%df_%d_%d_%s, ", 1 << i, s, config, argv[k]); } else { printf("NULL, "); @@ -212,7 +219,7 @@ int main(int argc, char **argv) { for(int k=isastart;k= minshift) { if (i == 1) { printf("dft%df_%d_%d_%s, ", 1 << i, s, config, argv[k]); } else { @@ -238,7 +245,7 @@ int main(int argc, char **argv) { for(int k=isastart;k= minshift) { printf("tbut%df_%d_%d_%s, ", 1 << i, s, config, argv[k]); } else { printf("NULL, "); @@ -260,7 +267,7 @@ int main(int argc, char **argv) { for(int k=isastart;k= minshift) { printf("tbut%db_%d_%d_%s, ", 1 << i, s, config, argv[k]); } else { printf("NULL, "); @@ -274,6 +281,50 @@ int main(int argc, char **argv) { } printf("};\n\n"); + printf("void (*butfs_%s[MAXSHIFT%s][CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const int, const %s *, const int) = {\n", baseType, baseTypeID, baseTypeID, baseType, baseType, baseType); + for(int s=0;s= 1) { + printf("but%df_%d_%d_%s, ", 1 << i, s, config, argv[k]); + } else { + printf("NULL, "); + } + } + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n\n"); + + printf("void (*butbs_%s[MAXSHIFT%s][CONFIGMAX][ISAMAX][MAXBUTWIDTH%s+1])(%s *, uint32_t *, const %s *, const int, const %s *, const int) = {\n", baseType, baseTypeID, baseTypeID, baseType, baseType, baseType); + for(int s=0;s= 1) { + printf("but%db_%d_%d_%s, ", 1 << i, s, config, argv[k]); + } else { + printf("NULL, "); + } + } + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n\n"); + // printf("void (*realSub0_%s[ISAMAX])(%s *, const %s *, const int, const %s *, const %s *) = {\n ", baseType, baseType, baseType, baseType, baseType); diff --git a/src/dft/mkunroll.c b/src/dft/mkunroll.c index 1da0e080..f6b069e8 100644 --- a/src/dft/mkunroll.c +++ b/src/dft/mkunroll.c @@ -41,15 +41,22 @@ char *replaceAll(const char *in, const char *pat, const char *replace) { char line[LEN+10]; int main(int argc, char **argv) { - if (argc < 2) { + if (argc < 5) { fprintf(stderr, "Usage : %s ...\n", argv[0]); exit(-1); } const char *fn = argv[1]; const char *baseTypeID = argv[3]; - const int shift = atoi(argv[4]); + int shift = atoi(argv[4]); const int isastart = 5; + int mode = 1; + if (strcmp(argv[4], "-") == 0) { + mode = 0; + } else if (shift <= 0) { + mode = 2; + shift = -shift; + } char shiftstr[21]; snprintf(shiftstr, 20, "%d", shift); @@ -65,11 +72,18 @@ int main(int argc, char **argv) { FILE *fpin = fopen(fn, "r"); - if (shift >= 0) { - sprintf(line, "unroll_%d_%s_%d.cpp", config, isaString, shift); - } else { + switch(mode) { + case 0: sprintf(line, "unroll_%d_%s.cpp", config, isaString); + break; + case 1: + sprintf(line, "unroll_%d_%s_%d.cpp", config, isaString, shift); + break; + case 2: + sprintf(line, "unroll2_%d_%s_%d.cpp", config, isaString, shift); + break; } + FILE *fpout = fopen(line, "w"); fputs("#include \"vectortype.hpp\"\n\n", fpout); diff --git a/src/dft/unroll2.cpp.in b/src/dft/unroll2.cpp.in new file mode 100644 index 00000000..6fa45e5c --- /dev/null +++ b/src/dft/unroll2.cpp.in @@ -0,0 +1,4856 @@ +// Copyright Naoki Shibata and contributors 2010 - 2025. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +ALIGNED(8192) void but2f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + store(out, (0 << %SHIFT%), plus(load(in, (0 << inShift)), load(in, (1 << inShift)))); + real2 v4 = minus(load(in, (0 << inShift)), load(in, (1 << inShift))); + store(out, (1 << %SHIFT%), ctimesminusplus(v4, tbl[0 + tbloffset], ctimes(reverse(v4), tbl[1 + tbloffset]))); + } +} + +ALIGNED(8192) void but2b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + store(out, (0 << %SHIFT%), plus(load(in, (0 << inShift)), load(in, (1 << inShift)))); + real2 v4 = minus(load(in, (0 << inShift)), load(in, (1 << inShift))); + store(out, (1 << %SHIFT%), ctimesminusplus(v4, tbl[0 + tbloffset], ctimes(reverse(v4), tbl[1 + tbloffset]))); + } +} + +ALIGNED(8192) void but4f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v5 = load(in, 3 << inShift); + real2 v3 = load(in, 1 << inShift); + real2 v7 = reverse(minus(v3, v5)); + real2 v13 = plus(v3, v5); + real2 v2 = load(in, 0 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v8 = minus(v4, v2); + real2 v12 = plus(v2, v4); + store(out, 0 << %SHIFT%, plus(v12, v13)); + real2 v26 = minus(v12, v13); + store(out, 2 << %SHIFT%, ctimesminusplus(v26, tbl[0 + tbloffset], ctimes(reverse(v26), tbl[1 + tbloffset]))); + real2 v11 = minusplus(uminus(v7), v8); + real2 v9 = minusplus(v7, v8); + store(out, 1 << %SHIFT%, ctimesminusplus(reverse(v9), tbl[2 + tbloffset], ctimes(v9, tbl[3 + tbloffset]))); + store(out, 3 << %SHIFT%, ctimesminusplus(reverse(v11), tbl[4 + tbloffset], ctimes(v11, tbl[5 + tbloffset]))); + } +} + +ALIGNED(8192) void but4b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v5 = load(in, 3 << inShift); + real2 v3 = load(in, 1 << inShift); + real2 v7 = reverse(minus(v5, v3)); + real2 v13 = plus(v3, v5); + real2 v2 = load(in, 0 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v8 = minus(v4, v2); + real2 v12 = plus(v2, v4); + store(out, 0 << %SHIFT%, plus(v12, v13)); + real2 v26 = minus(v12, v13); + store(out, 2 << %SHIFT%, ctimesminusplus(v26, tbl[0 + tbloffset], ctimes(reverse(v26), tbl[1 + tbloffset]))); + real2 v11 = minusplus(uminus(v7), v8); + real2 v9 = minusplus(v7, v8); + store(out, 1 << %SHIFT%, ctimesminusplus(reverse(v9), tbl[2 + tbloffset], ctimes(v9, tbl[3 + tbloffset]))); + store(out, 3 << %SHIFT%, ctimesminusplus(reverse(v11), tbl[4 + tbloffset], ctimes(v11, tbl[5 + tbloffset]))); + } +} + +ALIGNED(8192) void but8f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v9 = load(in, 7 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v37 = plus(v5, v9); + real2 v31 = reverse(minus(v5, v9)); + real2 v7 = load(in, 5 << inShift); + real2 v3 = load(in, 1 << inShift); + real2 v36 = plus(v3, v7); + real2 v32 = minus(v7, v3); + real2 v57 = plus(v36, v37); + real2 v51 = reverse(minus(v36, v37)); + real2 v35 = minusplus(uminus(v31), v32); + real2 v33 = minusplus(v31, v32); + real2 v43 = ctimesminusplus(reverse(v33), tbl[6 + tbloffset], ctimes(v33, tbl[7 + tbloffset])); + real2 v6 = load(in, 4 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v16 = plus(v2, v6); + real2 v12 = minus(v6, v2); + real2 v8 = load(in, 6 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v17 = plus(v4, v8); + real2 v11 = reverse(minus(v4, v8)); + real2 v52 = minus(v17, v16); + real2 v56 = plus(v16, v17); + store(out, 0 << %SHIFT%, plus(v56, v57)); + real2 v70 = minus(v56, v57); + store(out, 4 << %SHIFT%, ctimesminusplus(v70, tbl[0 + tbloffset], ctimes(reverse(v70), tbl[1 + tbloffset]))); + real2 v53 = minusplus(v51, v52); + store(out, 2 << %SHIFT%, ctimesminusplus(reverse(v53), tbl[10 + tbloffset], ctimes(v53, tbl[11 + tbloffset]))); + real2 v55 = minusplus(uminus(v51), v52); + store(out, 6 << %SHIFT%, ctimesminusplus(reverse(v55), tbl[12 + tbloffset], ctimes(v55, tbl[13 + tbloffset]))); + real2 v15 = minusplus(uminus(v11), v12); + real2 v13 = minusplus(v11, v12); + real2 v23 = ctimesminusplus(reverse(v13), tbl[2 + tbloffset], ctimes(v13, tbl[3 + tbloffset])); + store(out, 1 << %SHIFT%, plus(v23, v43)); + real2 v78 = minus(v23, v43); + store(out, 5 << %SHIFT%, ctimesminusplus(v78, tbl[0 + tbloffset], ctimes(reverse(v78), tbl[1 + tbloffset]))); + real2 v49 = ctimesminusplus(reverse(v35), tbl[8 + tbloffset], ctimes(v35, tbl[9 + tbloffset])); + real2 v29 = ctimesminusplus(reverse(v15), tbl[4 + tbloffset], ctimes(v15, tbl[5 + tbloffset])); + store(out, 3 << %SHIFT%, plus(v29, v49)); + real2 v84 = minus(v29, v49); + store(out, 7 << %SHIFT%, ctimesminusplus(v84, tbl[0 + tbloffset], ctimes(reverse(v84), tbl[1 + tbloffset]))); + } +} + +ALIGNED(8192) void but8b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v9 = load(in, 7 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v37 = plus(v5, v9); + real2 v31 = reverse(minus(v9, v5)); + real2 v7 = load(in, 5 << inShift); + real2 v3 = load(in, 1 << inShift); + real2 v36 = plus(v3, v7); + real2 v32 = minus(v7, v3); + real2 v57 = plus(v36, v37); + real2 v51 = reverse(minus(v37, v36)); + real2 v35 = minusplus(uminus(v31), v32); + real2 v33 = minusplus(v31, v32); + real2 v43 = ctimesminusplus(reverse(v33), tbl[6 + tbloffset], ctimes(v33, tbl[7 + tbloffset])); + real2 v6 = load(in, 4 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v16 = plus(v2, v6); + real2 v12 = minus(v6, v2); + real2 v8 = load(in, 6 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v17 = plus(v4, v8); + real2 v11 = reverse(minus(v8, v4)); + real2 v52 = minus(v17, v16); + real2 v56 = plus(v16, v17); + store(out, 0 << %SHIFT%, plus(v56, v57)); + real2 v70 = minus(v56, v57); + store(out, 4 << %SHIFT%, ctimesminusplus(v70, tbl[0 + tbloffset], ctimes(reverse(v70), tbl[1 + tbloffset]))); + real2 v53 = minusplus(v51, v52); + store(out, 2 << %SHIFT%, ctimesminusplus(reverse(v53), tbl[10 + tbloffset], ctimes(v53, tbl[11 + tbloffset]))); + real2 v55 = minusplus(uminus(v51), v52); + store(out, 6 << %SHIFT%, ctimesminusplus(reverse(v55), tbl[12 + tbloffset], ctimes(v55, tbl[13 + tbloffset]))); + real2 v15 = minusplus(uminus(v11), v12); + real2 v13 = minusplus(v11, v12); + real2 v23 = ctimesminusplus(reverse(v13), tbl[2 + tbloffset], ctimes(v13, tbl[3 + tbloffset])); + store(out, 1 << %SHIFT%, plus(v23, v43)); + real2 v78 = minus(v23, v43); + store(out, 5 << %SHIFT%, ctimesminusplus(v78, tbl[0 + tbloffset], ctimes(reverse(v78), tbl[1 + tbloffset]))); + real2 v49 = ctimesminusplus(reverse(v35), tbl[8 + tbloffset], ctimes(v35, tbl[9 + tbloffset])); + real2 v29 = ctimesminusplus(reverse(v15), tbl[4 + tbloffset], ctimes(v15, tbl[5 + tbloffset])); + store(out, 3 << %SHIFT%, plus(v29, v49)); + real2 v84 = minus(v29, v49); + store(out, 7 << %SHIFT%, ctimesminusplus(v84, tbl[0 + tbloffset], ctimes(reverse(v84), tbl[1 + tbloffset]))); + } +} + +ALIGNED(8192) void but16f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v15 = load(in, 13 << inShift); + real2 v7 = load(in, 5 << inShift); + real2 v45 = plus(v7, v15); + real2 v39 = reverse(minus(v7, v15)); + real2 v3 = load(in, 1 << inShift); + real2 v11 = load(in, 9 << inShift); + real2 v40 = minus(v11, v3); + real2 v44 = plus(v3, v11); + real2 v124 = plus(v44, v45); + real2 v120 = minus(v45, v44); + real2 v41 = minusplus(v39, v40); + real2 v43 = minusplus(uminus(v39), v40); + real2 v57 = ctimesminusplus(reverse(v43), tbl[8 + tbloffset], ctimes(v43, tbl[9 + tbloffset])); + real2 v13 = load(in, 11 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v84 = plus(v5, v13); + real2 v80 = minus(v13, v5); + real2 v17 = load(in, 15 << inShift); + real2 v9 = load(in, 7 << inShift); + real2 v85 = plus(v9, v17); + real2 v79 = reverse(minus(v9, v17)); + real2 v119 = reverse(minus(v84, v85)); + real2 v125 = plus(v84, v85); + real2 v145 = plus(v124, v125); + real2 v139 = reverse(minus(v124, v125)); + real2 v121 = minusplus(v119, v120); + real2 v123 = minusplus(uminus(v119), v120); + real2 v137 = ctimesminusplus(reverse(v123), tbl[24 + tbloffset], ctimes(v123, tbl[25 + tbloffset])); + real2 v131 = ctimesminusplus(reverse(v121), tbl[22 + tbloffset], ctimes(v121, tbl[23 + tbloffset])); + real2 v4 = load(in, 2 << inShift); + real2 v12 = load(in, 10 << inShift); + real2 v64 = plus(v4, v12); + real2 v60 = minus(v12, v4); + real2 v8 = load(in, 6 << inShift); + real2 v16 = load(in, 14 << inShift); + real2 v65 = plus(v8, v16); + real2 v59 = reverse(minus(v8, v16)); + real2 v99 = reverse(minus(v64, v65)); + real2 v105 = plus(v64, v65); + real2 v14 = load(in, 12 << inShift); + real2 v6 = load(in, 4 << inShift); + real2 v25 = plus(v6, v14); + real2 v19 = reverse(minus(v6, v14)); + real2 v10 = load(in, 8 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v20 = minus(v10, v2); + real2 v24 = plus(v2, v10); + real2 v104 = plus(v24, v25); + real2 v100 = minus(v25, v24); + real2 v140 = minus(v105, v104); + real2 v144 = plus(v104, v105); + store(out, 0 << %SHIFT%, plus(v144, v145)); + real2 v158 = minus(v144, v145); + store(out, 8 << %SHIFT%, ctimesminusplus(v158, tbl[0 + tbloffset], ctimes(reverse(v158), tbl[1 + tbloffset]))); + real2 v143 = minusplus(uminus(v139), v140); + store(out, 12 << %SHIFT%, ctimesminusplus(reverse(v143), tbl[28 + tbloffset], ctimes(v143, tbl[29 + tbloffset]))); + real2 v141 = minusplus(v139, v140); + store(out, 4 << %SHIFT%, ctimesminusplus(reverse(v141), tbl[26 + tbloffset], ctimes(v141, tbl[27 + tbloffset]))); + real2 v101 = minusplus(v99, v100); + real2 v103 = minusplus(uminus(v99), v100); + real2 v117 = ctimesminusplus(reverse(v103), tbl[20 + tbloffset], ctimes(v103, tbl[21 + tbloffset])); + store(out, 6 << %SHIFT%, plus(v117, v137)); + real2 v172 = minus(v117, v137); + store(out, 14 << %SHIFT%, ctimesminusplus(v172, tbl[0 + tbloffset], ctimes(reverse(v172), tbl[1 + tbloffset]))); + real2 v111 = ctimesminusplus(reverse(v101), tbl[18 + tbloffset], ctimes(v101, tbl[19 + tbloffset])); + store(out, 2 << %SHIFT%, plus(v111, v131)); + real2 v166 = minus(v111, v131); + store(out, 10 << %SHIFT%, ctimesminusplus(v166, tbl[0 + tbloffset], ctimes(reverse(v166), tbl[1 + tbloffset]))); + real2 v23 = minusplus(uminus(v19), v20); + real2 v21 = minusplus(v19, v20); + real2 v81 = minusplus(v79, v80); + real2 v83 = minusplus(uminus(v79), v80); + real2 v97 = ctimesminusplus(reverse(v83), tbl[16 + tbloffset], ctimes(v83, tbl[17 + tbloffset])); + real2 v211 = plus(v57, v97); + real2 v205 = reverse(minus(v57, v97)); + real2 v61 = minusplus(v59, v60); + real2 v63 = minusplus(uminus(v59), v60); + real2 v77 = ctimesminusplus(reverse(v63), tbl[12 + tbloffset], ctimes(v63, tbl[13 + tbloffset])); + real2 v37 = ctimesminusplus(reverse(v23), tbl[4 + tbloffset], ctimes(v23, tbl[5 + tbloffset])); + real2 v210 = plus(v37, v77); + real2 v206 = minus(v77, v37); + store(out, 3 << %SHIFT%, plus(v210, v211)); + real2 v224 = minus(v210, v211); + store(out, 11 << %SHIFT%, ctimesminusplus(v224, tbl[0 + tbloffset], ctimes(reverse(v224), tbl[1 + tbloffset]))); + real2 v207 = minusplus(v205, v206); + real2 v209 = minusplus(uminus(v205), v206); + store(out, 15 << %SHIFT%, ctimesminusplus(reverse(v209), tbl[36 + tbloffset], ctimes(v209, tbl[37 + tbloffset]))); + store(out, 7 << %SHIFT%, ctimesminusplus(reverse(v207), tbl[34 + tbloffset], ctimes(v207, tbl[35 + tbloffset]))); + real2 v71 = ctimesminusplus(reverse(v61), tbl[10 + tbloffset], ctimes(v61, tbl[11 + tbloffset])); + real2 v51 = ctimesminusplus(reverse(v41), tbl[6 + tbloffset], ctimes(v41, tbl[7 + tbloffset])); + real2 v91 = ctimesminusplus(reverse(v81), tbl[14 + tbloffset], ctimes(v81, tbl[15 + tbloffset])); + real2 v185 = plus(v51, v91); + real2 v179 = reverse(minus(v51, v91)); + real2 v31 = ctimesminusplus(reverse(v21), tbl[2 + tbloffset], ctimes(v21, tbl[3 + tbloffset])); + real2 v184 = plus(v31, v71); + real2 v180 = minus(v71, v31); + store(out, 1 << %SHIFT%, plus(v184, v185)); + real2 v198 = minus(v184, v185); + store(out, 9 << %SHIFT%, ctimesminusplus(v198, tbl[0 + tbloffset], ctimes(reverse(v198), tbl[1 + tbloffset]))); + real2 v181 = minusplus(v179, v180); + store(out, 5 << %SHIFT%, ctimesminusplus(reverse(v181), tbl[30 + tbloffset], ctimes(v181, tbl[31 + tbloffset]))); + real2 v183 = minusplus(uminus(v179), v180); + store(out, 13 << %SHIFT%, ctimesminusplus(reverse(v183), tbl[32 + tbloffset], ctimes(v183, tbl[33 + tbloffset]))); + } +} + +ALIGNED(8192) void but16b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v15 = load(in, 13 << inShift); + real2 v7 = load(in, 5 << inShift); + real2 v45 = plus(v7, v15); + real2 v39 = reverse(minus(v15, v7)); + real2 v3 = load(in, 1 << inShift); + real2 v11 = load(in, 9 << inShift); + real2 v40 = minus(v11, v3); + real2 v44 = plus(v3, v11); + real2 v124 = plus(v44, v45); + real2 v120 = minus(v45, v44); + real2 v41 = minusplus(v39, v40); + real2 v43 = minusplus(uminus(v39), v40); + real2 v57 = ctimesminusplus(reverse(v43), tbl[8 + tbloffset], ctimes(v43, tbl[9 + tbloffset])); + real2 v13 = load(in, 11 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v84 = plus(v5, v13); + real2 v80 = minus(v13, v5); + real2 v17 = load(in, 15 << inShift); + real2 v9 = load(in, 7 << inShift); + real2 v85 = plus(v9, v17); + real2 v79 = reverse(minus(v17, v9)); + real2 v119 = reverse(minus(v85, v84)); + real2 v125 = plus(v84, v85); + real2 v145 = plus(v124, v125); + real2 v139 = reverse(minus(v125, v124)); + real2 v121 = minusplus(v119, v120); + real2 v123 = minusplus(uminus(v119), v120); + real2 v137 = ctimesminusplus(reverse(v123), tbl[24 + tbloffset], ctimes(v123, tbl[25 + tbloffset])); + real2 v131 = ctimesminusplus(reverse(v121), tbl[22 + tbloffset], ctimes(v121, tbl[23 + tbloffset])); + real2 v4 = load(in, 2 << inShift); + real2 v12 = load(in, 10 << inShift); + real2 v64 = plus(v4, v12); + real2 v60 = minus(v12, v4); + real2 v8 = load(in, 6 << inShift); + real2 v16 = load(in, 14 << inShift); + real2 v65 = plus(v8, v16); + real2 v59 = reverse(minus(v16, v8)); + real2 v99 = reverse(minus(v65, v64)); + real2 v105 = plus(v64, v65); + real2 v14 = load(in, 12 << inShift); + real2 v6 = load(in, 4 << inShift); + real2 v25 = plus(v6, v14); + real2 v19 = reverse(minus(v14, v6)); + real2 v10 = load(in, 8 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v20 = minus(v10, v2); + real2 v24 = plus(v2, v10); + real2 v104 = plus(v24, v25); + real2 v100 = minus(v25, v24); + real2 v140 = minus(v105, v104); + real2 v144 = plus(v104, v105); + store(out, 0 << %SHIFT%, plus(v144, v145)); + real2 v158 = minus(v144, v145); + store(out, 8 << %SHIFT%, ctimesminusplus(v158, tbl[0 + tbloffset], ctimes(reverse(v158), tbl[1 + tbloffset]))); + real2 v143 = minusplus(uminus(v139), v140); + store(out, 12 << %SHIFT%, ctimesminusplus(reverse(v143), tbl[28 + tbloffset], ctimes(v143, tbl[29 + tbloffset]))); + real2 v141 = minusplus(v139, v140); + store(out, 4 << %SHIFT%, ctimesminusplus(reverse(v141), tbl[26 + tbloffset], ctimes(v141, tbl[27 + tbloffset]))); + real2 v101 = minusplus(v99, v100); + real2 v103 = minusplus(uminus(v99), v100); + real2 v117 = ctimesminusplus(reverse(v103), tbl[20 + tbloffset], ctimes(v103, tbl[21 + tbloffset])); + store(out, 6 << %SHIFT%, plus(v117, v137)); + real2 v172 = minus(v117, v137); + store(out, 14 << %SHIFT%, ctimesminusplus(v172, tbl[0 + tbloffset], ctimes(reverse(v172), tbl[1 + tbloffset]))); + real2 v111 = ctimesminusplus(reverse(v101), tbl[18 + tbloffset], ctimes(v101, tbl[19 + tbloffset])); + store(out, 2 << %SHIFT%, plus(v111, v131)); + real2 v166 = minus(v111, v131); + store(out, 10 << %SHIFT%, ctimesminusplus(v166, tbl[0 + tbloffset], ctimes(reverse(v166), tbl[1 + tbloffset]))); + real2 v23 = minusplus(uminus(v19), v20); + real2 v21 = minusplus(v19, v20); + real2 v81 = minusplus(v79, v80); + real2 v83 = minusplus(uminus(v79), v80); + real2 v97 = ctimesminusplus(reverse(v83), tbl[16 + tbloffset], ctimes(v83, tbl[17 + tbloffset])); + real2 v211 = plus(v57, v97); + real2 v205 = reverse(minus(v97, v57)); + real2 v61 = minusplus(v59, v60); + real2 v63 = minusplus(uminus(v59), v60); + real2 v77 = ctimesminusplus(reverse(v63), tbl[12 + tbloffset], ctimes(v63, tbl[13 + tbloffset])); + real2 v37 = ctimesminusplus(reverse(v23), tbl[4 + tbloffset], ctimes(v23, tbl[5 + tbloffset])); + real2 v210 = plus(v37, v77); + real2 v206 = minus(v77, v37); + store(out, 3 << %SHIFT%, plus(v210, v211)); + real2 v224 = minus(v210, v211); + store(out, 11 << %SHIFT%, ctimesminusplus(v224, tbl[0 + tbloffset], ctimes(reverse(v224), tbl[1 + tbloffset]))); + real2 v207 = minusplus(v205, v206); + real2 v209 = minusplus(uminus(v205), v206); + store(out, 15 << %SHIFT%, ctimesminusplus(reverse(v209), tbl[36 + tbloffset], ctimes(v209, tbl[37 + tbloffset]))); + store(out, 7 << %SHIFT%, ctimesminusplus(reverse(v207), tbl[34 + tbloffset], ctimes(v207, tbl[35 + tbloffset]))); + real2 v71 = ctimesminusplus(reverse(v61), tbl[10 + tbloffset], ctimes(v61, tbl[11 + tbloffset])); + real2 v51 = ctimesminusplus(reverse(v41), tbl[6 + tbloffset], ctimes(v41, tbl[7 + tbloffset])); + real2 v91 = ctimesminusplus(reverse(v81), tbl[14 + tbloffset], ctimes(v81, tbl[15 + tbloffset])); + real2 v185 = plus(v51, v91); + real2 v179 = reverse(minus(v91, v51)); + real2 v31 = ctimesminusplus(reverse(v21), tbl[2 + tbloffset], ctimes(v21, tbl[3 + tbloffset])); + real2 v184 = plus(v31, v71); + real2 v180 = minus(v71, v31); + store(out, 1 << %SHIFT%, plus(v184, v185)); + real2 v198 = minus(v184, v185); + store(out, 9 << %SHIFT%, ctimesminusplus(v198, tbl[0 + tbloffset], ctimes(reverse(v198), tbl[1 + tbloffset]))); + real2 v181 = minusplus(v179, v180); + store(out, 5 << %SHIFT%, ctimesminusplus(reverse(v181), tbl[30 + tbloffset], ctimes(v181, tbl[31 + tbloffset]))); + real2 v183 = minusplus(uminus(v179), v180); + store(out, 13 << %SHIFT%, ctimesminusplus(reverse(v183), tbl[32 + tbloffset], ctimes(v183, tbl[33 + tbloffset]))); + } +} + +ALIGNED(8192) void but32f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v14 = load(in, 12 << inShift); + real2 v30 = load(in, 28 << inShift); + real2 v115 = reverse(minus(v14, v30)); + real2 v121 = plus(v14, v30); + real2 v6 = load(in, 4 << inShift); + real2 v22 = load(in, 20 << inShift); + real2 v120 = plus(v6, v22); + real2 v116 = minus(v22, v6); + real2 v201 = plus(v120, v121); + real2 v195 = reverse(minus(v120, v121)); + real2 v119 = minusplus(uminus(v115), v116); + real2 v117 = minusplus(v115, v116); + real2 v133 = ctimesminusplus(reverse(v119), tbl[20 + tbloffset], ctimes(v119, tbl[21 + tbloffset])); + real2 v127 = ctimesminusplus(reverse(v117), tbl[18 + tbloffset], ctimes(v117, tbl[19 + tbloffset])); + real2 v18 = load(in, 16 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v40 = plus(v2, v18); + real2 v36 = minus(v18, v2); + real2 v10 = load(in, 8 << inShift); + real2 v26 = load(in, 24 << inShift); + real2 v41 = plus(v10, v26); + real2 v35 = reverse(minus(v10, v26)); + real2 v200 = plus(v40, v41); + real2 v196 = minus(v41, v40); + real2 v37 = minusplus(v35, v36); + real2 v39 = minusplus(uminus(v35), v36); + real2 v53 = ctimesminusplus(reverse(v39), tbl[4 + tbloffset], ctimes(v39, tbl[5 + tbloffset])); + real2 v276 = minus(v201, v200); + real2 v280 = plus(v200, v201); + real2 v47 = ctimesminusplus(reverse(v37), tbl[2 + tbloffset], ctimes(v37, tbl[3 + tbloffset])); + real2 v199 = minusplus(uminus(v195), v196); + real2 v197 = minusplus(v195, v196); + real2 v486 = minus(v133, v53); + real2 v490 = plus(v53, v133); + real2 v213 = ctimesminusplus(reverse(v199), tbl[36 + tbloffset], ctimes(v199, tbl[37 + tbloffset])); + real2 v207 = ctimesminusplus(reverse(v197), tbl[34 + tbloffset], ctimes(v197, tbl[35 + tbloffset])); + real2 v28 = load(in, 26 << inShift); + real2 v12 = load(in, 10 << inShift); + real2 v81 = plus(v12, v28); + real2 v75 = reverse(minus(v12, v28)); + real2 v20 = load(in, 18 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v80 = plus(v4, v20); + real2 v76 = minus(v20, v4); + real2 v236 = minus(v81, v80); + real2 v240 = plus(v80, v81); + real2 v77 = minusplus(v75, v76); + real2 v79 = minusplus(uminus(v75), v76); + real2 v93 = ctimesminusplus(reverse(v79), tbl[12 + tbloffset], ctimes(v79, tbl[13 + tbloffset])); + real2 v32 = load(in, 30 << inShift); + real2 v16 = load(in, 14 << inShift); + real2 v155 = reverse(minus(v16, v32)); + real2 v161 = plus(v16, v32); + real2 v24 = load(in, 22 << inShift); + real2 v8 = load(in, 6 << inShift); + real2 v160 = plus(v8, v24); + real2 v156 = minus(v24, v8); + real2 v235 = reverse(minus(v160, v161)); + real2 v241 = plus(v160, v161); + real2 v157 = minusplus(v155, v156); + real2 v159 = minusplus(uminus(v155), v156); + real2 v173 = ctimesminusplus(reverse(v159), tbl[28 + tbloffset], ctimes(v159, tbl[29 + tbloffset])); + real2 v485 = reverse(minus(v93, v173)); + real2 v491 = plus(v93, v173); + real2 v489 = minusplus(uminus(v485), v486); + real2 v487 = minusplus(v485, v486); + real2 v239 = minusplus(uminus(v235), v236); + real2 v237 = minusplus(v235, v236); + real2 v253 = ctimesminusplus(reverse(v239), tbl[44 + tbloffset], ctimes(v239, tbl[45 + tbloffset])); + real2 v497 = ctimesminusplus(reverse(v487), tbl[82 + tbloffset], ctimes(v487, tbl[83 + tbloffset])); + real2 v530 = plus(v490, v491); + real2 v526 = minus(v491, v490); + real2 v503 = ctimesminusplus(reverse(v489), tbl[84 + tbloffset], ctimes(v489, tbl[85 + tbloffset])); + real2 v247 = ctimesminusplus(reverse(v237), tbl[42 + tbloffset], ctimes(v237, tbl[43 + tbloffset])); + real2 v356 = minus(v247, v207); + real2 v360 = plus(v207, v247); + real2 v386 = plus(v213, v253); + real2 v382 = minus(v253, v213); + real2 v17 = load(in, 15 << inShift); + real2 v33 = load(in, 31 << inShift); + real2 v175 = reverse(minus(v17, v33)); + real2 v181 = plus(v17, v33); + real2 v25 = load(in, 23 << inShift); + real2 v9 = load(in, 7 << inShift); + real2 v176 = minus(v25, v9); + real2 v180 = plus(v9, v25); + real2 v177 = minusplus(v175, v176); + real2 v179 = minusplus(uminus(v175), v176); + real2 v193 = ctimesminusplus(reverse(v179), tbl[32 + tbloffset], ctimes(v179, tbl[33 + tbloffset])); + real2 v261 = plus(v180, v181); + real2 v255 = reverse(minus(v180, v181)); + real2 v29 = load(in, 27 << inShift); + real2 v13 = load(in, 11 << inShift); + real2 v101 = plus(v13, v29); + real2 v95 = reverse(minus(v13, v29)); + real2 v21 = load(in, 19 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v100 = plus(v5, v21); + real2 v96 = minus(v21, v5); + real2 v99 = minusplus(uminus(v95), v96); + real2 v97 = minusplus(v95, v96); + real2 v260 = plus(v100, v101); + real2 v256 = minus(v101, v100); + real2 v259 = minusplus(uminus(v255), v256); + real2 v257 = minusplus(v255, v256); + real2 v273 = ctimesminusplus(reverse(v259), tbl[48 + tbloffset], ctimes(v259, tbl[49 + tbloffset])); + real2 v267 = ctimesminusplus(reverse(v257), tbl[46 + tbloffset], ctimes(v257, tbl[47 + tbloffset])); + real2 v3 = load(in, 1 << inShift); + real2 v19 = load(in, 17 << inShift); + real2 v60 = plus(v3, v19); + real2 v56 = minus(v19, v3); + real2 v27 = load(in, 25 << inShift); + real2 v11 = load(in, 9 << inShift); + real2 v55 = reverse(minus(v11, v27)); + real2 v61 = plus(v11, v27); + real2 v220 = plus(v60, v61); + real2 v216 = minus(v61, v60); + real2 v7 = load(in, 5 << inShift); + real2 v23 = load(in, 21 << inShift); + real2 v136 = minus(v23, v7); + real2 v140 = plus(v7, v23); + real2 v15 = load(in, 13 << inShift); + real2 v31 = load(in, 29 << inShift); + real2 v135 = reverse(minus(v15, v31)); + real2 v141 = plus(v15, v31); + real2 v215 = reverse(minus(v140, v141)); + real2 v221 = plus(v140, v141); + real2 v219 = minusplus(uminus(v215), v216); + real2 v217 = minusplus(v215, v216); + real2 v227 = ctimesminusplus(reverse(v217), tbl[38 + tbloffset], ctimes(v217, tbl[39 + tbloffset])); + real2 v355 = reverse(minus(v227, v267)); + real2 v361 = plus(v227, v267); + store(out, 2 << %SHIFT%, plus(v360, v361)); + real2 v374 = minus(v360, v361); + store(out, 18 << %SHIFT%, ctimesminusplus(v374, tbl[0 + tbloffset], ctimes(reverse(v374), tbl[1 + tbloffset]))); + real2 v357 = minusplus(v355, v356); + store(out, 10 << %SHIFT%, ctimesminusplus(reverse(v357), tbl[62 + tbloffset], ctimes(v357, tbl[63 + tbloffset]))); + real2 v359 = minusplus(uminus(v355), v356); + store(out, 26 << %SHIFT%, ctimesminusplus(reverse(v359), tbl[64 + tbloffset], ctimes(v359, tbl[65 + tbloffset]))); + real2 v233 = ctimesminusplus(reverse(v219), tbl[40 + tbloffset], ctimes(v219, tbl[41 + tbloffset])); + real2 v381 = reverse(minus(v233, v273)); + real2 v387 = plus(v233, v273); + store(out, 6 << %SHIFT%, plus(v386, v387)); + real2 v400 = minus(v386, v387); + store(out, 22 << %SHIFT%, ctimesminusplus(v400, tbl[0 + tbloffset], ctimes(reverse(v400), tbl[1 + tbloffset]))); + real2 v383 = minusplus(v381, v382); + real2 v385 = minusplus(uminus(v381), v382); + store(out, 30 << %SHIFT%, ctimesminusplus(reverse(v385), tbl[68 + tbloffset], ctimes(v385, tbl[69 + tbloffset]))); + store(out, 14 << %SHIFT%, ctimesminusplus(reverse(v383), tbl[66 + tbloffset], ctimes(v383, tbl[67 + tbloffset]))); + real2 v137 = minusplus(v135, v136); + real2 v139 = minusplus(uminus(v135), v136); + real2 v153 = ctimesminusplus(reverse(v139), tbl[24 + tbloffset], ctimes(v139, tbl[25 + tbloffset])); + real2 v113 = ctimesminusplus(reverse(v99), tbl[16 + tbloffset], ctimes(v99, tbl[17 + tbloffset])); + real2 v511 = plus(v113, v193); + real2 v505 = reverse(minus(v113, v193)); + real2 v57 = minusplus(v55, v56); + real2 v59 = minusplus(uminus(v55), v56); + real2 v73 = ctimesminusplus(reverse(v59), tbl[8 + tbloffset], ctimes(v59, tbl[9 + tbloffset])); + real2 v510 = plus(v73, v153); + real2 v506 = minus(v153, v73); + real2 v531 = plus(v510, v511); + real2 v525 = reverse(minus(v510, v511)); + store(out, 3 << %SHIFT%, plus(v530, v531)); + real2 v544 = minus(v530, v531); + store(out, 19 << %SHIFT%, ctimesminusplus(v544, tbl[0 + tbloffset], ctimes(reverse(v544), tbl[1 + tbloffset]))); + real2 v527 = minusplus(v525, v526); + store(out, 11 << %SHIFT%, ctimesminusplus(reverse(v527), tbl[90 + tbloffset], ctimes(v527, tbl[91 + tbloffset]))); + real2 v529 = minusplus(uminus(v525), v526); + store(out, 27 << %SHIFT%, ctimesminusplus(reverse(v529), tbl[92 + tbloffset], ctimes(v529, tbl[93 + tbloffset]))); + real2 v509 = minusplus(uminus(v505), v506); + real2 v507 = minusplus(v505, v506); + real2 v523 = ctimesminusplus(reverse(v509), tbl[88 + tbloffset], ctimes(v509, tbl[89 + tbloffset])); + store(out, 15 << %SHIFT%, plus(v503, v523)); + real2 v556 = minus(v503, v523); + store(out, 31 << %SHIFT%, ctimesminusplus(v556, tbl[0 + tbloffset], ctimes(reverse(v556), tbl[1 + tbloffset]))); + real2 v517 = ctimesminusplus(reverse(v507), tbl[86 + tbloffset], ctimes(v507, tbl[87 + tbloffset])); + store(out, 7 << %SHIFT%, plus(v497, v517)); + real2 v550 = minus(v497, v517); + store(out, 23 << %SHIFT%, ctimesminusplus(v550, tbl[0 + tbloffset], ctimes(reverse(v550), tbl[1 + tbloffset]))); + real2 v275 = reverse(minus(v240, v241)); + real2 v281 = plus(v240, v241); + real2 v320 = plus(v280, v281); + real2 v316 = minus(v281, v280); + real2 v301 = plus(v260, v261); + real2 v295 = reverse(minus(v260, v261)); + real2 v300 = plus(v220, v221); + real2 v296 = minus(v221, v220); + real2 v315 = reverse(minus(v300, v301)); + real2 v321 = plus(v300, v301); + store(out, 0 << %SHIFT%, plus(v320, v321)); + real2 v334 = minus(v320, v321); + store(out, 16 << %SHIFT%, ctimesminusplus(v334, tbl[0 + tbloffset], ctimes(reverse(v334), tbl[1 + tbloffset]))); + real2 v319 = minusplus(uminus(v315), v316); + real2 v317 = minusplus(v315, v316); + store(out, 8 << %SHIFT%, ctimesminusplus(reverse(v317), tbl[58 + tbloffset], ctimes(v317, tbl[59 + tbloffset]))); + store(out, 24 << %SHIFT%, ctimesminusplus(reverse(v319), tbl[60 + tbloffset], ctimes(v319, tbl[61 + tbloffset]))); + real2 v299 = minusplus(uminus(v295), v296); + real2 v297 = minusplus(v295, v296); + real2 v279 = minusplus(uminus(v275), v276); + real2 v277 = minusplus(v275, v276); + real2 v287 = ctimesminusplus(reverse(v277), tbl[50 + tbloffset], ctimes(v277, tbl[51 + tbloffset])); + real2 v307 = ctimesminusplus(reverse(v297), tbl[54 + tbloffset], ctimes(v297, tbl[55 + tbloffset])); + store(out, 4 << %SHIFT%, plus(v287, v307)); + real2 v342 = minus(v287, v307); + store(out, 20 << %SHIFT%, ctimesminusplus(v342, tbl[0 + tbloffset], ctimes(reverse(v342), tbl[1 + tbloffset]))); + real2 v313 = ctimesminusplus(reverse(v299), tbl[56 + tbloffset], ctimes(v299, tbl[57 + tbloffset])); + real2 v293 = ctimesminusplus(reverse(v279), tbl[52 + tbloffset], ctimes(v279, tbl[53 + tbloffset])); + store(out, 12 << %SHIFT%, plus(v293, v313)); + real2 v348 = minus(v293, v313); + store(out, 28 << %SHIFT%, ctimesminusplus(v348, tbl[0 + tbloffset], ctimes(reverse(v348), tbl[1 + tbloffset]))); + real2 v87 = ctimesminusplus(reverse(v77), tbl[10 + tbloffset], ctimes(v77, tbl[11 + tbloffset])); + real2 v147 = ctimesminusplus(reverse(v137), tbl[22 + tbloffset], ctimes(v137, tbl[23 + tbloffset])); + real2 v187 = ctimesminusplus(reverse(v177), tbl[30 + tbloffset], ctimes(v177, tbl[31 + tbloffset])); + real2 v167 = ctimesminusplus(reverse(v157), tbl[26 + tbloffset], ctimes(v157, tbl[27 + tbloffset])); + real2 v413 = plus(v87, v167); + real2 v407 = reverse(minus(v87, v167)); + real2 v67 = ctimesminusplus(reverse(v57), tbl[6 + tbloffset], ctimes(v57, tbl[7 + tbloffset])); + real2 v107 = ctimesminusplus(reverse(v97), tbl[14 + tbloffset], ctimes(v97, tbl[15 + tbloffset])); + real2 v427 = reverse(minus(v107, v187)); + real2 v433 = plus(v107, v187); + real2 v432 = plus(v67, v147); + real2 v428 = minus(v147, v67); + real2 v453 = plus(v432, v433); + real2 v447 = reverse(minus(v432, v433)); + real2 v408 = minus(v127, v47); + real2 v412 = plus(v47, v127); + real2 v452 = plus(v412, v413); + real2 v448 = minus(v413, v412); + store(out, 1 << %SHIFT%, plus(v452, v453)); + real2 v466 = minus(v452, v453); + store(out, 17 << %SHIFT%, ctimesminusplus(v466, tbl[0 + tbloffset], ctimes(reverse(v466), tbl[1 + tbloffset]))); + real2 v451 = minusplus(uminus(v447), v448); + store(out, 25 << %SHIFT%, ctimesminusplus(reverse(v451), tbl[80 + tbloffset], ctimes(v451, tbl[81 + tbloffset]))); + real2 v449 = minusplus(v447, v448); + store(out, 9 << %SHIFT%, ctimesminusplus(reverse(v449), tbl[78 + tbloffset], ctimes(v449, tbl[79 + tbloffset]))); + real2 v429 = minusplus(v427, v428); + real2 v431 = minusplus(uminus(v427), v428); + real2 v445 = ctimesminusplus(reverse(v431), tbl[76 + tbloffset], ctimes(v431, tbl[77 + tbloffset])); + real2 v409 = minusplus(v407, v408); + real2 v411 = minusplus(uminus(v407), v408); + real2 v425 = ctimesminusplus(reverse(v411), tbl[72 + tbloffset], ctimes(v411, tbl[73 + tbloffset])); + store(out, 13 << %SHIFT%, plus(v425, v445)); + real2 v478 = minus(v425, v445); + store(out, 29 << %SHIFT%, ctimesminusplus(v478, tbl[0 + tbloffset], ctimes(reverse(v478), tbl[1 + tbloffset]))); + real2 v439 = ctimesminusplus(reverse(v429), tbl[74 + tbloffset], ctimes(v429, tbl[75 + tbloffset])); + real2 v419 = ctimesminusplus(reverse(v409), tbl[70 + tbloffset], ctimes(v409, tbl[71 + tbloffset])); + store(out, 5 << %SHIFT%, plus(v419, v439)); + real2 v472 = minus(v419, v439); + store(out, 21 << %SHIFT%, ctimesminusplus(v472, tbl[0 + tbloffset], ctimes(reverse(v472), tbl[1 + tbloffset]))); + } +} + +ALIGNED(8192) void but32b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + real2 v14 = load(in, 12 << inShift); + real2 v30 = load(in, 28 << inShift); + real2 v115 = reverse(minus(v30, v14)); + real2 v121 = plus(v14, v30); + real2 v6 = load(in, 4 << inShift); + real2 v22 = load(in, 20 << inShift); + real2 v120 = plus(v6, v22); + real2 v116 = minus(v22, v6); + real2 v201 = plus(v120, v121); + real2 v195 = reverse(minus(v121, v120)); + real2 v119 = minusplus(uminus(v115), v116); + real2 v117 = minusplus(v115, v116); + real2 v133 = ctimesminusplus(reverse(v119), tbl[20 + tbloffset], ctimes(v119, tbl[21 + tbloffset])); + real2 v127 = ctimesminusplus(reverse(v117), tbl[18 + tbloffset], ctimes(v117, tbl[19 + tbloffset])); + real2 v18 = load(in, 16 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v40 = plus(v2, v18); + real2 v36 = minus(v18, v2); + real2 v10 = load(in, 8 << inShift); + real2 v26 = load(in, 24 << inShift); + real2 v41 = plus(v10, v26); + real2 v35 = reverse(minus(v26, v10)); + real2 v200 = plus(v40, v41); + real2 v196 = minus(v41, v40); + real2 v37 = minusplus(v35, v36); + real2 v39 = minusplus(uminus(v35), v36); + real2 v53 = ctimesminusplus(reverse(v39), tbl[4 + tbloffset], ctimes(v39, tbl[5 + tbloffset])); + real2 v276 = minus(v201, v200); + real2 v280 = plus(v200, v201); + real2 v47 = ctimesminusplus(reverse(v37), tbl[2 + tbloffset], ctimes(v37, tbl[3 + tbloffset])); + real2 v199 = minusplus(uminus(v195), v196); + real2 v197 = minusplus(v195, v196); + real2 v486 = minus(v133, v53); + real2 v490 = plus(v53, v133); + real2 v213 = ctimesminusplus(reverse(v199), tbl[36 + tbloffset], ctimes(v199, tbl[37 + tbloffset])); + real2 v207 = ctimesminusplus(reverse(v197), tbl[34 + tbloffset], ctimes(v197, tbl[35 + tbloffset])); + real2 v28 = load(in, 26 << inShift); + real2 v12 = load(in, 10 << inShift); + real2 v81 = plus(v12, v28); + real2 v75 = reverse(minus(v28, v12)); + real2 v20 = load(in, 18 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v80 = plus(v4, v20); + real2 v76 = minus(v20, v4); + real2 v236 = minus(v81, v80); + real2 v240 = plus(v80, v81); + real2 v77 = minusplus(v75, v76); + real2 v79 = minusplus(uminus(v75), v76); + real2 v93 = ctimesminusplus(reverse(v79), tbl[12 + tbloffset], ctimes(v79, tbl[13 + tbloffset])); + real2 v32 = load(in, 30 << inShift); + real2 v16 = load(in, 14 << inShift); + real2 v155 = reverse(minus(v32, v16)); + real2 v161 = plus(v16, v32); + real2 v24 = load(in, 22 << inShift); + real2 v8 = load(in, 6 << inShift); + real2 v160 = plus(v8, v24); + real2 v156 = minus(v24, v8); + real2 v235 = reverse(minus(v161, v160)); + real2 v241 = plus(v160, v161); + real2 v157 = minusplus(v155, v156); + real2 v159 = minusplus(uminus(v155), v156); + real2 v173 = ctimesminusplus(reverse(v159), tbl[28 + tbloffset], ctimes(v159, tbl[29 + tbloffset])); + real2 v485 = reverse(minus(v173, v93)); + real2 v491 = plus(v93, v173); + real2 v489 = minusplus(uminus(v485), v486); + real2 v487 = minusplus(v485, v486); + real2 v239 = minusplus(uminus(v235), v236); + real2 v237 = minusplus(v235, v236); + real2 v253 = ctimesminusplus(reverse(v239), tbl[44 + tbloffset], ctimes(v239, tbl[45 + tbloffset])); + real2 v497 = ctimesminusplus(reverse(v487), tbl[82 + tbloffset], ctimes(v487, tbl[83 + tbloffset])); + real2 v530 = plus(v490, v491); + real2 v526 = minus(v491, v490); + real2 v503 = ctimesminusplus(reverse(v489), tbl[84 + tbloffset], ctimes(v489, tbl[85 + tbloffset])); + real2 v247 = ctimesminusplus(reverse(v237), tbl[42 + tbloffset], ctimes(v237, tbl[43 + tbloffset])); + real2 v356 = minus(v247, v207); + real2 v360 = plus(v207, v247); + real2 v386 = plus(v213, v253); + real2 v382 = minus(v253, v213); + real2 v17 = load(in, 15 << inShift); + real2 v33 = load(in, 31 << inShift); + real2 v175 = reverse(minus(v33, v17)); + real2 v181 = plus(v17, v33); + real2 v25 = load(in, 23 << inShift); + real2 v9 = load(in, 7 << inShift); + real2 v176 = minus(v25, v9); + real2 v180 = plus(v9, v25); + real2 v177 = minusplus(v175, v176); + real2 v179 = minusplus(uminus(v175), v176); + real2 v193 = ctimesminusplus(reverse(v179), tbl[32 + tbloffset], ctimes(v179, tbl[33 + tbloffset])); + real2 v261 = plus(v180, v181); + real2 v255 = reverse(minus(v181, v180)); + real2 v29 = load(in, 27 << inShift); + real2 v13 = load(in, 11 << inShift); + real2 v101 = plus(v13, v29); + real2 v95 = reverse(minus(v29, v13)); + real2 v21 = load(in, 19 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v100 = plus(v5, v21); + real2 v96 = minus(v21, v5); + real2 v99 = minusplus(uminus(v95), v96); + real2 v97 = minusplus(v95, v96); + real2 v260 = plus(v100, v101); + real2 v256 = minus(v101, v100); + real2 v259 = minusplus(uminus(v255), v256); + real2 v257 = minusplus(v255, v256); + real2 v273 = ctimesminusplus(reverse(v259), tbl[48 + tbloffset], ctimes(v259, tbl[49 + tbloffset])); + real2 v267 = ctimesminusplus(reverse(v257), tbl[46 + tbloffset], ctimes(v257, tbl[47 + tbloffset])); + real2 v3 = load(in, 1 << inShift); + real2 v19 = load(in, 17 << inShift); + real2 v60 = plus(v3, v19); + real2 v56 = minus(v19, v3); + real2 v27 = load(in, 25 << inShift); + real2 v11 = load(in, 9 << inShift); + real2 v55 = reverse(minus(v27, v11)); + real2 v61 = plus(v11, v27); + real2 v220 = plus(v60, v61); + real2 v216 = minus(v61, v60); + real2 v7 = load(in, 5 << inShift); + real2 v23 = load(in, 21 << inShift); + real2 v136 = minus(v23, v7); + real2 v140 = plus(v7, v23); + real2 v15 = load(in, 13 << inShift); + real2 v31 = load(in, 29 << inShift); + real2 v135 = reverse(minus(v31, v15)); + real2 v141 = plus(v15, v31); + real2 v215 = reverse(minus(v141, v140)); + real2 v221 = plus(v140, v141); + real2 v219 = minusplus(uminus(v215), v216); + real2 v217 = minusplus(v215, v216); + real2 v227 = ctimesminusplus(reverse(v217), tbl[38 + tbloffset], ctimes(v217, tbl[39 + tbloffset])); + real2 v355 = reverse(minus(v267, v227)); + real2 v361 = plus(v227, v267); + store(out, 2 << %SHIFT%, plus(v360, v361)); + real2 v374 = minus(v360, v361); + store(out, 18 << %SHIFT%, ctimesminusplus(v374, tbl[0 + tbloffset], ctimes(reverse(v374), tbl[1 + tbloffset]))); + real2 v357 = minusplus(v355, v356); + store(out, 10 << %SHIFT%, ctimesminusplus(reverse(v357), tbl[62 + tbloffset], ctimes(v357, tbl[63 + tbloffset]))); + real2 v359 = minusplus(uminus(v355), v356); + store(out, 26 << %SHIFT%, ctimesminusplus(reverse(v359), tbl[64 + tbloffset], ctimes(v359, tbl[65 + tbloffset]))); + real2 v233 = ctimesminusplus(reverse(v219), tbl[40 + tbloffset], ctimes(v219, tbl[41 + tbloffset])); + real2 v381 = reverse(minus(v273, v233)); + real2 v387 = plus(v233, v273); + store(out, 6 << %SHIFT%, plus(v386, v387)); + real2 v400 = minus(v386, v387); + store(out, 22 << %SHIFT%, ctimesminusplus(v400, tbl[0 + tbloffset], ctimes(reverse(v400), tbl[1 + tbloffset]))); + real2 v383 = minusplus(v381, v382); + real2 v385 = minusplus(uminus(v381), v382); + store(out, 30 << %SHIFT%, ctimesminusplus(reverse(v385), tbl[68 + tbloffset], ctimes(v385, tbl[69 + tbloffset]))); + store(out, 14 << %SHIFT%, ctimesminusplus(reverse(v383), tbl[66 + tbloffset], ctimes(v383, tbl[67 + tbloffset]))); + real2 v137 = minusplus(v135, v136); + real2 v139 = minusplus(uminus(v135), v136); + real2 v153 = ctimesminusplus(reverse(v139), tbl[24 + tbloffset], ctimes(v139, tbl[25 + tbloffset])); + real2 v113 = ctimesminusplus(reverse(v99), tbl[16 + tbloffset], ctimes(v99, tbl[17 + tbloffset])); + real2 v511 = plus(v113, v193); + real2 v505 = reverse(minus(v193, v113)); + real2 v57 = minusplus(v55, v56); + real2 v59 = minusplus(uminus(v55), v56); + real2 v73 = ctimesminusplus(reverse(v59), tbl[8 + tbloffset], ctimes(v59, tbl[9 + tbloffset])); + real2 v510 = plus(v73, v153); + real2 v506 = minus(v153, v73); + real2 v531 = plus(v510, v511); + real2 v525 = reverse(minus(v511, v510)); + store(out, 3 << %SHIFT%, plus(v530, v531)); + real2 v544 = minus(v530, v531); + store(out, 19 << %SHIFT%, ctimesminusplus(v544, tbl[0 + tbloffset], ctimes(reverse(v544), tbl[1 + tbloffset]))); + real2 v527 = minusplus(v525, v526); + store(out, 11 << %SHIFT%, ctimesminusplus(reverse(v527), tbl[90 + tbloffset], ctimes(v527, tbl[91 + tbloffset]))); + real2 v529 = minusplus(uminus(v525), v526); + store(out, 27 << %SHIFT%, ctimesminusplus(reverse(v529), tbl[92 + tbloffset], ctimes(v529, tbl[93 + tbloffset]))); + real2 v509 = minusplus(uminus(v505), v506); + real2 v507 = minusplus(v505, v506); + real2 v523 = ctimesminusplus(reverse(v509), tbl[88 + tbloffset], ctimes(v509, tbl[89 + tbloffset])); + store(out, 15 << %SHIFT%, plus(v503, v523)); + real2 v556 = minus(v503, v523); + store(out, 31 << %SHIFT%, ctimesminusplus(v556, tbl[0 + tbloffset], ctimes(reverse(v556), tbl[1 + tbloffset]))); + real2 v517 = ctimesminusplus(reverse(v507), tbl[86 + tbloffset], ctimes(v507, tbl[87 + tbloffset])); + store(out, 7 << %SHIFT%, plus(v497, v517)); + real2 v550 = minus(v497, v517); + store(out, 23 << %SHIFT%, ctimesminusplus(v550, tbl[0 + tbloffset], ctimes(reverse(v550), tbl[1 + tbloffset]))); + real2 v275 = reverse(minus(v241, v240)); + real2 v281 = plus(v240, v241); + real2 v320 = plus(v280, v281); + real2 v316 = minus(v281, v280); + real2 v301 = plus(v260, v261); + real2 v295 = reverse(minus(v261, v260)); + real2 v300 = plus(v220, v221); + real2 v296 = minus(v221, v220); + real2 v315 = reverse(minus(v301, v300)); + real2 v321 = plus(v300, v301); + store(out, 0 << %SHIFT%, plus(v320, v321)); + real2 v334 = minus(v320, v321); + store(out, 16 << %SHIFT%, ctimesminusplus(v334, tbl[0 + tbloffset], ctimes(reverse(v334), tbl[1 + tbloffset]))); + real2 v319 = minusplus(uminus(v315), v316); + real2 v317 = minusplus(v315, v316); + store(out, 8 << %SHIFT%, ctimesminusplus(reverse(v317), tbl[58 + tbloffset], ctimes(v317, tbl[59 + tbloffset]))); + store(out, 24 << %SHIFT%, ctimesminusplus(reverse(v319), tbl[60 + tbloffset], ctimes(v319, tbl[61 + tbloffset]))); + real2 v299 = minusplus(uminus(v295), v296); + real2 v297 = minusplus(v295, v296); + real2 v279 = minusplus(uminus(v275), v276); + real2 v277 = minusplus(v275, v276); + real2 v287 = ctimesminusplus(reverse(v277), tbl[50 + tbloffset], ctimes(v277, tbl[51 + tbloffset])); + real2 v307 = ctimesminusplus(reverse(v297), tbl[54 + tbloffset], ctimes(v297, tbl[55 + tbloffset])); + store(out, 4 << %SHIFT%, plus(v287, v307)); + real2 v342 = minus(v287, v307); + store(out, 20 << %SHIFT%, ctimesminusplus(v342, tbl[0 + tbloffset], ctimes(reverse(v342), tbl[1 + tbloffset]))); + real2 v313 = ctimesminusplus(reverse(v299), tbl[56 + tbloffset], ctimes(v299, tbl[57 + tbloffset])); + real2 v293 = ctimesminusplus(reverse(v279), tbl[52 + tbloffset], ctimes(v279, tbl[53 + tbloffset])); + store(out, 12 << %SHIFT%, plus(v293, v313)); + real2 v348 = minus(v293, v313); + store(out, 28 << %SHIFT%, ctimesminusplus(v348, tbl[0 + tbloffset], ctimes(reverse(v348), tbl[1 + tbloffset]))); + real2 v87 = ctimesminusplus(reverse(v77), tbl[10 + tbloffset], ctimes(v77, tbl[11 + tbloffset])); + real2 v147 = ctimesminusplus(reverse(v137), tbl[22 + tbloffset], ctimes(v137, tbl[23 + tbloffset])); + real2 v187 = ctimesminusplus(reverse(v177), tbl[30 + tbloffset], ctimes(v177, tbl[31 + tbloffset])); + real2 v167 = ctimesminusplus(reverse(v157), tbl[26 + tbloffset], ctimes(v157, tbl[27 + tbloffset])); + real2 v413 = plus(v87, v167); + real2 v407 = reverse(minus(v167, v87)); + real2 v67 = ctimesminusplus(reverse(v57), tbl[6 + tbloffset], ctimes(v57, tbl[7 + tbloffset])); + real2 v107 = ctimesminusplus(reverse(v97), tbl[14 + tbloffset], ctimes(v97, tbl[15 + tbloffset])); + real2 v427 = reverse(minus(v187, v107)); + real2 v433 = plus(v107, v187); + real2 v432 = plus(v67, v147); + real2 v428 = minus(v147, v67); + real2 v453 = plus(v432, v433); + real2 v447 = reverse(minus(v433, v432)); + real2 v408 = minus(v127, v47); + real2 v412 = plus(v47, v127); + real2 v452 = plus(v412, v413); + real2 v448 = minus(v413, v412); + store(out, 1 << %SHIFT%, plus(v452, v453)); + real2 v466 = minus(v452, v453); + store(out, 17 << %SHIFT%, ctimesminusplus(v466, tbl[0 + tbloffset], ctimes(reverse(v466), tbl[1 + tbloffset]))); + real2 v451 = minusplus(uminus(v447), v448); + store(out, 25 << %SHIFT%, ctimesminusplus(reverse(v451), tbl[80 + tbloffset], ctimes(v451, tbl[81 + tbloffset]))); + real2 v449 = minusplus(v447, v448); + store(out, 9 << %SHIFT%, ctimesminusplus(reverse(v449), tbl[78 + tbloffset], ctimes(v449, tbl[79 + tbloffset]))); + real2 v429 = minusplus(v427, v428); + real2 v431 = minusplus(uminus(v427), v428); + real2 v445 = ctimesminusplus(reverse(v431), tbl[76 + tbloffset], ctimes(v431, tbl[77 + tbloffset])); + real2 v409 = minusplus(v407, v408); + real2 v411 = minusplus(uminus(v407), v408); + real2 v425 = ctimesminusplus(reverse(v411), tbl[72 + tbloffset], ctimes(v411, tbl[73 + tbloffset])); + store(out, 13 << %SHIFT%, plus(v425, v445)); + real2 v478 = minus(v425, v445); + store(out, 29 << %SHIFT%, ctimesminusplus(v478, tbl[0 + tbloffset], ctimes(reverse(v478), tbl[1 + tbloffset]))); + real2 v439 = ctimesminusplus(reverse(v429), tbl[74 + tbloffset], ctimes(v429, tbl[75 + tbloffset])); + real2 v419 = ctimesminusplus(reverse(v409), tbl[70 + tbloffset], ctimes(v409, tbl[71 + tbloffset])); + store(out, 5 << %SHIFT%, plus(v419, v439)); + real2 v472 = minus(v419, v439); + store(out, 21 << %SHIFT%, ctimesminusplus(v472, tbl[0 + tbloffset], ctimes(reverse(v472), tbl[1 + tbloffset]))); + } +} + +ALIGNED(8192) void but64f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + // Pres : 30254 + real2 v37 = load(in, 35 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v132 = plus(v5, v37); + real2 v128 = minus(v37, v5); + real2 v21 = load(in, 19 << inShift); + real2 v53 = load(in, 51 << inShift); + real2 v133 = plus(v21, v53); + real2 v127 = reverse(minus(v21, v53)); + real2 v131 = minusplus(uminus(v127), v128); + real2 v129 = minusplus(v127, v128); + real2 v139 = ctimesminusplus(reverse(v129), tbl[14 + tbloffset], ctimes(v129, tbl[15 + tbloffset])); + real2 v145 = ctimesminusplus(reverse(v131), tbl[16 + tbloffset], ctimes(v131, tbl[17 + tbloffset])); + real2 v448 = minus(v133, v132); + real2 v452 = plus(v132, v133); + real2 v45 = load(in, 43 << inShift); + real2 v13 = load(in, 11 << inShift); + real2 v292 = plus(v13, v45); + real2 v288 = minus(v45, v13); + real2 v29 = load(in, 27 << inShift); + real2 v61 = load(in, 59 << inShift); + real2 v293 = plus(v29, v61); + real2 v287 = reverse(minus(v29, v61)); + real2 v291 = minusplus(uminus(v287), v288); + real2 v289 = minusplus(v287, v288); + real2 v299 = ctimesminusplus(reverse(v289), tbl[46 + tbloffset], ctimes(v289, tbl[47 + tbloffset])); + real2 v453 = plus(v292, v293); + real2 v447 = reverse(minus(v292, v293)); + real2 v608 = minus(v453, v452); + real2 v612 = plus(v452, v453); + real2 v980 = plus(v139, v299); + real2 v976 = minus(v299, v139); + real2 v449 = minusplus(v447, v448); + real2 v451 = minusplus(uminus(v447), v448); + real2 v465 = ctimesminusplus(reverse(v451), tbl[80 + tbloffset], ctimes(v451, tbl[81 + tbloffset])); + real2 v305 = ctimesminusplus(reverse(v291), tbl[48 + tbloffset], ctimes(v291, tbl[49 + tbloffset])); + real2 v1186 = minus(v305, v145); + real2 v1190 = plus(v145, v305); + real2 v459 = ctimesminusplus(reverse(v449), tbl[78 + tbloffset], ctimes(v449, tbl[79 + tbloffset])); + real2 v25 = load(in, 23 << inShift); + real2 v57 = load(in, 55 << inShift); + real2 v207 = reverse(minus(v25, v57)); + real2 v213 = plus(v25, v57); + real2 v9 = load(in, 7 << inShift); + real2 v41 = load(in, 39 << inShift); + real2 v212 = plus(v9, v41); + real2 v208 = minus(v41, v9); + real2 v528 = minus(v213, v212); + real2 v532 = plus(v212, v213); + real2 v209 = minusplus(v207, v208); + real2 v211 = minusplus(uminus(v207), v208); + real2 v225 = ctimesminusplus(reverse(v211), tbl[32 + tbloffset], ctimes(v211, tbl[33 + tbloffset])); + real2 v219 = ctimesminusplus(reverse(v209), tbl[30 + tbloffset], ctimes(v209, tbl[31 + tbloffset])); + real2 v17 = load(in, 15 << inShift); + real2 v49 = load(in, 47 << inShift); + real2 v368 = minus(v49, v17); + real2 v372 = plus(v17, v49); + real2 v33 = load(in, 31 << inShift); + real2 v65 = load(in, 63 << inShift); + real2 v367 = reverse(minus(v33, v65)); + real2 v373 = plus(v33, v65); + real2 v369 = minusplus(v367, v368); + real2 v371 = minusplus(uminus(v367), v368); + real2 v533 = plus(v372, v373); + real2 v527 = reverse(minus(v372, v373)); + real2 v607 = reverse(minus(v532, v533)); + real2 v613 = plus(v532, v533); + real2 v529 = minusplus(v527, v528); + real2 v531 = minusplus(uminus(v527), v528); + real2 v545 = ctimesminusplus(reverse(v531), tbl[96 + tbloffset], ctimes(v531, tbl[97 + tbloffset])); + real2 v653 = plus(v612, v613); + real2 v647 = reverse(minus(v612, v613)); + real2 v609 = minusplus(v607, v608); + real2 v611 = minusplus(uminus(v607), v608); + real2 v863 = plus(v465, v545); + real2 v857 = reverse(minus(v465, v545)); + real2 v539 = ctimesminusplus(reverse(v529), tbl[94 + tbloffset], ctimes(v529, tbl[95 + tbloffset])); + real2 v385 = ctimesminusplus(reverse(v371), tbl[64 + tbloffset], ctimes(v371, tbl[65 + tbloffset])); + real2 v619 = ctimesminusplus(reverse(v609), tbl[110 + tbloffset], ctimes(v609, tbl[111 + tbloffset])); + real2 v1191 = plus(v225, v385); + real2 v1185 = reverse(minus(v225, v385)); + real2 v779 = reverse(minus(v459, v539)); + real2 v785 = plus(v459, v539); + real2 v625 = ctimesminusplus(reverse(v611), tbl[112 + tbloffset], ctimes(v611, tbl[113 + tbloffset])); + real2 v379 = ctimesminusplus(reverse(v369), tbl[62 + tbloffset], ctimes(v369, tbl[63 + tbloffset])); + real2 v975 = reverse(minus(v219, v379)); + real2 v981 = plus(v219, v379); + real2 v977 = minusplus(v975, v976); + real2 v979 = minusplus(uminus(v975), v976); + real2 v987 = ctimesminusplus(reverse(v977), tbl[170 + tbloffset], ctimes(v977, tbl[171 + tbloffset])); + real2 v993 = ctimesminusplus(reverse(v979), tbl[172 + tbloffset], ctimes(v979, tbl[173 + tbloffset])); + real2 v1015 = reverse(minus(v980, v981)); + real2 v1021 = plus(v980, v981); + real2 v11 = load(in, 9 << inShift); + real2 v43 = load(in, 41 << inShift); + real2 v248 = minus(v43, v11); + real2 v252 = plus(v11, v43); + real2 v59 = load(in, 57 << inShift); + real2 v27 = load(in, 25 << inShift); + real2 v253 = plus(v27, v59); + real2 v247 = reverse(minus(v27, v59)); + real2 v413 = plus(v252, v253); + real2 v407 = reverse(minus(v252, v253)); + real2 v249 = minusplus(v247, v248); + real2 v251 = minusplus(uminus(v247), v248); + real2 v259 = ctimesminusplus(reverse(v249), tbl[38 + tbloffset], ctimes(v249, tbl[39 + tbloffset])); + real2 v35 = load(in, 33 << inShift); + real2 v3 = load(in, 1 << inShift); + real2 v92 = plus(v3, v35); + real2 v88 = minus(v35, v3); + real2 v51 = load(in, 49 << inShift); + real2 v19 = load(in, 17 << inShift); + real2 v87 = reverse(minus(v19, v51)); + real2 v93 = plus(v19, v51); + real2 v412 = plus(v92, v93); + real2 v408 = minus(v93, v92); + real2 v411 = minusplus(uminus(v407), v408); + real2 v409 = minusplus(v407, v408); + real2 v91 = minusplus(uminus(v87), v88); + real2 v89 = minusplus(v87, v88); + real2 v99 = ctimesminusplus(reverse(v89), tbl[6 + tbloffset], ctimes(v89, tbl[7 + tbloffset])); + real2 v425 = ctimesminusplus(reverse(v411), tbl[72 + tbloffset], ctimes(v411, tbl[73 + tbloffset])); + real2 v568 = minus(v413, v412); + real2 v572 = plus(v412, v413); + real2 v940 = plus(v99, v259); + real2 v936 = minus(v259, v99); + real2 v419 = ctimesminusplus(reverse(v409), tbl[70 + tbloffset], ctimes(v409, tbl[71 + tbloffset])); + real2 v47 = load(in, 45 << inShift); + real2 v15 = load(in, 13 << inShift); + real2 v332 = plus(v15, v47); + real2 v328 = minus(v47, v15); + real2 v63 = load(in, 61 << inShift); + real2 v31 = load(in, 29 << inShift); + real2 v327 = reverse(minus(v31, v63)); + real2 v333 = plus(v31, v63); + real2 v329 = minusplus(v327, v328); + real2 v331 = minusplus(uminus(v327), v328); + real2 v339 = ctimesminusplus(reverse(v329), tbl[54 + tbloffset], ctimes(v329, tbl[55 + tbloffset])); + real2 v487 = reverse(minus(v332, v333)); + real2 v493 = plus(v332, v333); + real2 v7 = load(in, 5 << inShift); + real2 v39 = load(in, 37 << inShift); + real2 v172 = plus(v7, v39); + real2 v168 = minus(v39, v7); + real2 v55 = load(in, 53 << inShift); + real2 v23 = load(in, 21 << inShift); + real2 v173 = plus(v23, v55); + real2 v167 = reverse(minus(v23, v55)); + real2 v488 = minus(v173, v172); + real2 v492 = plus(v172, v173); + real2 v491 = minusplus(uminus(v487), v488); + real2 v489 = minusplus(v487, v488); + real2 v499 = ctimesminusplus(reverse(v489), tbl[86 + tbloffset], ctimes(v489, tbl[87 + tbloffset])); + real2 v505 = ctimesminusplus(reverse(v491), tbl[88 + tbloffset], ctimes(v491, tbl[89 + tbloffset])); + real2 v567 = reverse(minus(v492, v493)); + real2 v573 = plus(v492, v493); + real2 v571 = minusplus(uminus(v567), v568); + real2 v569 = minusplus(v567, v568); + real2 v579 = ctimesminusplus(reverse(v569), tbl[102 + tbloffset], ctimes(v569, tbl[103 + tbloffset])); + real2 v585 = ctimesminusplus(reverse(v571), tbl[104 + tbloffset], ctimes(v571, tbl[105 + tbloffset])); + real2 v739 = plus(v585, v625); + real2 v733 = reverse(minus(v585, v625)); + real2 v707 = reverse(minus(v579, v619)); + real2 v713 = plus(v579, v619); + real2 v648 = minus(v573, v572); + real2 v652 = plus(v572, v573); + real2 v673 = plus(v652, v653); + real2 v667 = reverse(minus(v652, v653)); + real2 v651 = minusplus(uminus(v647), v648); + real2 v649 = minusplus(v647, v648); + real2 v659 = ctimesminusplus(reverse(v649), tbl[118 + tbloffset], ctimes(v649, tbl[119 + tbloffset])); + real2 v665 = ctimesminusplus(reverse(v651), tbl[120 + tbloffset], ctimes(v651, tbl[121 + tbloffset])); + real2 v780 = minus(v499, v419); + real2 v784 = plus(v419, v499); + real2 v781 = minusplus(v779, v780); + real2 v783 = minusplus(uminus(v779), v780); + real2 v805 = plus(v784, v785); + real2 v799 = reverse(minus(v784, v785)); + real2 v862 = plus(v425, v505); + real2 v858 = minus(v505, v425); + real2 v859 = minusplus(v857, v858); + real2 v861 = minusplus(uminus(v857), v858); + real2 v875 = ctimesminusplus(reverse(v861), tbl[152 + tbloffset], ctimes(v861, tbl[153 + tbloffset])); + real2 v791 = ctimesminusplus(reverse(v781), tbl[138 + tbloffset], ctimes(v781, tbl[139 + tbloffset])); + real2 v797 = ctimesminusplus(reverse(v783), tbl[140 + tbloffset], ctimes(v783, tbl[141 + tbloffset])); + real2 v883 = plus(v862, v863); + real2 v877 = reverse(minus(v862, v863)); + real2 v869 = ctimesminusplus(reverse(v859), tbl[150 + tbloffset], ctimes(v859, tbl[151 + tbloffset])); + real2 v36 = load(in, 34 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v108 = minus(v36, v4); + real2 v112 = plus(v4, v36); + real2 v52 = load(in, 50 << inShift); + real2 v20 = load(in, 18 << inShift); + real2 v113 = plus(v20, v52); + real2 v107 = reverse(minus(v20, v52)); + real2 v428 = minus(v113, v112); + real2 v432 = plus(v112, v113); + real2 v12 = load(in, 10 << inShift); + real2 v44 = load(in, 42 << inShift); + real2 v268 = minus(v44, v12); + real2 v272 = plus(v12, v44); + real2 v28 = load(in, 26 << inShift); + real2 v60 = load(in, 58 << inShift); + real2 v267 = reverse(minus(v28, v60)); + real2 v273 = plus(v28, v60); + real2 v427 = reverse(minus(v272, v273)); + real2 v433 = plus(v272, v273); + real2 v431 = minusplus(uminus(v427), v428); + real2 v429 = minusplus(v427, v428); + real2 v439 = ctimesminusplus(reverse(v429), tbl[74 + tbloffset], ctimes(v429, tbl[75 + tbloffset])); + real2 v588 = minus(v433, v432); + real2 v592 = plus(v432, v433); + real2 v40 = load(in, 38 << inShift); + real2 v8 = load(in, 6 << inShift); + real2 v188 = minus(v40, v8); + real2 v192 = plus(v8, v40); + real2 v24 = load(in, 22 << inShift); + real2 v56 = load(in, 54 << inShift); + real2 v187 = reverse(minus(v24, v56)); + real2 v193 = plus(v24, v56); + real2 v512 = plus(v192, v193); + real2 v508 = minus(v193, v192); + real2 v32 = load(in, 30 << inShift); + real2 v64 = load(in, 62 << inShift); + real2 v347 = reverse(minus(v32, v64)); + real2 v353 = plus(v32, v64); + real2 v48 = load(in, 46 << inShift); + real2 v16 = load(in, 14 << inShift); + real2 v348 = minus(v48, v16); + real2 v352 = plus(v16, v48); + real2 v513 = plus(v352, v353); + real2 v507 = reverse(minus(v352, v353)); + real2 v587 = reverse(minus(v512, v513)); + real2 v593 = plus(v512, v513); + real2 v633 = plus(v592, v593); + real2 v627 = reverse(minus(v592, v593)); + real2 v591 = minusplus(uminus(v587), v588); + real2 v589 = minusplus(v587, v588); + real2 v605 = ctimesminusplus(reverse(v591), tbl[108 + tbloffset], ctimes(v591, tbl[109 + tbloffset])); + real2 v599 = ctimesminusplus(reverse(v589), tbl[106 + tbloffset], ctimes(v589, tbl[107 + tbloffset])); + real2 v46 = load(in, 44 << inShift); + real2 v14 = load(in, 12 << inShift); + real2 v312 = plus(v14, v46); + real2 v308 = minus(v46, v14); + real2 v62 = load(in, 60 << inShift); + real2 v30 = load(in, 28 << inShift); + real2 v313 = plus(v30, v62); + real2 v307 = reverse(minus(v30, v62)); + real2 v467 = reverse(minus(v312, v313)); + real2 v473 = plus(v312, v313); + real2 v22 = load(in, 20 << inShift); + real2 v54 = load(in, 52 << inShift); + real2 v147 = reverse(minus(v22, v54)); + real2 v153 = plus(v22, v54); + real2 v6 = load(in, 4 << inShift); + real2 v38 = load(in, 36 << inShift); + real2 v148 = minus(v38, v6); + real2 v152 = plus(v6, v38); + real2 v472 = plus(v152, v153); + real2 v468 = minus(v153, v152); + real2 v547 = reverse(minus(v472, v473)); + real2 v553 = plus(v472, v473); + real2 v10 = load(in, 8 << inShift); + real2 v42 = load(in, 40 << inShift); + real2 v232 = plus(v10, v42); + real2 v228 = minus(v42, v10); + real2 v58 = load(in, 56 << inShift); + real2 v26 = load(in, 24 << inShift); + real2 v233 = plus(v26, v58); + real2 v227 = reverse(minus(v26, v58)); + real2 v393 = plus(v232, v233); + real2 v387 = reverse(minus(v232, v233)); + real2 v2 = load(in, 0 << inShift); + real2 v34 = load(in, 32 << inShift); + real2 v72 = plus(v2, v34); + real2 v68 = minus(v34, v2); + real2 v18 = load(in, 16 << inShift); + real2 v50 = load(in, 48 << inShift); + real2 v73 = plus(v18, v50); + real2 v67 = reverse(minus(v18, v50)); + real2 v388 = minus(v73, v72); + real2 v392 = plus(v72, v73); + real2 v548 = minus(v393, v392); + real2 v552 = plus(v392, v393); + real2 v628 = minus(v553, v552); + real2 v632 = plus(v552, v553); + real2 v672 = plus(v632, v633); + real2 v668 = minus(v633, v632); + store(out, 0 << %SHIFT%, plus(v672, v673)); + real2 v686 = minus(v672, v673); + store(out, 32 << %SHIFT%, ctimesminusplus(v686, tbl[0 + tbloffset], ctimes(reverse(v686), tbl[1 + tbloffset]))); + real2 v669 = minusplus(v667, v668); + real2 v671 = minusplus(uminus(v667), v668); + store(out, 48 << %SHIFT%, ctimesminusplus(reverse(v671), tbl[124 + tbloffset], ctimes(v671, tbl[125 + tbloffset]))); + store(out, 16 << %SHIFT%, ctimesminusplus(reverse(v669), tbl[122 + tbloffset], ctimes(v669, tbl[123 + tbloffset]))); + real2 v631 = minusplus(uminus(v627), v628); + real2 v629 = minusplus(v627, v628); + real2 v639 = ctimesminusplus(reverse(v629), tbl[114 + tbloffset], ctimes(v629, tbl[115 + tbloffset])); + store(out, 8 << %SHIFT%, plus(v639, v659)); + real2 v694 = minus(v639, v659); + store(out, 40 << %SHIFT%, ctimesminusplus(v694, tbl[0 + tbloffset], ctimes(reverse(v694), tbl[1 + tbloffset]))); + real2 v645 = ctimesminusplus(reverse(v631), tbl[116 + tbloffset], ctimes(v631, tbl[117 + tbloffset])); + store(out, 24 << %SHIFT%, plus(v645, v665)); + real2 v700 = minus(v645, v665); + store(out, 56 << %SHIFT%, ctimesminusplus(v700, tbl[0 + tbloffset], ctimes(reverse(v700), tbl[1 + tbloffset]))); + real2 v549 = minusplus(v547, v548); + real2 v551 = minusplus(uminus(v547), v548); + real2 v559 = ctimesminusplus(reverse(v549), tbl[98 + tbloffset], ctimes(v549, tbl[99 + tbloffset])); + real2 v708 = minus(v599, v559); + real2 v712 = plus(v559, v599); + store(out, 4 << %SHIFT%, plus(v712, v713)); + real2 v726 = minus(v712, v713); + store(out, 36 << %SHIFT%, ctimesminusplus(v726, tbl[0 + tbloffset], ctimes(reverse(v726), tbl[1 + tbloffset]))); + real2 v711 = minusplus(uminus(v707), v708); + real2 v709 = minusplus(v707, v708); + store(out, 20 << %SHIFT%, ctimesminusplus(reverse(v709), tbl[126 + tbloffset], ctimes(v709, tbl[127 + tbloffset]))); + store(out, 52 << %SHIFT%, ctimesminusplus(reverse(v711), tbl[128 + tbloffset], ctimes(v711, tbl[129 + tbloffset]))); + real2 v565 = ctimesminusplus(reverse(v551), tbl[100 + tbloffset], ctimes(v551, tbl[101 + tbloffset])); + real2 v738 = plus(v565, v605); + real2 v734 = minus(v605, v565); + store(out, 12 << %SHIFT%, plus(v738, v739)); + real2 v752 = minus(v738, v739); + store(out, 44 << %SHIFT%, ctimesminusplus(v752, tbl[0 + tbloffset], ctimes(reverse(v752), tbl[1 + tbloffset]))); + real2 v737 = minusplus(uminus(v733), v734); + store(out, 60 << %SHIFT%, ctimesminusplus(reverse(v737), tbl[132 + tbloffset], ctimes(v737, tbl[133 + tbloffset]))); + real2 v735 = minusplus(v733, v734); + store(out, 28 << %SHIFT%, ctimesminusplus(reverse(v735), tbl[130 + tbloffset], ctimes(v735, tbl[131 + tbloffset]))); + real2 v471 = minusplus(uminus(v467), v468); + real2 v469 = minusplus(v467, v468); + real2 v479 = ctimesminusplus(reverse(v469), tbl[82 + tbloffset], ctimes(v469, tbl[83 + tbloffset])); + real2 v511 = minusplus(uminus(v507), v508); + real2 v509 = minusplus(v507, v508); + real2 v519 = ctimesminusplus(reverse(v509), tbl[90 + tbloffset], ctimes(v509, tbl[91 + tbloffset])); + real2 v765 = plus(v439, v519); + real2 v759 = reverse(minus(v439, v519)); + real2 v389 = minusplus(v387, v388); + real2 v391 = minusplus(uminus(v387), v388); + real2 v399 = ctimesminusplus(reverse(v389), tbl[66 + tbloffset], ctimes(v389, tbl[67 + tbloffset])); + real2 v764 = plus(v399, v479); + real2 v760 = minus(v479, v399); + real2 v804 = plus(v764, v765); + real2 v800 = minus(v765, v764); + store(out, 2 << %SHIFT%, plus(v804, v805)); + real2 v818 = minus(v804, v805); + store(out, 34 << %SHIFT%, ctimesminusplus(v818, tbl[0 + tbloffset], ctimes(reverse(v818), tbl[1 + tbloffset]))); + real2 v803 = minusplus(uminus(v799), v800); + store(out, 50 << %SHIFT%, ctimesminusplus(reverse(v803), tbl[144 + tbloffset], ctimes(v803, tbl[145 + tbloffset]))); + real2 v801 = minusplus(v799, v800); + store(out, 18 << %SHIFT%, ctimesminusplus(reverse(v801), tbl[142 + tbloffset], ctimes(v801, tbl[143 + tbloffset]))); + real2 v763 = minusplus(uminus(v759), v760); + real2 v761 = minusplus(v759, v760); + real2 v777 = ctimesminusplus(reverse(v763), tbl[136 + tbloffset], ctimes(v763, tbl[137 + tbloffset])); + store(out, 26 << %SHIFT%, plus(v777, v797)); + real2 v830 = minus(v777, v797); + store(out, 58 << %SHIFT%, ctimesminusplus(v830, tbl[0 + tbloffset], ctimes(reverse(v830), tbl[1 + tbloffset]))); + real2 v771 = ctimesminusplus(reverse(v761), tbl[134 + tbloffset], ctimes(v761, tbl[135 + tbloffset])); + store(out, 10 << %SHIFT%, plus(v771, v791)); + real2 v824 = minus(v771, v791); + store(out, 42 << %SHIFT%, ctimesminusplus(v824, tbl[0 + tbloffset], ctimes(reverse(v824), tbl[1 + tbloffset]))); + real2 v445 = ctimesminusplus(reverse(v431), tbl[76 + tbloffset], ctimes(v431, tbl[77 + tbloffset])); + real2 v525 = ctimesminusplus(reverse(v511), tbl[92 + tbloffset], ctimes(v511, tbl[93 + tbloffset])); + real2 v837 = reverse(minus(v445, v525)); + real2 v843 = plus(v445, v525); + real2 v485 = ctimesminusplus(reverse(v471), tbl[84 + tbloffset], ctimes(v471, tbl[85 + tbloffset])); + real2 v405 = ctimesminusplus(reverse(v391), tbl[68 + tbloffset], ctimes(v391, tbl[69 + tbloffset])); + real2 v838 = minus(v485, v405); + real2 v842 = plus(v405, v485); + real2 v878 = minus(v843, v842); + real2 v882 = plus(v842, v843); + store(out, 6 << %SHIFT%, plus(v882, v883)); + real2 v896 = minus(v882, v883); + store(out, 38 << %SHIFT%, ctimesminusplus(v896, tbl[0 + tbloffset], ctimes(reverse(v896), tbl[1 + tbloffset]))); + real2 v881 = minusplus(uminus(v877), v878); + store(out, 54 << %SHIFT%, ctimesminusplus(reverse(v881), tbl[156 + tbloffset], ctimes(v881, tbl[157 + tbloffset]))); + real2 v879 = minusplus(v877, v878); + store(out, 22 << %SHIFT%, ctimesminusplus(reverse(v879), tbl[154 + tbloffset], ctimes(v879, tbl[155 + tbloffset]))); + real2 v841 = minusplus(uminus(v837), v838); + real2 v839 = minusplus(v837, v838); + real2 v855 = ctimesminusplus(reverse(v841), tbl[148 + tbloffset], ctimes(v841, tbl[149 + tbloffset])); + store(out, 30 << %SHIFT%, plus(v855, v875)); + real2 v908 = minus(v855, v875); + store(out, 62 << %SHIFT%, ctimesminusplus(v908, tbl[0 + tbloffset], ctimes(reverse(v908), tbl[1 + tbloffset]))); + real2 v849 = ctimesminusplus(reverse(v839), tbl[146 + tbloffset], ctimes(v839, tbl[147 + tbloffset])); + store(out, 14 << %SHIFT%, plus(v849, v869)); + real2 v902 = minus(v849, v869); + store(out, 46 << %SHIFT%, ctimesminusplus(v902, tbl[0 + tbloffset], ctimes(reverse(v902), tbl[1 + tbloffset]))); + real2 v151 = minusplus(uminus(v147), v148); + real2 v149 = minusplus(v147, v148); + real2 v311 = minusplus(uminus(v307), v308); + real2 v309 = minusplus(v307, v308); + real2 v109 = minusplus(v107, v108); + real2 v111 = minusplus(uminus(v107), v108); + real2 v119 = ctimesminusplus(reverse(v109), tbl[10 + tbloffset], ctimes(v109, tbl[11 + tbloffset])); + real2 v269 = minusplus(v267, v268); + real2 v271 = minusplus(uminus(v267), v268); + real2 v279 = ctimesminusplus(reverse(v269), tbl[42 + tbloffset], ctimes(v269, tbl[43 + tbloffset])); + real2 v960 = plus(v119, v279); + real2 v956 = minus(v279, v119); + real2 v169 = minusplus(v167, v168); + real2 v171 = minusplus(uminus(v167), v168); + real2 v159 = ctimesminusplus(reverse(v149), tbl[18 + tbloffset], ctimes(v149, tbl[19 + tbloffset])); + real2 v319 = ctimesminusplus(reverse(v309), tbl[50 + tbloffset], ctimes(v309, tbl[51 + tbloffset])); + real2 v921 = plus(v159, v319); + real2 v915 = reverse(minus(v159, v319)); + real2 v351 = minusplus(uminus(v347), v348); + real2 v349 = minusplus(v347, v348); + real2 v359 = ctimesminusplus(reverse(v349), tbl[58 + tbloffset], ctimes(v349, tbl[59 + tbloffset])); + real2 v191 = minusplus(uminus(v187), v188); + real2 v189 = minusplus(v187, v188); + real2 v199 = ctimesminusplus(reverse(v189), tbl[26 + tbloffset], ctimes(v189, tbl[27 + tbloffset])); + real2 v961 = plus(v199, v359); + real2 v955 = reverse(minus(v199, v359)); + real2 v995 = reverse(minus(v960, v961)); + real2 v1001 = plus(v960, v961); + real2 v179 = ctimesminusplus(reverse(v169), tbl[22 + tbloffset], ctimes(v169, tbl[23 + tbloffset])); + real2 v941 = plus(v179, v339); + real2 v935 = reverse(minus(v179, v339)); + real2 v1016 = minus(v941, v940); + real2 v1020 = plus(v940, v941); + real2 v71 = minusplus(uminus(v67), v68); + real2 v69 = minusplus(v67, v68); + real2 v79 = ctimesminusplus(reverse(v69), tbl[2 + tbloffset], ctimes(v69, tbl[3 + tbloffset])); + real2 v1041 = plus(v1020, v1021); + real2 v1035 = reverse(minus(v1020, v1021)); + real2 v229 = minusplus(v227, v228); + real2 v231 = minusplus(uminus(v227), v228); + real2 v239 = ctimesminusplus(reverse(v229), tbl[34 + tbloffset], ctimes(v229, tbl[35 + tbloffset])); + real2 v920 = plus(v79, v239); + real2 v916 = minus(v239, v79); + real2 v996 = minus(v921, v920); + real2 v1000 = plus(v920, v921); + real2 v1040 = plus(v1000, v1001); + real2 v1036 = minus(v1001, v1000); + store(out, 1 << %SHIFT%, plus(v1040, v1041)); + real2 v1054 = minus(v1040, v1041); + store(out, 33 << %SHIFT%, ctimesminusplus(v1054, tbl[0 + tbloffset], ctimes(reverse(v1054), tbl[1 + tbloffset]))); + real2 v1037 = minusplus(v1035, v1036); + real2 v1039 = minusplus(uminus(v1035), v1036); + store(out, 49 << %SHIFT%, ctimesminusplus(reverse(v1039), tbl[184 + tbloffset], ctimes(v1039, tbl[185 + tbloffset]))); + store(out, 17 << %SHIFT%, ctimesminusplus(reverse(v1037), tbl[182 + tbloffset], ctimes(v1037, tbl[183 + tbloffset]))); + real2 v1017 = minusplus(v1015, v1016); + real2 v1019 = minusplus(uminus(v1015), v1016); + real2 v1033 = ctimesminusplus(reverse(v1019), tbl[180 + tbloffset], ctimes(v1019, tbl[181 + tbloffset])); + real2 v997 = minusplus(v995, v996); + real2 v999 = minusplus(uminus(v995), v996); + real2 v1013 = ctimesminusplus(reverse(v999), tbl[176 + tbloffset], ctimes(v999, tbl[177 + tbloffset])); + store(out, 25 << %SHIFT%, plus(v1013, v1033)); + real2 v1066 = minus(v1013, v1033); + store(out, 57 << %SHIFT%, ctimesminusplus(v1066, tbl[0 + tbloffset], ctimes(reverse(v1066), tbl[1 + tbloffset]))); + real2 v1027 = ctimesminusplus(reverse(v1017), tbl[178 + tbloffset], ctimes(v1017, tbl[179 + tbloffset])); + real2 v1007 = ctimesminusplus(reverse(v997), tbl[174 + tbloffset], ctimes(v997, tbl[175 + tbloffset])); + store(out, 9 << %SHIFT%, plus(v1007, v1027)); + real2 v1060 = minus(v1007, v1027); + store(out, 41 << %SHIFT%, ctimesminusplus(v1060, tbl[0 + tbloffset], ctimes(reverse(v1060), tbl[1 + tbloffset]))); + real2 v937 = minusplus(v935, v936); + real2 v939 = minusplus(uminus(v935), v936); + real2 v959 = minusplus(uminus(v955), v956); + real2 v957 = minusplus(v955, v956); + real2 v967 = ctimesminusplus(reverse(v957), tbl[166 + tbloffset], ctimes(v957, tbl[167 + tbloffset])); + real2 v947 = ctimesminusplus(reverse(v937), tbl[162 + tbloffset], ctimes(v937, tbl[163 + tbloffset])); + real2 v919 = minusplus(uminus(v915), v916); + real2 v917 = minusplus(v915, v916); + real2 v1079 = plus(v947, v987); + real2 v1073 = reverse(minus(v947, v987)); + real2 v927 = ctimesminusplus(reverse(v917), tbl[158 + tbloffset], ctimes(v917, tbl[159 + tbloffset])); + real2 v1074 = minus(v967, v927); + real2 v1078 = plus(v927, v967); + store(out, 5 << %SHIFT%, plus(v1078, v1079)); + real2 v1092 = minus(v1078, v1079); + store(out, 37 << %SHIFT%, ctimesminusplus(v1092, tbl[0 + tbloffset], ctimes(reverse(v1092), tbl[1 + tbloffset]))); + real2 v1075 = minusplus(v1073, v1074); + store(out, 21 << %SHIFT%, ctimesminusplus(reverse(v1075), tbl[186 + tbloffset], ctimes(v1075, tbl[187 + tbloffset]))); + real2 v1077 = minusplus(uminus(v1073), v1074); + store(out, 53 << %SHIFT%, ctimesminusplus(reverse(v1077), tbl[188 + tbloffset], ctimes(v1077, tbl[189 + tbloffset]))); + real2 v953 = ctimesminusplus(reverse(v939), tbl[164 + tbloffset], ctimes(v939, tbl[165 + tbloffset])); + real2 v1099 = reverse(minus(v953, v993)); + real2 v1105 = plus(v953, v993); + real2 v973 = ctimesminusplus(reverse(v959), tbl[168 + tbloffset], ctimes(v959, tbl[169 + tbloffset])); + real2 v933 = ctimesminusplus(reverse(v919), tbl[160 + tbloffset], ctimes(v919, tbl[161 + tbloffset])); + real2 v1104 = plus(v933, v973); + real2 v1100 = minus(v973, v933); + store(out, 13 << %SHIFT%, plus(v1104, v1105)); + real2 v1118 = minus(v1104, v1105); + store(out, 45 << %SHIFT%, ctimesminusplus(v1118, tbl[0 + tbloffset], ctimes(reverse(v1118), tbl[1 + tbloffset]))); + real2 v1101 = minusplus(v1099, v1100); + store(out, 29 << %SHIFT%, ctimesminusplus(reverse(v1101), tbl[190 + tbloffset], ctimes(v1101, tbl[191 + tbloffset]))); + real2 v1103 = minusplus(uminus(v1099), v1100); + store(out, 61 << %SHIFT%, ctimesminusplus(reverse(v1103), tbl[192 + tbloffset], ctimes(v1103, tbl[193 + tbloffset]))); + real2 v345 = ctimesminusplus(reverse(v331), tbl[56 + tbloffset], ctimes(v331, tbl[57 + tbloffset])); + real2 v325 = ctimesminusplus(reverse(v311), tbl[52 + tbloffset], ctimes(v311, tbl[53 + tbloffset])); + real2 v265 = ctimesminusplus(reverse(v251), tbl[40 + tbloffset], ctimes(v251, tbl[41 + tbloffset])); + real2 v185 = ctimesminusplus(reverse(v171), tbl[24 + tbloffset], ctimes(v171, tbl[25 + tbloffset])); + real2 v165 = ctimesminusplus(reverse(v151), tbl[20 + tbloffset], ctimes(v151, tbl[21 + tbloffset])); + real2 v1131 = plus(v165, v325); + real2 v1125 = reverse(minus(v165, v325)); + real2 v1151 = plus(v185, v345); + real2 v1145 = reverse(minus(v185, v345)); + real2 v105 = ctimesminusplus(reverse(v91), tbl[8 + tbloffset], ctimes(v91, tbl[9 + tbloffset])); + real2 v1150 = plus(v105, v265); + real2 v1146 = minus(v265, v105); + real2 v1226 = minus(v1151, v1150); + real2 v1230 = plus(v1150, v1151); + real2 v1231 = plus(v1190, v1191); + real2 v1225 = reverse(minus(v1190, v1191)); + real2 v1245 = reverse(minus(v1230, v1231)); + real2 v1251 = plus(v1230, v1231); + real2 v365 = ctimesminusplus(reverse(v351), tbl[60 + tbloffset], ctimes(v351, tbl[61 + tbloffset])); + real2 v285 = ctimesminusplus(reverse(v271), tbl[44 + tbloffset], ctimes(v271, tbl[45 + tbloffset])); + real2 v205 = ctimesminusplus(reverse(v191), tbl[28 + tbloffset], ctimes(v191, tbl[29 + tbloffset])); + real2 v1171 = plus(v205, v365); + real2 v1165 = reverse(minus(v205, v365)); + real2 v125 = ctimesminusplus(reverse(v111), tbl[12 + tbloffset], ctimes(v111, tbl[13 + tbloffset])); + real2 v85 = ctimesminusplus(reverse(v71), tbl[4 + tbloffset], ctimes(v71, tbl[5 + tbloffset])); + real2 v245 = ctimesminusplus(reverse(v231), tbl[36 + tbloffset], ctimes(v231, tbl[37 + tbloffset])); + real2 v1126 = minus(v245, v85); + real2 v1130 = plus(v85, v245); + real2 v1210 = plus(v1130, v1131); + real2 v1206 = minus(v1131, v1130); + real2 v1166 = minus(v285, v125); + real2 v1170 = plus(v125, v285); + real2 v1211 = plus(v1170, v1171); + real2 v1205 = reverse(minus(v1170, v1171)); + real2 v1246 = minus(v1211, v1210); + real2 v1250 = plus(v1210, v1211); + store(out, 3 << %SHIFT%, plus(v1250, v1251)); + real2 v1264 = minus(v1250, v1251); + store(out, 35 << %SHIFT%, ctimesminusplus(v1264, tbl[0 + tbloffset], ctimes(reverse(v1264), tbl[1 + tbloffset]))); + real2 v1247 = minusplus(v1245, v1246); + real2 v1249 = minusplus(uminus(v1245), v1246); + store(out, 19 << %SHIFT%, ctimesminusplus(reverse(v1247), tbl[218 + tbloffset], ctimes(v1247, tbl[219 + tbloffset]))); + store(out, 51 << %SHIFT%, ctimesminusplus(reverse(v1249), tbl[220 + tbloffset], ctimes(v1249, tbl[221 + tbloffset]))); + real2 v1229 = minusplus(uminus(v1225), v1226); + real2 v1227 = minusplus(v1225, v1226); + real2 v1207 = minusplus(v1205, v1206); + real2 v1209 = minusplus(uminus(v1205), v1206); + real2 v1237 = ctimesminusplus(reverse(v1227), tbl[214 + tbloffset], ctimes(v1227, tbl[215 + tbloffset])); + real2 v1217 = ctimesminusplus(reverse(v1207), tbl[210 + tbloffset], ctimes(v1207, tbl[211 + tbloffset])); + store(out, 11 << %SHIFT%, plus(v1217, v1237)); + real2 v1270 = minus(v1217, v1237); + store(out, 43 << %SHIFT%, ctimesminusplus(v1270, tbl[0 + tbloffset], ctimes(reverse(v1270), tbl[1 + tbloffset]))); + real2 v1223 = ctimesminusplus(reverse(v1209), tbl[212 + tbloffset], ctimes(v1209, tbl[213 + tbloffset])); + real2 v1243 = ctimesminusplus(reverse(v1229), tbl[216 + tbloffset], ctimes(v1229, tbl[217 + tbloffset])); + store(out, 27 << %SHIFT%, plus(v1223, v1243)); + real2 v1276 = minus(v1223, v1243); + store(out, 59 << %SHIFT%, ctimesminusplus(v1276, tbl[0 + tbloffset], ctimes(reverse(v1276), tbl[1 + tbloffset]))); + real2 v1189 = minusplus(uminus(v1185), v1186); + real2 v1187 = minusplus(v1185, v1186); + real2 v1129 = minusplus(uminus(v1125), v1126); + real2 v1127 = minusplus(v1125, v1126); + real2 v1147 = minusplus(v1145, v1146); + real2 v1149 = minusplus(uminus(v1145), v1146); + real2 v1167 = minusplus(v1165, v1166); + real2 v1169 = minusplus(uminus(v1165), v1166); + real2 v1143 = ctimesminusplus(reverse(v1129), tbl[196 + tbloffset], ctimes(v1129, tbl[197 + tbloffset])); + real2 v1163 = ctimesminusplus(reverse(v1149), tbl[200 + tbloffset], ctimes(v1149, tbl[201 + tbloffset])); + real2 v1203 = ctimesminusplus(reverse(v1189), tbl[208 + tbloffset], ctimes(v1189, tbl[209 + tbloffset])); + real2 v1315 = plus(v1163, v1203); + real2 v1309 = reverse(minus(v1163, v1203)); + real2 v1183 = ctimesminusplus(reverse(v1169), tbl[204 + tbloffset], ctimes(v1169, tbl[205 + tbloffset])); + real2 v1314 = plus(v1143, v1183); + real2 v1310 = minus(v1183, v1143); + store(out, 15 << %SHIFT%, plus(v1314, v1315)); + real2 v1328 = minus(v1314, v1315); + store(out, 47 << %SHIFT%, ctimesminusplus(v1328, tbl[0 + tbloffset], ctimes(reverse(v1328), tbl[1 + tbloffset]))); + real2 v1311 = minusplus(v1309, v1310); + store(out, 31 << %SHIFT%, ctimesminusplus(reverse(v1311), tbl[226 + tbloffset], ctimes(v1311, tbl[227 + tbloffset]))); + real2 v1313 = minusplus(uminus(v1309), v1310); + store(out, 63 << %SHIFT%, ctimesminusplus(reverse(v1313), tbl[228 + tbloffset], ctimes(v1313, tbl[229 + tbloffset]))); + real2 v1177 = ctimesminusplus(reverse(v1167), tbl[202 + tbloffset], ctimes(v1167, tbl[203 + tbloffset])); + real2 v1137 = ctimesminusplus(reverse(v1127), tbl[194 + tbloffset], ctimes(v1127, tbl[195 + tbloffset])); + real2 v1197 = ctimesminusplus(reverse(v1187), tbl[206 + tbloffset], ctimes(v1187, tbl[207 + tbloffset])); + real2 v1157 = ctimesminusplus(reverse(v1147), tbl[198 + tbloffset], ctimes(v1147, tbl[199 + tbloffset])); + real2 v1283 = reverse(minus(v1157, v1197)); + real2 v1289 = plus(v1157, v1197); + real2 v1288 = plus(v1137, v1177); + real2 v1284 = minus(v1177, v1137); + store(out, 7 << %SHIFT%, plus(v1288, v1289)); + real2 v1302 = minus(v1288, v1289); + store(out, 39 << %SHIFT%, ctimesminusplus(v1302, tbl[0 + tbloffset], ctimes(reverse(v1302), tbl[1 + tbloffset]))); + real2 v1285 = minusplus(v1283, v1284); + real2 v1287 = minusplus(uminus(v1283), v1284); + store(out, 55 << %SHIFT%, ctimesminusplus(reverse(v1287), tbl[224 + tbloffset], ctimes(v1287, tbl[225 + tbloffset]))); + store(out, 23 << %SHIFT%, ctimesminusplus(reverse(v1285), tbl[222 + tbloffset], ctimes(v1285, tbl[223 + tbloffset]))); + // Pres : 17339 + } +} + +ALIGNED(8192) void but64b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + // Pres : 30254 + real2 v37 = load(in, 35 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v132 = plus(v5, v37); + real2 v128 = minus(v37, v5); + real2 v21 = load(in, 19 << inShift); + real2 v53 = load(in, 51 << inShift); + real2 v133 = plus(v21, v53); + real2 v127 = reverse(minus(v53, v21)); + real2 v131 = minusplus(uminus(v127), v128); + real2 v129 = minusplus(v127, v128); + real2 v139 = ctimesminusplus(reverse(v129), tbl[14 + tbloffset], ctimes(v129, tbl[15 + tbloffset])); + real2 v145 = ctimesminusplus(reverse(v131), tbl[16 + tbloffset], ctimes(v131, tbl[17 + tbloffset])); + real2 v448 = minus(v133, v132); + real2 v452 = plus(v132, v133); + real2 v45 = load(in, 43 << inShift); + real2 v13 = load(in, 11 << inShift); + real2 v292 = plus(v13, v45); + real2 v288 = minus(v45, v13); + real2 v29 = load(in, 27 << inShift); + real2 v61 = load(in, 59 << inShift); + real2 v293 = plus(v29, v61); + real2 v287 = reverse(minus(v61, v29)); + real2 v291 = minusplus(uminus(v287), v288); + real2 v289 = minusplus(v287, v288); + real2 v299 = ctimesminusplus(reverse(v289), tbl[46 + tbloffset], ctimes(v289, tbl[47 + tbloffset])); + real2 v453 = plus(v292, v293); + real2 v447 = reverse(minus(v293, v292)); + real2 v608 = minus(v453, v452); + real2 v612 = plus(v452, v453); + real2 v980 = plus(v139, v299); + real2 v976 = minus(v299, v139); + real2 v449 = minusplus(v447, v448); + real2 v451 = minusplus(uminus(v447), v448); + real2 v465 = ctimesminusplus(reverse(v451), tbl[80 + tbloffset], ctimes(v451, tbl[81 + tbloffset])); + real2 v305 = ctimesminusplus(reverse(v291), tbl[48 + tbloffset], ctimes(v291, tbl[49 + tbloffset])); + real2 v1186 = minus(v305, v145); + real2 v1190 = plus(v145, v305); + real2 v459 = ctimesminusplus(reverse(v449), tbl[78 + tbloffset], ctimes(v449, tbl[79 + tbloffset])); + real2 v25 = load(in, 23 << inShift); + real2 v57 = load(in, 55 << inShift); + real2 v207 = reverse(minus(v57, v25)); + real2 v213 = plus(v25, v57); + real2 v9 = load(in, 7 << inShift); + real2 v41 = load(in, 39 << inShift); + real2 v212 = plus(v9, v41); + real2 v208 = minus(v41, v9); + real2 v528 = minus(v213, v212); + real2 v532 = plus(v212, v213); + real2 v209 = minusplus(v207, v208); + real2 v211 = minusplus(uminus(v207), v208); + real2 v225 = ctimesminusplus(reverse(v211), tbl[32 + tbloffset], ctimes(v211, tbl[33 + tbloffset])); + real2 v219 = ctimesminusplus(reverse(v209), tbl[30 + tbloffset], ctimes(v209, tbl[31 + tbloffset])); + real2 v17 = load(in, 15 << inShift); + real2 v49 = load(in, 47 << inShift); + real2 v368 = minus(v49, v17); + real2 v372 = plus(v17, v49); + real2 v33 = load(in, 31 << inShift); + real2 v65 = load(in, 63 << inShift); + real2 v367 = reverse(minus(v65, v33)); + real2 v373 = plus(v33, v65); + real2 v369 = minusplus(v367, v368); + real2 v371 = minusplus(uminus(v367), v368); + real2 v533 = plus(v372, v373); + real2 v527 = reverse(minus(v373, v372)); + real2 v607 = reverse(minus(v533, v532)); + real2 v613 = plus(v532, v533); + real2 v529 = minusplus(v527, v528); + real2 v531 = minusplus(uminus(v527), v528); + real2 v545 = ctimesminusplus(reverse(v531), tbl[96 + tbloffset], ctimes(v531, tbl[97 + tbloffset])); + real2 v653 = plus(v612, v613); + real2 v647 = reverse(minus(v613, v612)); + real2 v609 = minusplus(v607, v608); + real2 v611 = minusplus(uminus(v607), v608); + real2 v863 = plus(v465, v545); + real2 v857 = reverse(minus(v545, v465)); + real2 v539 = ctimesminusplus(reverse(v529), tbl[94 + tbloffset], ctimes(v529, tbl[95 + tbloffset])); + real2 v385 = ctimesminusplus(reverse(v371), tbl[64 + tbloffset], ctimes(v371, tbl[65 + tbloffset])); + real2 v619 = ctimesminusplus(reverse(v609), tbl[110 + tbloffset], ctimes(v609, tbl[111 + tbloffset])); + real2 v1191 = plus(v225, v385); + real2 v1185 = reverse(minus(v385, v225)); + real2 v779 = reverse(minus(v539, v459)); + real2 v785 = plus(v459, v539); + real2 v625 = ctimesminusplus(reverse(v611), tbl[112 + tbloffset], ctimes(v611, tbl[113 + tbloffset])); + real2 v379 = ctimesminusplus(reverse(v369), tbl[62 + tbloffset], ctimes(v369, tbl[63 + tbloffset])); + real2 v975 = reverse(minus(v379, v219)); + real2 v981 = plus(v219, v379); + real2 v977 = minusplus(v975, v976); + real2 v979 = minusplus(uminus(v975), v976); + real2 v987 = ctimesminusplus(reverse(v977), tbl[170 + tbloffset], ctimes(v977, tbl[171 + tbloffset])); + real2 v993 = ctimesminusplus(reverse(v979), tbl[172 + tbloffset], ctimes(v979, tbl[173 + tbloffset])); + real2 v1015 = reverse(minus(v981, v980)); + real2 v1021 = plus(v980, v981); + real2 v11 = load(in, 9 << inShift); + real2 v43 = load(in, 41 << inShift); + real2 v248 = minus(v43, v11); + real2 v252 = plus(v11, v43); + real2 v59 = load(in, 57 << inShift); + real2 v27 = load(in, 25 << inShift); + real2 v253 = plus(v27, v59); + real2 v247 = reverse(minus(v59, v27)); + real2 v413 = plus(v252, v253); + real2 v407 = reverse(minus(v253, v252)); + real2 v249 = minusplus(v247, v248); + real2 v251 = minusplus(uminus(v247), v248); + real2 v259 = ctimesminusplus(reverse(v249), tbl[38 + tbloffset], ctimes(v249, tbl[39 + tbloffset])); + real2 v35 = load(in, 33 << inShift); + real2 v3 = load(in, 1 << inShift); + real2 v92 = plus(v3, v35); + real2 v88 = minus(v35, v3); + real2 v51 = load(in, 49 << inShift); + real2 v19 = load(in, 17 << inShift); + real2 v87 = reverse(minus(v51, v19)); + real2 v93 = plus(v19, v51); + real2 v412 = plus(v92, v93); + real2 v408 = minus(v93, v92); + real2 v411 = minusplus(uminus(v407), v408); + real2 v409 = minusplus(v407, v408); + real2 v91 = minusplus(uminus(v87), v88); + real2 v89 = minusplus(v87, v88); + real2 v99 = ctimesminusplus(reverse(v89), tbl[6 + tbloffset], ctimes(v89, tbl[7 + tbloffset])); + real2 v425 = ctimesminusplus(reverse(v411), tbl[72 + tbloffset], ctimes(v411, tbl[73 + tbloffset])); + real2 v568 = minus(v413, v412); + real2 v572 = plus(v412, v413); + real2 v940 = plus(v99, v259); + real2 v936 = minus(v259, v99); + real2 v419 = ctimesminusplus(reverse(v409), tbl[70 + tbloffset], ctimes(v409, tbl[71 + tbloffset])); + real2 v47 = load(in, 45 << inShift); + real2 v15 = load(in, 13 << inShift); + real2 v332 = plus(v15, v47); + real2 v328 = minus(v47, v15); + real2 v63 = load(in, 61 << inShift); + real2 v31 = load(in, 29 << inShift); + real2 v327 = reverse(minus(v63, v31)); + real2 v333 = plus(v31, v63); + real2 v329 = minusplus(v327, v328); + real2 v331 = minusplus(uminus(v327), v328); + real2 v339 = ctimesminusplus(reverse(v329), tbl[54 + tbloffset], ctimes(v329, tbl[55 + tbloffset])); + real2 v487 = reverse(minus(v333, v332)); + real2 v493 = plus(v332, v333); + real2 v7 = load(in, 5 << inShift); + real2 v39 = load(in, 37 << inShift); + real2 v172 = plus(v7, v39); + real2 v168 = minus(v39, v7); + real2 v55 = load(in, 53 << inShift); + real2 v23 = load(in, 21 << inShift); + real2 v173 = plus(v23, v55); + real2 v167 = reverse(minus(v55, v23)); + real2 v488 = minus(v173, v172); + real2 v492 = plus(v172, v173); + real2 v491 = minusplus(uminus(v487), v488); + real2 v489 = minusplus(v487, v488); + real2 v499 = ctimesminusplus(reverse(v489), tbl[86 + tbloffset], ctimes(v489, tbl[87 + tbloffset])); + real2 v505 = ctimesminusplus(reverse(v491), tbl[88 + tbloffset], ctimes(v491, tbl[89 + tbloffset])); + real2 v567 = reverse(minus(v493, v492)); + real2 v573 = plus(v492, v493); + real2 v571 = minusplus(uminus(v567), v568); + real2 v569 = minusplus(v567, v568); + real2 v579 = ctimesminusplus(reverse(v569), tbl[102 + tbloffset], ctimes(v569, tbl[103 + tbloffset])); + real2 v585 = ctimesminusplus(reverse(v571), tbl[104 + tbloffset], ctimes(v571, tbl[105 + tbloffset])); + real2 v739 = plus(v585, v625); + real2 v733 = reverse(minus(v625, v585)); + real2 v707 = reverse(minus(v619, v579)); + real2 v713 = plus(v579, v619); + real2 v648 = minus(v573, v572); + real2 v652 = plus(v572, v573); + real2 v673 = plus(v652, v653); + real2 v667 = reverse(minus(v653, v652)); + real2 v651 = minusplus(uminus(v647), v648); + real2 v649 = minusplus(v647, v648); + real2 v659 = ctimesminusplus(reverse(v649), tbl[118 + tbloffset], ctimes(v649, tbl[119 + tbloffset])); + real2 v665 = ctimesminusplus(reverse(v651), tbl[120 + tbloffset], ctimes(v651, tbl[121 + tbloffset])); + real2 v780 = minus(v499, v419); + real2 v784 = plus(v419, v499); + real2 v781 = minusplus(v779, v780); + real2 v783 = minusplus(uminus(v779), v780); + real2 v805 = plus(v784, v785); + real2 v799 = reverse(minus(v785, v784)); + real2 v862 = plus(v425, v505); + real2 v858 = minus(v505, v425); + real2 v859 = minusplus(v857, v858); + real2 v861 = minusplus(uminus(v857), v858); + real2 v875 = ctimesminusplus(reverse(v861), tbl[152 + tbloffset], ctimes(v861, tbl[153 + tbloffset])); + real2 v791 = ctimesminusplus(reverse(v781), tbl[138 + tbloffset], ctimes(v781, tbl[139 + tbloffset])); + real2 v797 = ctimesminusplus(reverse(v783), tbl[140 + tbloffset], ctimes(v783, tbl[141 + tbloffset])); + real2 v883 = plus(v862, v863); + real2 v877 = reverse(minus(v863, v862)); + real2 v869 = ctimesminusplus(reverse(v859), tbl[150 + tbloffset], ctimes(v859, tbl[151 + tbloffset])); + real2 v36 = load(in, 34 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v108 = minus(v36, v4); + real2 v112 = plus(v4, v36); + real2 v52 = load(in, 50 << inShift); + real2 v20 = load(in, 18 << inShift); + real2 v113 = plus(v20, v52); + real2 v107 = reverse(minus(v52, v20)); + real2 v428 = minus(v113, v112); + real2 v432 = plus(v112, v113); + real2 v12 = load(in, 10 << inShift); + real2 v44 = load(in, 42 << inShift); + real2 v268 = minus(v44, v12); + real2 v272 = plus(v12, v44); + real2 v28 = load(in, 26 << inShift); + real2 v60 = load(in, 58 << inShift); + real2 v267 = reverse(minus(v60, v28)); + real2 v273 = plus(v28, v60); + real2 v427 = reverse(minus(v273, v272)); + real2 v433 = plus(v272, v273); + real2 v431 = minusplus(uminus(v427), v428); + real2 v429 = minusplus(v427, v428); + real2 v439 = ctimesminusplus(reverse(v429), tbl[74 + tbloffset], ctimes(v429, tbl[75 + tbloffset])); + real2 v588 = minus(v433, v432); + real2 v592 = plus(v432, v433); + real2 v40 = load(in, 38 << inShift); + real2 v8 = load(in, 6 << inShift); + real2 v188 = minus(v40, v8); + real2 v192 = plus(v8, v40); + real2 v24 = load(in, 22 << inShift); + real2 v56 = load(in, 54 << inShift); + real2 v187 = reverse(minus(v56, v24)); + real2 v193 = plus(v24, v56); + real2 v512 = plus(v192, v193); + real2 v508 = minus(v193, v192); + real2 v32 = load(in, 30 << inShift); + real2 v64 = load(in, 62 << inShift); + real2 v347 = reverse(minus(v64, v32)); + real2 v353 = plus(v32, v64); + real2 v48 = load(in, 46 << inShift); + real2 v16 = load(in, 14 << inShift); + real2 v348 = minus(v48, v16); + real2 v352 = plus(v16, v48); + real2 v513 = plus(v352, v353); + real2 v507 = reverse(minus(v353, v352)); + real2 v587 = reverse(minus(v513, v512)); + real2 v593 = plus(v512, v513); + real2 v633 = plus(v592, v593); + real2 v627 = reverse(minus(v593, v592)); + real2 v591 = minusplus(uminus(v587), v588); + real2 v589 = minusplus(v587, v588); + real2 v605 = ctimesminusplus(reverse(v591), tbl[108 + tbloffset], ctimes(v591, tbl[109 + tbloffset])); + real2 v599 = ctimesminusplus(reverse(v589), tbl[106 + tbloffset], ctimes(v589, tbl[107 + tbloffset])); + real2 v46 = load(in, 44 << inShift); + real2 v14 = load(in, 12 << inShift); + real2 v312 = plus(v14, v46); + real2 v308 = minus(v46, v14); + real2 v62 = load(in, 60 << inShift); + real2 v30 = load(in, 28 << inShift); + real2 v313 = plus(v30, v62); + real2 v307 = reverse(minus(v62, v30)); + real2 v467 = reverse(minus(v313, v312)); + real2 v473 = plus(v312, v313); + real2 v22 = load(in, 20 << inShift); + real2 v54 = load(in, 52 << inShift); + real2 v147 = reverse(minus(v54, v22)); + real2 v153 = plus(v22, v54); + real2 v6 = load(in, 4 << inShift); + real2 v38 = load(in, 36 << inShift); + real2 v148 = minus(v38, v6); + real2 v152 = plus(v6, v38); + real2 v472 = plus(v152, v153); + real2 v468 = minus(v153, v152); + real2 v547 = reverse(minus(v473, v472)); + real2 v553 = plus(v472, v473); + real2 v10 = load(in, 8 << inShift); + real2 v42 = load(in, 40 << inShift); + real2 v232 = plus(v10, v42); + real2 v228 = minus(v42, v10); + real2 v58 = load(in, 56 << inShift); + real2 v26 = load(in, 24 << inShift); + real2 v233 = plus(v26, v58); + real2 v227 = reverse(minus(v58, v26)); + real2 v393 = plus(v232, v233); + real2 v387 = reverse(minus(v233, v232)); + real2 v2 = load(in, 0 << inShift); + real2 v34 = load(in, 32 << inShift); + real2 v72 = plus(v2, v34); + real2 v68 = minus(v34, v2); + real2 v18 = load(in, 16 << inShift); + real2 v50 = load(in, 48 << inShift); + real2 v73 = plus(v18, v50); + real2 v67 = reverse(minus(v50, v18)); + real2 v388 = minus(v73, v72); + real2 v392 = plus(v72, v73); + real2 v548 = minus(v393, v392); + real2 v552 = plus(v392, v393); + real2 v628 = minus(v553, v552); + real2 v632 = plus(v552, v553); + real2 v672 = plus(v632, v633); + real2 v668 = minus(v633, v632); + store(out, 0 << %SHIFT%, plus(v672, v673)); + real2 v686 = minus(v672, v673); + store(out, 32 << %SHIFT%, ctimesminusplus(v686, tbl[0 + tbloffset], ctimes(reverse(v686), tbl[1 + tbloffset]))); + real2 v669 = minusplus(v667, v668); + real2 v671 = minusplus(uminus(v667), v668); + store(out, 48 << %SHIFT%, ctimesminusplus(reverse(v671), tbl[124 + tbloffset], ctimes(v671, tbl[125 + tbloffset]))); + store(out, 16 << %SHIFT%, ctimesminusplus(reverse(v669), tbl[122 + tbloffset], ctimes(v669, tbl[123 + tbloffset]))); + real2 v631 = minusplus(uminus(v627), v628); + real2 v629 = minusplus(v627, v628); + real2 v639 = ctimesminusplus(reverse(v629), tbl[114 + tbloffset], ctimes(v629, tbl[115 + tbloffset])); + store(out, 8 << %SHIFT%, plus(v639, v659)); + real2 v694 = minus(v639, v659); + store(out, 40 << %SHIFT%, ctimesminusplus(v694, tbl[0 + tbloffset], ctimes(reverse(v694), tbl[1 + tbloffset]))); + real2 v645 = ctimesminusplus(reverse(v631), tbl[116 + tbloffset], ctimes(v631, tbl[117 + tbloffset])); + store(out, 24 << %SHIFT%, plus(v645, v665)); + real2 v700 = minus(v645, v665); + store(out, 56 << %SHIFT%, ctimesminusplus(v700, tbl[0 + tbloffset], ctimes(reverse(v700), tbl[1 + tbloffset]))); + real2 v549 = minusplus(v547, v548); + real2 v551 = minusplus(uminus(v547), v548); + real2 v559 = ctimesminusplus(reverse(v549), tbl[98 + tbloffset], ctimes(v549, tbl[99 + tbloffset])); + real2 v708 = minus(v599, v559); + real2 v712 = plus(v559, v599); + store(out, 4 << %SHIFT%, plus(v712, v713)); + real2 v726 = minus(v712, v713); + store(out, 36 << %SHIFT%, ctimesminusplus(v726, tbl[0 + tbloffset], ctimes(reverse(v726), tbl[1 + tbloffset]))); + real2 v711 = minusplus(uminus(v707), v708); + real2 v709 = minusplus(v707, v708); + store(out, 20 << %SHIFT%, ctimesminusplus(reverse(v709), tbl[126 + tbloffset], ctimes(v709, tbl[127 + tbloffset]))); + store(out, 52 << %SHIFT%, ctimesminusplus(reverse(v711), tbl[128 + tbloffset], ctimes(v711, tbl[129 + tbloffset]))); + real2 v565 = ctimesminusplus(reverse(v551), tbl[100 + tbloffset], ctimes(v551, tbl[101 + tbloffset])); + real2 v738 = plus(v565, v605); + real2 v734 = minus(v605, v565); + store(out, 12 << %SHIFT%, plus(v738, v739)); + real2 v752 = minus(v738, v739); + store(out, 44 << %SHIFT%, ctimesminusplus(v752, tbl[0 + tbloffset], ctimes(reverse(v752), tbl[1 + tbloffset]))); + real2 v737 = minusplus(uminus(v733), v734); + store(out, 60 << %SHIFT%, ctimesminusplus(reverse(v737), tbl[132 + tbloffset], ctimes(v737, tbl[133 + tbloffset]))); + real2 v735 = minusplus(v733, v734); + store(out, 28 << %SHIFT%, ctimesminusplus(reverse(v735), tbl[130 + tbloffset], ctimes(v735, tbl[131 + tbloffset]))); + real2 v471 = minusplus(uminus(v467), v468); + real2 v469 = minusplus(v467, v468); + real2 v479 = ctimesminusplus(reverse(v469), tbl[82 + tbloffset], ctimes(v469, tbl[83 + tbloffset])); + real2 v511 = minusplus(uminus(v507), v508); + real2 v509 = minusplus(v507, v508); + real2 v519 = ctimesminusplus(reverse(v509), tbl[90 + tbloffset], ctimes(v509, tbl[91 + tbloffset])); + real2 v765 = plus(v439, v519); + real2 v759 = reverse(minus(v519, v439)); + real2 v389 = minusplus(v387, v388); + real2 v391 = minusplus(uminus(v387), v388); + real2 v399 = ctimesminusplus(reverse(v389), tbl[66 + tbloffset], ctimes(v389, tbl[67 + tbloffset])); + real2 v764 = plus(v399, v479); + real2 v760 = minus(v479, v399); + real2 v804 = plus(v764, v765); + real2 v800 = minus(v765, v764); + store(out, 2 << %SHIFT%, plus(v804, v805)); + real2 v818 = minus(v804, v805); + store(out, 34 << %SHIFT%, ctimesminusplus(v818, tbl[0 + tbloffset], ctimes(reverse(v818), tbl[1 + tbloffset]))); + real2 v803 = minusplus(uminus(v799), v800); + store(out, 50 << %SHIFT%, ctimesminusplus(reverse(v803), tbl[144 + tbloffset], ctimes(v803, tbl[145 + tbloffset]))); + real2 v801 = minusplus(v799, v800); + store(out, 18 << %SHIFT%, ctimesminusplus(reverse(v801), tbl[142 + tbloffset], ctimes(v801, tbl[143 + tbloffset]))); + real2 v763 = minusplus(uminus(v759), v760); + real2 v761 = minusplus(v759, v760); + real2 v777 = ctimesminusplus(reverse(v763), tbl[136 + tbloffset], ctimes(v763, tbl[137 + tbloffset])); + store(out, 26 << %SHIFT%, plus(v777, v797)); + real2 v830 = minus(v777, v797); + store(out, 58 << %SHIFT%, ctimesminusplus(v830, tbl[0 + tbloffset], ctimes(reverse(v830), tbl[1 + tbloffset]))); + real2 v771 = ctimesminusplus(reverse(v761), tbl[134 + tbloffset], ctimes(v761, tbl[135 + tbloffset])); + store(out, 10 << %SHIFT%, plus(v771, v791)); + real2 v824 = minus(v771, v791); + store(out, 42 << %SHIFT%, ctimesminusplus(v824, tbl[0 + tbloffset], ctimes(reverse(v824), tbl[1 + tbloffset]))); + real2 v445 = ctimesminusplus(reverse(v431), tbl[76 + tbloffset], ctimes(v431, tbl[77 + tbloffset])); + real2 v525 = ctimesminusplus(reverse(v511), tbl[92 + tbloffset], ctimes(v511, tbl[93 + tbloffset])); + real2 v837 = reverse(minus(v525, v445)); + real2 v843 = plus(v445, v525); + real2 v485 = ctimesminusplus(reverse(v471), tbl[84 + tbloffset], ctimes(v471, tbl[85 + tbloffset])); + real2 v405 = ctimesminusplus(reverse(v391), tbl[68 + tbloffset], ctimes(v391, tbl[69 + tbloffset])); + real2 v838 = minus(v485, v405); + real2 v842 = plus(v405, v485); + real2 v878 = minus(v843, v842); + real2 v882 = plus(v842, v843); + store(out, 6 << %SHIFT%, plus(v882, v883)); + real2 v896 = minus(v882, v883); + store(out, 38 << %SHIFT%, ctimesminusplus(v896, tbl[0 + tbloffset], ctimes(reverse(v896), tbl[1 + tbloffset]))); + real2 v881 = minusplus(uminus(v877), v878); + store(out, 54 << %SHIFT%, ctimesminusplus(reverse(v881), tbl[156 + tbloffset], ctimes(v881, tbl[157 + tbloffset]))); + real2 v879 = minusplus(v877, v878); + store(out, 22 << %SHIFT%, ctimesminusplus(reverse(v879), tbl[154 + tbloffset], ctimes(v879, tbl[155 + tbloffset]))); + real2 v841 = minusplus(uminus(v837), v838); + real2 v839 = minusplus(v837, v838); + real2 v855 = ctimesminusplus(reverse(v841), tbl[148 + tbloffset], ctimes(v841, tbl[149 + tbloffset])); + store(out, 30 << %SHIFT%, plus(v855, v875)); + real2 v908 = minus(v855, v875); + store(out, 62 << %SHIFT%, ctimesminusplus(v908, tbl[0 + tbloffset], ctimes(reverse(v908), tbl[1 + tbloffset]))); + real2 v849 = ctimesminusplus(reverse(v839), tbl[146 + tbloffset], ctimes(v839, tbl[147 + tbloffset])); + store(out, 14 << %SHIFT%, plus(v849, v869)); + real2 v902 = minus(v849, v869); + store(out, 46 << %SHIFT%, ctimesminusplus(v902, tbl[0 + tbloffset], ctimes(reverse(v902), tbl[1 + tbloffset]))); + real2 v151 = minusplus(uminus(v147), v148); + real2 v149 = minusplus(v147, v148); + real2 v311 = minusplus(uminus(v307), v308); + real2 v309 = minusplus(v307, v308); + real2 v109 = minusplus(v107, v108); + real2 v111 = minusplus(uminus(v107), v108); + real2 v119 = ctimesminusplus(reverse(v109), tbl[10 + tbloffset], ctimes(v109, tbl[11 + tbloffset])); + real2 v269 = minusplus(v267, v268); + real2 v271 = minusplus(uminus(v267), v268); + real2 v279 = ctimesminusplus(reverse(v269), tbl[42 + tbloffset], ctimes(v269, tbl[43 + tbloffset])); + real2 v960 = plus(v119, v279); + real2 v956 = minus(v279, v119); + real2 v169 = minusplus(v167, v168); + real2 v171 = minusplus(uminus(v167), v168); + real2 v159 = ctimesminusplus(reverse(v149), tbl[18 + tbloffset], ctimes(v149, tbl[19 + tbloffset])); + real2 v319 = ctimesminusplus(reverse(v309), tbl[50 + tbloffset], ctimes(v309, tbl[51 + tbloffset])); + real2 v921 = plus(v159, v319); + real2 v915 = reverse(minus(v319, v159)); + real2 v351 = minusplus(uminus(v347), v348); + real2 v349 = minusplus(v347, v348); + real2 v359 = ctimesminusplus(reverse(v349), tbl[58 + tbloffset], ctimes(v349, tbl[59 + tbloffset])); + real2 v191 = minusplus(uminus(v187), v188); + real2 v189 = minusplus(v187, v188); + real2 v199 = ctimesminusplus(reverse(v189), tbl[26 + tbloffset], ctimes(v189, tbl[27 + tbloffset])); + real2 v961 = plus(v199, v359); + real2 v955 = reverse(minus(v359, v199)); + real2 v995 = reverse(minus(v961, v960)); + real2 v1001 = plus(v960, v961); + real2 v179 = ctimesminusplus(reverse(v169), tbl[22 + tbloffset], ctimes(v169, tbl[23 + tbloffset])); + real2 v941 = plus(v179, v339); + real2 v935 = reverse(minus(v339, v179)); + real2 v1016 = minus(v941, v940); + real2 v1020 = plus(v940, v941); + real2 v71 = minusplus(uminus(v67), v68); + real2 v69 = minusplus(v67, v68); + real2 v79 = ctimesminusplus(reverse(v69), tbl[2 + tbloffset], ctimes(v69, tbl[3 + tbloffset])); + real2 v1041 = plus(v1020, v1021); + real2 v1035 = reverse(minus(v1021, v1020)); + real2 v229 = minusplus(v227, v228); + real2 v231 = minusplus(uminus(v227), v228); + real2 v239 = ctimesminusplus(reverse(v229), tbl[34 + tbloffset], ctimes(v229, tbl[35 + tbloffset])); + real2 v920 = plus(v79, v239); + real2 v916 = minus(v239, v79); + real2 v996 = minus(v921, v920); + real2 v1000 = plus(v920, v921); + real2 v1040 = plus(v1000, v1001); + real2 v1036 = minus(v1001, v1000); + store(out, 1 << %SHIFT%, plus(v1040, v1041)); + real2 v1054 = minus(v1040, v1041); + store(out, 33 << %SHIFT%, ctimesminusplus(v1054, tbl[0 + tbloffset], ctimes(reverse(v1054), tbl[1 + tbloffset]))); + real2 v1037 = minusplus(v1035, v1036); + real2 v1039 = minusplus(uminus(v1035), v1036); + store(out, 49 << %SHIFT%, ctimesminusplus(reverse(v1039), tbl[184 + tbloffset], ctimes(v1039, tbl[185 + tbloffset]))); + store(out, 17 << %SHIFT%, ctimesminusplus(reverse(v1037), tbl[182 + tbloffset], ctimes(v1037, tbl[183 + tbloffset]))); + real2 v1017 = minusplus(v1015, v1016); + real2 v1019 = minusplus(uminus(v1015), v1016); + real2 v1033 = ctimesminusplus(reverse(v1019), tbl[180 + tbloffset], ctimes(v1019, tbl[181 + tbloffset])); + real2 v997 = minusplus(v995, v996); + real2 v999 = minusplus(uminus(v995), v996); + real2 v1013 = ctimesminusplus(reverse(v999), tbl[176 + tbloffset], ctimes(v999, tbl[177 + tbloffset])); + store(out, 25 << %SHIFT%, plus(v1013, v1033)); + real2 v1066 = minus(v1013, v1033); + store(out, 57 << %SHIFT%, ctimesminusplus(v1066, tbl[0 + tbloffset], ctimes(reverse(v1066), tbl[1 + tbloffset]))); + real2 v1027 = ctimesminusplus(reverse(v1017), tbl[178 + tbloffset], ctimes(v1017, tbl[179 + tbloffset])); + real2 v1007 = ctimesminusplus(reverse(v997), tbl[174 + tbloffset], ctimes(v997, tbl[175 + tbloffset])); + store(out, 9 << %SHIFT%, plus(v1007, v1027)); + real2 v1060 = minus(v1007, v1027); + store(out, 41 << %SHIFT%, ctimesminusplus(v1060, tbl[0 + tbloffset], ctimes(reverse(v1060), tbl[1 + tbloffset]))); + real2 v937 = minusplus(v935, v936); + real2 v939 = minusplus(uminus(v935), v936); + real2 v959 = minusplus(uminus(v955), v956); + real2 v957 = minusplus(v955, v956); + real2 v967 = ctimesminusplus(reverse(v957), tbl[166 + tbloffset], ctimes(v957, tbl[167 + tbloffset])); + real2 v947 = ctimesminusplus(reverse(v937), tbl[162 + tbloffset], ctimes(v937, tbl[163 + tbloffset])); + real2 v919 = minusplus(uminus(v915), v916); + real2 v917 = minusplus(v915, v916); + real2 v1079 = plus(v947, v987); + real2 v1073 = reverse(minus(v987, v947)); + real2 v927 = ctimesminusplus(reverse(v917), tbl[158 + tbloffset], ctimes(v917, tbl[159 + tbloffset])); + real2 v1074 = minus(v967, v927); + real2 v1078 = plus(v927, v967); + store(out, 5 << %SHIFT%, plus(v1078, v1079)); + real2 v1092 = minus(v1078, v1079); + store(out, 37 << %SHIFT%, ctimesminusplus(v1092, tbl[0 + tbloffset], ctimes(reverse(v1092), tbl[1 + tbloffset]))); + real2 v1075 = minusplus(v1073, v1074); + store(out, 21 << %SHIFT%, ctimesminusplus(reverse(v1075), tbl[186 + tbloffset], ctimes(v1075, tbl[187 + tbloffset]))); + real2 v1077 = minusplus(uminus(v1073), v1074); + store(out, 53 << %SHIFT%, ctimesminusplus(reverse(v1077), tbl[188 + tbloffset], ctimes(v1077, tbl[189 + tbloffset]))); + real2 v953 = ctimesminusplus(reverse(v939), tbl[164 + tbloffset], ctimes(v939, tbl[165 + tbloffset])); + real2 v1099 = reverse(minus(v993, v953)); + real2 v1105 = plus(v953, v993); + real2 v973 = ctimesminusplus(reverse(v959), tbl[168 + tbloffset], ctimes(v959, tbl[169 + tbloffset])); + real2 v933 = ctimesminusplus(reverse(v919), tbl[160 + tbloffset], ctimes(v919, tbl[161 + tbloffset])); + real2 v1104 = plus(v933, v973); + real2 v1100 = minus(v973, v933); + store(out, 13 << %SHIFT%, plus(v1104, v1105)); + real2 v1118 = minus(v1104, v1105); + store(out, 45 << %SHIFT%, ctimesminusplus(v1118, tbl[0 + tbloffset], ctimes(reverse(v1118), tbl[1 + tbloffset]))); + real2 v1101 = minusplus(v1099, v1100); + store(out, 29 << %SHIFT%, ctimesminusplus(reverse(v1101), tbl[190 + tbloffset], ctimes(v1101, tbl[191 + tbloffset]))); + real2 v1103 = minusplus(uminus(v1099), v1100); + store(out, 61 << %SHIFT%, ctimesminusplus(reverse(v1103), tbl[192 + tbloffset], ctimes(v1103, tbl[193 + tbloffset]))); + real2 v345 = ctimesminusplus(reverse(v331), tbl[56 + tbloffset], ctimes(v331, tbl[57 + tbloffset])); + real2 v325 = ctimesminusplus(reverse(v311), tbl[52 + tbloffset], ctimes(v311, tbl[53 + tbloffset])); + real2 v265 = ctimesminusplus(reverse(v251), tbl[40 + tbloffset], ctimes(v251, tbl[41 + tbloffset])); + real2 v185 = ctimesminusplus(reverse(v171), tbl[24 + tbloffset], ctimes(v171, tbl[25 + tbloffset])); + real2 v165 = ctimesminusplus(reverse(v151), tbl[20 + tbloffset], ctimes(v151, tbl[21 + tbloffset])); + real2 v1131 = plus(v165, v325); + real2 v1125 = reverse(minus(v325, v165)); + real2 v1151 = plus(v185, v345); + real2 v1145 = reverse(minus(v345, v185)); + real2 v105 = ctimesminusplus(reverse(v91), tbl[8 + tbloffset], ctimes(v91, tbl[9 + tbloffset])); + real2 v1150 = plus(v105, v265); + real2 v1146 = minus(v265, v105); + real2 v1226 = minus(v1151, v1150); + real2 v1230 = plus(v1150, v1151); + real2 v1231 = plus(v1190, v1191); + real2 v1225 = reverse(minus(v1191, v1190)); + real2 v1245 = reverse(minus(v1231, v1230)); + real2 v1251 = plus(v1230, v1231); + real2 v365 = ctimesminusplus(reverse(v351), tbl[60 + tbloffset], ctimes(v351, tbl[61 + tbloffset])); + real2 v285 = ctimesminusplus(reverse(v271), tbl[44 + tbloffset], ctimes(v271, tbl[45 + tbloffset])); + real2 v205 = ctimesminusplus(reverse(v191), tbl[28 + tbloffset], ctimes(v191, tbl[29 + tbloffset])); + real2 v1171 = plus(v205, v365); + real2 v1165 = reverse(minus(v365, v205)); + real2 v125 = ctimesminusplus(reverse(v111), tbl[12 + tbloffset], ctimes(v111, tbl[13 + tbloffset])); + real2 v85 = ctimesminusplus(reverse(v71), tbl[4 + tbloffset], ctimes(v71, tbl[5 + tbloffset])); + real2 v245 = ctimesminusplus(reverse(v231), tbl[36 + tbloffset], ctimes(v231, tbl[37 + tbloffset])); + real2 v1126 = minus(v245, v85); + real2 v1130 = plus(v85, v245); + real2 v1210 = plus(v1130, v1131); + real2 v1206 = minus(v1131, v1130); + real2 v1166 = minus(v285, v125); + real2 v1170 = plus(v125, v285); + real2 v1211 = plus(v1170, v1171); + real2 v1205 = reverse(minus(v1171, v1170)); + real2 v1246 = minus(v1211, v1210); + real2 v1250 = plus(v1210, v1211); + store(out, 3 << %SHIFT%, plus(v1250, v1251)); + real2 v1264 = minus(v1250, v1251); + store(out, 35 << %SHIFT%, ctimesminusplus(v1264, tbl[0 + tbloffset], ctimes(reverse(v1264), tbl[1 + tbloffset]))); + real2 v1247 = minusplus(v1245, v1246); + real2 v1249 = minusplus(uminus(v1245), v1246); + store(out, 19 << %SHIFT%, ctimesminusplus(reverse(v1247), tbl[218 + tbloffset], ctimes(v1247, tbl[219 + tbloffset]))); + store(out, 51 << %SHIFT%, ctimesminusplus(reverse(v1249), tbl[220 + tbloffset], ctimes(v1249, tbl[221 + tbloffset]))); + real2 v1229 = minusplus(uminus(v1225), v1226); + real2 v1227 = minusplus(v1225, v1226); + real2 v1207 = minusplus(v1205, v1206); + real2 v1209 = minusplus(uminus(v1205), v1206); + real2 v1237 = ctimesminusplus(reverse(v1227), tbl[214 + tbloffset], ctimes(v1227, tbl[215 + tbloffset])); + real2 v1217 = ctimesminusplus(reverse(v1207), tbl[210 + tbloffset], ctimes(v1207, tbl[211 + tbloffset])); + store(out, 11 << %SHIFT%, plus(v1217, v1237)); + real2 v1270 = minus(v1217, v1237); + store(out, 43 << %SHIFT%, ctimesminusplus(v1270, tbl[0 + tbloffset], ctimes(reverse(v1270), tbl[1 + tbloffset]))); + real2 v1223 = ctimesminusplus(reverse(v1209), tbl[212 + tbloffset], ctimes(v1209, tbl[213 + tbloffset])); + real2 v1243 = ctimesminusplus(reverse(v1229), tbl[216 + tbloffset], ctimes(v1229, tbl[217 + tbloffset])); + store(out, 27 << %SHIFT%, plus(v1223, v1243)); + real2 v1276 = minus(v1223, v1243); + store(out, 59 << %SHIFT%, ctimesminusplus(v1276, tbl[0 + tbloffset], ctimes(reverse(v1276), tbl[1 + tbloffset]))); + real2 v1189 = minusplus(uminus(v1185), v1186); + real2 v1187 = minusplus(v1185, v1186); + real2 v1129 = minusplus(uminus(v1125), v1126); + real2 v1127 = minusplus(v1125, v1126); + real2 v1147 = minusplus(v1145, v1146); + real2 v1149 = minusplus(uminus(v1145), v1146); + real2 v1167 = minusplus(v1165, v1166); + real2 v1169 = minusplus(uminus(v1165), v1166); + real2 v1143 = ctimesminusplus(reverse(v1129), tbl[196 + tbloffset], ctimes(v1129, tbl[197 + tbloffset])); + real2 v1163 = ctimesminusplus(reverse(v1149), tbl[200 + tbloffset], ctimes(v1149, tbl[201 + tbloffset])); + real2 v1203 = ctimesminusplus(reverse(v1189), tbl[208 + tbloffset], ctimes(v1189, tbl[209 + tbloffset])); + real2 v1315 = plus(v1163, v1203); + real2 v1309 = reverse(minus(v1203, v1163)); + real2 v1183 = ctimesminusplus(reverse(v1169), tbl[204 + tbloffset], ctimes(v1169, tbl[205 + tbloffset])); + real2 v1314 = plus(v1143, v1183); + real2 v1310 = minus(v1183, v1143); + store(out, 15 << %SHIFT%, plus(v1314, v1315)); + real2 v1328 = minus(v1314, v1315); + store(out, 47 << %SHIFT%, ctimesminusplus(v1328, tbl[0 + tbloffset], ctimes(reverse(v1328), tbl[1 + tbloffset]))); + real2 v1311 = minusplus(v1309, v1310); + store(out, 31 << %SHIFT%, ctimesminusplus(reverse(v1311), tbl[226 + tbloffset], ctimes(v1311, tbl[227 + tbloffset]))); + real2 v1313 = minusplus(uminus(v1309), v1310); + store(out, 63 << %SHIFT%, ctimesminusplus(reverse(v1313), tbl[228 + tbloffset], ctimes(v1313, tbl[229 + tbloffset]))); + real2 v1177 = ctimesminusplus(reverse(v1167), tbl[202 + tbloffset], ctimes(v1167, tbl[203 + tbloffset])); + real2 v1137 = ctimesminusplus(reverse(v1127), tbl[194 + tbloffset], ctimes(v1127, tbl[195 + tbloffset])); + real2 v1197 = ctimesminusplus(reverse(v1187), tbl[206 + tbloffset], ctimes(v1187, tbl[207 + tbloffset])); + real2 v1157 = ctimesminusplus(reverse(v1147), tbl[198 + tbloffset], ctimes(v1147, tbl[199 + tbloffset])); + real2 v1283 = reverse(minus(v1197, v1157)); + real2 v1289 = plus(v1157, v1197); + real2 v1288 = plus(v1137, v1177); + real2 v1284 = minus(v1177, v1137); + store(out, 7 << %SHIFT%, plus(v1288, v1289)); + real2 v1302 = minus(v1288, v1289); + store(out, 39 << %SHIFT%, ctimesminusplus(v1302, tbl[0 + tbloffset], ctimes(reverse(v1302), tbl[1 + tbloffset]))); + real2 v1285 = minusplus(v1283, v1284); + real2 v1287 = minusplus(uminus(v1283), v1284); + store(out, 55 << %SHIFT%, ctimesminusplus(reverse(v1287), tbl[224 + tbloffset], ctimes(v1287, tbl[225 + tbloffset]))); + store(out, 23 << %SHIFT%, ctimesminusplus(reverse(v1285), tbl[222 + tbloffset], ctimes(v1285, tbl[223 + tbloffset]))); + // Pres : 17339 + } +} + +ALIGNED(8192) void but128f_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + // Pres : 148586 + real2 v56 = load(in, 54 << inShift); + real2 v120 = load(in, 118 << inShift); + real2 v571 = reverse(minus(v56, v120)); + real2 v577 = plus(v56, v120); + real2 v24 = load(in, 22 << inShift); + real2 v88 = load(in, 86 << inShift); + real2 v576 = plus(v24, v88); + real2 v572 = minus(v88, v24); + real2 v573 = minusplus(v571, v572); + real2 v575 = minusplus(uminus(v571), v572); + real2 v589 = ctimesminusplus(reverse(v575), tbl[92 + tbloffset], ctimes(v575, tbl[93 + tbloffset])); + real2 v583 = ctimesminusplus(reverse(v573), tbl[90 + tbloffset], ctimes(v573, tbl[91 + tbloffset])); + real2 v897 = plus(v576, v577); + real2 v891 = reverse(minus(v576, v577)); + real2 v8 = load(in, 6 << inShift); + real2 v72 = load(in, 70 << inShift); + real2 v252 = minus(v72, v8); + real2 v256 = plus(v8, v72); + real2 v104 = load(in, 102 << inShift); + real2 v40 = load(in, 38 << inShift); + real2 v251 = reverse(minus(v40, v104)); + real2 v257 = plus(v40, v104); + real2 v255 = minusplus(uminus(v251), v252); + real2 v253 = minusplus(v251, v252); + real2 v263 = ctimesminusplus(reverse(v253), tbl[26 + tbloffset], ctimes(v253, tbl[27 + tbloffset])); + real2 v896 = plus(v256, v257); + real2 v892 = minus(v257, v256); + real2 v895 = minusplus(uminus(v891), v892); + real2 v893 = minusplus(v891, v892); + real2 v909 = ctimesminusplus(reverse(v895), tbl[156 + tbloffset], ctimes(v895, tbl[157 + tbloffset])); + real2 v903 = ctimesminusplus(reverse(v893), tbl[154 + tbloffset], ctimes(v893, tbl[155 + tbloffset])); + real2 v269 = ctimesminusplus(reverse(v255), tbl[28 + tbloffset], ctimes(v255, tbl[29 + tbloffset])); + real2 v1216 = plus(v896, v897); + real2 v1212 = minus(v897, v896); + real2 v2160 = minus(v583, v263); + real2 v2164 = plus(v263, v583); + real2 v2686 = minus(v589, v269); + real2 v2690 = plus(v269, v589); + real2 v96 = load(in, 94 << inShift); + real2 v32 = load(in, 30 << inShift); + real2 v736 = plus(v32, v96); + real2 v732 = minus(v96, v32); + real2 v64 = load(in, 62 << inShift); + real2 v128 = load(in, 126 << inShift); + real2 v737 = plus(v64, v128); + real2 v731 = reverse(minus(v64, v128)); + real2 v1057 = plus(v736, v737); + real2 v1051 = reverse(minus(v736, v737)); + real2 v733 = minusplus(v731, v732); + real2 v735 = minusplus(uminus(v731), v732); + real2 v749 = ctimesminusplus(reverse(v735), tbl[124 + tbloffset], ctimes(v735, tbl[125 + tbloffset])); + real2 v743 = ctimesminusplus(reverse(v733), tbl[122 + tbloffset], ctimes(v733, tbl[123 + tbloffset])); + real2 v16 = load(in, 14 << inShift); + real2 v80 = load(in, 78 << inShift); + real2 v412 = minus(v80, v16); + real2 v416 = plus(v16, v80); + real2 v112 = load(in, 110 << inShift); + real2 v48 = load(in, 46 << inShift); + real2 v417 = plus(v48, v112); + real2 v411 = reverse(minus(v48, v112)); + real2 v1056 = plus(v416, v417); + real2 v1052 = minus(v417, v416); + real2 v1055 = minusplus(uminus(v1051), v1052); + real2 v1053 = minusplus(v1051, v1052); + real2 v1063 = ctimesminusplus(reverse(v1053), tbl[186 + tbloffset], ctimes(v1053, tbl[187 + tbloffset])); + real2 v1665 = plus(v903, v1063); + real2 v1659 = reverse(minus(v903, v1063)); + real2 v1069 = ctimesminusplus(reverse(v1055), tbl[188 + tbloffset], ctimes(v1055, tbl[189 + tbloffset])); + real2 v1869 = reverse(minus(v909, v1069)); + real2 v1875 = plus(v909, v1069); + real2 v413 = minusplus(v411, v412); + real2 v415 = minusplus(uminus(v411), v412); + real2 v429 = ctimesminusplus(reverse(v415), tbl[60 + tbloffset], ctimes(v415, tbl[61 + tbloffset])); + real2 v1217 = plus(v1056, v1057); + real2 v1211 = reverse(minus(v1056, v1057)); + real2 v1297 = plus(v1216, v1217); + real2 v1291 = reverse(minus(v1216, v1217)); + real2 v2691 = plus(v429, v749); + real2 v2685 = reverse(minus(v429, v749)); + real2 v2765 = reverse(minus(v2690, v2691)); + real2 v2771 = plus(v2690, v2691); + real2 v2689 = minusplus(uminus(v2685), v2686); + real2 v2687 = minusplus(v2685, v2686); + real2 v2703 = ctimesminusplus(reverse(v2689), tbl[476 + tbloffset], ctimes(v2689, tbl[477 + tbloffset])); + real2 v2697 = ctimesminusplus(reverse(v2687), tbl[474 + tbloffset], ctimes(v2687, tbl[475 + tbloffset])); + real2 v1215 = minusplus(uminus(v1211), v1212); + real2 v1213 = minusplus(v1211, v1212); + real2 v1223 = ctimesminusplus(reverse(v1213), tbl[218 + tbloffset], ctimes(v1213, tbl[219 + tbloffset])); + real2 v1229 = ctimesminusplus(reverse(v1215), tbl[220 + tbloffset], ctimes(v1215, tbl[221 + tbloffset])); + real2 v423 = ctimesminusplus(reverse(v413), tbl[58 + tbloffset], ctimes(v413, tbl[59 + tbloffset])); + real2 v2165 = plus(v423, v743); + real2 v2159 = reverse(minus(v423, v743)); + real2 v2245 = plus(v2164, v2165); + real2 v2239 = reverse(minus(v2164, v2165)); + real2 v44 = load(in, 42 << inShift); + real2 v108 = load(in, 106 << inShift); + real2 v331 = reverse(minus(v44, v108)); + real2 v337 = plus(v44, v108); + real2 v76 = load(in, 74 << inShift); + real2 v12 = load(in, 10 << inShift); + real2 v336 = plus(v12, v76); + real2 v332 = minus(v76, v12); + real2 v976 = plus(v336, v337); + real2 v972 = minus(v337, v336); + real2 v335 = minusplus(uminus(v331), v332); + real2 v333 = minusplus(v331, v332); + real2 v343 = ctimesminusplus(reverse(v333), tbl[42 + tbloffset], ctimes(v333, tbl[43 + tbloffset])); + real2 v349 = ctimesminusplus(reverse(v335), tbl[44 + tbloffset], ctimes(v335, tbl[45 + tbloffset])); + real2 v124 = load(in, 122 << inShift); + real2 v60 = load(in, 58 << inShift); + real2 v651 = reverse(minus(v60, v124)); + real2 v657 = plus(v60, v124); + real2 v28 = load(in, 26 << inShift); + real2 v92 = load(in, 90 << inShift); + real2 v652 = minus(v92, v28); + real2 v656 = plus(v28, v92); + real2 v977 = plus(v656, v657); + real2 v971 = reverse(minus(v656, v657)); + real2 v973 = minusplus(v971, v972); + real2 v975 = minusplus(uminus(v971), v972); + real2 v983 = ctimesminusplus(reverse(v973), tbl[170 + tbloffset], ctimes(v973, tbl[171 + tbloffset])); + real2 v1131 = reverse(minus(v976, v977)); + real2 v1137 = plus(v976, v977); + real2 v655 = minusplus(uminus(v651), v652); + real2 v653 = minusplus(v651, v652); + real2 v669 = ctimesminusplus(reverse(v655), tbl[108 + tbloffset], ctimes(v655, tbl[109 + tbloffset])); + real2 v663 = ctimesminusplus(reverse(v653), tbl[106 + tbloffset], ctimes(v653, tbl[107 + tbloffset])); + real2 v2079 = reverse(minus(v343, v663)); + real2 v2085 = plus(v343, v663); + real2 v2605 = reverse(minus(v349, v669)); + real2 v2611 = plus(v349, v669); + real2 v989 = ctimesminusplus(reverse(v975), tbl[172 + tbloffset], ctimes(v975, tbl[173 + tbloffset])); + real2 v20 = load(in, 18 << inShift); + real2 v84 = load(in, 82 << inShift); + real2 v496 = plus(v20, v84); + real2 v492 = minus(v84, v20); + real2 v52 = load(in, 50 << inShift); + real2 v116 = load(in, 114 << inShift); + real2 v491 = reverse(minus(v52, v116)); + real2 v497 = plus(v52, v116); + real2 v817 = plus(v496, v497); + real2 v811 = reverse(minus(v496, v497)); + real2 v493 = minusplus(v491, v492); + real2 v495 = minusplus(uminus(v491), v492); + real2 v509 = ctimesminusplus(reverse(v495), tbl[76 + tbloffset], ctimes(v495, tbl[77 + tbloffset])); + real2 v503 = ctimesminusplus(reverse(v493), tbl[74 + tbloffset], ctimes(v493, tbl[75 + tbloffset])); + real2 v36 = load(in, 34 << inShift); + real2 v100 = load(in, 98 << inShift); + real2 v171 = reverse(minus(v36, v100)); + real2 v177 = plus(v36, v100); + real2 v68 = load(in, 66 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v176 = plus(v4, v68); + real2 v172 = minus(v68, v4); + real2 v816 = plus(v176, v177); + real2 v812 = minus(v177, v176); + real2 v1136 = plus(v816, v817); + real2 v1132 = minus(v817, v816); + real2 v1133 = minusplus(v1131, v1132); + real2 v1135 = minusplus(uminus(v1131), v1132); + real2 v1149 = ctimesminusplus(reverse(v1135), tbl[204 + tbloffset], ctimes(v1135, tbl[205 + tbloffset])); + real2 v1296 = plus(v1136, v1137); + real2 v1292 = minus(v1137, v1136); + real2 v1295 = minusplus(uminus(v1291), v1292); + real2 v1293 = minusplus(v1291, v1292); + real2 v1303 = ctimesminusplus(reverse(v1293), tbl[234 + tbloffset], ctimes(v1293, tbl[235 + tbloffset])); + real2 v1331 = reverse(minus(v1296, v1297)); + real2 v1337 = plus(v1296, v1297); + real2 v173 = minusplus(v171, v172); + real2 v175 = minusplus(uminus(v171), v172); + real2 v189 = ctimesminusplus(reverse(v175), tbl[12 + tbloffset], ctimes(v175, tbl[13 + tbloffset])); + real2 v1309 = ctimesminusplus(reverse(v1295), tbl[236 + tbloffset], ctimes(v1295, tbl[237 + tbloffset])); + real2 v815 = minusplus(uminus(v811), v812); + real2 v813 = minusplus(v811, v812); + real2 v1143 = ctimesminusplus(reverse(v1133), tbl[202 + tbloffset], ctimes(v1133, tbl[203 + tbloffset])); + real2 v1541 = reverse(minus(v1149, v1229)); + real2 v1547 = plus(v1149, v1229); + real2 v2610 = plus(v189, v509); + real2 v2606 = minus(v509, v189); + real2 v2770 = plus(v2610, v2611); + real2 v2766 = minus(v2611, v2610); + real2 v823 = ctimesminusplus(reverse(v813), tbl[138 + tbloffset], ctimes(v813, tbl[139 + tbloffset])); + real2 v829 = ctimesminusplus(reverse(v815), tbl[140 + tbloffset], ctimes(v815, tbl[141 + tbloffset])); + real2 v2811 = plus(v2770, v2771); + real2 v2805 = reverse(minus(v2770, v2771)); + real2 v2767 = minusplus(v2765, v2766); + real2 v2769 = minusplus(uminus(v2765), v2766); + real2 v2607 = minusplus(v2605, v2606); + real2 v2609 = minusplus(uminus(v2605), v2606); + real2 v2617 = ctimesminusplus(reverse(v2607), tbl[458 + tbloffset], ctimes(v2607, tbl[459 + tbloffset])); + real2 v2623 = ctimesminusplus(reverse(v2609), tbl[460 + tbloffset], ctimes(v2609, tbl[461 + tbloffset])); + real2 v3013 = reverse(minus(v2623, v2703)); + real2 v3019 = plus(v2623, v2703); + real2 v2783 = ctimesminusplus(reverse(v2769), tbl[492 + tbloffset], ctimes(v2769, tbl[493 + tbloffset])); + real2 v2941 = plus(v2617, v2697); + real2 v2935 = reverse(minus(v2617, v2697)); + real2 v2777 = ctimesminusplus(reverse(v2767), tbl[490 + tbloffset], ctimes(v2767, tbl[491 + tbloffset])); + real2 v1660 = minus(v983, v823); + real2 v1664 = plus(v823, v983); + real2 v1874 = plus(v829, v989); + real2 v1870 = minus(v989, v829); + real2 v1909 = reverse(minus(v1874, v1875)); + real2 v1915 = plus(v1874, v1875); + real2 v1663 = minusplus(uminus(v1659), v1660); + real2 v1661 = minusplus(v1659, v1660); + real2 v1677 = ctimesminusplus(reverse(v1663), tbl[296 + tbloffset], ctimes(v1663, tbl[297 + tbloffset])); + real2 v1873 = minusplus(uminus(v1869), v1870); + real2 v1871 = minusplus(v1869, v1870); + real2 v1887 = ctimesminusplus(reverse(v1873), tbl[332 + tbloffset], ctimes(v1873, tbl[333 + tbloffset])); + real2 v1705 = plus(v1664, v1665); + real2 v1699 = reverse(minus(v1664, v1665)); + real2 v1671 = ctimesminusplus(reverse(v1661), tbl[294 + tbloffset], ctimes(v1661, tbl[295 + tbloffset])); + real2 v1881 = ctimesminusplus(reverse(v1871), tbl[330 + tbloffset], ctimes(v1871, tbl[331 + tbloffset])); + real2 v1469 = plus(v1143, v1223); + real2 v1463 = reverse(minus(v1143, v1223)); + real2 v54 = load(in, 52 << inShift); + real2 v118 = load(in, 116 << inShift); + real2 v537 = plus(v54, v118); + real2 v531 = reverse(minus(v54, v118)); + real2 v86 = load(in, 84 << inShift); + real2 v22 = load(in, 20 << inShift); + real2 v536 = plus(v22, v86); + real2 v532 = minus(v86, v22); + real2 v851 = reverse(minus(v536, v537)); + real2 v857 = plus(v536, v537); + real2 v533 = minusplus(v531, v532); + real2 v535 = minusplus(uminus(v531), v532); + real2 v549 = ctimesminusplus(reverse(v535), tbl[84 + tbloffset], ctimes(v535, tbl[85 + tbloffset])); + real2 v102 = load(in, 100 << inShift); + real2 v38 = load(in, 36 << inShift); + real2 v217 = plus(v38, v102); + real2 v211 = reverse(minus(v38, v102)); + real2 v70 = load(in, 68 << inShift); + real2 v6 = load(in, 4 << inShift); + real2 v216 = plus(v6, v70); + real2 v212 = minus(v70, v6); + real2 v213 = minusplus(v211, v212); + real2 v215 = minusplus(uminus(v211), v212); + real2 v229 = ctimesminusplus(reverse(v215), tbl[20 + tbloffset], ctimes(v215, tbl[21 + tbloffset])); + real2 v2646 = minus(v549, v229); + real2 v2650 = plus(v229, v549); + real2 v856 = plus(v216, v217); + real2 v852 = minus(v217, v216); + real2 v853 = minusplus(v851, v852); + real2 v855 = minusplus(uminus(v851), v852); + real2 v863 = ctimesminusplus(reverse(v853), tbl[146 + tbloffset], ctimes(v853, tbl[147 + tbloffset])); + real2 v869 = ctimesminusplus(reverse(v855), tbl[148 + tbloffset], ctimes(v855, tbl[149 + tbloffset])); + real2 v1176 = plus(v856, v857); + real2 v1172 = minus(v857, v856); + real2 v110 = load(in, 108 << inShift); + real2 v46 = load(in, 44 << inShift); + real2 v377 = plus(v46, v110); + real2 v371 = reverse(minus(v46, v110)); + real2 v78 = load(in, 76 << inShift); + real2 v14 = load(in, 12 << inShift); + real2 v372 = minus(v78, v14); + real2 v376 = plus(v14, v78); + real2 v1012 = minus(v377, v376); + real2 v1016 = plus(v376, v377); + real2 v373 = minusplus(v371, v372); + real2 v375 = minusplus(uminus(v371), v372); + real2 v389 = ctimesminusplus(reverse(v375), tbl[52 + tbloffset], ctimes(v375, tbl[53 + tbloffset])); + real2 v30 = load(in, 28 << inShift); + real2 v94 = load(in, 92 << inShift); + real2 v696 = plus(v30, v94); + real2 v692 = minus(v94, v30); + real2 v62 = load(in, 60 << inShift); + real2 v126 = load(in, 124 << inShift); + real2 v697 = plus(v62, v126); + real2 v691 = reverse(minus(v62, v126)); + real2 v1017 = plus(v696, v697); + real2 v1011 = reverse(minus(v696, v697)); + real2 v1171 = reverse(minus(v1016, v1017)); + real2 v1177 = plus(v1016, v1017); + real2 v1013 = minusplus(v1011, v1012); + real2 v1015 = minusplus(uminus(v1011), v1012); + real2 v1175 = minusplus(uminus(v1171), v1172); + real2 v1173 = minusplus(v1171, v1172); + real2 v1183 = ctimesminusplus(reverse(v1173), tbl[210 + tbloffset], ctimes(v1173, tbl[211 + tbloffset])); + real2 v1189 = ctimesminusplus(reverse(v1175), tbl[212 + tbloffset], ctimes(v1175, tbl[213 + tbloffset])); + real2 v1029 = ctimesminusplus(reverse(v1015), tbl[180 + tbloffset], ctimes(v1015, tbl[181 + tbloffset])); + real2 v1023 = ctimesminusplus(reverse(v1013), tbl[178 + tbloffset], ctimes(v1013, tbl[179 + tbloffset])); + real2 v1625 = plus(v863, v1023); + real2 v1619 = reverse(minus(v863, v1023)); + real2 v1835 = plus(v869, v1029); + real2 v1829 = reverse(minus(v869, v1029)); + real2 v693 = minusplus(v691, v692); + real2 v695 = minusplus(uminus(v691), v692); + real2 v709 = ctimesminusplus(reverse(v695), tbl[116 + tbloffset], ctimes(v695, tbl[117 + tbloffset])); + real2 v2645 = reverse(minus(v389, v709)); + real2 v2651 = plus(v389, v709); + real2 v1257 = plus(v1176, v1177); + real2 v1251 = reverse(minus(v1176, v1177)); + real2 v2731 = plus(v2650, v2651); + real2 v2725 = reverse(minus(v2650, v2651)); + real2 v114 = load(in, 112 << inShift); + real2 v50 = load(in, 48 << inShift); + real2 v457 = plus(v50, v114); + real2 v451 = reverse(minus(v50, v114)); + real2 v18 = load(in, 16 << inShift); + real2 v82 = load(in, 80 << inShift); + real2 v456 = plus(v18, v82); + real2 v452 = minus(v82, v18); + real2 v771 = reverse(minus(v456, v457)); + real2 v777 = plus(v456, v457); + real2 v453 = minusplus(v451, v452); + real2 v455 = minusplus(uminus(v451), v452); + real2 v469 = ctimesminusplus(reverse(v455), tbl[68 + tbloffset], ctimes(v455, tbl[69 + tbloffset])); + real2 v66 = load(in, 64 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v132 = minus(v66, v2); + real2 v136 = plus(v2, v66); + real2 v98 = load(in, 96 << inShift); + real2 v34 = load(in, 32 << inShift); + real2 v131 = reverse(minus(v34, v98)); + real2 v137 = plus(v34, v98); + real2 v133 = minusplus(v131, v132); + real2 v135 = minusplus(uminus(v131), v132); + real2 v149 = ctimesminusplus(reverse(v135), tbl[4 + tbloffset], ctimes(v135, tbl[5 + tbloffset])); + real2 v2566 = minus(v469, v149); + real2 v2570 = plus(v149, v469); + real2 v772 = minus(v137, v136); + real2 v776 = plus(v136, v137); + real2 v1092 = minus(v777, v776); + real2 v1096 = plus(v776, v777); + real2 v773 = minusplus(v771, v772); + real2 v775 = minusplus(uminus(v771), v772); + real2 v783 = ctimesminusplus(reverse(v773), tbl[130 + tbloffset], ctimes(v773, tbl[131 + tbloffset])); + real2 v789 = ctimesminusplus(reverse(v775), tbl[132 + tbloffset], ctimes(v775, tbl[133 + tbloffset])); + real2 v74 = load(in, 72 << inShift); + real2 v10 = load(in, 8 << inShift); + real2 v296 = plus(v10, v74); + real2 v292 = minus(v74, v10); + real2 v42 = load(in, 40 << inShift); + real2 v106 = load(in, 104 << inShift); + real2 v291 = reverse(minus(v42, v106)); + real2 v297 = plus(v42, v106); + real2 v293 = minusplus(v291, v292); + real2 v295 = minusplus(uminus(v291), v292); + real2 v309 = ctimesminusplus(reverse(v295), tbl[36 + tbloffset], ctimes(v295, tbl[37 + tbloffset])); + real2 v932 = minus(v297, v296); + real2 v936 = plus(v296, v297); + real2 v122 = load(in, 120 << inShift); + real2 v58 = load(in, 56 << inShift); + real2 v617 = plus(v58, v122); + real2 v611 = reverse(minus(v58, v122)); + real2 v26 = load(in, 24 << inShift); + real2 v90 = load(in, 88 << inShift); + real2 v612 = minus(v90, v26); + real2 v616 = plus(v26, v90); + real2 v937 = plus(v616, v617); + real2 v931 = reverse(minus(v616, v617)); + real2 v1091 = reverse(minus(v936, v937)); + real2 v1097 = plus(v936, v937); + real2 v933 = minusplus(v931, v932); + real2 v935 = minusplus(uminus(v931), v932); + real2 v1093 = minusplus(v1091, v1092); + real2 v1095 = minusplus(uminus(v1091), v1092); + real2 v1103 = ctimesminusplus(reverse(v1093), tbl[194 + tbloffset], ctimes(v1093, tbl[195 + tbloffset])); + real2 v1468 = plus(v1103, v1183); + real2 v1464 = minus(v1183, v1103); + real2 v1508 = plus(v1468, v1469); + real2 v1504 = minus(v1469, v1468); + real2 v1252 = minus(v1097, v1096); + real2 v1256 = plus(v1096, v1097); + real2 v1336 = plus(v1256, v1257); + real2 v1332 = minus(v1257, v1256); + real2 v1335 = minusplus(uminus(v1331), v1332); + real2 v1333 = minusplus(v1331, v1332); + real2 v1343 = ctimesminusplus(reverse(v1333), tbl[242 + tbloffset], ctimes(v1333, tbl[243 + tbloffset])); + real2 v1349 = ctimesminusplus(reverse(v1335), tbl[244 + tbloffset], ctimes(v1335, tbl[245 + tbloffset])); + real2 v1376 = plus(v1336, v1337); + real2 v1372 = minus(v1337, v1336); + real2 v1465 = minusplus(v1463, v1464); + real2 v1467 = minusplus(uminus(v1463), v1464); + real2 v1255 = minusplus(uminus(v1251), v1252); + real2 v1253 = minusplus(v1251, v1252); + real2 v1481 = ctimesminusplus(reverse(v1467), tbl[264 + tbloffset], ctimes(v1467, tbl[265 + tbloffset])); + real2 v1475 = ctimesminusplus(reverse(v1465), tbl[262 + tbloffset], ctimes(v1465, tbl[263 + tbloffset])); + real2 v1109 = ctimesminusplus(reverse(v1095), tbl[196 + tbloffset], ctimes(v1095, tbl[197 + tbloffset])); + real2 v1542 = minus(v1189, v1109); + real2 v1546 = plus(v1109, v1189); + real2 v1545 = minusplus(uminus(v1541), v1542); + real2 v1543 = minusplus(v1541, v1542); + real2 v1553 = ctimesminusplus(reverse(v1543), tbl[274 + tbloffset], ctimes(v1543, tbl[275 + tbloffset])); + real2 v1559 = ctimesminusplus(reverse(v1545), tbl[276 + tbloffset], ctimes(v1545, tbl[277 + tbloffset])); + real2 v1582 = minus(v1547, v1546); + real2 v1586 = plus(v1546, v1547); + real2 v1269 = ctimesminusplus(reverse(v1255), tbl[228 + tbloffset], ctimes(v1255, tbl[229 + tbloffset])); + real2 v1438 = minus(v1309, v1269); + real2 v1442 = plus(v1269, v1309); + real2 v1263 = ctimesminusplus(reverse(v1253), tbl[226 + tbloffset], ctimes(v1253, tbl[227 + tbloffset])); + real2 v943 = ctimesminusplus(reverse(v933), tbl[162 + tbloffset], ctimes(v933, tbl[163 + tbloffset])); + real2 v1624 = plus(v783, v943); + real2 v1620 = minus(v943, v783); + real2 v1623 = minusplus(uminus(v1619), v1620); + real2 v1621 = minusplus(v1619, v1620); + real2 v1700 = minus(v1625, v1624); + real2 v1704 = plus(v1624, v1625); + real2 v1631 = ctimesminusplus(reverse(v1621), tbl[286 + tbloffset], ctimes(v1621, tbl[287 + tbloffset])); + real2 v949 = ctimesminusplus(reverse(v935), tbl[164 + tbloffset], ctimes(v935, tbl[165 + tbloffset])); + real2 v1830 = minus(v949, v789); + real2 v1834 = plus(v789, v949); + real2 v1782 = plus(v1631, v1671); + real2 v1778 = minus(v1671, v1631); + real2 v1910 = minus(v1835, v1834); + real2 v1914 = plus(v1834, v1835); + real2 v1950 = minus(v1915, v1914); + real2 v1954 = plus(v1914, v1915); + real2 v1913 = minusplus(uminus(v1909), v1910); + real2 v1911 = minusplus(v1909, v1910); + real2 v613 = minusplus(v611, v612); + real2 v615 = minusplus(uminus(v611), v612); + real2 v629 = ctimesminusplus(reverse(v615), tbl[100 + tbloffset], ctimes(v615, tbl[101 + tbloffset])); + real2 v1744 = plus(v1704, v1705); + real2 v1740 = minus(v1705, v1704); + real2 v1637 = ctimesminusplus(reverse(v1623), tbl[288 + tbloffset], ctimes(v1623, tbl[289 + tbloffset])); + real2 v1927 = ctimesminusplus(reverse(v1913), tbl[340 + tbloffset], ctimes(v1913, tbl[341 + tbloffset])); + real2 v2571 = plus(v309, v629); + real2 v2565 = reverse(minus(v309, v629)); + real2 v1833 = minusplus(uminus(v1829), v1830); + real2 v1831 = minusplus(v1829, v1830); + real2 v1921 = ctimesminusplus(reverse(v1911), tbl[338 + tbloffset], ctimes(v1911, tbl[339 + tbloffset])); + real2 v1804 = minus(v1677, v1637); + real2 v1808 = plus(v1637, v1677); + real2 v1847 = ctimesminusplus(reverse(v1833), tbl[324 + tbloffset], ctimes(v1833, tbl[325 + tbloffset])); + real2 v2014 = minus(v1887, v1847); + real2 v2018 = plus(v1847, v1887); + real2 v1841 = ctimesminusplus(reverse(v1831), tbl[322 + tbloffset], ctimes(v1831, tbl[323 + tbloffset])); + real2 v1988 = minus(v1881, v1841); + real2 v1992 = plus(v1841, v1881); + real2 v1703 = minusplus(uminus(v1699), v1700); + real2 v1701 = minusplus(v1699, v1700); + real2 v1717 = ctimesminusplus(reverse(v1703), tbl[304 + tbloffset], ctimes(v1703, tbl[305 + tbloffset])); + real2 v1711 = ctimesminusplus(reverse(v1701), tbl[302 + tbloffset], ctimes(v1701, tbl[303 + tbloffset])); + real2 v2730 = plus(v2570, v2571); + real2 v2726 = minus(v2571, v2570); + real2 v1412 = minus(v1303, v1263); + real2 v1416 = plus(v1263, v1303); + real2 v63 = load(in, 61 << inShift); + real2 v127 = load(in, 125 << inShift); + real2 v717 = plus(v63, v127); + real2 v711 = reverse(minus(v63, v127)); + real2 v95 = load(in, 93 << inShift); + real2 v31 = load(in, 29 << inShift); + real2 v712 = minus(v95, v31); + real2 v716 = plus(v31, v95); + real2 v1037 = plus(v716, v717); + real2 v1031 = reverse(minus(v716, v717)); + real2 v79 = load(in, 77 << inShift); + real2 v15 = load(in, 13 << inShift); + real2 v396 = plus(v15, v79); + real2 v392 = minus(v79, v15); + real2 v111 = load(in, 109 << inShift); + real2 v47 = load(in, 45 << inShift); + real2 v397 = plus(v47, v111); + real2 v391 = reverse(minus(v47, v111)); + real2 v1032 = minus(v397, v396); + real2 v1036 = plus(v396, v397); + real2 v1033 = minusplus(v1031, v1032); + real2 v1035 = minusplus(uminus(v1031), v1032); + real2 v1049 = ctimesminusplus(reverse(v1035), tbl[184 + tbloffset], ctimes(v1035, tbl[185 + tbloffset])); + real2 v1043 = ctimesminusplus(reverse(v1033), tbl[182 + tbloffset], ctimes(v1033, tbl[183 + tbloffset])); + real2 v1197 = plus(v1036, v1037); + real2 v1191 = reverse(minus(v1036, v1037)); + real2 v23 = load(in, 21 << inShift); + real2 v87 = load(in, 85 << inShift); + real2 v556 = plus(v23, v87); + real2 v552 = minus(v87, v23); + real2 v119 = load(in, 117 << inShift); + real2 v55 = load(in, 53 << inShift); + real2 v557 = plus(v55, v119); + real2 v551 = reverse(minus(v55, v119)); + real2 v877 = plus(v556, v557); + real2 v871 = reverse(minus(v556, v557)); + real2 v7 = load(in, 5 << inShift); + real2 v71 = load(in, 69 << inShift); + real2 v232 = minus(v71, v7); + real2 v236 = plus(v7, v71); + real2 v103 = load(in, 101 << inShift); + real2 v39 = load(in, 37 << inShift); + real2 v237 = plus(v39, v103); + real2 v231 = reverse(minus(v39, v103)); + real2 v876 = plus(v236, v237); + real2 v872 = minus(v237, v236); + real2 v1192 = minus(v877, v876); + real2 v1196 = plus(v876, v877); + real2 v1271 = reverse(minus(v1196, v1197)); + real2 v1277 = plus(v1196, v1197); + real2 v875 = minusplus(uminus(v871), v872); + real2 v873 = minusplus(v871, v872); + real2 v883 = ctimesminusplus(reverse(v873), tbl[150 + tbloffset], ctimes(v873, tbl[151 + tbloffset])); + real2 v1639 = reverse(minus(v883, v1043)); + real2 v1645 = plus(v883, v1043); + real2 v1195 = minusplus(uminus(v1191), v1192); + real2 v1193 = minusplus(v1191, v1192); + real2 v1209 = ctimesminusplus(reverse(v1195), tbl[216 + tbloffset], ctimes(v1195, tbl[217 + tbloffset])); + real2 v1203 = ctimesminusplus(reverse(v1193), tbl[214 + tbloffset], ctimes(v1193, tbl[215 + tbloffset])); + real2 v83 = load(in, 81 << inShift); + real2 v19 = load(in, 17 << inShift); + real2 v476 = plus(v19, v83); + real2 v472 = minus(v83, v19); + real2 v51 = load(in, 49 << inShift); + real2 v115 = load(in, 113 << inShift); + real2 v477 = plus(v51, v115); + real2 v471 = reverse(minus(v51, v115)); + real2 v797 = plus(v476, v477); + real2 v791 = reverse(minus(v476, v477)); + real2 v3 = load(in, 1 << inShift); + real2 v67 = load(in, 65 << inShift); + real2 v156 = plus(v3, v67); + real2 v152 = minus(v67, v3); + real2 v35 = load(in, 33 << inShift); + real2 v99 = load(in, 97 << inShift); + real2 v157 = plus(v35, v99); + real2 v151 = reverse(minus(v35, v99)); + real2 v792 = minus(v157, v156); + real2 v796 = plus(v156, v157); + real2 v793 = minusplus(v791, v792); + real2 v795 = minusplus(uminus(v791), v792); + real2 v803 = ctimesminusplus(reverse(v793), tbl[134 + tbloffset], ctimes(v793, tbl[135 + tbloffset])); + real2 v1112 = minus(v797, v796); + real2 v1116 = plus(v796, v797); + real2 v107 = load(in, 105 << inShift); + real2 v43 = load(in, 41 << inShift); + real2 v317 = plus(v43, v107); + real2 v311 = reverse(minus(v43, v107)); + real2 v75 = load(in, 73 << inShift); + real2 v11 = load(in, 9 << inShift); + real2 v316 = plus(v11, v75); + real2 v312 = minus(v75, v11); + real2 v956 = plus(v316, v317); + real2 v952 = minus(v317, v316); + real2 v59 = load(in, 57 << inShift); + real2 v123 = load(in, 121 << inShift); + real2 v631 = reverse(minus(v59, v123)); + real2 v637 = plus(v59, v123); + real2 v27 = load(in, 25 << inShift); + real2 v91 = load(in, 89 << inShift); + real2 v636 = plus(v27, v91); + real2 v632 = minus(v91, v27); + real2 v957 = plus(v636, v637); + real2 v951 = reverse(minus(v636, v637)); + real2 v1111 = reverse(minus(v956, v957)); + real2 v1117 = plus(v956, v957); + real2 v1276 = plus(v1116, v1117); + real2 v1272 = minus(v1117, v1116); + real2 v1275 = minusplus(uminus(v1271), v1272); + real2 v1273 = minusplus(v1271, v1272); + real2 v1283 = ctimesminusplus(reverse(v1273), tbl[230 + tbloffset], ctimes(v1273, tbl[231 + tbloffset])); + real2 v1352 = minus(v1277, v1276); + real2 v1356 = plus(v1276, v1277); + real2 v1289 = ctimesminusplus(reverse(v1275), tbl[232 + tbloffset], ctimes(v1275, tbl[233 + tbloffset])); + real2 v1115 = minusplus(uminus(v1111), v1112); + real2 v1113 = minusplus(v1111, v1112); + real2 v1123 = ctimesminusplus(reverse(v1113), tbl[198 + tbloffset], ctimes(v1113, tbl[199 + tbloffset])); + real2 v1129 = ctimesminusplus(reverse(v1115), tbl[200 + tbloffset], ctimes(v1115, tbl[201 + tbloffset])); + real2 v1488 = plus(v1123, v1203); + real2 v1484 = minus(v1203, v1123); + real2 v1566 = plus(v1129, v1209); + real2 v1562 = minus(v1209, v1129); + real2 v85 = load(in, 83 << inShift); + real2 v21 = load(in, 19 << inShift); + real2 v512 = minus(v85, v21); + real2 v516 = plus(v21, v85); + real2 v117 = load(in, 115 << inShift); + real2 v53 = load(in, 51 << inShift); + real2 v517 = plus(v53, v117); + real2 v511 = reverse(minus(v53, v117)); + real2 v831 = reverse(minus(v516, v517)); + real2 v837 = plus(v516, v517); + real2 v69 = load(in, 67 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v192 = minus(v69, v5); + real2 v196 = plus(v5, v69); + real2 v37 = load(in, 35 << inShift); + real2 v101 = load(in, 99 << inShift); + real2 v197 = plus(v37, v101); + real2 v191 = reverse(minus(v37, v101)); + real2 v832 = minus(v197, v196); + real2 v836 = plus(v196, v197); + real2 v1152 = minus(v837, v836); + real2 v1156 = plus(v836, v837); + real2 v61 = load(in, 59 << inShift); + real2 v125 = load(in, 123 << inShift); + real2 v677 = plus(v61, v125); + real2 v671 = reverse(minus(v61, v125)); + real2 v29 = load(in, 27 << inShift); + real2 v93 = load(in, 91 << inShift); + real2 v672 = minus(v93, v29); + real2 v676 = plus(v29, v93); + real2 v997 = plus(v676, v677); + real2 v991 = reverse(minus(v676, v677)); + real2 v109 = load(in, 107 << inShift); + real2 v45 = load(in, 43 << inShift); + real2 v357 = plus(v45, v109); + real2 v351 = reverse(minus(v45, v109)); + real2 v77 = load(in, 75 << inShift); + real2 v13 = load(in, 11 << inShift); + real2 v352 = minus(v77, v13); + real2 v356 = plus(v13, v77); + real2 v992 = minus(v357, v356); + real2 v996 = plus(v356, v357); + real2 v1157 = plus(v996, v997); + real2 v1151 = reverse(minus(v996, v997)); + real2 v1155 = minusplus(uminus(v1151), v1152); + real2 v1153 = minusplus(v1151, v1152); + real2 v1163 = ctimesminusplus(reverse(v1153), tbl[206 + tbloffset], ctimes(v1153, tbl[207 + tbloffset])); + real2 v1316 = plus(v1156, v1157); + real2 v1312 = minus(v1157, v1156); + real2 v41 = load(in, 39 << inShift); + real2 v105 = load(in, 103 << inShift); + real2 v277 = plus(v41, v105); + real2 v271 = reverse(minus(v41, v105)); + real2 v9 = load(in, 7 << inShift); + real2 v73 = load(in, 71 << inShift); + real2 v276 = plus(v9, v73); + real2 v272 = minus(v73, v9); + real2 v916 = plus(v276, v277); + real2 v912 = minus(v277, v276); + real2 v89 = load(in, 87 << inShift); + real2 v25 = load(in, 23 << inShift); + real2 v592 = minus(v89, v25); + real2 v596 = plus(v25, v89); + real2 v57 = load(in, 55 << inShift); + real2 v121 = load(in, 119 << inShift); + real2 v591 = reverse(minus(v57, v121)); + real2 v597 = plus(v57, v121); + real2 v911 = reverse(minus(v596, v597)); + real2 v917 = plus(v596, v597); + real2 v1236 = plus(v916, v917); + real2 v1232 = minus(v917, v916); + real2 v81 = load(in, 79 << inShift); + real2 v17 = load(in, 15 << inShift); + real2 v432 = minus(v81, v17); + real2 v436 = plus(v17, v81); + real2 v113 = load(in, 111 << inShift); + real2 v49 = load(in, 47 << inShift); + real2 v437 = plus(v49, v113); + real2 v431 = reverse(minus(v49, v113)); + real2 v1072 = minus(v437, v436); + real2 v1076 = plus(v436, v437); + real2 v65 = load(in, 63 << inShift); + real2 v129 = load(in, 127 << inShift); + real2 v757 = plus(v65, v129); + real2 v751 = reverse(minus(v65, v129)); + real2 v97 = load(in, 95 << inShift); + real2 v33 = load(in, 31 << inShift); + real2 v752 = minus(v97, v33); + real2 v756 = plus(v33, v97); + real2 v1077 = plus(v756, v757); + real2 v1071 = reverse(minus(v756, v757)); + real2 v1231 = reverse(minus(v1076, v1077)); + real2 v1237 = plus(v1076, v1077); + real2 v1317 = plus(v1236, v1237); + real2 v1311 = reverse(minus(v1236, v1237)); + real2 v1351 = reverse(minus(v1316, v1317)); + real2 v1357 = plus(v1316, v1317); + real2 v1371 = reverse(minus(v1356, v1357)); + real2 v1377 = plus(v1356, v1357); + store(out, 0 << %SHIFT%, plus(v1376, v1377)); + real2 v1390 = minus(v1376, v1377); + store(out, 64 << %SHIFT%, ctimesminusplus(v1390, tbl[0 + tbloffset], ctimes(reverse(v1390), tbl[1 + tbloffset]))); + real2 v1353 = minusplus(v1351, v1352); + real2 v1355 = minusplus(uminus(v1351), v1352); + real2 v1369 = ctimesminusplus(reverse(v1355), tbl[248 + tbloffset], ctimes(v1355, tbl[249 + tbloffset])); + store(out, 48 << %SHIFT%, plus(v1349, v1369)); + real2 v1404 = minus(v1349, v1369); + store(out, 112 << %SHIFT%, ctimesminusplus(v1404, tbl[0 + tbloffset], ctimes(reverse(v1404), tbl[1 + tbloffset]))); + real2 v1363 = ctimesminusplus(reverse(v1353), tbl[246 + tbloffset], ctimes(v1353, tbl[247 + tbloffset])); + store(out, 16 << %SHIFT%, plus(v1343, v1363)); + real2 v1398 = minus(v1343, v1363); + store(out, 80 << %SHIFT%, ctimesminusplus(v1398, tbl[0 + tbloffset], ctimes(reverse(v1398), tbl[1 + tbloffset]))); + real2 v1373 = minusplus(v1371, v1372); + real2 v1375 = minusplus(uminus(v1371), v1372); + store(out, 96 << %SHIFT%, ctimesminusplus(reverse(v1375), tbl[252 + tbloffset], ctimes(v1375, tbl[253 + tbloffset]))); + store(out, 32 << %SHIFT%, ctimesminusplus(reverse(v1373), tbl[250 + tbloffset], ctimes(v1373, tbl[251 + tbloffset]))); + real2 v1313 = minusplus(v1311, v1312); + real2 v1315 = minusplus(uminus(v1311), v1312); + real2 v1323 = ctimesminusplus(reverse(v1313), tbl[238 + tbloffset], ctimes(v1313, tbl[239 + tbloffset])); + real2 v1417 = plus(v1283, v1323); + real2 v1411 = reverse(minus(v1283, v1323)); + store(out, 8 << %SHIFT%, plus(v1416, v1417)); + real2 v1430 = minus(v1416, v1417); + store(out, 72 << %SHIFT%, ctimesminusplus(v1430, tbl[0 + tbloffset], ctimes(reverse(v1430), tbl[1 + tbloffset]))); + real2 v1413 = minusplus(v1411, v1412); + real2 v1415 = minusplus(uminus(v1411), v1412); + store(out, 104 << %SHIFT%, ctimesminusplus(reverse(v1415), tbl[256 + tbloffset], ctimes(v1415, tbl[257 + tbloffset]))); + store(out, 40 << %SHIFT%, ctimesminusplus(reverse(v1413), tbl[254 + tbloffset], ctimes(v1413, tbl[255 + tbloffset]))); + real2 v1329 = ctimesminusplus(reverse(v1315), tbl[240 + tbloffset], ctimes(v1315, tbl[241 + tbloffset])); + real2 v1443 = plus(v1289, v1329); + real2 v1437 = reverse(minus(v1289, v1329)); + store(out, 24 << %SHIFT%, plus(v1442, v1443)); + real2 v1456 = minus(v1442, v1443); + store(out, 88 << %SHIFT%, ctimesminusplus(v1456, tbl[0 + tbloffset], ctimes(reverse(v1456), tbl[1 + tbloffset]))); + real2 v1441 = minusplus(uminus(v1437), v1438); + real2 v1439 = minusplus(v1437, v1438); + store(out, 120 << %SHIFT%, ctimesminusplus(reverse(v1441), tbl[260 + tbloffset], ctimes(v1441, tbl[261 + tbloffset]))); + store(out, 56 << %SHIFT%, ctimesminusplus(reverse(v1439), tbl[258 + tbloffset], ctimes(v1439, tbl[259 + tbloffset]))); + real2 v1235 = minusplus(uminus(v1231), v1232); + real2 v1233 = minusplus(v1231, v1232); + real2 v1243 = ctimesminusplus(reverse(v1233), tbl[222 + tbloffset], ctimes(v1233, tbl[223 + tbloffset])); + real2 v1489 = plus(v1163, v1243); + real2 v1483 = reverse(minus(v1163, v1243)); + real2 v1509 = plus(v1488, v1489); + real2 v1503 = reverse(minus(v1488, v1489)); + store(out, 4 << %SHIFT%, plus(v1508, v1509)); + real2 v1522 = minus(v1508, v1509); + store(out, 68 << %SHIFT%, ctimesminusplus(v1522, tbl[0 + tbloffset], ctimes(reverse(v1522), tbl[1 + tbloffset]))); + real2 v1507 = minusplus(uminus(v1503), v1504); + real2 v1505 = minusplus(v1503, v1504); + store(out, 36 << %SHIFT%, ctimesminusplus(reverse(v1505), tbl[270 + tbloffset], ctimes(v1505, tbl[271 + tbloffset]))); + store(out, 100 << %SHIFT%, ctimesminusplus(reverse(v1507), tbl[272 + tbloffset], ctimes(v1507, tbl[273 + tbloffset]))); + real2 v1485 = minusplus(v1483, v1484); + real2 v1487 = minusplus(uminus(v1483), v1484); + real2 v1501 = ctimesminusplus(reverse(v1487), tbl[268 + tbloffset], ctimes(v1487, tbl[269 + tbloffset])); + store(out, 52 << %SHIFT%, plus(v1481, v1501)); + real2 v1534 = minus(v1481, v1501); + store(out, 116 << %SHIFT%, ctimesminusplus(v1534, tbl[0 + tbloffset], ctimes(reverse(v1534), tbl[1 + tbloffset]))); + real2 v1495 = ctimesminusplus(reverse(v1485), tbl[266 + tbloffset], ctimes(v1485, tbl[267 + tbloffset])); + store(out, 20 << %SHIFT%, plus(v1475, v1495)); + real2 v1528 = minus(v1475, v1495); + store(out, 84 << %SHIFT%, ctimesminusplus(v1528, tbl[0 + tbloffset], ctimes(reverse(v1528), tbl[1 + tbloffset]))); + real2 v1249 = ctimesminusplus(reverse(v1235), tbl[224 + tbloffset], ctimes(v1235, tbl[225 + tbloffset])); + real2 v1169 = ctimesminusplus(reverse(v1155), tbl[208 + tbloffset], ctimes(v1155, tbl[209 + tbloffset])); + real2 v1567 = plus(v1169, v1249); + real2 v1561 = reverse(minus(v1169, v1249)); + real2 v1581 = reverse(minus(v1566, v1567)); + real2 v1587 = plus(v1566, v1567); + store(out, 12 << %SHIFT%, plus(v1586, v1587)); + real2 v1600 = minus(v1586, v1587); + store(out, 76 << %SHIFT%, ctimesminusplus(v1600, tbl[0 + tbloffset], ctimes(reverse(v1600), tbl[1 + tbloffset]))); + real2 v1583 = minusplus(v1581, v1582); + store(out, 44 << %SHIFT%, ctimesminusplus(reverse(v1583), tbl[282 + tbloffset], ctimes(v1583, tbl[283 + tbloffset]))); + real2 v1585 = minusplus(uminus(v1581), v1582); + store(out, 108 << %SHIFT%, ctimesminusplus(reverse(v1585), tbl[284 + tbloffset], ctimes(v1585, tbl[285 + tbloffset]))); + real2 v1565 = minusplus(uminus(v1561), v1562); + real2 v1563 = minusplus(v1561, v1562); + real2 v1579 = ctimesminusplus(reverse(v1565), tbl[280 + tbloffset], ctimes(v1565, tbl[281 + tbloffset])); + store(out, 60 << %SHIFT%, plus(v1559, v1579)); + real2 v1612 = minus(v1559, v1579); + store(out, 124 << %SHIFT%, ctimesminusplus(v1612, tbl[0 + tbloffset], ctimes(reverse(v1612), tbl[1 + tbloffset]))); + real2 v1573 = ctimesminusplus(reverse(v1563), tbl[278 + tbloffset], ctimes(v1563, tbl[279 + tbloffset])); + store(out, 28 << %SHIFT%, plus(v1553, v1573)); + real2 v1606 = minus(v1553, v1573); + store(out, 92 << %SHIFT%, ctimesminusplus(v1606, tbl[0 + tbloffset], ctimes(reverse(v1606), tbl[1 + tbloffset]))); + real2 v833 = minusplus(v831, v832); + real2 v835 = minusplus(uminus(v831), v832); + real2 v955 = minusplus(uminus(v951), v952); + real2 v953 = minusplus(v951, v952); + real2 v963 = ctimesminusplus(reverse(v953), tbl[166 + tbloffset], ctimes(v953, tbl[167 + tbloffset])); + real2 v995 = minusplus(uminus(v991), v992); + real2 v993 = minusplus(v991, v992); + real2 v1003 = ctimesminusplus(reverse(v993), tbl[174 + tbloffset], ctimes(v993, tbl[175 + tbloffset])); + real2 v843 = ctimesminusplus(reverse(v833), tbl[142 + tbloffset], ctimes(v833, tbl[143 + tbloffset])); + real2 v1640 = minus(v963, v803); + real2 v1644 = plus(v803, v963); + real2 v1680 = minus(v1003, v843); + real2 v1684 = plus(v843, v1003); + real2 v1641 = minusplus(v1639, v1640); + real2 v1643 = minusplus(uminus(v1639), v1640); + real2 v1657 = ctimesminusplus(reverse(v1643), tbl[292 + tbloffset], ctimes(v1643, tbl[293 + tbloffset])); + real2 v913 = minusplus(v911, v912); + real2 v915 = minusplus(uminus(v911), v912); + real2 v1073 = minusplus(v1071, v1072); + real2 v1075 = minusplus(uminus(v1071), v1072); + real2 v923 = ctimesminusplus(reverse(v913), tbl[158 + tbloffset], ctimes(v913, tbl[159 + tbloffset])); + real2 v1083 = ctimesminusplus(reverse(v1073), tbl[190 + tbloffset], ctimes(v1073, tbl[191 + tbloffset])); + real2 v1685 = plus(v923, v1083); + real2 v1679 = reverse(minus(v923, v1083)); + real2 v1681 = minusplus(v1679, v1680); + real2 v1683 = minusplus(uminus(v1679), v1680); + real2 v1697 = ctimesminusplus(reverse(v1683), tbl[300 + tbloffset], ctimes(v1683, tbl[301 + tbloffset])); + real2 v1809 = plus(v1657, v1697); + real2 v1803 = reverse(minus(v1657, v1697)); + store(out, 26 << %SHIFT%, plus(v1808, v1809)); + real2 v1822 = minus(v1808, v1809); + store(out, 90 << %SHIFT%, ctimesminusplus(v1822, tbl[0 + tbloffset], ctimes(reverse(v1822), tbl[1 + tbloffset]))); + real2 v1807 = minusplus(uminus(v1803), v1804); + real2 v1805 = minusplus(v1803, v1804); + store(out, 58 << %SHIFT%, ctimesminusplus(reverse(v1805), tbl[318 + tbloffset], ctimes(v1805, tbl[319 + tbloffset]))); + store(out, 122 << %SHIFT%, ctimesminusplus(reverse(v1807), tbl[320 + tbloffset], ctimes(v1807, tbl[321 + tbloffset]))); + real2 v1651 = ctimesminusplus(reverse(v1641), tbl[290 + tbloffset], ctimes(v1641, tbl[291 + tbloffset])); + real2 v1691 = ctimesminusplus(reverse(v1681), tbl[298 + tbloffset], ctimes(v1681, tbl[299 + tbloffset])); + real2 v1783 = plus(v1651, v1691); + real2 v1777 = reverse(minus(v1651, v1691)); + real2 v1779 = minusplus(v1777, v1778); + real2 v1781 = minusplus(uminus(v1777), v1778); + store(out, 106 << %SHIFT%, ctimesminusplus(reverse(v1781), tbl[316 + tbloffset], ctimes(v1781, tbl[317 + tbloffset]))); + store(out, 42 << %SHIFT%, ctimesminusplus(reverse(v1779), tbl[314 + tbloffset], ctimes(v1779, tbl[315 + tbloffset]))); + store(out, 10 << %SHIFT%, plus(v1782, v1783)); + real2 v1796 = minus(v1782, v1783); + store(out, 74 << %SHIFT%, ctimesminusplus(v1796, tbl[0 + tbloffset], ctimes(reverse(v1796), tbl[1 + tbloffset]))); + real2 v1720 = minus(v1645, v1644); + real2 v1724 = plus(v1644, v1645); + real2 v1719 = reverse(minus(v1684, v1685)); + real2 v1725 = plus(v1684, v1685); + real2 v1745 = plus(v1724, v1725); + real2 v1739 = reverse(minus(v1724, v1725)); + store(out, 2 << %SHIFT%, plus(v1744, v1745)); + real2 v1758 = minus(v1744, v1745); + store(out, 66 << %SHIFT%, ctimesminusplus(v1758, tbl[0 + tbloffset], ctimes(reverse(v1758), tbl[1 + tbloffset]))); + real2 v1741 = minusplus(v1739, v1740); + real2 v1743 = minusplus(uminus(v1739), v1740); + store(out, 98 << %SHIFT%, ctimesminusplus(reverse(v1743), tbl[312 + tbloffset], ctimes(v1743, tbl[313 + tbloffset]))); + store(out, 34 << %SHIFT%, ctimesminusplus(reverse(v1741), tbl[310 + tbloffset], ctimes(v1741, tbl[311 + tbloffset]))); + real2 v1723 = minusplus(uminus(v1719), v1720); + real2 v1721 = minusplus(v1719, v1720); + real2 v1737 = ctimesminusplus(reverse(v1723), tbl[308 + tbloffset], ctimes(v1723, tbl[309 + tbloffset])); + store(out, 50 << %SHIFT%, plus(v1717, v1737)); + real2 v1770 = minus(v1717, v1737); + store(out, 114 << %SHIFT%, ctimesminusplus(v1770, tbl[0 + tbloffset], ctimes(reverse(v1770), tbl[1 + tbloffset]))); + real2 v1731 = ctimesminusplus(reverse(v1721), tbl[306 + tbloffset], ctimes(v1721, tbl[307 + tbloffset])); + store(out, 18 << %SHIFT%, plus(v1711, v1731)); + real2 v1764 = minus(v1711, v1731); + store(out, 82 << %SHIFT%, ctimesminusplus(v1764, tbl[0 + tbloffset], ctimes(reverse(v1764), tbl[1 + tbloffset]))); + real2 v809 = ctimesminusplus(reverse(v795), tbl[136 + tbloffset], ctimes(v795, tbl[137 + tbloffset])); + real2 v969 = ctimesminusplus(reverse(v955), tbl[168 + tbloffset], ctimes(v955, tbl[169 + tbloffset])); + real2 v1850 = minus(v969, v809); + real2 v1854 = plus(v809, v969); + real2 v849 = ctimesminusplus(reverse(v835), tbl[144 + tbloffset], ctimes(v835, tbl[145 + tbloffset])); + real2 v929 = ctimesminusplus(reverse(v915), tbl[160 + tbloffset], ctimes(v915, tbl[161 + tbloffset])); + real2 v889 = ctimesminusplus(reverse(v875), tbl[152 + tbloffset], ctimes(v875, tbl[153 + tbloffset])); + real2 v1089 = ctimesminusplus(reverse(v1075), tbl[192 + tbloffset], ctimes(v1075, tbl[193 + tbloffset])); + real2 v1009 = ctimesminusplus(reverse(v995), tbl[176 + tbloffset], ctimes(v995, tbl[177 + tbloffset])); + real2 v1890 = minus(v1009, v849); + real2 v1894 = plus(v849, v1009); + real2 v1849 = reverse(minus(v889, v1049)); + real2 v1855 = plus(v889, v1049); + real2 v1930 = minus(v1855, v1854); + real2 v1934 = plus(v1854, v1855); + real2 v1895 = plus(v929, v1089); + real2 v1889 = reverse(minus(v929, v1089)); + real2 v1929 = reverse(minus(v1894, v1895)); + real2 v1935 = plus(v1894, v1895); + real2 v1955 = plus(v1934, v1935); + real2 v1949 = reverse(minus(v1934, v1935)); + store(out, 6 << %SHIFT%, plus(v1954, v1955)); + real2 v1968 = minus(v1954, v1955); + store(out, 70 << %SHIFT%, ctimesminusplus(v1968, tbl[0 + tbloffset], ctimes(reverse(v1968), tbl[1 + tbloffset]))); + real2 v1951 = minusplus(v1949, v1950); + store(out, 38 << %SHIFT%, ctimesminusplus(reverse(v1951), tbl[346 + tbloffset], ctimes(v1951, tbl[347 + tbloffset]))); + real2 v1953 = minusplus(uminus(v1949), v1950); + store(out, 102 << %SHIFT%, ctimesminusplus(reverse(v1953), tbl[348 + tbloffset], ctimes(v1953, tbl[349 + tbloffset]))); + real2 v1931 = minusplus(v1929, v1930); + real2 v1933 = minusplus(uminus(v1929), v1930); + real2 v1947 = ctimesminusplus(reverse(v1933), tbl[344 + tbloffset], ctimes(v1933, tbl[345 + tbloffset])); + store(out, 54 << %SHIFT%, plus(v1927, v1947)); + real2 v1980 = minus(v1927, v1947); + store(out, 118 << %SHIFT%, ctimesminusplus(v1980, tbl[0 + tbloffset], ctimes(reverse(v1980), tbl[1 + tbloffset]))); + real2 v1941 = ctimesminusplus(reverse(v1931), tbl[342 + tbloffset], ctimes(v1931, tbl[343 + tbloffset])); + store(out, 22 << %SHIFT%, plus(v1921, v1941)); + real2 v1974 = minus(v1921, v1941); + store(out, 86 << %SHIFT%, ctimesminusplus(v1974, tbl[0 + tbloffset], ctimes(reverse(v1974), tbl[1 + tbloffset]))); + real2 v1851 = minusplus(v1849, v1850); + real2 v1853 = minusplus(uminus(v1849), v1850); + real2 v1867 = ctimesminusplus(reverse(v1853), tbl[328 + tbloffset], ctimes(v1853, tbl[329 + tbloffset])); + real2 v1891 = minusplus(v1889, v1890); + real2 v1893 = minusplus(uminus(v1889), v1890); + real2 v1907 = ctimesminusplus(reverse(v1893), tbl[336 + tbloffset], ctimes(v1893, tbl[337 + tbloffset])); + real2 v2019 = plus(v1867, v1907); + real2 v2013 = reverse(minus(v1867, v1907)); + store(out, 30 << %SHIFT%, plus(v2018, v2019)); + real2 v2032 = minus(v2018, v2019); + store(out, 94 << %SHIFT%, ctimesminusplus(v2032, tbl[0 + tbloffset], ctimes(reverse(v2032), tbl[1 + tbloffset]))); + real2 v2017 = minusplus(uminus(v2013), v2014); + store(out, 126 << %SHIFT%, ctimesminusplus(reverse(v2017), tbl[356 + tbloffset], ctimes(v2017, tbl[357 + tbloffset]))); + real2 v2015 = minusplus(v2013, v2014); + store(out, 62 << %SHIFT%, ctimesminusplus(reverse(v2015), tbl[354 + tbloffset], ctimes(v2015, tbl[355 + tbloffset]))); + real2 v1861 = ctimesminusplus(reverse(v1851), tbl[326 + tbloffset], ctimes(v1851, tbl[327 + tbloffset])); + real2 v1901 = ctimesminusplus(reverse(v1891), tbl[334 + tbloffset], ctimes(v1891, tbl[335 + tbloffset])); + real2 v1993 = plus(v1861, v1901); + real2 v1987 = reverse(minus(v1861, v1901)); + store(out, 14 << %SHIFT%, plus(v1992, v1993)); + real2 v2006 = minus(v1992, v1993); + store(out, 78 << %SHIFT%, ctimesminusplus(v2006, tbl[0 + tbloffset], ctimes(reverse(v2006), tbl[1 + tbloffset]))); + real2 v1991 = minusplus(uminus(v1987), v1988); + store(out, 110 << %SHIFT%, ctimesminusplus(reverse(v1991), tbl[352 + tbloffset], ctimes(v1991, tbl[353 + tbloffset]))); + real2 v1989 = minusplus(v1987, v1988); + store(out, 46 << %SHIFT%, ctimesminusplus(reverse(v1989), tbl[350 + tbloffset], ctimes(v1989, tbl[351 + tbloffset]))); + real2 v593 = minusplus(v591, v592); + real2 v595 = minusplus(uminus(v591), v592); + real2 v473 = minusplus(v471, v472); + real2 v475 = minusplus(uminus(v471), v472); + real2 v555 = minusplus(uminus(v551), v552); + real2 v553 = minusplus(v551, v552); + real2 v609 = ctimesminusplus(reverse(v595), tbl[96 + tbloffset], ctimes(v595, tbl[97 + tbloffset])); + real2 v195 = minusplus(uminus(v191), v192); + real2 v193 = minusplus(v191, v192); + real2 v275 = minusplus(uminus(v271), v272); + real2 v273 = minusplus(v271, v272); + real2 v673 = minusplus(v671, v672); + real2 v675 = minusplus(uminus(v671), v672); + real2 v689 = ctimesminusplus(reverse(v675), tbl[112 + tbloffset], ctimes(v675, tbl[113 + tbloffset])); + real2 v209 = ctimesminusplus(reverse(v195), tbl[16 + tbloffset], ctimes(v195, tbl[17 + tbloffset])); + real2 v289 = ctimesminusplus(reverse(v275), tbl[32 + tbloffset], ctimes(v275, tbl[33 + tbloffset])); + real2 v755 = minusplus(uminus(v751), v752); + real2 v753 = minusplus(v751, v752); + real2 v435 = minusplus(uminus(v431), v432); + real2 v433 = minusplus(v431, v432); + real2 v513 = minusplus(v511, v512); + real2 v515 = minusplus(uminus(v511), v512); + real2 v529 = ctimesminusplus(reverse(v515), tbl[80 + tbloffset], ctimes(v515, tbl[81 + tbloffset])); + real2 v353 = minusplus(v351, v352); + real2 v355 = minusplus(uminus(v351), v352); + real2 v369 = ctimesminusplus(reverse(v355), tbl[48 + tbloffset], ctimes(v355, tbl[49 + tbloffset])); + real2 v2631 = plus(v369, v689); + real2 v2625 = reverse(minus(v369, v689)); + real2 v449 = ctimesminusplus(reverse(v435), tbl[64 + tbloffset], ctimes(v435, tbl[65 + tbloffset])); + real2 v2710 = plus(v289, v609); + real2 v2706 = minus(v609, v289); + real2 v2630 = plus(v209, v529); + real2 v2626 = minus(v529, v209); + real2 v2790 = plus(v2630, v2631); + real2 v2786 = minus(v2631, v2630); + real2 v713 = minusplus(v711, v712); + real2 v715 = minusplus(uminus(v711), v712); + real2 v769 = ctimesminusplus(reverse(v755), tbl[128 + tbloffset], ctimes(v755, tbl[129 + tbloffset])); + real2 v2705 = reverse(minus(v449, v769)); + real2 v2711 = plus(v449, v769); + real2 v313 = minusplus(v311, v312); + real2 v315 = minusplus(uminus(v311), v312); + real2 v393 = minusplus(v391, v392); + real2 v395 = minusplus(uminus(v391), v392); + real2 v409 = ctimesminusplus(reverse(v395), tbl[56 + tbloffset], ctimes(v395, tbl[57 + tbloffset])); + real2 v729 = ctimesminusplus(reverse(v715), tbl[120 + tbloffset], ctimes(v715, tbl[121 + tbloffset])); + real2 v329 = ctimesminusplus(reverse(v315), tbl[40 + tbloffset], ctimes(v315, tbl[41 + tbloffset])); + real2 v489 = ctimesminusplus(reverse(v475), tbl[72 + tbloffset], ctimes(v475, tbl[73 + tbloffset])); + real2 v153 = minusplus(v151, v152); + real2 v155 = minusplus(uminus(v151), v152); + real2 v169 = ctimesminusplus(reverse(v155), tbl[8 + tbloffset], ctimes(v155, tbl[9 + tbloffset])); + real2 v2586 = minus(v489, v169); + real2 v2590 = plus(v169, v489); + real2 v233 = minusplus(v231, v232); + real2 v235 = minusplus(uminus(v231), v232); + real2 v633 = minusplus(v631, v632); + real2 v635 = minusplus(uminus(v631), v632); + real2 v649 = ctimesminusplus(reverse(v635), tbl[104 + tbloffset], ctimes(v635, tbl[105 + tbloffset])); + real2 v249 = ctimesminusplus(reverse(v235), tbl[24 + tbloffset], ctimes(v235, tbl[25 + tbloffset])); + real2 v569 = ctimesminusplus(reverse(v555), tbl[88 + tbloffset], ctimes(v555, tbl[89 + tbloffset])); + real2 v2670 = plus(v249, v569); + real2 v2666 = minus(v569, v249); + real2 v2785 = reverse(minus(v2710, v2711)); + real2 v2791 = plus(v2710, v2711); + real2 v2825 = reverse(minus(v2790, v2791)); + real2 v2831 = plus(v2790, v2791); + real2 v2671 = plus(v409, v729); + real2 v2665 = reverse(minus(v409, v729)); + real2 v2745 = reverse(minus(v2670, v2671)); + real2 v2751 = plus(v2670, v2671); + real2 v2806 = minus(v2731, v2730); + real2 v2810 = plus(v2730, v2731); + real2 v2846 = minus(v2811, v2810); + real2 v2850 = plus(v2810, v2811); + real2 v2591 = plus(v329, v649); + real2 v2585 = reverse(minus(v329, v649)); + real2 v2750 = plus(v2590, v2591); + real2 v2746 = minus(v2591, v2590); + real2 v2830 = plus(v2750, v2751); + real2 v2826 = minus(v2751, v2750); + real2 v2845 = reverse(minus(v2830, v2831)); + real2 v2851 = plus(v2830, v2831); + store(out, 3 << %SHIFT%, plus(v2850, v2851)); + real2 v2864 = minus(v2850, v2851); + store(out, 67 << %SHIFT%, ctimesminusplus(v2864, tbl[0 + tbloffset], ctimes(reverse(v2864), tbl[1 + tbloffset]))); + real2 v2849 = minusplus(uminus(v2845), v2846); + real2 v2847 = minusplus(v2845, v2846); + store(out, 35 << %SHIFT%, ctimesminusplus(reverse(v2847), tbl[506 + tbloffset], ctimes(v2847, tbl[507 + tbloffset]))); + store(out, 99 << %SHIFT%, ctimesminusplus(reverse(v2849), tbl[508 + tbloffset], ctimes(v2849, tbl[509 + tbloffset]))); + real2 v2827 = minusplus(v2825, v2826); + real2 v2829 = minusplus(uminus(v2825), v2826); + real2 v2837 = ctimesminusplus(reverse(v2827), tbl[502 + tbloffset], ctimes(v2827, tbl[503 + tbloffset])); + real2 v2809 = minusplus(uminus(v2805), v2806); + real2 v2807 = minusplus(v2805, v2806); + real2 v2817 = ctimesminusplus(reverse(v2807), tbl[498 + tbloffset], ctimes(v2807, tbl[499 + tbloffset])); + store(out, 19 << %SHIFT%, plus(v2817, v2837)); + real2 v2870 = minus(v2817, v2837); + store(out, 83 << %SHIFT%, ctimesminusplus(v2870, tbl[0 + tbloffset], ctimes(reverse(v2870), tbl[1 + tbloffset]))); + real2 v2823 = ctimesminusplus(reverse(v2809), tbl[500 + tbloffset], ctimes(v2809, tbl[501 + tbloffset])); + real2 v2843 = ctimesminusplus(reverse(v2829), tbl[504 + tbloffset], ctimes(v2829, tbl[505 + tbloffset])); + store(out, 51 << %SHIFT%, plus(v2823, v2843)); + real2 v2876 = minus(v2823, v2843); + store(out, 115 << %SHIFT%, ctimesminusplus(v2876, tbl[0 + tbloffset], ctimes(reverse(v2876), tbl[1 + tbloffset]))); + real2 v2787 = minusplus(v2785, v2786); + real2 v2789 = minusplus(uminus(v2785), v2786); + real2 v2803 = ctimesminusplus(reverse(v2789), tbl[496 + tbloffset], ctimes(v2789, tbl[497 + tbloffset])); + real2 v2727 = minusplus(v2725, v2726); + real2 v2729 = minusplus(uminus(v2725), v2726); + real2 v2743 = ctimesminusplus(reverse(v2729), tbl[484 + tbloffset], ctimes(v2729, tbl[485 + tbloffset])); + real2 v2914 = plus(v2743, v2783); + real2 v2910 = minus(v2783, v2743); + real2 v2749 = minusplus(uminus(v2745), v2746); + real2 v2747 = minusplus(v2745, v2746); + real2 v2763 = ctimesminusplus(reverse(v2749), tbl[488 + tbloffset], ctimes(v2749, tbl[489 + tbloffset])); + real2 v2909 = reverse(minus(v2763, v2803)); + real2 v2915 = plus(v2763, v2803); + store(out, 27 << %SHIFT%, plus(v2914, v2915)); + real2 v2928 = minus(v2914, v2915); + store(out, 91 << %SHIFT%, ctimesminusplus(v2928, tbl[0 + tbloffset], ctimes(reverse(v2928), tbl[1 + tbloffset]))); + real2 v2913 = minusplus(uminus(v2909), v2910); + store(out, 123 << %SHIFT%, ctimesminusplus(reverse(v2913), tbl[516 + tbloffset], ctimes(v2913, tbl[517 + tbloffset]))); + real2 v2911 = minusplus(v2909, v2910); + store(out, 59 << %SHIFT%, ctimesminusplus(reverse(v2911), tbl[514 + tbloffset], ctimes(v2911, tbl[515 + tbloffset]))); + real2 v2737 = ctimesminusplus(reverse(v2727), tbl[482 + tbloffset], ctimes(v2727, tbl[483 + tbloffset])); + real2 v2888 = plus(v2737, v2777); + real2 v2884 = minus(v2777, v2737); + real2 v2797 = ctimesminusplus(reverse(v2787), tbl[494 + tbloffset], ctimes(v2787, tbl[495 + tbloffset])); + real2 v2757 = ctimesminusplus(reverse(v2747), tbl[486 + tbloffset], ctimes(v2747, tbl[487 + tbloffset])); + real2 v2889 = plus(v2757, v2797); + real2 v2883 = reverse(minus(v2757, v2797)); + store(out, 11 << %SHIFT%, plus(v2888, v2889)); + real2 v2902 = minus(v2888, v2889); + store(out, 75 << %SHIFT%, ctimesminusplus(v2902, tbl[0 + tbloffset], ctimes(reverse(v2902), tbl[1 + tbloffset]))); + real2 v2887 = minusplus(uminus(v2883), v2884); + store(out, 107 << %SHIFT%, ctimesminusplus(reverse(v2887), tbl[512 + tbloffset], ctimes(v2887, tbl[513 + tbloffset]))); + real2 v2885 = minusplus(v2883, v2884); + store(out, 43 << %SHIFT%, ctimesminusplus(reverse(v2885), tbl[510 + tbloffset], ctimes(v2885, tbl[511 + tbloffset]))); + real2 v2669 = minusplus(uminus(v2665), v2666); + real2 v2667 = minusplus(v2665, v2666); + real2 v2707 = minusplus(v2705, v2706); + real2 v2709 = minusplus(uminus(v2705), v2706); + real2 v2717 = ctimesminusplus(reverse(v2707), tbl[478 + tbloffset], ctimes(v2707, tbl[479 + tbloffset])); + real2 v2627 = minusplus(v2625, v2626); + real2 v2629 = minusplus(uminus(v2625), v2626); + real2 v2637 = ctimesminusplus(reverse(v2627), tbl[462 + tbloffset], ctimes(v2627, tbl[463 + tbloffset])); + real2 v2961 = plus(v2637, v2717); + real2 v2955 = reverse(minus(v2637, v2717)); + real2 v2649 = minusplus(uminus(v2645), v2646); + real2 v2647 = minusplus(v2645, v2646); + real2 v2569 = minusplus(uminus(v2565), v2566); + real2 v2567 = minusplus(v2565, v2566); + real2 v2577 = ctimesminusplus(reverse(v2567), tbl[450 + tbloffset], ctimes(v2567, tbl[451 + tbloffset])); + real2 v2657 = ctimesminusplus(reverse(v2647), tbl[466 + tbloffset], ctimes(v2647, tbl[467 + tbloffset])); + real2 v2936 = minus(v2657, v2577); + real2 v2940 = plus(v2577, v2657); + real2 v2976 = minus(v2941, v2940); + real2 v2980 = plus(v2940, v2941); + real2 v2677 = ctimesminusplus(reverse(v2667), tbl[470 + tbloffset], ctimes(v2667, tbl[471 + tbloffset])); + real2 v2587 = minusplus(v2585, v2586); + real2 v2589 = minusplus(uminus(v2585), v2586); + real2 v2597 = ctimesminusplus(reverse(v2587), tbl[454 + tbloffset], ctimes(v2587, tbl[455 + tbloffset])); + real2 v2956 = minus(v2677, v2597); + real2 v2960 = plus(v2597, v2677); + real2 v2975 = reverse(minus(v2960, v2961)); + real2 v2981 = plus(v2960, v2961); + store(out, 7 << %SHIFT%, plus(v2980, v2981)); + real2 v2994 = minus(v2980, v2981); + store(out, 71 << %SHIFT%, ctimesminusplus(v2994, tbl[0 + tbloffset], ctimes(reverse(v2994), tbl[1 + tbloffset]))); + real2 v2979 = minusplus(uminus(v2975), v2976); + store(out, 103 << %SHIFT%, ctimesminusplus(reverse(v2979), tbl[528 + tbloffset], ctimes(v2979, tbl[529 + tbloffset]))); + real2 v2977 = minusplus(v2975, v2976); + store(out, 39 << %SHIFT%, ctimesminusplus(reverse(v2977), tbl[526 + tbloffset], ctimes(v2977, tbl[527 + tbloffset]))); + real2 v2939 = minusplus(uminus(v2935), v2936); + real2 v2937 = minusplus(v2935, v2936); + real2 v2953 = ctimesminusplus(reverse(v2939), tbl[520 + tbloffset], ctimes(v2939, tbl[521 + tbloffset])); + real2 v2957 = minusplus(v2955, v2956); + real2 v2959 = minusplus(uminus(v2955), v2956); + real2 v2973 = ctimesminusplus(reverse(v2959), tbl[524 + tbloffset], ctimes(v2959, tbl[525 + tbloffset])); + store(out, 55 << %SHIFT%, plus(v2953, v2973)); + real2 v3006 = minus(v2953, v2973); + store(out, 119 << %SHIFT%, ctimesminusplus(v3006, tbl[0 + tbloffset], ctimes(reverse(v3006), tbl[1 + tbloffset]))); + real2 v2947 = ctimesminusplus(reverse(v2937), tbl[518 + tbloffset], ctimes(v2937, tbl[519 + tbloffset])); + real2 v2967 = ctimesminusplus(reverse(v2957), tbl[522 + tbloffset], ctimes(v2957, tbl[523 + tbloffset])); + store(out, 23 << %SHIFT%, plus(v2947, v2967)); + real2 v3000 = minus(v2947, v2967); + store(out, 87 << %SHIFT%, ctimesminusplus(v3000, tbl[0 + tbloffset], ctimes(reverse(v3000), tbl[1 + tbloffset]))); + real2 v2663 = ctimesminusplus(reverse(v2649), tbl[468 + tbloffset], ctimes(v2649, tbl[469 + tbloffset])); + real2 v2583 = ctimesminusplus(reverse(v2569), tbl[452 + tbloffset], ctimes(v2569, tbl[453 + tbloffset])); + real2 v3014 = minus(v2663, v2583); + real2 v3018 = plus(v2583, v2663); + real2 v3015 = minusplus(v3013, v3014); + real2 v3017 = minusplus(uminus(v3013), v3014); + real2 v2643 = ctimesminusplus(reverse(v2629), tbl[464 + tbloffset], ctimes(v2629, tbl[465 + tbloffset])); + real2 v2723 = ctimesminusplus(reverse(v2709), tbl[480 + tbloffset], ctimes(v2709, tbl[481 + tbloffset])); + real2 v3039 = plus(v2643, v2723); + real2 v3033 = reverse(minus(v2643, v2723)); + real2 v2683 = ctimesminusplus(reverse(v2669), tbl[472 + tbloffset], ctimes(v2669, tbl[473 + tbloffset])); + real2 v3031 = ctimesminusplus(reverse(v3017), tbl[532 + tbloffset], ctimes(v3017, tbl[533 + tbloffset])); + real2 v2603 = ctimesminusplus(reverse(v2589), tbl[456 + tbloffset], ctimes(v2589, tbl[457 + tbloffset])); + real2 v3034 = minus(v2683, v2603); + real2 v3038 = plus(v2603, v2683); + real2 v3037 = minusplus(uminus(v3033), v3034); + real2 v3035 = minusplus(v3033, v3034); + real2 v3051 = ctimesminusplus(reverse(v3037), tbl[536 + tbloffset], ctimes(v3037, tbl[537 + tbloffset])); + store(out, 63 << %SHIFT%, plus(v3031, v3051)); + real2 v3084 = minus(v3031, v3051); + store(out, 127 << %SHIFT%, ctimesminusplus(v3084, tbl[0 + tbloffset], ctimes(reverse(v3084), tbl[1 + tbloffset]))); + real2 v3025 = ctimesminusplus(reverse(v3015), tbl[530 + tbloffset], ctimes(v3015, tbl[531 + tbloffset])); + real2 v3045 = ctimesminusplus(reverse(v3035), tbl[534 + tbloffset], ctimes(v3035, tbl[535 + tbloffset])); + store(out, 31 << %SHIFT%, plus(v3025, v3045)); + real2 v3078 = minus(v3025, v3045); + store(out, 95 << %SHIFT%, ctimesminusplus(v3078, tbl[0 + tbloffset], ctimes(reverse(v3078), tbl[1 + tbloffset]))); + real2 v3058 = plus(v3018, v3019); + real2 v3054 = minus(v3019, v3018); + real2 v3053 = reverse(minus(v3038, v3039)); + real2 v3059 = plus(v3038, v3039); + real2 v3055 = minusplus(v3053, v3054); + store(out, 47 << %SHIFT%, ctimesminusplus(reverse(v3055), tbl[538 + tbloffset], ctimes(v3055, tbl[539 + tbloffset]))); + real2 v3057 = minusplus(uminus(v3053), v3054); + store(out, 111 << %SHIFT%, ctimesminusplus(reverse(v3057), tbl[540 + tbloffset], ctimes(v3057, tbl[541 + tbloffset]))); + store(out, 15 << %SHIFT%, plus(v3058, v3059)); + real2 v3072 = minus(v3058, v3059); + store(out, 79 << %SHIFT%, ctimesminusplus(v3072, tbl[0 + tbloffset], ctimes(reverse(v3072), tbl[1 + tbloffset]))); + real2 v683 = ctimesminusplus(reverse(v673), tbl[110 + tbloffset], ctimes(v673, tbl[111 + tbloffset])); + real2 v363 = ctimesminusplus(reverse(v353), tbl[46 + tbloffset], ctimes(v353, tbl[47 + tbloffset])); + real2 v2105 = plus(v363, v683); + real2 v2099 = reverse(minus(v363, v683)); + real2 v283 = ctimesminusplus(reverse(v273), tbl[30 + tbloffset], ctimes(v273, tbl[31 + tbloffset])); + real2 v723 = ctimesminusplus(reverse(v713), tbl[118 + tbloffset], ctimes(v713, tbl[119 + tbloffset])); + real2 v403 = ctimesminusplus(reverse(v393), tbl[54 + tbloffset], ctimes(v393, tbl[55 + tbloffset])); + real2 v603 = ctimesminusplus(reverse(v593), tbl[94 + tbloffset], ctimes(v593, tbl[95 + tbloffset])); + real2 v2180 = minus(v603, v283); + real2 v2184 = plus(v283, v603); + real2 v2145 = plus(v403, v723); + real2 v2139 = reverse(minus(v403, v723)); + real2 v543 = ctimesminusplus(reverse(v533), tbl[82 + tbloffset], ctimes(v533, tbl[83 + tbloffset])); + real2 v383 = ctimesminusplus(reverse(v373), tbl[50 + tbloffset], ctimes(v373, tbl[51 + tbloffset])); + real2 v703 = ctimesminusplus(reverse(v693), tbl[114 + tbloffset], ctimes(v693, tbl[115 + tbloffset])); + real2 v2125 = plus(v383, v703); + real2 v2119 = reverse(minus(v383, v703)); + real2 v223 = ctimesminusplus(reverse(v213), tbl[18 + tbloffset], ctimes(v213, tbl[19 + tbloffset])); + real2 v2120 = minus(v543, v223); + real2 v2124 = plus(v223, v543); + real2 v443 = ctimesminusplus(reverse(v433), tbl[62 + tbloffset], ctimes(v433, tbl[63 + tbloffset])); + real2 v203 = ctimesminusplus(reverse(v193), tbl[14 + tbloffset], ctimes(v193, tbl[15 + tbloffset])); + real2 v763 = ctimesminusplus(reverse(v753), tbl[126 + tbloffset], ctimes(v753, tbl[127 + tbloffset])); + real2 v2179 = reverse(minus(v443, v763)); + real2 v2185 = plus(v443, v763); + real2 v523 = ctimesminusplus(reverse(v513), tbl[78 + tbloffset], ctimes(v513, tbl[79 + tbloffset])); + real2 v2100 = minus(v523, v203); + real2 v2104 = plus(v203, v523); + real2 v2264 = plus(v2104, v2105); + real2 v2260 = minus(v2105, v2104); + real2 v643 = ctimesminusplus(reverse(v633), tbl[102 + tbloffset], ctimes(v633, tbl[103 + tbloffset])); + real2 v2265 = plus(v2184, v2185); + real2 v2259 = reverse(minus(v2184, v2185)); + real2 v563 = ctimesminusplus(reverse(v553), tbl[86 + tbloffset], ctimes(v553, tbl[87 + tbloffset])); + real2 v243 = ctimesminusplus(reverse(v233), tbl[22 + tbloffset], ctimes(v233, tbl[23 + tbloffset])); + real2 v2144 = plus(v243, v563); + real2 v2140 = minus(v563, v243); + real2 v143 = ctimesminusplus(reverse(v133), tbl[2 + tbloffset], ctimes(v133, tbl[3 + tbloffset])); + real2 v183 = ctimesminusplus(reverse(v173), tbl[10 + tbloffset], ctimes(v173, tbl[11 + tbloffset])); + real2 v2084 = plus(v183, v503); + real2 v2080 = minus(v503, v183); + real2 v163 = ctimesminusplus(reverse(v153), tbl[6 + tbloffset], ctimes(v153, tbl[7 + tbloffset])); + real2 v303 = ctimesminusplus(reverse(v293), tbl[34 + tbloffset], ctimes(v293, tbl[35 + tbloffset])); + real2 v623 = ctimesminusplus(reverse(v613), tbl[98 + tbloffset], ctimes(v613, tbl[99 + tbloffset])); + real2 v2039 = reverse(minus(v303, v623)); + real2 v2045 = plus(v303, v623); + real2 v463 = ctimesminusplus(reverse(v453), tbl[66 + tbloffset], ctimes(v453, tbl[67 + tbloffset])); + real2 v2044 = plus(v143, v463); + real2 v2040 = minus(v463, v143); + real2 v2204 = plus(v2044, v2045); + real2 v2200 = minus(v2045, v2044); + real2 v323 = ctimesminusplus(reverse(v313), tbl[38 + tbloffset], ctimes(v313, tbl[39 + tbloffset])); + real2 v2205 = plus(v2124, v2125); + real2 v2199 = reverse(minus(v2124, v2125)); + real2 v2280 = minus(v2205, v2204); + real2 v2284 = plus(v2204, v2205); + real2 v2225 = plus(v2144, v2145); + real2 v2219 = reverse(minus(v2144, v2145)); + real2 v2305 = plus(v2264, v2265); + real2 v2299 = reverse(minus(v2264, v2265)); + real2 v2240 = minus(v2085, v2084); + real2 v2244 = plus(v2084, v2085); + real2 v2279 = reverse(minus(v2244, v2245)); + real2 v2285 = plus(v2244, v2245); + real2 v2281 = minusplus(v2279, v2280); + real2 v2283 = minusplus(uminus(v2279), v2280); + real2 v2291 = ctimesminusplus(reverse(v2281), tbl[406 + tbloffset], ctimes(v2281, tbl[407 + tbloffset])); + real2 v483 = ctimesminusplus(reverse(v473), tbl[70 + tbloffset], ctimes(v473, tbl[71 + tbloffset])); + real2 v2060 = minus(v483, v163); + real2 v2064 = plus(v163, v483); + real2 v2065 = plus(v323, v643); + real2 v2059 = reverse(minus(v323, v643)); + real2 v2220 = minus(v2065, v2064); + real2 v2224 = plus(v2064, v2065); + real2 v2304 = plus(v2224, v2225); + real2 v2300 = minus(v2225, v2224); + real2 v2301 = minusplus(v2299, v2300); + real2 v2303 = minusplus(uminus(v2299), v2300); + real2 v2311 = ctimesminusplus(reverse(v2301), tbl[410 + tbloffset], ctimes(v2301, tbl[411 + tbloffset])); + store(out, 17 << %SHIFT%, plus(v2291, v2311)); + real2 v2344 = minus(v2291, v2311); + store(out, 81 << %SHIFT%, ctimesminusplus(v2344, tbl[0 + tbloffset], ctimes(reverse(v2344), tbl[1 + tbloffset]))); + real2 v2297 = ctimesminusplus(reverse(v2283), tbl[408 + tbloffset], ctimes(v2283, tbl[409 + tbloffset])); + real2 v2317 = ctimesminusplus(reverse(v2303), tbl[412 + tbloffset], ctimes(v2303, tbl[413 + tbloffset])); + store(out, 49 << %SHIFT%, plus(v2297, v2317)); + real2 v2350 = minus(v2297, v2317); + store(out, 113 << %SHIFT%, ctimesminusplus(v2350, tbl[0 + tbloffset], ctimes(reverse(v2350), tbl[1 + tbloffset]))); + real2 v2320 = minus(v2285, v2284); + real2 v2324 = plus(v2284, v2285); + real2 v2325 = plus(v2304, v2305); + real2 v2319 = reverse(minus(v2304, v2305)); + store(out, 1 << %SHIFT%, plus(v2324, v2325)); + real2 v2338 = minus(v2324, v2325); + store(out, 65 << %SHIFT%, ctimesminusplus(v2338, tbl[0 + tbloffset], ctimes(reverse(v2338), tbl[1 + tbloffset]))); + real2 v2321 = minusplus(v2319, v2320); + store(out, 33 << %SHIFT%, ctimesminusplus(reverse(v2321), tbl[414 + tbloffset], ctimes(v2321, tbl[415 + tbloffset]))); + real2 v2323 = minusplus(uminus(v2319), v2320); + store(out, 97 << %SHIFT%, ctimesminusplus(reverse(v2323), tbl[416 + tbloffset], ctimes(v2323, tbl[417 + tbloffset]))); + real2 v2201 = minusplus(v2199, v2200); + real2 v2203 = minusplus(uminus(v2199), v2200); + real2 v2263 = minusplus(uminus(v2259), v2260); + real2 v2261 = minusplus(v2259, v2260); + real2 v2243 = minusplus(uminus(v2239), v2240); + real2 v2241 = minusplus(v2239, v2240); + real2 v2257 = ctimesminusplus(reverse(v2243), tbl[400 + tbloffset], ctimes(v2243, tbl[401 + tbloffset])); + real2 v2217 = ctimesminusplus(reverse(v2203), tbl[392 + tbloffset], ctimes(v2203, tbl[393 + tbloffset])); + real2 v2388 = plus(v2217, v2257); + real2 v2384 = minus(v2257, v2217); + real2 v2277 = ctimesminusplus(reverse(v2263), tbl[404 + tbloffset], ctimes(v2263, tbl[405 + tbloffset])); + real2 v2221 = minusplus(v2219, v2220); + real2 v2223 = minusplus(uminus(v2219), v2220); + real2 v2237 = ctimesminusplus(reverse(v2223), tbl[396 + tbloffset], ctimes(v2223, tbl[397 + tbloffset])); + real2 v2389 = plus(v2237, v2277); + real2 v2383 = reverse(minus(v2237, v2277)); + store(out, 25 << %SHIFT%, plus(v2388, v2389)); + real2 v2402 = minus(v2388, v2389); + store(out, 89 << %SHIFT%, ctimesminusplus(v2402, tbl[0 + tbloffset], ctimes(reverse(v2402), tbl[1 + tbloffset]))); + real2 v2385 = minusplus(v2383, v2384); + real2 v2387 = minusplus(uminus(v2383), v2384); + store(out, 121 << %SHIFT%, ctimesminusplus(reverse(v2387), tbl[424 + tbloffset], ctimes(v2387, tbl[425 + tbloffset]))); + store(out, 57 << %SHIFT%, ctimesminusplus(reverse(v2385), tbl[422 + tbloffset], ctimes(v2385, tbl[423 + tbloffset]))); + real2 v2251 = ctimesminusplus(reverse(v2241), tbl[398 + tbloffset], ctimes(v2241, tbl[399 + tbloffset])); + real2 v2211 = ctimesminusplus(reverse(v2201), tbl[390 + tbloffset], ctimes(v2201, tbl[391 + tbloffset])); + real2 v2358 = minus(v2251, v2211); + real2 v2362 = plus(v2211, v2251); + real2 v2271 = ctimesminusplus(reverse(v2261), tbl[402 + tbloffset], ctimes(v2261, tbl[403 + tbloffset])); + real2 v2231 = ctimesminusplus(reverse(v2221), tbl[394 + tbloffset], ctimes(v2221, tbl[395 + tbloffset])); + real2 v2357 = reverse(minus(v2231, v2271)); + real2 v2363 = plus(v2231, v2271); + store(out, 9 << %SHIFT%, plus(v2362, v2363)); + real2 v2376 = minus(v2362, v2363); + store(out, 73 << %SHIFT%, ctimesminusplus(v2376, tbl[0 + tbloffset], ctimes(reverse(v2376), tbl[1 + tbloffset]))); + real2 v2361 = minusplus(uminus(v2357), v2358); + store(out, 105 << %SHIFT%, ctimesminusplus(reverse(v2361), tbl[420 + tbloffset], ctimes(v2361, tbl[421 + tbloffset]))); + real2 v2359 = minusplus(v2357, v2358); + store(out, 41 << %SHIFT%, ctimesminusplus(reverse(v2359), tbl[418 + tbloffset], ctimes(v2359, tbl[419 + tbloffset]))); + real2 v2121 = minusplus(v2119, v2120); + real2 v2123 = minusplus(uminus(v2119), v2120); + real2 v2083 = minusplus(uminus(v2079), v2080); + real2 v2081 = minusplus(v2079, v2080); + real2 v2091 = ctimesminusplus(reverse(v2081), tbl[366 + tbloffset], ctimes(v2081, tbl[367 + tbloffset])); + real2 v2043 = minusplus(uminus(v2039), v2040); + real2 v2041 = minusplus(v2039, v2040); + real2 v2051 = ctimesminusplus(reverse(v2041), tbl[358 + tbloffset], ctimes(v2041, tbl[359 + tbloffset])); + real2 v2131 = ctimesminusplus(reverse(v2121), tbl[374 + tbloffset], ctimes(v2121, tbl[375 + tbloffset])); + real2 v2163 = minusplus(uminus(v2159), v2160); + real2 v2161 = minusplus(v2159, v2160); + real2 v2171 = ctimesminusplus(reverse(v2161), tbl[382 + tbloffset], ctimes(v2161, tbl[383 + tbloffset])); + real2 v2409 = reverse(minus(v2091, v2171)); + real2 v2415 = plus(v2091, v2171); + real2 v2410 = minus(v2131, v2051); + real2 v2414 = plus(v2051, v2131); + real2 v2454 = plus(v2414, v2415); + real2 v2450 = minus(v2415, v2414); + real2 v2181 = minusplus(v2179, v2180); + real2 v2183 = minusplus(uminus(v2179), v2180); + real2 v2191 = ctimesminusplus(reverse(v2181), tbl[386 + tbloffset], ctimes(v2181, tbl[387 + tbloffset])); + real2 v2103 = minusplus(uminus(v2099), v2100); + real2 v2101 = minusplus(v2099, v2100); + real2 v2111 = ctimesminusplus(reverse(v2101), tbl[370 + tbloffset], ctimes(v2101, tbl[371 + tbloffset])); + real2 v2435 = plus(v2111, v2191); + real2 v2429 = reverse(minus(v2111, v2191)); + real2 v2141 = minusplus(v2139, v2140); + real2 v2143 = minusplus(uminus(v2139), v2140); + real2 v2151 = ctimesminusplus(reverse(v2141), tbl[378 + tbloffset], ctimes(v2141, tbl[379 + tbloffset])); + real2 v2063 = minusplus(uminus(v2059), v2060); + real2 v2061 = minusplus(v2059, v2060); + real2 v2071 = ctimesminusplus(reverse(v2061), tbl[362 + tbloffset], ctimes(v2061, tbl[363 + tbloffset])); + real2 v2434 = plus(v2071, v2151); + real2 v2430 = minus(v2151, v2071); + real2 v2455 = plus(v2434, v2435); + real2 v2449 = reverse(minus(v2434, v2435)); + store(out, 5 << %SHIFT%, plus(v2454, v2455)); + real2 v2468 = minus(v2454, v2455); + store(out, 69 << %SHIFT%, ctimesminusplus(v2468, tbl[0 + tbloffset], ctimes(reverse(v2468), tbl[1 + tbloffset]))); + real2 v2451 = minusplus(v2449, v2450); + real2 v2453 = minusplus(uminus(v2449), v2450); + store(out, 101 << %SHIFT%, ctimesminusplus(reverse(v2453), tbl[436 + tbloffset], ctimes(v2453, tbl[437 + tbloffset]))); + store(out, 37 << %SHIFT%, ctimesminusplus(reverse(v2451), tbl[434 + tbloffset], ctimes(v2451, tbl[435 + tbloffset]))); + real2 v2411 = minusplus(v2409, v2410); + real2 v2413 = minusplus(uminus(v2409), v2410); + real2 v2433 = minusplus(uminus(v2429), v2430); + real2 v2431 = minusplus(v2429, v2430); + real2 v2421 = ctimesminusplus(reverse(v2411), tbl[426 + tbloffset], ctimes(v2411, tbl[427 + tbloffset])); + real2 v2441 = ctimesminusplus(reverse(v2431), tbl[430 + tbloffset], ctimes(v2431, tbl[431 + tbloffset])); + store(out, 21 << %SHIFT%, plus(v2421, v2441)); + real2 v2474 = minus(v2421, v2441); + store(out, 85 << %SHIFT%, ctimesminusplus(v2474, tbl[0 + tbloffset], ctimes(reverse(v2474), tbl[1 + tbloffset]))); + real2 v2427 = ctimesminusplus(reverse(v2413), tbl[428 + tbloffset], ctimes(v2413, tbl[429 + tbloffset])); + real2 v2447 = ctimesminusplus(reverse(v2433), tbl[432 + tbloffset], ctimes(v2433, tbl[433 + tbloffset])); + store(out, 53 << %SHIFT%, plus(v2427, v2447)); + real2 v2480 = minus(v2427, v2447); + store(out, 117 << %SHIFT%, ctimesminusplus(v2480, tbl[0 + tbloffset], ctimes(reverse(v2480), tbl[1 + tbloffset]))); + real2 v2057 = ctimesminusplus(reverse(v2043), tbl[360 + tbloffset], ctimes(v2043, tbl[361 + tbloffset])); + real2 v2097 = ctimesminusplus(reverse(v2083), tbl[368 + tbloffset], ctimes(v2083, tbl[369 + tbloffset])); + real2 v2157 = ctimesminusplus(reverse(v2143), tbl[380 + tbloffset], ctimes(v2143, tbl[381 + tbloffset])); + real2 v2197 = ctimesminusplus(reverse(v2183), tbl[388 + tbloffset], ctimes(v2183, tbl[389 + tbloffset])); + real2 v2117 = ctimesminusplus(reverse(v2103), tbl[372 + tbloffset], ctimes(v2103, tbl[373 + tbloffset])); + real2 v2507 = reverse(minus(v2117, v2197)); + real2 v2513 = plus(v2117, v2197); + real2 v2137 = ctimesminusplus(reverse(v2123), tbl[376 + tbloffset], ctimes(v2123, tbl[377 + tbloffset])); + real2 v2488 = minus(v2137, v2057); + real2 v2492 = plus(v2057, v2137); + real2 v2177 = ctimesminusplus(reverse(v2163), tbl[384 + tbloffset], ctimes(v2163, tbl[385 + tbloffset])); + real2 v2493 = plus(v2097, v2177); + real2 v2487 = reverse(minus(v2097, v2177)); + real2 v2532 = plus(v2492, v2493); + real2 v2528 = minus(v2493, v2492); + real2 v2077 = ctimesminusplus(reverse(v2063), tbl[364 + tbloffset], ctimes(v2063, tbl[365 + tbloffset])); + real2 v2512 = plus(v2077, v2157); + real2 v2508 = minus(v2157, v2077); + real2 v2527 = reverse(minus(v2512, v2513)); + real2 v2533 = plus(v2512, v2513); + real2 v2529 = minusplus(v2527, v2528); + real2 v2531 = minusplus(uminus(v2527), v2528); + store(out, 109 << %SHIFT%, ctimesminusplus(reverse(v2531), tbl[448 + tbloffset], ctimes(v2531, tbl[449 + tbloffset]))); + store(out, 45 << %SHIFT%, ctimesminusplus(reverse(v2529), tbl[446 + tbloffset], ctimes(v2529, tbl[447 + tbloffset]))); + store(out, 13 << %SHIFT%, plus(v2532, v2533)); + real2 v2546 = minus(v2532, v2533); + store(out, 77 << %SHIFT%, ctimesminusplus(v2546, tbl[0 + tbloffset], ctimes(reverse(v2546), tbl[1 + tbloffset]))); + real2 v2509 = minusplus(v2507, v2508); + real2 v2511 = minusplus(uminus(v2507), v2508); + real2 v2491 = minusplus(uminus(v2487), v2488); + real2 v2489 = minusplus(v2487, v2488); + real2 v2499 = ctimesminusplus(reverse(v2489), tbl[438 + tbloffset], ctimes(v2489, tbl[439 + tbloffset])); + real2 v2519 = ctimesminusplus(reverse(v2509), tbl[442 + tbloffset], ctimes(v2509, tbl[443 + tbloffset])); + store(out, 29 << %SHIFT%, plus(v2499, v2519)); + real2 v2552 = minus(v2499, v2519); + store(out, 93 << %SHIFT%, ctimesminusplus(v2552, tbl[0 + tbloffset], ctimes(reverse(v2552), tbl[1 + tbloffset]))); + real2 v2505 = ctimesminusplus(reverse(v2491), tbl[440 + tbloffset], ctimes(v2491, tbl[441 + tbloffset])); + real2 v2525 = ctimesminusplus(reverse(v2511), tbl[444 + tbloffset], ctimes(v2511, tbl[445 + tbloffset])); + store(out, 61 << %SHIFT%, plus(v2505, v2525)); + real2 v2558 = minus(v2505, v2525); + store(out, 125 << %SHIFT%, ctimesminusplus(v2558, tbl[0 + tbloffset], ctimes(reverse(v2558), tbl[1 + tbloffset]))); + // Pres : 76263 + } +} + +ALIGNED(8192) void but128b_%SHIFT%_%CONFIG%_%ISA%(real *RESTRICT out0, uint32_t *q, const real *RESTRICT in0, const int inShift, const real *RESTRICT tbl, const int K) { + const int k = 1 << (inShift - LOG2VECWIDTH); + int i=0; +#pragma omp parallel for + for(i=0;i < k;i++) { + int i0 = i << LOG2VECWIDTH; + real *out = out0 + q[i]; + const real *in = in0 + i0*2; + const int tbloffset = K * (i0 >> %SHIFT%); + + // Pres : 148586 + real2 v56 = load(in, 54 << inShift); + real2 v120 = load(in, 118 << inShift); + real2 v571 = reverse(minus(v120, v56)); + real2 v577 = plus(v56, v120); + real2 v24 = load(in, 22 << inShift); + real2 v88 = load(in, 86 << inShift); + real2 v576 = plus(v24, v88); + real2 v572 = minus(v88, v24); + real2 v573 = minusplus(v571, v572); + real2 v575 = minusplus(uminus(v571), v572); + real2 v589 = ctimesminusplus(reverse(v575), tbl[92 + tbloffset], ctimes(v575, tbl[93 + tbloffset])); + real2 v583 = ctimesminusplus(reverse(v573), tbl[90 + tbloffset], ctimes(v573, tbl[91 + tbloffset])); + real2 v897 = plus(v576, v577); + real2 v891 = reverse(minus(v577, v576)); + real2 v8 = load(in, 6 << inShift); + real2 v72 = load(in, 70 << inShift); + real2 v252 = minus(v72, v8); + real2 v256 = plus(v8, v72); + real2 v104 = load(in, 102 << inShift); + real2 v40 = load(in, 38 << inShift); + real2 v251 = reverse(minus(v104, v40)); + real2 v257 = plus(v40, v104); + real2 v255 = minusplus(uminus(v251), v252); + real2 v253 = minusplus(v251, v252); + real2 v263 = ctimesminusplus(reverse(v253), tbl[26 + tbloffset], ctimes(v253, tbl[27 + tbloffset])); + real2 v896 = plus(v256, v257); + real2 v892 = minus(v257, v256); + real2 v895 = minusplus(uminus(v891), v892); + real2 v893 = minusplus(v891, v892); + real2 v909 = ctimesminusplus(reverse(v895), tbl[156 + tbloffset], ctimes(v895, tbl[157 + tbloffset])); + real2 v903 = ctimesminusplus(reverse(v893), tbl[154 + tbloffset], ctimes(v893, tbl[155 + tbloffset])); + real2 v269 = ctimesminusplus(reverse(v255), tbl[28 + tbloffset], ctimes(v255, tbl[29 + tbloffset])); + real2 v1216 = plus(v896, v897); + real2 v1212 = minus(v897, v896); + real2 v2160 = minus(v583, v263); + real2 v2164 = plus(v263, v583); + real2 v2686 = minus(v589, v269); + real2 v2690 = plus(v269, v589); + real2 v96 = load(in, 94 << inShift); + real2 v32 = load(in, 30 << inShift); + real2 v736 = plus(v32, v96); + real2 v732 = minus(v96, v32); + real2 v64 = load(in, 62 << inShift); + real2 v128 = load(in, 126 << inShift); + real2 v737 = plus(v64, v128); + real2 v731 = reverse(minus(v128, v64)); + real2 v1057 = plus(v736, v737); + real2 v1051 = reverse(minus(v737, v736)); + real2 v733 = minusplus(v731, v732); + real2 v735 = minusplus(uminus(v731), v732); + real2 v749 = ctimesminusplus(reverse(v735), tbl[124 + tbloffset], ctimes(v735, tbl[125 + tbloffset])); + real2 v743 = ctimesminusplus(reverse(v733), tbl[122 + tbloffset], ctimes(v733, tbl[123 + tbloffset])); + real2 v16 = load(in, 14 << inShift); + real2 v80 = load(in, 78 << inShift); + real2 v412 = minus(v80, v16); + real2 v416 = plus(v16, v80); + real2 v112 = load(in, 110 << inShift); + real2 v48 = load(in, 46 << inShift); + real2 v417 = plus(v48, v112); + real2 v411 = reverse(minus(v112, v48)); + real2 v1056 = plus(v416, v417); + real2 v1052 = minus(v417, v416); + real2 v1055 = minusplus(uminus(v1051), v1052); + real2 v1053 = minusplus(v1051, v1052); + real2 v1063 = ctimesminusplus(reverse(v1053), tbl[186 + tbloffset], ctimes(v1053, tbl[187 + tbloffset])); + real2 v1665 = plus(v903, v1063); + real2 v1659 = reverse(minus(v1063, v903)); + real2 v1069 = ctimesminusplus(reverse(v1055), tbl[188 + tbloffset], ctimes(v1055, tbl[189 + tbloffset])); + real2 v1869 = reverse(minus(v1069, v909)); + real2 v1875 = plus(v909, v1069); + real2 v413 = minusplus(v411, v412); + real2 v415 = minusplus(uminus(v411), v412); + real2 v429 = ctimesminusplus(reverse(v415), tbl[60 + tbloffset], ctimes(v415, tbl[61 + tbloffset])); + real2 v1217 = plus(v1056, v1057); + real2 v1211 = reverse(minus(v1057, v1056)); + real2 v1297 = plus(v1216, v1217); + real2 v1291 = reverse(minus(v1217, v1216)); + real2 v2691 = plus(v429, v749); + real2 v2685 = reverse(minus(v749, v429)); + real2 v2765 = reverse(minus(v2691, v2690)); + real2 v2771 = plus(v2690, v2691); + real2 v2689 = minusplus(uminus(v2685), v2686); + real2 v2687 = minusplus(v2685, v2686); + real2 v2703 = ctimesminusplus(reverse(v2689), tbl[476 + tbloffset], ctimes(v2689, tbl[477 + tbloffset])); + real2 v2697 = ctimesminusplus(reverse(v2687), tbl[474 + tbloffset], ctimes(v2687, tbl[475 + tbloffset])); + real2 v1215 = minusplus(uminus(v1211), v1212); + real2 v1213 = minusplus(v1211, v1212); + real2 v1223 = ctimesminusplus(reverse(v1213), tbl[218 + tbloffset], ctimes(v1213, tbl[219 + tbloffset])); + real2 v1229 = ctimesminusplus(reverse(v1215), tbl[220 + tbloffset], ctimes(v1215, tbl[221 + tbloffset])); + real2 v423 = ctimesminusplus(reverse(v413), tbl[58 + tbloffset], ctimes(v413, tbl[59 + tbloffset])); + real2 v2165 = plus(v423, v743); + real2 v2159 = reverse(minus(v743, v423)); + real2 v2245 = plus(v2164, v2165); + real2 v2239 = reverse(minus(v2165, v2164)); + real2 v44 = load(in, 42 << inShift); + real2 v108 = load(in, 106 << inShift); + real2 v331 = reverse(minus(v108, v44)); + real2 v337 = plus(v44, v108); + real2 v76 = load(in, 74 << inShift); + real2 v12 = load(in, 10 << inShift); + real2 v336 = plus(v12, v76); + real2 v332 = minus(v76, v12); + real2 v976 = plus(v336, v337); + real2 v972 = minus(v337, v336); + real2 v335 = minusplus(uminus(v331), v332); + real2 v333 = minusplus(v331, v332); + real2 v343 = ctimesminusplus(reverse(v333), tbl[42 + tbloffset], ctimes(v333, tbl[43 + tbloffset])); + real2 v349 = ctimesminusplus(reverse(v335), tbl[44 + tbloffset], ctimes(v335, tbl[45 + tbloffset])); + real2 v124 = load(in, 122 << inShift); + real2 v60 = load(in, 58 << inShift); + real2 v651 = reverse(minus(v124, v60)); + real2 v657 = plus(v60, v124); + real2 v28 = load(in, 26 << inShift); + real2 v92 = load(in, 90 << inShift); + real2 v652 = minus(v92, v28); + real2 v656 = plus(v28, v92); + real2 v977 = plus(v656, v657); + real2 v971 = reverse(minus(v657, v656)); + real2 v973 = minusplus(v971, v972); + real2 v975 = minusplus(uminus(v971), v972); + real2 v983 = ctimesminusplus(reverse(v973), tbl[170 + tbloffset], ctimes(v973, tbl[171 + tbloffset])); + real2 v1131 = reverse(minus(v977, v976)); + real2 v1137 = plus(v976, v977); + real2 v655 = minusplus(uminus(v651), v652); + real2 v653 = minusplus(v651, v652); + real2 v669 = ctimesminusplus(reverse(v655), tbl[108 + tbloffset], ctimes(v655, tbl[109 + tbloffset])); + real2 v663 = ctimesminusplus(reverse(v653), tbl[106 + tbloffset], ctimes(v653, tbl[107 + tbloffset])); + real2 v2079 = reverse(minus(v663, v343)); + real2 v2085 = plus(v343, v663); + real2 v2605 = reverse(minus(v669, v349)); + real2 v2611 = plus(v349, v669); + real2 v989 = ctimesminusplus(reverse(v975), tbl[172 + tbloffset], ctimes(v975, tbl[173 + tbloffset])); + real2 v20 = load(in, 18 << inShift); + real2 v84 = load(in, 82 << inShift); + real2 v496 = plus(v20, v84); + real2 v492 = minus(v84, v20); + real2 v52 = load(in, 50 << inShift); + real2 v116 = load(in, 114 << inShift); + real2 v491 = reverse(minus(v116, v52)); + real2 v497 = plus(v52, v116); + real2 v817 = plus(v496, v497); + real2 v811 = reverse(minus(v497, v496)); + real2 v493 = minusplus(v491, v492); + real2 v495 = minusplus(uminus(v491), v492); + real2 v509 = ctimesminusplus(reverse(v495), tbl[76 + tbloffset], ctimes(v495, tbl[77 + tbloffset])); + real2 v503 = ctimesminusplus(reverse(v493), tbl[74 + tbloffset], ctimes(v493, tbl[75 + tbloffset])); + real2 v36 = load(in, 34 << inShift); + real2 v100 = load(in, 98 << inShift); + real2 v171 = reverse(minus(v100, v36)); + real2 v177 = plus(v36, v100); + real2 v68 = load(in, 66 << inShift); + real2 v4 = load(in, 2 << inShift); + real2 v176 = plus(v4, v68); + real2 v172 = minus(v68, v4); + real2 v816 = plus(v176, v177); + real2 v812 = minus(v177, v176); + real2 v1136 = plus(v816, v817); + real2 v1132 = minus(v817, v816); + real2 v1133 = minusplus(v1131, v1132); + real2 v1135 = minusplus(uminus(v1131), v1132); + real2 v1149 = ctimesminusplus(reverse(v1135), tbl[204 + tbloffset], ctimes(v1135, tbl[205 + tbloffset])); + real2 v1296 = plus(v1136, v1137); + real2 v1292 = minus(v1137, v1136); + real2 v1295 = minusplus(uminus(v1291), v1292); + real2 v1293 = minusplus(v1291, v1292); + real2 v1303 = ctimesminusplus(reverse(v1293), tbl[234 + tbloffset], ctimes(v1293, tbl[235 + tbloffset])); + real2 v1331 = reverse(minus(v1297, v1296)); + real2 v1337 = plus(v1296, v1297); + real2 v173 = minusplus(v171, v172); + real2 v175 = minusplus(uminus(v171), v172); + real2 v189 = ctimesminusplus(reverse(v175), tbl[12 + tbloffset], ctimes(v175, tbl[13 + tbloffset])); + real2 v1309 = ctimesminusplus(reverse(v1295), tbl[236 + tbloffset], ctimes(v1295, tbl[237 + tbloffset])); + real2 v815 = minusplus(uminus(v811), v812); + real2 v813 = minusplus(v811, v812); + real2 v1143 = ctimesminusplus(reverse(v1133), tbl[202 + tbloffset], ctimes(v1133, tbl[203 + tbloffset])); + real2 v1541 = reverse(minus(v1229, v1149)); + real2 v1547 = plus(v1149, v1229); + real2 v2610 = plus(v189, v509); + real2 v2606 = minus(v509, v189); + real2 v2770 = plus(v2610, v2611); + real2 v2766 = minus(v2611, v2610); + real2 v823 = ctimesminusplus(reverse(v813), tbl[138 + tbloffset], ctimes(v813, tbl[139 + tbloffset])); + real2 v829 = ctimesminusplus(reverse(v815), tbl[140 + tbloffset], ctimes(v815, tbl[141 + tbloffset])); + real2 v2811 = plus(v2770, v2771); + real2 v2805 = reverse(minus(v2771, v2770)); + real2 v2767 = minusplus(v2765, v2766); + real2 v2769 = minusplus(uminus(v2765), v2766); + real2 v2607 = minusplus(v2605, v2606); + real2 v2609 = minusplus(uminus(v2605), v2606); + real2 v2617 = ctimesminusplus(reverse(v2607), tbl[458 + tbloffset], ctimes(v2607, tbl[459 + tbloffset])); + real2 v2623 = ctimesminusplus(reverse(v2609), tbl[460 + tbloffset], ctimes(v2609, tbl[461 + tbloffset])); + real2 v3013 = reverse(minus(v2703, v2623)); + real2 v3019 = plus(v2623, v2703); + real2 v2783 = ctimesminusplus(reverse(v2769), tbl[492 + tbloffset], ctimes(v2769, tbl[493 + tbloffset])); + real2 v2941 = plus(v2617, v2697); + real2 v2935 = reverse(minus(v2697, v2617)); + real2 v2777 = ctimesminusplus(reverse(v2767), tbl[490 + tbloffset], ctimes(v2767, tbl[491 + tbloffset])); + real2 v1660 = minus(v983, v823); + real2 v1664 = plus(v823, v983); + real2 v1874 = plus(v829, v989); + real2 v1870 = minus(v989, v829); + real2 v1909 = reverse(minus(v1875, v1874)); + real2 v1915 = plus(v1874, v1875); + real2 v1663 = minusplus(uminus(v1659), v1660); + real2 v1661 = minusplus(v1659, v1660); + real2 v1677 = ctimesminusplus(reverse(v1663), tbl[296 + tbloffset], ctimes(v1663, tbl[297 + tbloffset])); + real2 v1873 = minusplus(uminus(v1869), v1870); + real2 v1871 = minusplus(v1869, v1870); + real2 v1887 = ctimesminusplus(reverse(v1873), tbl[332 + tbloffset], ctimes(v1873, tbl[333 + tbloffset])); + real2 v1705 = plus(v1664, v1665); + real2 v1699 = reverse(minus(v1665, v1664)); + real2 v1671 = ctimesminusplus(reverse(v1661), tbl[294 + tbloffset], ctimes(v1661, tbl[295 + tbloffset])); + real2 v1881 = ctimesminusplus(reverse(v1871), tbl[330 + tbloffset], ctimes(v1871, tbl[331 + tbloffset])); + real2 v1469 = plus(v1143, v1223); + real2 v1463 = reverse(minus(v1223, v1143)); + real2 v54 = load(in, 52 << inShift); + real2 v118 = load(in, 116 << inShift); + real2 v537 = plus(v54, v118); + real2 v531 = reverse(minus(v118, v54)); + real2 v86 = load(in, 84 << inShift); + real2 v22 = load(in, 20 << inShift); + real2 v536 = plus(v22, v86); + real2 v532 = minus(v86, v22); + real2 v851 = reverse(minus(v537, v536)); + real2 v857 = plus(v536, v537); + real2 v533 = minusplus(v531, v532); + real2 v535 = minusplus(uminus(v531), v532); + real2 v549 = ctimesminusplus(reverse(v535), tbl[84 + tbloffset], ctimes(v535, tbl[85 + tbloffset])); + real2 v102 = load(in, 100 << inShift); + real2 v38 = load(in, 36 << inShift); + real2 v217 = plus(v38, v102); + real2 v211 = reverse(minus(v102, v38)); + real2 v70 = load(in, 68 << inShift); + real2 v6 = load(in, 4 << inShift); + real2 v216 = plus(v6, v70); + real2 v212 = minus(v70, v6); + real2 v213 = minusplus(v211, v212); + real2 v215 = minusplus(uminus(v211), v212); + real2 v229 = ctimesminusplus(reverse(v215), tbl[20 + tbloffset], ctimes(v215, tbl[21 + tbloffset])); + real2 v2646 = minus(v549, v229); + real2 v2650 = plus(v229, v549); + real2 v856 = plus(v216, v217); + real2 v852 = minus(v217, v216); + real2 v853 = minusplus(v851, v852); + real2 v855 = minusplus(uminus(v851), v852); + real2 v863 = ctimesminusplus(reverse(v853), tbl[146 + tbloffset], ctimes(v853, tbl[147 + tbloffset])); + real2 v869 = ctimesminusplus(reverse(v855), tbl[148 + tbloffset], ctimes(v855, tbl[149 + tbloffset])); + real2 v1176 = plus(v856, v857); + real2 v1172 = minus(v857, v856); + real2 v110 = load(in, 108 << inShift); + real2 v46 = load(in, 44 << inShift); + real2 v377 = plus(v46, v110); + real2 v371 = reverse(minus(v110, v46)); + real2 v78 = load(in, 76 << inShift); + real2 v14 = load(in, 12 << inShift); + real2 v372 = minus(v78, v14); + real2 v376 = plus(v14, v78); + real2 v1012 = minus(v377, v376); + real2 v1016 = plus(v376, v377); + real2 v373 = minusplus(v371, v372); + real2 v375 = minusplus(uminus(v371), v372); + real2 v389 = ctimesminusplus(reverse(v375), tbl[52 + tbloffset], ctimes(v375, tbl[53 + tbloffset])); + real2 v30 = load(in, 28 << inShift); + real2 v94 = load(in, 92 << inShift); + real2 v696 = plus(v30, v94); + real2 v692 = minus(v94, v30); + real2 v62 = load(in, 60 << inShift); + real2 v126 = load(in, 124 << inShift); + real2 v697 = plus(v62, v126); + real2 v691 = reverse(minus(v126, v62)); + real2 v1017 = plus(v696, v697); + real2 v1011 = reverse(minus(v697, v696)); + real2 v1171 = reverse(minus(v1017, v1016)); + real2 v1177 = plus(v1016, v1017); + real2 v1013 = minusplus(v1011, v1012); + real2 v1015 = minusplus(uminus(v1011), v1012); + real2 v1175 = minusplus(uminus(v1171), v1172); + real2 v1173 = minusplus(v1171, v1172); + real2 v1183 = ctimesminusplus(reverse(v1173), tbl[210 + tbloffset], ctimes(v1173, tbl[211 + tbloffset])); + real2 v1189 = ctimesminusplus(reverse(v1175), tbl[212 + tbloffset], ctimes(v1175, tbl[213 + tbloffset])); + real2 v1029 = ctimesminusplus(reverse(v1015), tbl[180 + tbloffset], ctimes(v1015, tbl[181 + tbloffset])); + real2 v1023 = ctimesminusplus(reverse(v1013), tbl[178 + tbloffset], ctimes(v1013, tbl[179 + tbloffset])); + real2 v1625 = plus(v863, v1023); + real2 v1619 = reverse(minus(v1023, v863)); + real2 v1835 = plus(v869, v1029); + real2 v1829 = reverse(minus(v1029, v869)); + real2 v693 = minusplus(v691, v692); + real2 v695 = minusplus(uminus(v691), v692); + real2 v709 = ctimesminusplus(reverse(v695), tbl[116 + tbloffset], ctimes(v695, tbl[117 + tbloffset])); + real2 v2645 = reverse(minus(v709, v389)); + real2 v2651 = plus(v389, v709); + real2 v1257 = plus(v1176, v1177); + real2 v1251 = reverse(minus(v1177, v1176)); + real2 v2731 = plus(v2650, v2651); + real2 v2725 = reverse(minus(v2651, v2650)); + real2 v114 = load(in, 112 << inShift); + real2 v50 = load(in, 48 << inShift); + real2 v457 = plus(v50, v114); + real2 v451 = reverse(minus(v114, v50)); + real2 v18 = load(in, 16 << inShift); + real2 v82 = load(in, 80 << inShift); + real2 v456 = plus(v18, v82); + real2 v452 = minus(v82, v18); + real2 v771 = reverse(minus(v457, v456)); + real2 v777 = plus(v456, v457); + real2 v453 = minusplus(v451, v452); + real2 v455 = minusplus(uminus(v451), v452); + real2 v469 = ctimesminusplus(reverse(v455), tbl[68 + tbloffset], ctimes(v455, tbl[69 + tbloffset])); + real2 v66 = load(in, 64 << inShift); + real2 v2 = load(in, 0 << inShift); + real2 v132 = minus(v66, v2); + real2 v136 = plus(v2, v66); + real2 v98 = load(in, 96 << inShift); + real2 v34 = load(in, 32 << inShift); + real2 v131 = reverse(minus(v98, v34)); + real2 v137 = plus(v34, v98); + real2 v133 = minusplus(v131, v132); + real2 v135 = minusplus(uminus(v131), v132); + real2 v149 = ctimesminusplus(reverse(v135), tbl[4 + tbloffset], ctimes(v135, tbl[5 + tbloffset])); + real2 v2566 = minus(v469, v149); + real2 v2570 = plus(v149, v469); + real2 v772 = minus(v137, v136); + real2 v776 = plus(v136, v137); + real2 v1092 = minus(v777, v776); + real2 v1096 = plus(v776, v777); + real2 v773 = minusplus(v771, v772); + real2 v775 = minusplus(uminus(v771), v772); + real2 v783 = ctimesminusplus(reverse(v773), tbl[130 + tbloffset], ctimes(v773, tbl[131 + tbloffset])); + real2 v789 = ctimesminusplus(reverse(v775), tbl[132 + tbloffset], ctimes(v775, tbl[133 + tbloffset])); + real2 v74 = load(in, 72 << inShift); + real2 v10 = load(in, 8 << inShift); + real2 v296 = plus(v10, v74); + real2 v292 = minus(v74, v10); + real2 v42 = load(in, 40 << inShift); + real2 v106 = load(in, 104 << inShift); + real2 v291 = reverse(minus(v106, v42)); + real2 v297 = plus(v42, v106); + real2 v293 = minusplus(v291, v292); + real2 v295 = minusplus(uminus(v291), v292); + real2 v309 = ctimesminusplus(reverse(v295), tbl[36 + tbloffset], ctimes(v295, tbl[37 + tbloffset])); + real2 v932 = minus(v297, v296); + real2 v936 = plus(v296, v297); + real2 v122 = load(in, 120 << inShift); + real2 v58 = load(in, 56 << inShift); + real2 v617 = plus(v58, v122); + real2 v611 = reverse(minus(v122, v58)); + real2 v26 = load(in, 24 << inShift); + real2 v90 = load(in, 88 << inShift); + real2 v612 = minus(v90, v26); + real2 v616 = plus(v26, v90); + real2 v937 = plus(v616, v617); + real2 v931 = reverse(minus(v617, v616)); + real2 v1091 = reverse(minus(v937, v936)); + real2 v1097 = plus(v936, v937); + real2 v933 = minusplus(v931, v932); + real2 v935 = minusplus(uminus(v931), v932); + real2 v1093 = minusplus(v1091, v1092); + real2 v1095 = minusplus(uminus(v1091), v1092); + real2 v1103 = ctimesminusplus(reverse(v1093), tbl[194 + tbloffset], ctimes(v1093, tbl[195 + tbloffset])); + real2 v1468 = plus(v1103, v1183); + real2 v1464 = minus(v1183, v1103); + real2 v1508 = plus(v1468, v1469); + real2 v1504 = minus(v1469, v1468); + real2 v1252 = minus(v1097, v1096); + real2 v1256 = plus(v1096, v1097); + real2 v1336 = plus(v1256, v1257); + real2 v1332 = minus(v1257, v1256); + real2 v1335 = minusplus(uminus(v1331), v1332); + real2 v1333 = minusplus(v1331, v1332); + real2 v1343 = ctimesminusplus(reverse(v1333), tbl[242 + tbloffset], ctimes(v1333, tbl[243 + tbloffset])); + real2 v1349 = ctimesminusplus(reverse(v1335), tbl[244 + tbloffset], ctimes(v1335, tbl[245 + tbloffset])); + real2 v1376 = plus(v1336, v1337); + real2 v1372 = minus(v1337, v1336); + real2 v1465 = minusplus(v1463, v1464); + real2 v1467 = minusplus(uminus(v1463), v1464); + real2 v1255 = minusplus(uminus(v1251), v1252); + real2 v1253 = minusplus(v1251, v1252); + real2 v1481 = ctimesminusplus(reverse(v1467), tbl[264 + tbloffset], ctimes(v1467, tbl[265 + tbloffset])); + real2 v1475 = ctimesminusplus(reverse(v1465), tbl[262 + tbloffset], ctimes(v1465, tbl[263 + tbloffset])); + real2 v1109 = ctimesminusplus(reverse(v1095), tbl[196 + tbloffset], ctimes(v1095, tbl[197 + tbloffset])); + real2 v1542 = minus(v1189, v1109); + real2 v1546 = plus(v1109, v1189); + real2 v1545 = minusplus(uminus(v1541), v1542); + real2 v1543 = minusplus(v1541, v1542); + real2 v1553 = ctimesminusplus(reverse(v1543), tbl[274 + tbloffset], ctimes(v1543, tbl[275 + tbloffset])); + real2 v1559 = ctimesminusplus(reverse(v1545), tbl[276 + tbloffset], ctimes(v1545, tbl[277 + tbloffset])); + real2 v1582 = minus(v1547, v1546); + real2 v1586 = plus(v1546, v1547); + real2 v1269 = ctimesminusplus(reverse(v1255), tbl[228 + tbloffset], ctimes(v1255, tbl[229 + tbloffset])); + real2 v1438 = minus(v1309, v1269); + real2 v1442 = plus(v1269, v1309); + real2 v1263 = ctimesminusplus(reverse(v1253), tbl[226 + tbloffset], ctimes(v1253, tbl[227 + tbloffset])); + real2 v943 = ctimesminusplus(reverse(v933), tbl[162 + tbloffset], ctimes(v933, tbl[163 + tbloffset])); + real2 v1624 = plus(v783, v943); + real2 v1620 = minus(v943, v783); + real2 v1623 = minusplus(uminus(v1619), v1620); + real2 v1621 = minusplus(v1619, v1620); + real2 v1700 = minus(v1625, v1624); + real2 v1704 = plus(v1624, v1625); + real2 v1631 = ctimesminusplus(reverse(v1621), tbl[286 + tbloffset], ctimes(v1621, tbl[287 + tbloffset])); + real2 v949 = ctimesminusplus(reverse(v935), tbl[164 + tbloffset], ctimes(v935, tbl[165 + tbloffset])); + real2 v1830 = minus(v949, v789); + real2 v1834 = plus(v789, v949); + real2 v1782 = plus(v1631, v1671); + real2 v1778 = minus(v1671, v1631); + real2 v1910 = minus(v1835, v1834); + real2 v1914 = plus(v1834, v1835); + real2 v1950 = minus(v1915, v1914); + real2 v1954 = plus(v1914, v1915); + real2 v1913 = minusplus(uminus(v1909), v1910); + real2 v1911 = minusplus(v1909, v1910); + real2 v613 = minusplus(v611, v612); + real2 v615 = minusplus(uminus(v611), v612); + real2 v629 = ctimesminusplus(reverse(v615), tbl[100 + tbloffset], ctimes(v615, tbl[101 + tbloffset])); + real2 v1744 = plus(v1704, v1705); + real2 v1740 = minus(v1705, v1704); + real2 v1637 = ctimesminusplus(reverse(v1623), tbl[288 + tbloffset], ctimes(v1623, tbl[289 + tbloffset])); + real2 v1927 = ctimesminusplus(reverse(v1913), tbl[340 + tbloffset], ctimes(v1913, tbl[341 + tbloffset])); + real2 v2571 = plus(v309, v629); + real2 v2565 = reverse(minus(v629, v309)); + real2 v1833 = minusplus(uminus(v1829), v1830); + real2 v1831 = minusplus(v1829, v1830); + real2 v1921 = ctimesminusplus(reverse(v1911), tbl[338 + tbloffset], ctimes(v1911, tbl[339 + tbloffset])); + real2 v1804 = minus(v1677, v1637); + real2 v1808 = plus(v1637, v1677); + real2 v1847 = ctimesminusplus(reverse(v1833), tbl[324 + tbloffset], ctimes(v1833, tbl[325 + tbloffset])); + real2 v2014 = minus(v1887, v1847); + real2 v2018 = plus(v1847, v1887); + real2 v1841 = ctimesminusplus(reverse(v1831), tbl[322 + tbloffset], ctimes(v1831, tbl[323 + tbloffset])); + real2 v1988 = minus(v1881, v1841); + real2 v1992 = plus(v1841, v1881); + real2 v1703 = minusplus(uminus(v1699), v1700); + real2 v1701 = minusplus(v1699, v1700); + real2 v1717 = ctimesminusplus(reverse(v1703), tbl[304 + tbloffset], ctimes(v1703, tbl[305 + tbloffset])); + real2 v1711 = ctimesminusplus(reverse(v1701), tbl[302 + tbloffset], ctimes(v1701, tbl[303 + tbloffset])); + real2 v2730 = plus(v2570, v2571); + real2 v2726 = minus(v2571, v2570); + real2 v1412 = minus(v1303, v1263); + real2 v1416 = plus(v1263, v1303); + real2 v63 = load(in, 61 << inShift); + real2 v127 = load(in, 125 << inShift); + real2 v717 = plus(v63, v127); + real2 v711 = reverse(minus(v127, v63)); + real2 v95 = load(in, 93 << inShift); + real2 v31 = load(in, 29 << inShift); + real2 v712 = minus(v95, v31); + real2 v716 = plus(v31, v95); + real2 v1037 = plus(v716, v717); + real2 v1031 = reverse(minus(v717, v716)); + real2 v79 = load(in, 77 << inShift); + real2 v15 = load(in, 13 << inShift); + real2 v396 = plus(v15, v79); + real2 v392 = minus(v79, v15); + real2 v111 = load(in, 109 << inShift); + real2 v47 = load(in, 45 << inShift); + real2 v397 = plus(v47, v111); + real2 v391 = reverse(minus(v111, v47)); + real2 v1032 = minus(v397, v396); + real2 v1036 = plus(v396, v397); + real2 v1033 = minusplus(v1031, v1032); + real2 v1035 = minusplus(uminus(v1031), v1032); + real2 v1049 = ctimesminusplus(reverse(v1035), tbl[184 + tbloffset], ctimes(v1035, tbl[185 + tbloffset])); + real2 v1043 = ctimesminusplus(reverse(v1033), tbl[182 + tbloffset], ctimes(v1033, tbl[183 + tbloffset])); + real2 v1197 = plus(v1036, v1037); + real2 v1191 = reverse(minus(v1037, v1036)); + real2 v23 = load(in, 21 << inShift); + real2 v87 = load(in, 85 << inShift); + real2 v556 = plus(v23, v87); + real2 v552 = minus(v87, v23); + real2 v119 = load(in, 117 << inShift); + real2 v55 = load(in, 53 << inShift); + real2 v557 = plus(v55, v119); + real2 v551 = reverse(minus(v119, v55)); + real2 v877 = plus(v556, v557); + real2 v871 = reverse(minus(v557, v556)); + real2 v7 = load(in, 5 << inShift); + real2 v71 = load(in, 69 << inShift); + real2 v232 = minus(v71, v7); + real2 v236 = plus(v7, v71); + real2 v103 = load(in, 101 << inShift); + real2 v39 = load(in, 37 << inShift); + real2 v237 = plus(v39, v103); + real2 v231 = reverse(minus(v103, v39)); + real2 v876 = plus(v236, v237); + real2 v872 = minus(v237, v236); + real2 v1192 = minus(v877, v876); + real2 v1196 = plus(v876, v877); + real2 v1271 = reverse(minus(v1197, v1196)); + real2 v1277 = plus(v1196, v1197); + real2 v875 = minusplus(uminus(v871), v872); + real2 v873 = minusplus(v871, v872); + real2 v883 = ctimesminusplus(reverse(v873), tbl[150 + tbloffset], ctimes(v873, tbl[151 + tbloffset])); + real2 v1639 = reverse(minus(v1043, v883)); + real2 v1645 = plus(v883, v1043); + real2 v1195 = minusplus(uminus(v1191), v1192); + real2 v1193 = minusplus(v1191, v1192); + real2 v1209 = ctimesminusplus(reverse(v1195), tbl[216 + tbloffset], ctimes(v1195, tbl[217 + tbloffset])); + real2 v1203 = ctimesminusplus(reverse(v1193), tbl[214 + tbloffset], ctimes(v1193, tbl[215 + tbloffset])); + real2 v83 = load(in, 81 << inShift); + real2 v19 = load(in, 17 << inShift); + real2 v476 = plus(v19, v83); + real2 v472 = minus(v83, v19); + real2 v51 = load(in, 49 << inShift); + real2 v115 = load(in, 113 << inShift); + real2 v477 = plus(v51, v115); + real2 v471 = reverse(minus(v115, v51)); + real2 v797 = plus(v476, v477); + real2 v791 = reverse(minus(v477, v476)); + real2 v3 = load(in, 1 << inShift); + real2 v67 = load(in, 65 << inShift); + real2 v156 = plus(v3, v67); + real2 v152 = minus(v67, v3); + real2 v35 = load(in, 33 << inShift); + real2 v99 = load(in, 97 << inShift); + real2 v157 = plus(v35, v99); + real2 v151 = reverse(minus(v99, v35)); + real2 v792 = minus(v157, v156); + real2 v796 = plus(v156, v157); + real2 v793 = minusplus(v791, v792); + real2 v795 = minusplus(uminus(v791), v792); + real2 v803 = ctimesminusplus(reverse(v793), tbl[134 + tbloffset], ctimes(v793, tbl[135 + tbloffset])); + real2 v1112 = minus(v797, v796); + real2 v1116 = plus(v796, v797); + real2 v107 = load(in, 105 << inShift); + real2 v43 = load(in, 41 << inShift); + real2 v317 = plus(v43, v107); + real2 v311 = reverse(minus(v107, v43)); + real2 v75 = load(in, 73 << inShift); + real2 v11 = load(in, 9 << inShift); + real2 v316 = plus(v11, v75); + real2 v312 = minus(v75, v11); + real2 v956 = plus(v316, v317); + real2 v952 = minus(v317, v316); + real2 v59 = load(in, 57 << inShift); + real2 v123 = load(in, 121 << inShift); + real2 v631 = reverse(minus(v123, v59)); + real2 v637 = plus(v59, v123); + real2 v27 = load(in, 25 << inShift); + real2 v91 = load(in, 89 << inShift); + real2 v636 = plus(v27, v91); + real2 v632 = minus(v91, v27); + real2 v957 = plus(v636, v637); + real2 v951 = reverse(minus(v637, v636)); + real2 v1111 = reverse(minus(v957, v956)); + real2 v1117 = plus(v956, v957); + real2 v1276 = plus(v1116, v1117); + real2 v1272 = minus(v1117, v1116); + real2 v1275 = minusplus(uminus(v1271), v1272); + real2 v1273 = minusplus(v1271, v1272); + real2 v1283 = ctimesminusplus(reverse(v1273), tbl[230 + tbloffset], ctimes(v1273, tbl[231 + tbloffset])); + real2 v1352 = minus(v1277, v1276); + real2 v1356 = plus(v1276, v1277); + real2 v1289 = ctimesminusplus(reverse(v1275), tbl[232 + tbloffset], ctimes(v1275, tbl[233 + tbloffset])); + real2 v1115 = minusplus(uminus(v1111), v1112); + real2 v1113 = minusplus(v1111, v1112); + real2 v1123 = ctimesminusplus(reverse(v1113), tbl[198 + tbloffset], ctimes(v1113, tbl[199 + tbloffset])); + real2 v1129 = ctimesminusplus(reverse(v1115), tbl[200 + tbloffset], ctimes(v1115, tbl[201 + tbloffset])); + real2 v1488 = plus(v1123, v1203); + real2 v1484 = minus(v1203, v1123); + real2 v1566 = plus(v1129, v1209); + real2 v1562 = minus(v1209, v1129); + real2 v85 = load(in, 83 << inShift); + real2 v21 = load(in, 19 << inShift); + real2 v512 = minus(v85, v21); + real2 v516 = plus(v21, v85); + real2 v117 = load(in, 115 << inShift); + real2 v53 = load(in, 51 << inShift); + real2 v517 = plus(v53, v117); + real2 v511 = reverse(minus(v117, v53)); + real2 v831 = reverse(minus(v517, v516)); + real2 v837 = plus(v516, v517); + real2 v69 = load(in, 67 << inShift); + real2 v5 = load(in, 3 << inShift); + real2 v192 = minus(v69, v5); + real2 v196 = plus(v5, v69); + real2 v37 = load(in, 35 << inShift); + real2 v101 = load(in, 99 << inShift); + real2 v197 = plus(v37, v101); + real2 v191 = reverse(minus(v101, v37)); + real2 v832 = minus(v197, v196); + real2 v836 = plus(v196, v197); + real2 v1152 = minus(v837, v836); + real2 v1156 = plus(v836, v837); + real2 v61 = load(in, 59 << inShift); + real2 v125 = load(in, 123 << inShift); + real2 v677 = plus(v61, v125); + real2 v671 = reverse(minus(v125, v61)); + real2 v29 = load(in, 27 << inShift); + real2 v93 = load(in, 91 << inShift); + real2 v672 = minus(v93, v29); + real2 v676 = plus(v29, v93); + real2 v997 = plus(v676, v677); + real2 v991 = reverse(minus(v677, v676)); + real2 v109 = load(in, 107 << inShift); + real2 v45 = load(in, 43 << inShift); + real2 v357 = plus(v45, v109); + real2 v351 = reverse(minus(v109, v45)); + real2 v77 = load(in, 75 << inShift); + real2 v13 = load(in, 11 << inShift); + real2 v352 = minus(v77, v13); + real2 v356 = plus(v13, v77); + real2 v992 = minus(v357, v356); + real2 v996 = plus(v356, v357); + real2 v1157 = plus(v996, v997); + real2 v1151 = reverse(minus(v997, v996)); + real2 v1155 = minusplus(uminus(v1151), v1152); + real2 v1153 = minusplus(v1151, v1152); + real2 v1163 = ctimesminusplus(reverse(v1153), tbl[206 + tbloffset], ctimes(v1153, tbl[207 + tbloffset])); + real2 v1316 = plus(v1156, v1157); + real2 v1312 = minus(v1157, v1156); + real2 v41 = load(in, 39 << inShift); + real2 v105 = load(in, 103 << inShift); + real2 v277 = plus(v41, v105); + real2 v271 = reverse(minus(v105, v41)); + real2 v9 = load(in, 7 << inShift); + real2 v73 = load(in, 71 << inShift); + real2 v276 = plus(v9, v73); + real2 v272 = minus(v73, v9); + real2 v916 = plus(v276, v277); + real2 v912 = minus(v277, v276); + real2 v89 = load(in, 87 << inShift); + real2 v25 = load(in, 23 << inShift); + real2 v592 = minus(v89, v25); + real2 v596 = plus(v25, v89); + real2 v57 = load(in, 55 << inShift); + real2 v121 = load(in, 119 << inShift); + real2 v591 = reverse(minus(v121, v57)); + real2 v597 = plus(v57, v121); + real2 v911 = reverse(minus(v597, v596)); + real2 v917 = plus(v596, v597); + real2 v1236 = plus(v916, v917); + real2 v1232 = minus(v917, v916); + real2 v81 = load(in, 79 << inShift); + real2 v17 = load(in, 15 << inShift); + real2 v432 = minus(v81, v17); + real2 v436 = plus(v17, v81); + real2 v113 = load(in, 111 << inShift); + real2 v49 = load(in, 47 << inShift); + real2 v437 = plus(v49, v113); + real2 v431 = reverse(minus(v113, v49)); + real2 v1072 = minus(v437, v436); + real2 v1076 = plus(v436, v437); + real2 v65 = load(in, 63 << inShift); + real2 v129 = load(in, 127 << inShift); + real2 v757 = plus(v65, v129); + real2 v751 = reverse(minus(v129, v65)); + real2 v97 = load(in, 95 << inShift); + real2 v33 = load(in, 31 << inShift); + real2 v752 = minus(v97, v33); + real2 v756 = plus(v33, v97); + real2 v1077 = plus(v756, v757); + real2 v1071 = reverse(minus(v757, v756)); + real2 v1231 = reverse(minus(v1077, v1076)); + real2 v1237 = plus(v1076, v1077); + real2 v1317 = plus(v1236, v1237); + real2 v1311 = reverse(minus(v1237, v1236)); + real2 v1351 = reverse(minus(v1317, v1316)); + real2 v1357 = plus(v1316, v1317); + real2 v1371 = reverse(minus(v1357, v1356)); + real2 v1377 = plus(v1356, v1357); + store(out, 0 << %SHIFT%, plus(v1376, v1377)); + real2 v1390 = minus(v1376, v1377); + store(out, 64 << %SHIFT%, ctimesminusplus(v1390, tbl[0 + tbloffset], ctimes(reverse(v1390), tbl[1 + tbloffset]))); + real2 v1353 = minusplus(v1351, v1352); + real2 v1355 = minusplus(uminus(v1351), v1352); + real2 v1369 = ctimesminusplus(reverse(v1355), tbl[248 + tbloffset], ctimes(v1355, tbl[249 + tbloffset])); + store(out, 48 << %SHIFT%, plus(v1349, v1369)); + real2 v1404 = minus(v1349, v1369); + store(out, 112 << %SHIFT%, ctimesminusplus(v1404, tbl[0 + tbloffset], ctimes(reverse(v1404), tbl[1 + tbloffset]))); + real2 v1363 = ctimesminusplus(reverse(v1353), tbl[246 + tbloffset], ctimes(v1353, tbl[247 + tbloffset])); + store(out, 16 << %SHIFT%, plus(v1343, v1363)); + real2 v1398 = minus(v1343, v1363); + store(out, 80 << %SHIFT%, ctimesminusplus(v1398, tbl[0 + tbloffset], ctimes(reverse(v1398), tbl[1 + tbloffset]))); + real2 v1373 = minusplus(v1371, v1372); + real2 v1375 = minusplus(uminus(v1371), v1372); + store(out, 96 << %SHIFT%, ctimesminusplus(reverse(v1375), tbl[252 + tbloffset], ctimes(v1375, tbl[253 + tbloffset]))); + store(out, 32 << %SHIFT%, ctimesminusplus(reverse(v1373), tbl[250 + tbloffset], ctimes(v1373, tbl[251 + tbloffset]))); + real2 v1313 = minusplus(v1311, v1312); + real2 v1315 = minusplus(uminus(v1311), v1312); + real2 v1323 = ctimesminusplus(reverse(v1313), tbl[238 + tbloffset], ctimes(v1313, tbl[239 + tbloffset])); + real2 v1417 = plus(v1283, v1323); + real2 v1411 = reverse(minus(v1323, v1283)); + store(out, 8 << %SHIFT%, plus(v1416, v1417)); + real2 v1430 = minus(v1416, v1417); + store(out, 72 << %SHIFT%, ctimesminusplus(v1430, tbl[0 + tbloffset], ctimes(reverse(v1430), tbl[1 + tbloffset]))); + real2 v1413 = minusplus(v1411, v1412); + real2 v1415 = minusplus(uminus(v1411), v1412); + store(out, 104 << %SHIFT%, ctimesminusplus(reverse(v1415), tbl[256 + tbloffset], ctimes(v1415, tbl[257 + tbloffset]))); + store(out, 40 << %SHIFT%, ctimesminusplus(reverse(v1413), tbl[254 + tbloffset], ctimes(v1413, tbl[255 + tbloffset]))); + real2 v1329 = ctimesminusplus(reverse(v1315), tbl[240 + tbloffset], ctimes(v1315, tbl[241 + tbloffset])); + real2 v1443 = plus(v1289, v1329); + real2 v1437 = reverse(minus(v1329, v1289)); + store(out, 24 << %SHIFT%, plus(v1442, v1443)); + real2 v1456 = minus(v1442, v1443); + store(out, 88 << %SHIFT%, ctimesminusplus(v1456, tbl[0 + tbloffset], ctimes(reverse(v1456), tbl[1 + tbloffset]))); + real2 v1441 = minusplus(uminus(v1437), v1438); + real2 v1439 = minusplus(v1437, v1438); + store(out, 120 << %SHIFT%, ctimesminusplus(reverse(v1441), tbl[260 + tbloffset], ctimes(v1441, tbl[261 + tbloffset]))); + store(out, 56 << %SHIFT%, ctimesminusplus(reverse(v1439), tbl[258 + tbloffset], ctimes(v1439, tbl[259 + tbloffset]))); + real2 v1235 = minusplus(uminus(v1231), v1232); + real2 v1233 = minusplus(v1231, v1232); + real2 v1243 = ctimesminusplus(reverse(v1233), tbl[222 + tbloffset], ctimes(v1233, tbl[223 + tbloffset])); + real2 v1489 = plus(v1163, v1243); + real2 v1483 = reverse(minus(v1243, v1163)); + real2 v1509 = plus(v1488, v1489); + real2 v1503 = reverse(minus(v1489, v1488)); + store(out, 4 << %SHIFT%, plus(v1508, v1509)); + real2 v1522 = minus(v1508, v1509); + store(out, 68 << %SHIFT%, ctimesminusplus(v1522, tbl[0 + tbloffset], ctimes(reverse(v1522), tbl[1 + tbloffset]))); + real2 v1507 = minusplus(uminus(v1503), v1504); + real2 v1505 = minusplus(v1503, v1504); + store(out, 36 << %SHIFT%, ctimesminusplus(reverse(v1505), tbl[270 + tbloffset], ctimes(v1505, tbl[271 + tbloffset]))); + store(out, 100 << %SHIFT%, ctimesminusplus(reverse(v1507), tbl[272 + tbloffset], ctimes(v1507, tbl[273 + tbloffset]))); + real2 v1485 = minusplus(v1483, v1484); + real2 v1487 = minusplus(uminus(v1483), v1484); + real2 v1501 = ctimesminusplus(reverse(v1487), tbl[268 + tbloffset], ctimes(v1487, tbl[269 + tbloffset])); + store(out, 52 << %SHIFT%, plus(v1481, v1501)); + real2 v1534 = minus(v1481, v1501); + store(out, 116 << %SHIFT%, ctimesminusplus(v1534, tbl[0 + tbloffset], ctimes(reverse(v1534), tbl[1 + tbloffset]))); + real2 v1495 = ctimesminusplus(reverse(v1485), tbl[266 + tbloffset], ctimes(v1485, tbl[267 + tbloffset])); + store(out, 20 << %SHIFT%, plus(v1475, v1495)); + real2 v1528 = minus(v1475, v1495); + store(out, 84 << %SHIFT%, ctimesminusplus(v1528, tbl[0 + tbloffset], ctimes(reverse(v1528), tbl[1 + tbloffset]))); + real2 v1249 = ctimesminusplus(reverse(v1235), tbl[224 + tbloffset], ctimes(v1235, tbl[225 + tbloffset])); + real2 v1169 = ctimesminusplus(reverse(v1155), tbl[208 + tbloffset], ctimes(v1155, tbl[209 + tbloffset])); + real2 v1567 = plus(v1169, v1249); + real2 v1561 = reverse(minus(v1249, v1169)); + real2 v1581 = reverse(minus(v1567, v1566)); + real2 v1587 = plus(v1566, v1567); + store(out, 12 << %SHIFT%, plus(v1586, v1587)); + real2 v1600 = minus(v1586, v1587); + store(out, 76 << %SHIFT%, ctimesminusplus(v1600, tbl[0 + tbloffset], ctimes(reverse(v1600), tbl[1 + tbloffset]))); + real2 v1583 = minusplus(v1581, v1582); + store(out, 44 << %SHIFT%, ctimesminusplus(reverse(v1583), tbl[282 + tbloffset], ctimes(v1583, tbl[283 + tbloffset]))); + real2 v1585 = minusplus(uminus(v1581), v1582); + store(out, 108 << %SHIFT%, ctimesminusplus(reverse(v1585), tbl[284 + tbloffset], ctimes(v1585, tbl[285 + tbloffset]))); + real2 v1565 = minusplus(uminus(v1561), v1562); + real2 v1563 = minusplus(v1561, v1562); + real2 v1579 = ctimesminusplus(reverse(v1565), tbl[280 + tbloffset], ctimes(v1565, tbl[281 + tbloffset])); + store(out, 60 << %SHIFT%, plus(v1559, v1579)); + real2 v1612 = minus(v1559, v1579); + store(out, 124 << %SHIFT%, ctimesminusplus(v1612, tbl[0 + tbloffset], ctimes(reverse(v1612), tbl[1 + tbloffset]))); + real2 v1573 = ctimesminusplus(reverse(v1563), tbl[278 + tbloffset], ctimes(v1563, tbl[279 + tbloffset])); + store(out, 28 << %SHIFT%, plus(v1553, v1573)); + real2 v1606 = minus(v1553, v1573); + store(out, 92 << %SHIFT%, ctimesminusplus(v1606, tbl[0 + tbloffset], ctimes(reverse(v1606), tbl[1 + tbloffset]))); + real2 v833 = minusplus(v831, v832); + real2 v835 = minusplus(uminus(v831), v832); + real2 v955 = minusplus(uminus(v951), v952); + real2 v953 = minusplus(v951, v952); + real2 v963 = ctimesminusplus(reverse(v953), tbl[166 + tbloffset], ctimes(v953, tbl[167 + tbloffset])); + real2 v995 = minusplus(uminus(v991), v992); + real2 v993 = minusplus(v991, v992); + real2 v1003 = ctimesminusplus(reverse(v993), tbl[174 + tbloffset], ctimes(v993, tbl[175 + tbloffset])); + real2 v843 = ctimesminusplus(reverse(v833), tbl[142 + tbloffset], ctimes(v833, tbl[143 + tbloffset])); + real2 v1640 = minus(v963, v803); + real2 v1644 = plus(v803, v963); + real2 v1680 = minus(v1003, v843); + real2 v1684 = plus(v843, v1003); + real2 v1641 = minusplus(v1639, v1640); + real2 v1643 = minusplus(uminus(v1639), v1640); + real2 v1657 = ctimesminusplus(reverse(v1643), tbl[292 + tbloffset], ctimes(v1643, tbl[293 + tbloffset])); + real2 v913 = minusplus(v911, v912); + real2 v915 = minusplus(uminus(v911), v912); + real2 v1073 = minusplus(v1071, v1072); + real2 v1075 = minusplus(uminus(v1071), v1072); + real2 v923 = ctimesminusplus(reverse(v913), tbl[158 + tbloffset], ctimes(v913, tbl[159 + tbloffset])); + real2 v1083 = ctimesminusplus(reverse(v1073), tbl[190 + tbloffset], ctimes(v1073, tbl[191 + tbloffset])); + real2 v1685 = plus(v923, v1083); + real2 v1679 = reverse(minus(v1083, v923)); + real2 v1681 = minusplus(v1679, v1680); + real2 v1683 = minusplus(uminus(v1679), v1680); + real2 v1697 = ctimesminusplus(reverse(v1683), tbl[300 + tbloffset], ctimes(v1683, tbl[301 + tbloffset])); + real2 v1809 = plus(v1657, v1697); + real2 v1803 = reverse(minus(v1697, v1657)); + store(out, 26 << %SHIFT%, plus(v1808, v1809)); + real2 v1822 = minus(v1808, v1809); + store(out, 90 << %SHIFT%, ctimesminusplus(v1822, tbl[0 + tbloffset], ctimes(reverse(v1822), tbl[1 + tbloffset]))); + real2 v1807 = minusplus(uminus(v1803), v1804); + real2 v1805 = minusplus(v1803, v1804); + store(out, 58 << %SHIFT%, ctimesminusplus(reverse(v1805), tbl[318 + tbloffset], ctimes(v1805, tbl[319 + tbloffset]))); + store(out, 122 << %SHIFT%, ctimesminusplus(reverse(v1807), tbl[320 + tbloffset], ctimes(v1807, tbl[321 + tbloffset]))); + real2 v1651 = ctimesminusplus(reverse(v1641), tbl[290 + tbloffset], ctimes(v1641, tbl[291 + tbloffset])); + real2 v1691 = ctimesminusplus(reverse(v1681), tbl[298 + tbloffset], ctimes(v1681, tbl[299 + tbloffset])); + real2 v1783 = plus(v1651, v1691); + real2 v1777 = reverse(minus(v1691, v1651)); + real2 v1779 = minusplus(v1777, v1778); + real2 v1781 = minusplus(uminus(v1777), v1778); + store(out, 106 << %SHIFT%, ctimesminusplus(reverse(v1781), tbl[316 + tbloffset], ctimes(v1781, tbl[317 + tbloffset]))); + store(out, 42 << %SHIFT%, ctimesminusplus(reverse(v1779), tbl[314 + tbloffset], ctimes(v1779, tbl[315 + tbloffset]))); + store(out, 10 << %SHIFT%, plus(v1782, v1783)); + real2 v1796 = minus(v1782, v1783); + store(out, 74 << %SHIFT%, ctimesminusplus(v1796, tbl[0 + tbloffset], ctimes(reverse(v1796), tbl[1 + tbloffset]))); + real2 v1720 = minus(v1645, v1644); + real2 v1724 = plus(v1644, v1645); + real2 v1719 = reverse(minus(v1685, v1684)); + real2 v1725 = plus(v1684, v1685); + real2 v1745 = plus(v1724, v1725); + real2 v1739 = reverse(minus(v1725, v1724)); + store(out, 2 << %SHIFT%, plus(v1744, v1745)); + real2 v1758 = minus(v1744, v1745); + store(out, 66 << %SHIFT%, ctimesminusplus(v1758, tbl[0 + tbloffset], ctimes(reverse(v1758), tbl[1 + tbloffset]))); + real2 v1741 = minusplus(v1739, v1740); + real2 v1743 = minusplus(uminus(v1739), v1740); + store(out, 98 << %SHIFT%, ctimesminusplus(reverse(v1743), tbl[312 + tbloffset], ctimes(v1743, tbl[313 + tbloffset]))); + store(out, 34 << %SHIFT%, ctimesminusplus(reverse(v1741), tbl[310 + tbloffset], ctimes(v1741, tbl[311 + tbloffset]))); + real2 v1723 = minusplus(uminus(v1719), v1720); + real2 v1721 = minusplus(v1719, v1720); + real2 v1737 = ctimesminusplus(reverse(v1723), tbl[308 + tbloffset], ctimes(v1723, tbl[309 + tbloffset])); + store(out, 50 << %SHIFT%, plus(v1717, v1737)); + real2 v1770 = minus(v1717, v1737); + store(out, 114 << %SHIFT%, ctimesminusplus(v1770, tbl[0 + tbloffset], ctimes(reverse(v1770), tbl[1 + tbloffset]))); + real2 v1731 = ctimesminusplus(reverse(v1721), tbl[306 + tbloffset], ctimes(v1721, tbl[307 + tbloffset])); + store(out, 18 << %SHIFT%, plus(v1711, v1731)); + real2 v1764 = minus(v1711, v1731); + store(out, 82 << %SHIFT%, ctimesminusplus(v1764, tbl[0 + tbloffset], ctimes(reverse(v1764), tbl[1 + tbloffset]))); + real2 v809 = ctimesminusplus(reverse(v795), tbl[136 + tbloffset], ctimes(v795, tbl[137 + tbloffset])); + real2 v969 = ctimesminusplus(reverse(v955), tbl[168 + tbloffset], ctimes(v955, tbl[169 + tbloffset])); + real2 v1850 = minus(v969, v809); + real2 v1854 = plus(v809, v969); + real2 v849 = ctimesminusplus(reverse(v835), tbl[144 + tbloffset], ctimes(v835, tbl[145 + tbloffset])); + real2 v929 = ctimesminusplus(reverse(v915), tbl[160 + tbloffset], ctimes(v915, tbl[161 + tbloffset])); + real2 v889 = ctimesminusplus(reverse(v875), tbl[152 + tbloffset], ctimes(v875, tbl[153 + tbloffset])); + real2 v1089 = ctimesminusplus(reverse(v1075), tbl[192 + tbloffset], ctimes(v1075, tbl[193 + tbloffset])); + real2 v1009 = ctimesminusplus(reverse(v995), tbl[176 + tbloffset], ctimes(v995, tbl[177 + tbloffset])); + real2 v1890 = minus(v1009, v849); + real2 v1894 = plus(v849, v1009); + real2 v1849 = reverse(minus(v1049, v889)); + real2 v1855 = plus(v889, v1049); + real2 v1930 = minus(v1855, v1854); + real2 v1934 = plus(v1854, v1855); + real2 v1895 = plus(v929, v1089); + real2 v1889 = reverse(minus(v1089, v929)); + real2 v1929 = reverse(minus(v1895, v1894)); + real2 v1935 = plus(v1894, v1895); + real2 v1955 = plus(v1934, v1935); + real2 v1949 = reverse(minus(v1935, v1934)); + store(out, 6 << %SHIFT%, plus(v1954, v1955)); + real2 v1968 = minus(v1954, v1955); + store(out, 70 << %SHIFT%, ctimesminusplus(v1968, tbl[0 + tbloffset], ctimes(reverse(v1968), tbl[1 + tbloffset]))); + real2 v1951 = minusplus(v1949, v1950); + store(out, 38 << %SHIFT%, ctimesminusplus(reverse(v1951), tbl[346 + tbloffset], ctimes(v1951, tbl[347 + tbloffset]))); + real2 v1953 = minusplus(uminus(v1949), v1950); + store(out, 102 << %SHIFT%, ctimesminusplus(reverse(v1953), tbl[348 + tbloffset], ctimes(v1953, tbl[349 + tbloffset]))); + real2 v1931 = minusplus(v1929, v1930); + real2 v1933 = minusplus(uminus(v1929), v1930); + real2 v1947 = ctimesminusplus(reverse(v1933), tbl[344 + tbloffset], ctimes(v1933, tbl[345 + tbloffset])); + store(out, 54 << %SHIFT%, plus(v1927, v1947)); + real2 v1980 = minus(v1927, v1947); + store(out, 118 << %SHIFT%, ctimesminusplus(v1980, tbl[0 + tbloffset], ctimes(reverse(v1980), tbl[1 + tbloffset]))); + real2 v1941 = ctimesminusplus(reverse(v1931), tbl[342 + tbloffset], ctimes(v1931, tbl[343 + tbloffset])); + store(out, 22 << %SHIFT%, plus(v1921, v1941)); + real2 v1974 = minus(v1921, v1941); + store(out, 86 << %SHIFT%, ctimesminusplus(v1974, tbl[0 + tbloffset], ctimes(reverse(v1974), tbl[1 + tbloffset]))); + real2 v1851 = minusplus(v1849, v1850); + real2 v1853 = minusplus(uminus(v1849), v1850); + real2 v1867 = ctimesminusplus(reverse(v1853), tbl[328 + tbloffset], ctimes(v1853, tbl[329 + tbloffset])); + real2 v1891 = minusplus(v1889, v1890); + real2 v1893 = minusplus(uminus(v1889), v1890); + real2 v1907 = ctimesminusplus(reverse(v1893), tbl[336 + tbloffset], ctimes(v1893, tbl[337 + tbloffset])); + real2 v2019 = plus(v1867, v1907); + real2 v2013 = reverse(minus(v1907, v1867)); + store(out, 30 << %SHIFT%, plus(v2018, v2019)); + real2 v2032 = minus(v2018, v2019); + store(out, 94 << %SHIFT%, ctimesminusplus(v2032, tbl[0 + tbloffset], ctimes(reverse(v2032), tbl[1 + tbloffset]))); + real2 v2017 = minusplus(uminus(v2013), v2014); + store(out, 126 << %SHIFT%, ctimesminusplus(reverse(v2017), tbl[356 + tbloffset], ctimes(v2017, tbl[357 + tbloffset]))); + real2 v2015 = minusplus(v2013, v2014); + store(out, 62 << %SHIFT%, ctimesminusplus(reverse(v2015), tbl[354 + tbloffset], ctimes(v2015, tbl[355 + tbloffset]))); + real2 v1861 = ctimesminusplus(reverse(v1851), tbl[326 + tbloffset], ctimes(v1851, tbl[327 + tbloffset])); + real2 v1901 = ctimesminusplus(reverse(v1891), tbl[334 + tbloffset], ctimes(v1891, tbl[335 + tbloffset])); + real2 v1993 = plus(v1861, v1901); + real2 v1987 = reverse(minus(v1901, v1861)); + store(out, 14 << %SHIFT%, plus(v1992, v1993)); + real2 v2006 = minus(v1992, v1993); + store(out, 78 << %SHIFT%, ctimesminusplus(v2006, tbl[0 + tbloffset], ctimes(reverse(v2006), tbl[1 + tbloffset]))); + real2 v1991 = minusplus(uminus(v1987), v1988); + store(out, 110 << %SHIFT%, ctimesminusplus(reverse(v1991), tbl[352 + tbloffset], ctimes(v1991, tbl[353 + tbloffset]))); + real2 v1989 = minusplus(v1987, v1988); + store(out, 46 << %SHIFT%, ctimesminusplus(reverse(v1989), tbl[350 + tbloffset], ctimes(v1989, tbl[351 + tbloffset]))); + real2 v593 = minusplus(v591, v592); + real2 v595 = minusplus(uminus(v591), v592); + real2 v473 = minusplus(v471, v472); + real2 v475 = minusplus(uminus(v471), v472); + real2 v555 = minusplus(uminus(v551), v552); + real2 v553 = minusplus(v551, v552); + real2 v609 = ctimesminusplus(reverse(v595), tbl[96 + tbloffset], ctimes(v595, tbl[97 + tbloffset])); + real2 v195 = minusplus(uminus(v191), v192); + real2 v193 = minusplus(v191, v192); + real2 v275 = minusplus(uminus(v271), v272); + real2 v273 = minusplus(v271, v272); + real2 v673 = minusplus(v671, v672); + real2 v675 = minusplus(uminus(v671), v672); + real2 v689 = ctimesminusplus(reverse(v675), tbl[112 + tbloffset], ctimes(v675, tbl[113 + tbloffset])); + real2 v209 = ctimesminusplus(reverse(v195), tbl[16 + tbloffset], ctimes(v195, tbl[17 + tbloffset])); + real2 v289 = ctimesminusplus(reverse(v275), tbl[32 + tbloffset], ctimes(v275, tbl[33 + tbloffset])); + real2 v755 = minusplus(uminus(v751), v752); + real2 v753 = minusplus(v751, v752); + real2 v435 = minusplus(uminus(v431), v432); + real2 v433 = minusplus(v431, v432); + real2 v513 = minusplus(v511, v512); + real2 v515 = minusplus(uminus(v511), v512); + real2 v529 = ctimesminusplus(reverse(v515), tbl[80 + tbloffset], ctimes(v515, tbl[81 + tbloffset])); + real2 v353 = minusplus(v351, v352); + real2 v355 = minusplus(uminus(v351), v352); + real2 v369 = ctimesminusplus(reverse(v355), tbl[48 + tbloffset], ctimes(v355, tbl[49 + tbloffset])); + real2 v2631 = plus(v369, v689); + real2 v2625 = reverse(minus(v689, v369)); + real2 v449 = ctimesminusplus(reverse(v435), tbl[64 + tbloffset], ctimes(v435, tbl[65 + tbloffset])); + real2 v2710 = plus(v289, v609); + real2 v2706 = minus(v609, v289); + real2 v2630 = plus(v209, v529); + real2 v2626 = minus(v529, v209); + real2 v2790 = plus(v2630, v2631); + real2 v2786 = minus(v2631, v2630); + real2 v713 = minusplus(v711, v712); + real2 v715 = minusplus(uminus(v711), v712); + real2 v769 = ctimesminusplus(reverse(v755), tbl[128 + tbloffset], ctimes(v755, tbl[129 + tbloffset])); + real2 v2705 = reverse(minus(v769, v449)); + real2 v2711 = plus(v449, v769); + real2 v313 = minusplus(v311, v312); + real2 v315 = minusplus(uminus(v311), v312); + real2 v393 = minusplus(v391, v392); + real2 v395 = minusplus(uminus(v391), v392); + real2 v409 = ctimesminusplus(reverse(v395), tbl[56 + tbloffset], ctimes(v395, tbl[57 + tbloffset])); + real2 v729 = ctimesminusplus(reverse(v715), tbl[120 + tbloffset], ctimes(v715, tbl[121 + tbloffset])); + real2 v329 = ctimesminusplus(reverse(v315), tbl[40 + tbloffset], ctimes(v315, tbl[41 + tbloffset])); + real2 v489 = ctimesminusplus(reverse(v475), tbl[72 + tbloffset], ctimes(v475, tbl[73 + tbloffset])); + real2 v153 = minusplus(v151, v152); + real2 v155 = minusplus(uminus(v151), v152); + real2 v169 = ctimesminusplus(reverse(v155), tbl[8 + tbloffset], ctimes(v155, tbl[9 + tbloffset])); + real2 v2586 = minus(v489, v169); + real2 v2590 = plus(v169, v489); + real2 v233 = minusplus(v231, v232); + real2 v235 = minusplus(uminus(v231), v232); + real2 v633 = minusplus(v631, v632); + real2 v635 = minusplus(uminus(v631), v632); + real2 v649 = ctimesminusplus(reverse(v635), tbl[104 + tbloffset], ctimes(v635, tbl[105 + tbloffset])); + real2 v249 = ctimesminusplus(reverse(v235), tbl[24 + tbloffset], ctimes(v235, tbl[25 + tbloffset])); + real2 v569 = ctimesminusplus(reverse(v555), tbl[88 + tbloffset], ctimes(v555, tbl[89 + tbloffset])); + real2 v2670 = plus(v249, v569); + real2 v2666 = minus(v569, v249); + real2 v2785 = reverse(minus(v2711, v2710)); + real2 v2791 = plus(v2710, v2711); + real2 v2825 = reverse(minus(v2791, v2790)); + real2 v2831 = plus(v2790, v2791); + real2 v2671 = plus(v409, v729); + real2 v2665 = reverse(minus(v729, v409)); + real2 v2745 = reverse(minus(v2671, v2670)); + real2 v2751 = plus(v2670, v2671); + real2 v2806 = minus(v2731, v2730); + real2 v2810 = plus(v2730, v2731); + real2 v2846 = minus(v2811, v2810); + real2 v2850 = plus(v2810, v2811); + real2 v2591 = plus(v329, v649); + real2 v2585 = reverse(minus(v649, v329)); + real2 v2750 = plus(v2590, v2591); + real2 v2746 = minus(v2591, v2590); + real2 v2830 = plus(v2750, v2751); + real2 v2826 = minus(v2751, v2750); + real2 v2845 = reverse(minus(v2831, v2830)); + real2 v2851 = plus(v2830, v2831); + store(out, 3 << %SHIFT%, plus(v2850, v2851)); + real2 v2864 = minus(v2850, v2851); + store(out, 67 << %SHIFT%, ctimesminusplus(v2864, tbl[0 + tbloffset], ctimes(reverse(v2864), tbl[1 + tbloffset]))); + real2 v2849 = minusplus(uminus(v2845), v2846); + real2 v2847 = minusplus(v2845, v2846); + store(out, 35 << %SHIFT%, ctimesminusplus(reverse(v2847), tbl[506 + tbloffset], ctimes(v2847, tbl[507 + tbloffset]))); + store(out, 99 << %SHIFT%, ctimesminusplus(reverse(v2849), tbl[508 + tbloffset], ctimes(v2849, tbl[509 + tbloffset]))); + real2 v2827 = minusplus(v2825, v2826); + real2 v2829 = minusplus(uminus(v2825), v2826); + real2 v2837 = ctimesminusplus(reverse(v2827), tbl[502 + tbloffset], ctimes(v2827, tbl[503 + tbloffset])); + real2 v2809 = minusplus(uminus(v2805), v2806); + real2 v2807 = minusplus(v2805, v2806); + real2 v2817 = ctimesminusplus(reverse(v2807), tbl[498 + tbloffset], ctimes(v2807, tbl[499 + tbloffset])); + store(out, 19 << %SHIFT%, plus(v2817, v2837)); + real2 v2870 = minus(v2817, v2837); + store(out, 83 << %SHIFT%, ctimesminusplus(v2870, tbl[0 + tbloffset], ctimes(reverse(v2870), tbl[1 + tbloffset]))); + real2 v2823 = ctimesminusplus(reverse(v2809), tbl[500 + tbloffset], ctimes(v2809, tbl[501 + tbloffset])); + real2 v2843 = ctimesminusplus(reverse(v2829), tbl[504 + tbloffset], ctimes(v2829, tbl[505 + tbloffset])); + store(out, 51 << %SHIFT%, plus(v2823, v2843)); + real2 v2876 = minus(v2823, v2843); + store(out, 115 << %SHIFT%, ctimesminusplus(v2876, tbl[0 + tbloffset], ctimes(reverse(v2876), tbl[1 + tbloffset]))); + real2 v2787 = minusplus(v2785, v2786); + real2 v2789 = minusplus(uminus(v2785), v2786); + real2 v2803 = ctimesminusplus(reverse(v2789), tbl[496 + tbloffset], ctimes(v2789, tbl[497 + tbloffset])); + real2 v2727 = minusplus(v2725, v2726); + real2 v2729 = minusplus(uminus(v2725), v2726); + real2 v2743 = ctimesminusplus(reverse(v2729), tbl[484 + tbloffset], ctimes(v2729, tbl[485 + tbloffset])); + real2 v2914 = plus(v2743, v2783); + real2 v2910 = minus(v2783, v2743); + real2 v2749 = minusplus(uminus(v2745), v2746); + real2 v2747 = minusplus(v2745, v2746); + real2 v2763 = ctimesminusplus(reverse(v2749), tbl[488 + tbloffset], ctimes(v2749, tbl[489 + tbloffset])); + real2 v2909 = reverse(minus(v2803, v2763)); + real2 v2915 = plus(v2763, v2803); + store(out, 27 << %SHIFT%, plus(v2914, v2915)); + real2 v2928 = minus(v2914, v2915); + store(out, 91 << %SHIFT%, ctimesminusplus(v2928, tbl[0 + tbloffset], ctimes(reverse(v2928), tbl[1 + tbloffset]))); + real2 v2913 = minusplus(uminus(v2909), v2910); + store(out, 123 << %SHIFT%, ctimesminusplus(reverse(v2913), tbl[516 + tbloffset], ctimes(v2913, tbl[517 + tbloffset]))); + real2 v2911 = minusplus(v2909, v2910); + store(out, 59 << %SHIFT%, ctimesminusplus(reverse(v2911), tbl[514 + tbloffset], ctimes(v2911, tbl[515 + tbloffset]))); + real2 v2737 = ctimesminusplus(reverse(v2727), tbl[482 + tbloffset], ctimes(v2727, tbl[483 + tbloffset])); + real2 v2888 = plus(v2737, v2777); + real2 v2884 = minus(v2777, v2737); + real2 v2797 = ctimesminusplus(reverse(v2787), tbl[494 + tbloffset], ctimes(v2787, tbl[495 + tbloffset])); + real2 v2757 = ctimesminusplus(reverse(v2747), tbl[486 + tbloffset], ctimes(v2747, tbl[487 + tbloffset])); + real2 v2889 = plus(v2757, v2797); + real2 v2883 = reverse(minus(v2797, v2757)); + store(out, 11 << %SHIFT%, plus(v2888, v2889)); + real2 v2902 = minus(v2888, v2889); + store(out, 75 << %SHIFT%, ctimesminusplus(v2902, tbl[0 + tbloffset], ctimes(reverse(v2902), tbl[1 + tbloffset]))); + real2 v2887 = minusplus(uminus(v2883), v2884); + store(out, 107 << %SHIFT%, ctimesminusplus(reverse(v2887), tbl[512 + tbloffset], ctimes(v2887, tbl[513 + tbloffset]))); + real2 v2885 = minusplus(v2883, v2884); + store(out, 43 << %SHIFT%, ctimesminusplus(reverse(v2885), tbl[510 + tbloffset], ctimes(v2885, tbl[511 + tbloffset]))); + real2 v2669 = minusplus(uminus(v2665), v2666); + real2 v2667 = minusplus(v2665, v2666); + real2 v2707 = minusplus(v2705, v2706); + real2 v2709 = minusplus(uminus(v2705), v2706); + real2 v2717 = ctimesminusplus(reverse(v2707), tbl[478 + tbloffset], ctimes(v2707, tbl[479 + tbloffset])); + real2 v2627 = minusplus(v2625, v2626); + real2 v2629 = minusplus(uminus(v2625), v2626); + real2 v2637 = ctimesminusplus(reverse(v2627), tbl[462 + tbloffset], ctimes(v2627, tbl[463 + tbloffset])); + real2 v2961 = plus(v2637, v2717); + real2 v2955 = reverse(minus(v2717, v2637)); + real2 v2649 = minusplus(uminus(v2645), v2646); + real2 v2647 = minusplus(v2645, v2646); + real2 v2569 = minusplus(uminus(v2565), v2566); + real2 v2567 = minusplus(v2565, v2566); + real2 v2577 = ctimesminusplus(reverse(v2567), tbl[450 + tbloffset], ctimes(v2567, tbl[451 + tbloffset])); + real2 v2657 = ctimesminusplus(reverse(v2647), tbl[466 + tbloffset], ctimes(v2647, tbl[467 + tbloffset])); + real2 v2936 = minus(v2657, v2577); + real2 v2940 = plus(v2577, v2657); + real2 v2976 = minus(v2941, v2940); + real2 v2980 = plus(v2940, v2941); + real2 v2677 = ctimesminusplus(reverse(v2667), tbl[470 + tbloffset], ctimes(v2667, tbl[471 + tbloffset])); + real2 v2587 = minusplus(v2585, v2586); + real2 v2589 = minusplus(uminus(v2585), v2586); + real2 v2597 = ctimesminusplus(reverse(v2587), tbl[454 + tbloffset], ctimes(v2587, tbl[455 + tbloffset])); + real2 v2956 = minus(v2677, v2597); + real2 v2960 = plus(v2597, v2677); + real2 v2975 = reverse(minus(v2961, v2960)); + real2 v2981 = plus(v2960, v2961); + store(out, 7 << %SHIFT%, plus(v2980, v2981)); + real2 v2994 = minus(v2980, v2981); + store(out, 71 << %SHIFT%, ctimesminusplus(v2994, tbl[0 + tbloffset], ctimes(reverse(v2994), tbl[1 + tbloffset]))); + real2 v2979 = minusplus(uminus(v2975), v2976); + store(out, 103 << %SHIFT%, ctimesminusplus(reverse(v2979), tbl[528 + tbloffset], ctimes(v2979, tbl[529 + tbloffset]))); + real2 v2977 = minusplus(v2975, v2976); + store(out, 39 << %SHIFT%, ctimesminusplus(reverse(v2977), tbl[526 + tbloffset], ctimes(v2977, tbl[527 + tbloffset]))); + real2 v2939 = minusplus(uminus(v2935), v2936); + real2 v2937 = minusplus(v2935, v2936); + real2 v2953 = ctimesminusplus(reverse(v2939), tbl[520 + tbloffset], ctimes(v2939, tbl[521 + tbloffset])); + real2 v2957 = minusplus(v2955, v2956); + real2 v2959 = minusplus(uminus(v2955), v2956); + real2 v2973 = ctimesminusplus(reverse(v2959), tbl[524 + tbloffset], ctimes(v2959, tbl[525 + tbloffset])); + store(out, 55 << %SHIFT%, plus(v2953, v2973)); + real2 v3006 = minus(v2953, v2973); + store(out, 119 << %SHIFT%, ctimesminusplus(v3006, tbl[0 + tbloffset], ctimes(reverse(v3006), tbl[1 + tbloffset]))); + real2 v2947 = ctimesminusplus(reverse(v2937), tbl[518 + tbloffset], ctimes(v2937, tbl[519 + tbloffset])); + real2 v2967 = ctimesminusplus(reverse(v2957), tbl[522 + tbloffset], ctimes(v2957, tbl[523 + tbloffset])); + store(out, 23 << %SHIFT%, plus(v2947, v2967)); + real2 v3000 = minus(v2947, v2967); + store(out, 87 << %SHIFT%, ctimesminusplus(v3000, tbl[0 + tbloffset], ctimes(reverse(v3000), tbl[1 + tbloffset]))); + real2 v2663 = ctimesminusplus(reverse(v2649), tbl[468 + tbloffset], ctimes(v2649, tbl[469 + tbloffset])); + real2 v2583 = ctimesminusplus(reverse(v2569), tbl[452 + tbloffset], ctimes(v2569, tbl[453 + tbloffset])); + real2 v3014 = minus(v2663, v2583); + real2 v3018 = plus(v2583, v2663); + real2 v3015 = minusplus(v3013, v3014); + real2 v3017 = minusplus(uminus(v3013), v3014); + real2 v2643 = ctimesminusplus(reverse(v2629), tbl[464 + tbloffset], ctimes(v2629, tbl[465 + tbloffset])); + real2 v2723 = ctimesminusplus(reverse(v2709), tbl[480 + tbloffset], ctimes(v2709, tbl[481 + tbloffset])); + real2 v3039 = plus(v2643, v2723); + real2 v3033 = reverse(minus(v2723, v2643)); + real2 v2683 = ctimesminusplus(reverse(v2669), tbl[472 + tbloffset], ctimes(v2669, tbl[473 + tbloffset])); + real2 v3031 = ctimesminusplus(reverse(v3017), tbl[532 + tbloffset], ctimes(v3017, tbl[533 + tbloffset])); + real2 v2603 = ctimesminusplus(reverse(v2589), tbl[456 + tbloffset], ctimes(v2589, tbl[457 + tbloffset])); + real2 v3034 = minus(v2683, v2603); + real2 v3038 = plus(v2603, v2683); + real2 v3037 = minusplus(uminus(v3033), v3034); + real2 v3035 = minusplus(v3033, v3034); + real2 v3051 = ctimesminusplus(reverse(v3037), tbl[536 + tbloffset], ctimes(v3037, tbl[537 + tbloffset])); + store(out, 63 << %SHIFT%, plus(v3031, v3051)); + real2 v3084 = minus(v3031, v3051); + store(out, 127 << %SHIFT%, ctimesminusplus(v3084, tbl[0 + tbloffset], ctimes(reverse(v3084), tbl[1 + tbloffset]))); + real2 v3025 = ctimesminusplus(reverse(v3015), tbl[530 + tbloffset], ctimes(v3015, tbl[531 + tbloffset])); + real2 v3045 = ctimesminusplus(reverse(v3035), tbl[534 + tbloffset], ctimes(v3035, tbl[535 + tbloffset])); + store(out, 31 << %SHIFT%, plus(v3025, v3045)); + real2 v3078 = minus(v3025, v3045); + store(out, 95 << %SHIFT%, ctimesminusplus(v3078, tbl[0 + tbloffset], ctimes(reverse(v3078), tbl[1 + tbloffset]))); + real2 v3058 = plus(v3018, v3019); + real2 v3054 = minus(v3019, v3018); + real2 v3053 = reverse(minus(v3039, v3038)); + real2 v3059 = plus(v3038, v3039); + real2 v3055 = minusplus(v3053, v3054); + store(out, 47 << %SHIFT%, ctimesminusplus(reverse(v3055), tbl[538 + tbloffset], ctimes(v3055, tbl[539 + tbloffset]))); + real2 v3057 = minusplus(uminus(v3053), v3054); + store(out, 111 << %SHIFT%, ctimesminusplus(reverse(v3057), tbl[540 + tbloffset], ctimes(v3057, tbl[541 + tbloffset]))); + store(out, 15 << %SHIFT%, plus(v3058, v3059)); + real2 v3072 = minus(v3058, v3059); + store(out, 79 << %SHIFT%, ctimesminusplus(v3072, tbl[0 + tbloffset], ctimes(reverse(v3072), tbl[1 + tbloffset]))); + real2 v683 = ctimesminusplus(reverse(v673), tbl[110 + tbloffset], ctimes(v673, tbl[111 + tbloffset])); + real2 v363 = ctimesminusplus(reverse(v353), tbl[46 + tbloffset], ctimes(v353, tbl[47 + tbloffset])); + real2 v2105 = plus(v363, v683); + real2 v2099 = reverse(minus(v683, v363)); + real2 v283 = ctimesminusplus(reverse(v273), tbl[30 + tbloffset], ctimes(v273, tbl[31 + tbloffset])); + real2 v723 = ctimesminusplus(reverse(v713), tbl[118 + tbloffset], ctimes(v713, tbl[119 + tbloffset])); + real2 v403 = ctimesminusplus(reverse(v393), tbl[54 + tbloffset], ctimes(v393, tbl[55 + tbloffset])); + real2 v603 = ctimesminusplus(reverse(v593), tbl[94 + tbloffset], ctimes(v593, tbl[95 + tbloffset])); + real2 v2180 = minus(v603, v283); + real2 v2184 = plus(v283, v603); + real2 v2145 = plus(v403, v723); + real2 v2139 = reverse(minus(v723, v403)); + real2 v543 = ctimesminusplus(reverse(v533), tbl[82 + tbloffset], ctimes(v533, tbl[83 + tbloffset])); + real2 v383 = ctimesminusplus(reverse(v373), tbl[50 + tbloffset], ctimes(v373, tbl[51 + tbloffset])); + real2 v703 = ctimesminusplus(reverse(v693), tbl[114 + tbloffset], ctimes(v693, tbl[115 + tbloffset])); + real2 v2125 = plus(v383, v703); + real2 v2119 = reverse(minus(v703, v383)); + real2 v223 = ctimesminusplus(reverse(v213), tbl[18 + tbloffset], ctimes(v213, tbl[19 + tbloffset])); + real2 v2120 = minus(v543, v223); + real2 v2124 = plus(v223, v543); + real2 v443 = ctimesminusplus(reverse(v433), tbl[62 + tbloffset], ctimes(v433, tbl[63 + tbloffset])); + real2 v203 = ctimesminusplus(reverse(v193), tbl[14 + tbloffset], ctimes(v193, tbl[15 + tbloffset])); + real2 v763 = ctimesminusplus(reverse(v753), tbl[126 + tbloffset], ctimes(v753, tbl[127 + tbloffset])); + real2 v2179 = reverse(minus(v763, v443)); + real2 v2185 = plus(v443, v763); + real2 v523 = ctimesminusplus(reverse(v513), tbl[78 + tbloffset], ctimes(v513, tbl[79 + tbloffset])); + real2 v2100 = minus(v523, v203); + real2 v2104 = plus(v203, v523); + real2 v2264 = plus(v2104, v2105); + real2 v2260 = minus(v2105, v2104); + real2 v643 = ctimesminusplus(reverse(v633), tbl[102 + tbloffset], ctimes(v633, tbl[103 + tbloffset])); + real2 v2265 = plus(v2184, v2185); + real2 v2259 = reverse(minus(v2185, v2184)); + real2 v563 = ctimesminusplus(reverse(v553), tbl[86 + tbloffset], ctimes(v553, tbl[87 + tbloffset])); + real2 v243 = ctimesminusplus(reverse(v233), tbl[22 + tbloffset], ctimes(v233, tbl[23 + tbloffset])); + real2 v2144 = plus(v243, v563); + real2 v2140 = minus(v563, v243); + real2 v143 = ctimesminusplus(reverse(v133), tbl[2 + tbloffset], ctimes(v133, tbl[3 + tbloffset])); + real2 v183 = ctimesminusplus(reverse(v173), tbl[10 + tbloffset], ctimes(v173, tbl[11 + tbloffset])); + real2 v2084 = plus(v183, v503); + real2 v2080 = minus(v503, v183); + real2 v163 = ctimesminusplus(reverse(v153), tbl[6 + tbloffset], ctimes(v153, tbl[7 + tbloffset])); + real2 v303 = ctimesminusplus(reverse(v293), tbl[34 + tbloffset], ctimes(v293, tbl[35 + tbloffset])); + real2 v623 = ctimesminusplus(reverse(v613), tbl[98 + tbloffset], ctimes(v613, tbl[99 + tbloffset])); + real2 v2039 = reverse(minus(v623, v303)); + real2 v2045 = plus(v303, v623); + real2 v463 = ctimesminusplus(reverse(v453), tbl[66 + tbloffset], ctimes(v453, tbl[67 + tbloffset])); + real2 v2044 = plus(v143, v463); + real2 v2040 = minus(v463, v143); + real2 v2204 = plus(v2044, v2045); + real2 v2200 = minus(v2045, v2044); + real2 v323 = ctimesminusplus(reverse(v313), tbl[38 + tbloffset], ctimes(v313, tbl[39 + tbloffset])); + real2 v2205 = plus(v2124, v2125); + real2 v2199 = reverse(minus(v2125, v2124)); + real2 v2280 = minus(v2205, v2204); + real2 v2284 = plus(v2204, v2205); + real2 v2225 = plus(v2144, v2145); + real2 v2219 = reverse(minus(v2145, v2144)); + real2 v2305 = plus(v2264, v2265); + real2 v2299 = reverse(minus(v2265, v2264)); + real2 v2240 = minus(v2085, v2084); + real2 v2244 = plus(v2084, v2085); + real2 v2279 = reverse(minus(v2245, v2244)); + real2 v2285 = plus(v2244, v2245); + real2 v2281 = minusplus(v2279, v2280); + real2 v2283 = minusplus(uminus(v2279), v2280); + real2 v2291 = ctimesminusplus(reverse(v2281), tbl[406 + tbloffset], ctimes(v2281, tbl[407 + tbloffset])); + real2 v483 = ctimesminusplus(reverse(v473), tbl[70 + tbloffset], ctimes(v473, tbl[71 + tbloffset])); + real2 v2060 = minus(v483, v163); + real2 v2064 = plus(v163, v483); + real2 v2065 = plus(v323, v643); + real2 v2059 = reverse(minus(v643, v323)); + real2 v2220 = minus(v2065, v2064); + real2 v2224 = plus(v2064, v2065); + real2 v2304 = plus(v2224, v2225); + real2 v2300 = minus(v2225, v2224); + real2 v2301 = minusplus(v2299, v2300); + real2 v2303 = minusplus(uminus(v2299), v2300); + real2 v2311 = ctimesminusplus(reverse(v2301), tbl[410 + tbloffset], ctimes(v2301, tbl[411 + tbloffset])); + store(out, 17 << %SHIFT%, plus(v2291, v2311)); + real2 v2344 = minus(v2291, v2311); + store(out, 81 << %SHIFT%, ctimesminusplus(v2344, tbl[0 + tbloffset], ctimes(reverse(v2344), tbl[1 + tbloffset]))); + real2 v2297 = ctimesminusplus(reverse(v2283), tbl[408 + tbloffset], ctimes(v2283, tbl[409 + tbloffset])); + real2 v2317 = ctimesminusplus(reverse(v2303), tbl[412 + tbloffset], ctimes(v2303, tbl[413 + tbloffset])); + store(out, 49 << %SHIFT%, plus(v2297, v2317)); + real2 v2350 = minus(v2297, v2317); + store(out, 113 << %SHIFT%, ctimesminusplus(v2350, tbl[0 + tbloffset], ctimes(reverse(v2350), tbl[1 + tbloffset]))); + real2 v2320 = minus(v2285, v2284); + real2 v2324 = plus(v2284, v2285); + real2 v2325 = plus(v2304, v2305); + real2 v2319 = reverse(minus(v2305, v2304)); + store(out, 1 << %SHIFT%, plus(v2324, v2325)); + real2 v2338 = minus(v2324, v2325); + store(out, 65 << %SHIFT%, ctimesminusplus(v2338, tbl[0 + tbloffset], ctimes(reverse(v2338), tbl[1 + tbloffset]))); + real2 v2321 = minusplus(v2319, v2320); + store(out, 33 << %SHIFT%, ctimesminusplus(reverse(v2321), tbl[414 + tbloffset], ctimes(v2321, tbl[415 + tbloffset]))); + real2 v2323 = minusplus(uminus(v2319), v2320); + store(out, 97 << %SHIFT%, ctimesminusplus(reverse(v2323), tbl[416 + tbloffset], ctimes(v2323, tbl[417 + tbloffset]))); + real2 v2201 = minusplus(v2199, v2200); + real2 v2203 = minusplus(uminus(v2199), v2200); + real2 v2263 = minusplus(uminus(v2259), v2260); + real2 v2261 = minusplus(v2259, v2260); + real2 v2243 = minusplus(uminus(v2239), v2240); + real2 v2241 = minusplus(v2239, v2240); + real2 v2257 = ctimesminusplus(reverse(v2243), tbl[400 + tbloffset], ctimes(v2243, tbl[401 + tbloffset])); + real2 v2217 = ctimesminusplus(reverse(v2203), tbl[392 + tbloffset], ctimes(v2203, tbl[393 + tbloffset])); + real2 v2388 = plus(v2217, v2257); + real2 v2384 = minus(v2257, v2217); + real2 v2277 = ctimesminusplus(reverse(v2263), tbl[404 + tbloffset], ctimes(v2263, tbl[405 + tbloffset])); + real2 v2221 = minusplus(v2219, v2220); + real2 v2223 = minusplus(uminus(v2219), v2220); + real2 v2237 = ctimesminusplus(reverse(v2223), tbl[396 + tbloffset], ctimes(v2223, tbl[397 + tbloffset])); + real2 v2389 = plus(v2237, v2277); + real2 v2383 = reverse(minus(v2277, v2237)); + store(out, 25 << %SHIFT%, plus(v2388, v2389)); + real2 v2402 = minus(v2388, v2389); + store(out, 89 << %SHIFT%, ctimesminusplus(v2402, tbl[0 + tbloffset], ctimes(reverse(v2402), tbl[1 + tbloffset]))); + real2 v2385 = minusplus(v2383, v2384); + real2 v2387 = minusplus(uminus(v2383), v2384); + store(out, 121 << %SHIFT%, ctimesminusplus(reverse(v2387), tbl[424 + tbloffset], ctimes(v2387, tbl[425 + tbloffset]))); + store(out, 57 << %SHIFT%, ctimesminusplus(reverse(v2385), tbl[422 + tbloffset], ctimes(v2385, tbl[423 + tbloffset]))); + real2 v2251 = ctimesminusplus(reverse(v2241), tbl[398 + tbloffset], ctimes(v2241, tbl[399 + tbloffset])); + real2 v2211 = ctimesminusplus(reverse(v2201), tbl[390 + tbloffset], ctimes(v2201, tbl[391 + tbloffset])); + real2 v2358 = minus(v2251, v2211); + real2 v2362 = plus(v2211, v2251); + real2 v2271 = ctimesminusplus(reverse(v2261), tbl[402 + tbloffset], ctimes(v2261, tbl[403 + tbloffset])); + real2 v2231 = ctimesminusplus(reverse(v2221), tbl[394 + tbloffset], ctimes(v2221, tbl[395 + tbloffset])); + real2 v2357 = reverse(minus(v2271, v2231)); + real2 v2363 = plus(v2231, v2271); + store(out, 9 << %SHIFT%, plus(v2362, v2363)); + real2 v2376 = minus(v2362, v2363); + store(out, 73 << %SHIFT%, ctimesminusplus(v2376, tbl[0 + tbloffset], ctimes(reverse(v2376), tbl[1 + tbloffset]))); + real2 v2361 = minusplus(uminus(v2357), v2358); + store(out, 105 << %SHIFT%, ctimesminusplus(reverse(v2361), tbl[420 + tbloffset], ctimes(v2361, tbl[421 + tbloffset]))); + real2 v2359 = minusplus(v2357, v2358); + store(out, 41 << %SHIFT%, ctimesminusplus(reverse(v2359), tbl[418 + tbloffset], ctimes(v2359, tbl[419 + tbloffset]))); + real2 v2121 = minusplus(v2119, v2120); + real2 v2123 = minusplus(uminus(v2119), v2120); + real2 v2083 = minusplus(uminus(v2079), v2080); + real2 v2081 = minusplus(v2079, v2080); + real2 v2091 = ctimesminusplus(reverse(v2081), tbl[366 + tbloffset], ctimes(v2081, tbl[367 + tbloffset])); + real2 v2043 = minusplus(uminus(v2039), v2040); + real2 v2041 = minusplus(v2039, v2040); + real2 v2051 = ctimesminusplus(reverse(v2041), tbl[358 + tbloffset], ctimes(v2041, tbl[359 + tbloffset])); + real2 v2131 = ctimesminusplus(reverse(v2121), tbl[374 + tbloffset], ctimes(v2121, tbl[375 + tbloffset])); + real2 v2163 = minusplus(uminus(v2159), v2160); + real2 v2161 = minusplus(v2159, v2160); + real2 v2171 = ctimesminusplus(reverse(v2161), tbl[382 + tbloffset], ctimes(v2161, tbl[383 + tbloffset])); + real2 v2409 = reverse(minus(v2171, v2091)); + real2 v2415 = plus(v2091, v2171); + real2 v2410 = minus(v2131, v2051); + real2 v2414 = plus(v2051, v2131); + real2 v2454 = plus(v2414, v2415); + real2 v2450 = minus(v2415, v2414); + real2 v2181 = minusplus(v2179, v2180); + real2 v2183 = minusplus(uminus(v2179), v2180); + real2 v2191 = ctimesminusplus(reverse(v2181), tbl[386 + tbloffset], ctimes(v2181, tbl[387 + tbloffset])); + real2 v2103 = minusplus(uminus(v2099), v2100); + real2 v2101 = minusplus(v2099, v2100); + real2 v2111 = ctimesminusplus(reverse(v2101), tbl[370 + tbloffset], ctimes(v2101, tbl[371 + tbloffset])); + real2 v2435 = plus(v2111, v2191); + real2 v2429 = reverse(minus(v2191, v2111)); + real2 v2141 = minusplus(v2139, v2140); + real2 v2143 = minusplus(uminus(v2139), v2140); + real2 v2151 = ctimesminusplus(reverse(v2141), tbl[378 + tbloffset], ctimes(v2141, tbl[379 + tbloffset])); + real2 v2063 = minusplus(uminus(v2059), v2060); + real2 v2061 = minusplus(v2059, v2060); + real2 v2071 = ctimesminusplus(reverse(v2061), tbl[362 + tbloffset], ctimes(v2061, tbl[363 + tbloffset])); + real2 v2434 = plus(v2071, v2151); + real2 v2430 = minus(v2151, v2071); + real2 v2455 = plus(v2434, v2435); + real2 v2449 = reverse(minus(v2435, v2434)); + store(out, 5 << %SHIFT%, plus(v2454, v2455)); + real2 v2468 = minus(v2454, v2455); + store(out, 69 << %SHIFT%, ctimesminusplus(v2468, tbl[0 + tbloffset], ctimes(reverse(v2468), tbl[1 + tbloffset]))); + real2 v2451 = minusplus(v2449, v2450); + real2 v2453 = minusplus(uminus(v2449), v2450); + store(out, 101 << %SHIFT%, ctimesminusplus(reverse(v2453), tbl[436 + tbloffset], ctimes(v2453, tbl[437 + tbloffset]))); + store(out, 37 << %SHIFT%, ctimesminusplus(reverse(v2451), tbl[434 + tbloffset], ctimes(v2451, tbl[435 + tbloffset]))); + real2 v2411 = minusplus(v2409, v2410); + real2 v2413 = minusplus(uminus(v2409), v2410); + real2 v2433 = minusplus(uminus(v2429), v2430); + real2 v2431 = minusplus(v2429, v2430); + real2 v2421 = ctimesminusplus(reverse(v2411), tbl[426 + tbloffset], ctimes(v2411, tbl[427 + tbloffset])); + real2 v2441 = ctimesminusplus(reverse(v2431), tbl[430 + tbloffset], ctimes(v2431, tbl[431 + tbloffset])); + store(out, 21 << %SHIFT%, plus(v2421, v2441)); + real2 v2474 = minus(v2421, v2441); + store(out, 85 << %SHIFT%, ctimesminusplus(v2474, tbl[0 + tbloffset], ctimes(reverse(v2474), tbl[1 + tbloffset]))); + real2 v2427 = ctimesminusplus(reverse(v2413), tbl[428 + tbloffset], ctimes(v2413, tbl[429 + tbloffset])); + real2 v2447 = ctimesminusplus(reverse(v2433), tbl[432 + tbloffset], ctimes(v2433, tbl[433 + tbloffset])); + store(out, 53 << %SHIFT%, plus(v2427, v2447)); + real2 v2480 = minus(v2427, v2447); + store(out, 117 << %SHIFT%, ctimesminusplus(v2480, tbl[0 + tbloffset], ctimes(reverse(v2480), tbl[1 + tbloffset]))); + real2 v2057 = ctimesminusplus(reverse(v2043), tbl[360 + tbloffset], ctimes(v2043, tbl[361 + tbloffset])); + real2 v2097 = ctimesminusplus(reverse(v2083), tbl[368 + tbloffset], ctimes(v2083, tbl[369 + tbloffset])); + real2 v2157 = ctimesminusplus(reverse(v2143), tbl[380 + tbloffset], ctimes(v2143, tbl[381 + tbloffset])); + real2 v2197 = ctimesminusplus(reverse(v2183), tbl[388 + tbloffset], ctimes(v2183, tbl[389 + tbloffset])); + real2 v2117 = ctimesminusplus(reverse(v2103), tbl[372 + tbloffset], ctimes(v2103, tbl[373 + tbloffset])); + real2 v2507 = reverse(minus(v2197, v2117)); + real2 v2513 = plus(v2117, v2197); + real2 v2137 = ctimesminusplus(reverse(v2123), tbl[376 + tbloffset], ctimes(v2123, tbl[377 + tbloffset])); + real2 v2488 = minus(v2137, v2057); + real2 v2492 = plus(v2057, v2137); + real2 v2177 = ctimesminusplus(reverse(v2163), tbl[384 + tbloffset], ctimes(v2163, tbl[385 + tbloffset])); + real2 v2493 = plus(v2097, v2177); + real2 v2487 = reverse(minus(v2177, v2097)); + real2 v2532 = plus(v2492, v2493); + real2 v2528 = minus(v2493, v2492); + real2 v2077 = ctimesminusplus(reverse(v2063), tbl[364 + tbloffset], ctimes(v2063, tbl[365 + tbloffset])); + real2 v2512 = plus(v2077, v2157); + real2 v2508 = minus(v2157, v2077); + real2 v2527 = reverse(minus(v2513, v2512)); + real2 v2533 = plus(v2512, v2513); + real2 v2529 = minusplus(v2527, v2528); + real2 v2531 = minusplus(uminus(v2527), v2528); + store(out, 109 << %SHIFT%, ctimesminusplus(reverse(v2531), tbl[448 + tbloffset], ctimes(v2531, tbl[449 + tbloffset]))); + store(out, 45 << %SHIFT%, ctimesminusplus(reverse(v2529), tbl[446 + tbloffset], ctimes(v2529, tbl[447 + tbloffset]))); + store(out, 13 << %SHIFT%, plus(v2532, v2533)); + real2 v2546 = minus(v2532, v2533); + store(out, 77 << %SHIFT%, ctimesminusplus(v2546, tbl[0 + tbloffset], ctimes(reverse(v2546), tbl[1 + tbloffset]))); + real2 v2509 = minusplus(v2507, v2508); + real2 v2511 = minusplus(uminus(v2507), v2508); + real2 v2491 = minusplus(uminus(v2487), v2488); + real2 v2489 = minusplus(v2487, v2488); + real2 v2499 = ctimesminusplus(reverse(v2489), tbl[438 + tbloffset], ctimes(v2489, tbl[439 + tbloffset])); + real2 v2519 = ctimesminusplus(reverse(v2509), tbl[442 + tbloffset], ctimes(v2509, tbl[443 + tbloffset])); + store(out, 29 << %SHIFT%, plus(v2499, v2519)); + real2 v2552 = minus(v2499, v2519); + store(out, 93 << %SHIFT%, ctimesminusplus(v2552, tbl[0 + tbloffset], ctimes(reverse(v2552), tbl[1 + tbloffset]))); + real2 v2505 = ctimesminusplus(reverse(v2491), tbl[440 + tbloffset], ctimes(v2491, tbl[441 + tbloffset])); + real2 v2525 = ctimesminusplus(reverse(v2511), tbl[444 + tbloffset], ctimes(v2511, tbl[445 + tbloffset])); + store(out, 61 << %SHIFT%, plus(v2505, v2525)); + real2 v2558 = minus(v2505, v2525); + store(out, 125 << %SHIFT%, ctimesminusplus(v2558, tbl[0 + tbloffset], ctimes(reverse(v2558), tbl[1 + tbloffset]))); + // Pres : 76263 + } +}