From 914fc6fde1908eafb2f6188fe5bf507fe83745ee Mon Sep 17 00:00:00 2001 From: Mikael Mortensen Date: Wed, 15 May 2024 18:45:30 +0200 Subject: [PATCH] Adding fast multipole method for Legendre-Chebyshev transforms (#83) * Adding FMM * Update fmm.c * Adding new tests * Trigger * Adding stdint.h * Fixing mac CI --- .github/workflows/ci.yml | 6 +- Make.inc | 41 +- Makefile | 2 + src/fmm.c | 67 ++ src/fmm.h | 22 + src/fmm_source.c | 1729 ++++++++++++++++++++++++++++++++++++++ src/fmm_source.h | 58 ++ test/test_fmm.c | 39 + test/test_fmm_source.c | 177 ++++ 9 files changed, 2120 insertions(+), 21 deletions(-) create mode 100644 src/fmm.c create mode 100644 src/fmm.h create mode 100644 src/fmm_source.c create mode 100644 src/fmm_source.h create mode 100644 test/test_fmm.c create mode 100644 test/test_fmm_source.c diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3edbbab2..60e8de7f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: CC: ${{ matrix.compiler }} LD_LIBRARY_PATH: $LD_LIBRARY_PATH:. steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install dependencies on ubuntu gcc if: startsWith(matrix.os, 'ubuntu') && startsWith(matrix.compiler, 'gcc') run: | @@ -65,13 +65,13 @@ jobs: name: Documentation runs-on: macOS-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install Doxygen run: brew install doxygen - name: Build site run: doxygen Doxyfile - if: ${{ github.ref == 'refs/heads/master' }} - uses: peaceiris/actions-gh-pages@v3 + uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: docs/html diff --git a/Make.inc b/Make.inc index d3be1378..097e0532 100644 --- a/Make.inc +++ b/Make.inc @@ -84,7 +84,7 @@ ASM = src/recurrence/recurrence_default.s \ src/rotations/rotations_AVX512F.s \ src/rotations/rotations_NEON.s -SRC = src/recurrence.c src/transforms.c src/rotations.c src/permute.c src/tdc.c src/drivers.c src/fftw.c src/isometries.c src/helmholtzhodge.c +SRC = src/fmm.c src/recurrence.c src/transforms.c src/rotations.c src/permute.c src/tdc.c src/drivers.c src/fftw.c src/isometries.c src/helmholtzhodge.c machine := $(shell $(CC) -dumpmachine | cut -d'-' -f1) @@ -120,23 +120,28 @@ ifdef FT_PREFIX endif else ifeq ($(UNAME), Darwin) - CFLAGS += -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include - LDFLAGS += -L/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib - ifeq ($(FT_USE_APPLEBLAS), 1) - CFLAGS += -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers - LDFLAGS += -L/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A + ifeq (,$(shell which conda)) + CFLAGS += -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include + LDFLAGS += -L/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib + ifeq ($(FT_USE_APPLEBLAS), 1) + CFLAGS += -I/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers + LDFLAGS += -L/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A + else + CFLAGS += -I/opt/homebrew/opt/openblas/include + LDFLAGS += -L/opt/homebrew/opt/openblas/lib + endif + CFLAGS += -I/opt/homebrew/opt/libomp/include + LDFLAGS += -L/opt/homebrew/opt/libomp/lib + CFLAGS += -I/opt/homebrew/opt/fftw/include + LDFLAGS += -L/opt/homebrew/opt/fftw/lib + CFLAGS += -I/opt/homebrew/opt/mpfr/include + LDFLAGS += -L/opt/homebrew/opt/mpfr/lib + CFLAGS += -I/opt/homebrew/opt/gmp/include + LDFLAGS += -L/opt/homebrew/opt/gmp/lib else - CFLAGS += -I/usr/local/opt/openblas/include - LDFLAGS += -L/usr/local/opt/openblas/lib + CFLAGS += -I$(CONDA_PREFIX)/include + LDFLAGS += -L$(CONDA_PREFIX)/lib endif - CFLAGS += -I/usr/local/opt/libomp/include - LDFLAGS += -L/usr/local/opt/libomp/lib - CFLAGS += -I/usr/local/opt/fftw/include - LDFLAGS += -L/usr/local/opt/fftw/lib - CFLAGS += -I/usr/local/opt/mpfr/include - LDFLAGS += -L/usr/local/opt/mpfr/lib - CFLAGS += -I/usr/local/opt/gmp/include - LDFLAGS += -L/usr/local/opt/gmp/lib else ifeq ($(UNAME), Windows) CFLAGS += -Ic:\tools\vcpkg\installed\x64-windows\include\openblas CFLAGS += -Ic:\tools\vcpkg\installed\x64-windows\include @@ -164,7 +169,7 @@ ifndef FT_OPENMP endif endif -LDLIBS += $(FT_OPENMP) -lm -lmpfr -l$(FT_BLAS) -lfftw3 +LDLIBS += $(FT_OPENMP) -lm -lmpfr -l$(FT_BLAS) -lfftw3 -lfftw3f ifdef FT_QUADMATH AFLAGS += -DFT_QUADMATH @@ -174,5 +179,5 @@ ifdef FT_QUADMATH endif ifneq ($(FT_FFTW_WITH_COMBINED_THREADS), 1) - LDLIBS += -lfftw3_threads + LDLIBS += -lfftw3_threads -lfftw3f_threads endif diff --git a/Makefile b/Makefile index d1cc9f72..1ed9e1f5 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,7 @@ lib: $(CC) $(CFLAGS) $(LIBFLAGS) $(ASM) $(SRC) $(LDFLAGS) $(LDLIBS) -o lib$(LIB).$(SLIB) tests: + $(CC) src/ftutilities.c test/test_fmm.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_fmm $(CC) src/ftutilities.c test/test_recurrence.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_recurrence $(CC) src/ftutilities.c test/test_transforms.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_transforms $(CC) src/ftutilities.c test/test_permute.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o test_permute @@ -58,6 +59,7 @@ examples: $(CC) src/ftutilities.c examples/subspaceangles.c $(CFLAGS) -L$(LIBDIR) -l$(LIB) $(LDFLAGS) $(LDLIBS) -o subspaceangles runtests: + ./test_fmm ./test_recurrence ./test_transforms ./test_permute diff --git a/src/fmm.c b/src/fmm.c new file mode 100644 index 00000000..64133b05 --- /dev/null +++ b/src/fmm.c @@ -0,0 +1,67 @@ +#include "fasttransforms.h" +#include "ftinternal.h" +#include "fmm.h" + +size_t get_number_of_blocks(const size_t level) { return pow(2, level + 1) - 1; } + +size_t get_h(const size_t level, const size_t L) { return pow(2, L - level - 1); } + +size_t get_number_of_submatrices(const size_t level) { return 3 * get_number_of_blocks(level); } + +size_t get_total_number_of_blocks(const size_t L) { return pow(2, L + 1) - (L + 2); } + +size_t get_total_number_of_submatrices(const size_t L) { return 3 * get_total_number_of_blocks(L); } + +void get_ij(size_t *ij, const size_t level, const size_t block, const size_t s, const size_t L) +{ + size_t h = get_h(level, L); + ij[0] = 2 * block * s * h; + ij[1] = ij[0] + 2 * s * h; +} + +/*#if defined(FT_QUADMATH) + #define FLT quadruple + #define X(name) FT_CONCAT(ft_, name, q) + #define Y(name) FT_CONCAT(, name, q) + #include "fmm_source.c" + #undef FLT + #undef X + #undef Y + + #define FLT long double + #define X(name) FT_CONCAT(ft_, name, l) + #define X2(name) FT_CONCAT(ft_, name, q) + #define Y(name) FT_CONCAT(, name, l) + #include "fmm_source.c" + #undef FLT + #undef X + #undef X2 + #undef Y +#else + #define FLT long double + #define X(name) FT_CONCAT(ft_, name, l) + #define X2(name) FT_CONCAT(ft_, name, l) + #define Y(name) FT_CONCAT(, name, l) + #include "fmm_source.c" + #undef FLT + #undef X + #undef X2 + #undef Y +#endif*/ + + +#define FLT double +#define X(name) FT_CONCAT(ft_, name, ) +#define FT_USE_DOUBLE +#include "fmm_source.c" +#undef FLT +#undef X +#undef FT_USE_DOUBLE + +#define FLT float +#define X(name) FT_CONCAT(ft_, name, f) +#define FT_USE_SINGLE +#include "fmm_source.c" +#undef FLT +#undef X +#undef FT_USE_SINGLE diff --git a/src/fmm.h b/src/fmm.h new file mode 100644 index 00000000..3de19b05 --- /dev/null +++ b/src/fmm.h @@ -0,0 +1,22 @@ +#ifndef FMM_H +#define FMM_H + +#include +#include +#include +#include +#include + +enum { L2C = 0, C2L = 1, BOTH = 2 }; + +#ifdef CLOCK_UPTIME_RAW +#define tic clock_gettime_nsec_np(CLOCK_UPTIME_RAW) +#define dtics(a, b) (double)(b - a) / 1.0E9 +#define toc(a) (double)(tic - a) / 1.0E9 +#else +#define tic clock() +#define dtics(a, b) (double)(b - a) / (double)CLOCKS_PER_SEC +#define toc(a) (double)(tic - a) / (double)CLOCKS_PER_SEC +#endif + +#endif \ No newline at end of file diff --git a/src/fmm_source.c b/src/fmm_source.c new file mode 100644 index 00000000..f4842bdd --- /dev/null +++ b/src/fmm_source.c @@ -0,0 +1,1729 @@ +#include "fmm_source.h" + +// C ← α*A*B + β*C, C ← α*Aᵀ*B + β*C +#if defined(FT_USE_SINGLE) +// y ← α*A*x + β*y, y ← α*Aᵀ*x + β*y +void X(rm_gemv)(char TRANS, int m, int n, FLT alpha, FLT *A, int LDA, FLT *x, FLT beta, FLT *y) +{ + if (TRANS == 'N') + cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, A, LDA, x, 1, beta, y, 1); + else if (TRANS == 'T') + cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, A, LDA, x, 1, beta, y, 1); +} + +void X(rm_gemm)(char TRANS, int m, int n, int p, FLT alpha, FLT *A, int LDA, FLT *B, int LDB, + FLT beta, FLT *C, int LDC) +{ + if (TRANS == 'N') + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, A, LDA, B, LDB, beta, C, + LDC); + else if (TRANS == 'T') + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, p, alpha, A, LDA, B, LDB, beta, C, + LDC); +} + +FLT *X(fftw_malloc)(int N) { return (FLT *)fftwf_malloc(N); } + +typedef fftwf_plan X(fftw_plan); + +fftwf_plan X(fftw_plan_r2r_1d)(int M, FLT *fun, FLT *fun_hat) +{ + return fftwf_plan_r2r_1d(M, fun, fun_hat, FFTW_REDFT10, FFTW_MEASURE); +} + +fftwf_plan X(fftw_plan_r2r_2d)(int M, FLT *fun, FLT *fun_hat) +{ + return fftwf_plan_r2r_2d(M, M, fun, fun_hat, FFTW_REDFT10, FFTW_REDFT10, FFTW_MEASURE); +} + +void X(fftw_execute)(fftwf_plan plan) { fftwf_execute(plan); } + +void X(fftw_destroy_plan)(fftwf_plan plan) { fftwf_destroy_plan(plan); } + +#elif defined(FT_USE_DOUBLE) + +void X(rm_gemv)(char TRANS, int m, int n, FLT alpha, FLT *A, int LDA, FLT *x, FLT beta, FLT *y) +{ + if (TRANS == 'N') + cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, A, LDA, x, 1, beta, y, 1); + else if (TRANS == 'T') + cblas_dgemv(CblasRowMajor, CblasTrans, n, m, alpha, A, LDA, x, 1, beta, y, 1); +} + +void X(rm_gemm)(char TRANS, int m, int n, int p, FLT alpha, FLT *A, int LDA, FLT *B, int LDB, + FLT beta, FLT *C, int LDC) +{ + if (TRANS == 'N') + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, p, alpha, A, LDA, B, LDB, beta, C, + LDC); + else if (TRANS == 'T') + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, p, alpha, A, LDA, B, LDB, beta, C, + LDC); +} + +FLT *X(fftw_malloc)(int N) { return (FLT *)fftw_malloc(N); } + +typedef fftw_plan X(fftw_plan); + +fftw_plan X(fftw_plan_r2r_1d)(int M, FLT *fun, FLT *fun_hat) +{ + return fftw_plan_r2r_1d(M, fun, fun_hat, FFTW_REDFT10, FFTW_MEASURE); +} + +fftw_plan X(fftw_plan_r2r_2d)(int M, FLT *fun, FLT *fun_hat) +{ + return fftw_plan_r2r_2d(M, M, fun, fun_hat, FFTW_REDFT10, FFTW_REDFT10, FFTW_MEASURE); +} + +void X(fftw_execute)(fftw_plan plan) { fftw_execute(plan); } + +void X(fftw_destroy_plan)(fftw_plan plan) { fftw_destroy_plan(plan); } + +// Compact storage of lower triangular transform matrix. (18 x 18 + 18) / 2 +// items +const double BMe[171] __attribute__((aligned)) = { + 1.0000000000000000e+00, -5.0000000000000000e-01, 5.0000000000000000e-01, + -2.5000000000000000e-01, -1.0000000000000000e+00, 2.5000000000000000e-01, + 2.5000000000000000e-01, 3.7500000000000000e-01, -7.5000000000000000e-01, + 1.2500000000000000e-01, 1.8750000000000000e-01, 5.0000000000000000e-01, + 7.5000000000000000e-01, -5.0000000000000000e-01, 6.2500000000000000e-02, + -1.8750000000000000e-01, -3.1250000000000000e-01, 0.0000000000000000e+00, + 7.8125000000000000e-01, -3.1250000000000000e-01, 3.1250000000000000e-02, + -1.5625000000000000e-01, -3.7500000000000000e-01, -5.1562500000000000e-01, + -4.3750000000000000e-01, 6.5625000000000000e-01, -1.8750000000000000e-01, + 1.5625000000000000e-02, 1.5625000000000000e-01, 2.7343750000000000e-01, + 1.0937500000000000e-01, -2.7343750000000000e-01, -6.5625000000000000e-01, + 4.9218750000000000e-01, -1.0937500000000000e-01, 7.8125000000000000e-03, + 1.3671875000000000e-01, 3.1250000000000000e-01, 4.0625000000000000e-01, + 4.3750000000000000e-01, 1.0937500000000000e-01, -6.8750000000000000e-01, + 3.4375000000000000e-01, -6.2500000000000000e-02, 3.9062500000000000e-03, + -1.3671875000000000e-01, -2.4609375000000000e-01, -1.4062500000000000e-01, + 9.3750000000000000e-02, 4.2187500000000000e-01, 4.2187500000000000e-01, + -6.0937500000000000e-01, 2.2851562500000000e-01, -3.5156250000000000e-02, + 1.9531250000000000e-03, -1.2304687500000000e-01, -2.7343750000000000e-01, + -3.4179687500000000e-01, -3.9062500000000000e-01, -2.7343750000000000e-01, + 1.7187500000000000e-01, 5.9082031250000000e-01, -4.8828125000000000e-01, + 1.4648437500000000e-01, -1.9531250000000000e-02, 9.7656250000000000e-04, + 1.2304687500000000e-01, 2.2558593750000000e-01, 1.5039062500000000e-01, + -1.0742187500000000e-02, -2.5781250000000000e-01, -4.3505859375000000e-01, + -1.3964843750000000e-01, 6.2841796875000000e-01, -3.6523437500000000e-01, + 9.1308593750000000e-02, -1.0742187500000000e-02, 4.8828125000000000e-04, + 1.1279296875000000e-01, 2.4609375000000000e-01, 2.9882812500000000e-01, + 3.4765625000000000e-01, 3.0834960937500000e-01, 6.4453125000000000e-02, + -3.5449218750000000e-01, -3.9257812500000000e-01, 5.7861328125000000e-01, + -2.5976562500000000e-01, 5.5664062500000000e-02, -5.8593750000000000e-03, + 2.4414062500000000e-04, -1.1279296875000000e-01, -2.0947265625000000e-01, + -1.5234375000000000e-01, -3.3325195312500000e-02, 1.5551757812500000e-01, + 3.4753417968750000e-01, 3.3007812500000000e-01, -1.2377929687500000e-01, + -5.3955078125000000e-01, 4.8559570312500000e-01, -1.7773437500000000e-01, + 3.3325195312500000e-02, -3.1738281250000000e-03, 1.2207031250000000e-04, + -1.0473632812500000e-01, -2.2558593750000000e-01, -2.6788330078125000e-01, + -3.1274414062500000e-01, -3.0676269531250000e-01, -1.6918945312500000e-01, + 1.3629150390625000e-01, 4.1162109375000000e-01, 1.4184570312500000e-01, + -5.8447265625000000e-01, 3.8153076171875000e-01, -1.1791992187500000e-01, + 1.9653320312500000e-02, -1.7089843750000000e-03, 6.1035156250000000e-05, + 1.0473632812500000e-01, 1.9638061523437500e-01, 1.5106201171875000e-01, + 5.8746337890625000e-02, -8.9721679687500000e-02, -2.6358032226562500e-01, + -3.4515380859375000e-01, -1.4877319335937500e-01, 3.1127929687500000e-01, + 3.6056518554687500e-01, -5.5499267578125000e-01, 2.8518676757812500e-01, + -7.6293945312500000e-02, 1.1444091796875000e-02, -9.1552734375000000e-04, + 3.0517578125000000e-05, 9.8190307617187500e-02, 2.0947265625000000e-01, + 2.4438476562500000e-01, 2.8466796875000000e-01, 2.9406738281250000e-01, + 2.1533203125000000e-01, 2.6855468750000000e-03, -2.7978515625000000e-01, + -3.4722900390625000e-01, 1.0205078125000000e-01, 4.9633789062500000e-01, + -4.8291015625000000e-01, 2.0495605468750000e-01, -4.8339843750000000e-02, + 6.5917968750000000e-03, -4.8828125000000000e-04, 1.5258789062500000e-05, + -9.8190307617187500e-02, -1.8547058105468750e-01, -1.4837646484375000e-01, + -7.4188232421875000e-02, 4.5654296875000000e-02, 1.9662475585937500e-01, + 3.1024169921875000e-01, 2.5628662109375000e-01, -5.2917480468750000e-02, + -3.8806152343750000e-01, -1.3177490234375000e-01, 5.4837036132812500e-01, + -3.9428710937500000e-01, 1.4266967773437500e-01, -3.0090332031250000e-02, + 3.7612915039062500e-03, -2.5939941406250000e-04, 7.6293945312500000e-06}; + +// Transpose of BMe +const double BMeT[171] __attribute__((aligned)) = { + 1.0000000000000000e+00, -5.0000000000000000e-01, -2.5000000000000000e-01, + 2.5000000000000000e-01, 1.8750000000000000e-01, -1.8750000000000000e-01, + -1.5625000000000000e-01, 1.5625000000000000e-01, 1.3671875000000000e-01, + -1.3671875000000000e-01, -1.2304687500000000e-01, 1.2304687500000000e-01, + 1.1279296875000000e-01, -1.1279296875000000e-01, -1.0473632812500000e-01, + 1.0473632812500000e-01, 9.8190307617187500e-02, -9.8190307617187500e-02, + 5.0000000000000000e-01, -1.0000000000000000e+00, 3.7500000000000000e-01, + 5.0000000000000000e-01, -3.1250000000000000e-01, -3.7500000000000000e-01, + 2.7343750000000000e-01, 3.1250000000000000e-01, -2.4609375000000000e-01, + -2.7343750000000000e-01, 2.2558593750000000e-01, 2.4609375000000000e-01, + -2.0947265625000000e-01, -2.2558593750000000e-01, 1.9638061523437500e-01, + 2.0947265625000000e-01, -1.8547058105468750e-01, 2.5000000000000000e-01, + -7.5000000000000000e-01, 7.5000000000000000e-01, 0.0000000000000000e+00, + -5.1562500000000000e-01, 1.0937500000000000e-01, 4.0625000000000000e-01, + -1.4062500000000000e-01, -3.4179687500000000e-01, 1.5039062500000000e-01, + 2.9882812500000000e-01, -1.5234375000000000e-01, -2.6788330078125000e-01, + 1.5106201171875000e-01, 2.4438476562500000e-01, -1.4837646484375000e-01, + 1.2500000000000000e-01, -5.0000000000000000e-01, 7.8125000000000000e-01, + -4.3750000000000000e-01, -2.7343750000000000e-01, 4.3750000000000000e-01, + 9.3750000000000000e-02, -3.9062500000000000e-01, -1.0742187500000000e-02, + 3.4765625000000000e-01, -3.3325195312500000e-02, -3.1274414062500000e-01, + 5.8746337890625000e-02, 2.8466796875000000e-01, -7.4188232421875000e-02, + 6.2500000000000000e-02, -3.1250000000000000e-01, 6.5625000000000000e-01, + -6.5625000000000000e-01, 1.0937500000000000e-01, 4.2187500000000000e-01, + -2.7343750000000000e-01, -2.5781250000000000e-01, 3.0834960937500000e-01, + 1.5551757812500000e-01, -3.0676269531250000e-01, -8.9721679687500000e-02, + 2.9406738281250000e-01, 4.5654296875000000e-02, 3.1250000000000000e-02, + -1.8750000000000000e-01, 4.9218750000000000e-01, -6.8750000000000000e-01, + 4.2187500000000000e-01, 1.7187500000000000e-01, -4.3505859375000000e-01, + 6.4453125000000000e-02, 3.4753417968750000e-01, -1.6918945312500000e-01, + -2.6358032226562500e-01, 2.1533203125000000e-01, 1.9662475585937500e-01, + 1.5625000000000000e-02, -1.0937500000000000e-01, 3.4375000000000000e-01, + -6.0937500000000000e-01, 5.9082031250000000e-01, -1.3964843750000000e-01, + -3.5449218750000000e-01, 3.3007812500000000e-01, 1.3629150390625000e-01, + -3.4515380859375000e-01, 2.6855468750000000e-03, 3.1024169921875000e-01, + 7.8125000000000000e-03, -6.2500000000000000e-02, 2.2851562500000000e-01, + -4.8828125000000000e-01, 6.2841796875000000e-01, -3.9257812500000000e-01, + -1.2377929687500000e-01, 4.1162109375000000e-01, -1.4877319335937500e-01, + -2.7978515625000000e-01, 2.5628662109375000e-01, 3.9062500000000000e-03, + -3.5156250000000000e-02, 1.4648437500000000e-01, -3.6523437500000000e-01, + 5.7861328125000000e-01, -5.3955078125000000e-01, 1.4184570312500000e-01, + 3.1127929687500000e-01, -3.4722900390625000e-01, -5.2917480468750000e-02, + 1.9531250000000000e-03, -1.9531250000000000e-02, 9.1308593750000000e-02, + -2.5976562500000000e-01, 4.8559570312500000e-01, -5.8447265625000000e-01, + 3.6056518554687500e-01, 1.0205078125000000e-01, -3.8806152343750000e-01, + 9.7656250000000000e-04, -1.0742187500000000e-02, 5.5664062500000000e-02, + -1.7773437500000000e-01, 3.8153076171875000e-01, -5.5499267578125000e-01, + 4.9633789062500000e-01, -1.3177490234375000e-01, 4.8828125000000000e-04, + -5.8593750000000000e-03, 3.3325195312500000e-02, -1.1791992187500000e-01, + 2.8518676757812500e-01, -4.8291015625000000e-01, 5.4837036132812500e-01, + 2.4414062500000000e-04, -3.1738281250000000e-03, 1.9653320312500000e-02, + -7.6293945312500000e-02, 2.0495605468750000e-01, -3.9428710937500000e-01, + 1.2207031250000000e-04, -1.7089843750000000e-03, 1.1444091796875000e-02, + -4.8339843750000000e-02, 1.4266967773437500e-01, 6.1035156250000000e-05, + -9.1552734375000000e-04, 6.5917968750000000e-03, -3.0090332031250000e-02, + 3.0517578125000000e-05, -4.8828125000000000e-04, 3.7612915039062500e-03, + 1.5258789062500000e-05, -2.5939941406250000e-04, 7.6293945312500000e-06}; + +// Mixed radix 2 and 3 algorithm +void dct_radix23(double *input, double *output, size_t st) +{ + double out[18], z[9], B[3], C[3], T[9]; + double f[3], g[3], h[3], t[3]; + + static const double M_SQRT3_2 = 8.660254037844385966e-01; + // sqrt(3)*sin(pi*(i+0.5)/9) + static const double SIN[3] __attribute__((aligned)) = { + 3.007674663608705945e-01, 8.660254037844385966e-01, 1.326827896337876789e+00}; + // sqrt(3)*sin(2*pi*(i+0.5)/9) + static const double SIN2[3] __attribute__((aligned)) = { + 5.923962654520477100e-01, 1.500000000000000000e+00, 1.705737063904886330e+00}; + // cos(pi*(i+0.5)/9) + static const double COS[3] __attribute__((aligned)) = { + 9.848077530122080203e-01, 8.660254037844385966e-01, 6.427876096865393629e-01}; + // cos(2*pi*(i+0.5)/9) + static const double COS2[3] __attribute__((aligned)) = { + 9.396926207859084279e-01, 5.000000000000000000e-01, -1.736481776669303589e-01}; + // 2*cos(pi*(i+0.5)/18) + static const double COSH[9] __attribute__((aligned)) = { + 1.992389396183491090e+00, 1.931851652578136624e+00, 1.812615574073299873e+00, + 1.638304088577983597e+00, 1.414213562373095145e+00, 1.147152872702092097e+00, + 8.452365234813988826e-01, 5.176380902050414790e-01, 1.743114854953163595e-01}; + + // even indices + for (size_t i = 0; i < 9; i++) + { // radix 2 + z[i] = input[i * st] + input[(17 - i) * st]; + } + + // radix 3 on the 9 even indices + for (size_t i = 0; i < 3; i++) + { + f[i] = z[i] + z[5 - i] + z[6 + i]; + t[i] = 2 * z[i] - z[5 - i] - z[6 + i]; + } + + for (size_t i = 0; i < 3; i++) + { + double tmp = z[5 - i] - z[6 + i]; + g[i] = t[i] * COS[i] + tmp * SIN[i]; + h[i] = t[i] * COS2[i] - tmp * SIN2[i]; + } + + B[0] = g[0] + g[1] + g[2]; + B[1] = M_SQRT3_2 * (g[0] - g[2]); + B[2] = 0.5 * (g[0] + g[2]) - g[1]; + C[0] = h[0] + h[1] + h[2]; + C[1] = M_SQRT3_2 * (h[0] - h[2]); + C[2] = 0.5 * (h[0] + h[2]) - h[1]; + out[0] = f[0] + f[1] + f[2]; + out[3] = M_SQRT3_2 * (f[0] - f[2]); + out[6] = 0.5 * (f[0] + f[2]) - f[1]; + out[1] = B[0] * 0.5; + out[2] = C[0] * 0.5; + out[4] = B[1] - out[2]; + out[5] = C[1] - out[1]; + out[7] = B[2] - out[5]; + out[8] = C[2] - out[4]; + + // odd indices + for (size_t i = 0; i < 9; i++) + { + z[i] = (input[i * st] - input[(17 - i) * st]) * COSH[i]; + } + + // radix 3 on the 9 odd indices + for (size_t i = 0; i < 3; i++) + { + f[i] = z[i] + z[5 - i] + z[6 + i]; + } + out[9] = (f[0] + f[1] + f[2]) * 0.5; // T[0] is out[9] + T[3] = M_SQRT3_2 * (f[0] - f[2]); + T[6] = 0.5 * (f[0] + f[2]) - f[1]; + + for (size_t i = 0; i < 3; i++) + { + t[i] = 3 * z[i] - f[i]; + } + + for (size_t i = 0; i < 3; i++) + { + double tmp = z[5 - i] - z[6 + i]; + g[i] = t[i] * COS[i] + tmp * SIN[i]; + h[i] = t[i] * COS2[i] - tmp * SIN2[i]; + } + + B[0] = g[0] + g[1] + g[2]; + B[1] = M_SQRT3_2 * (g[0] - g[2]); + B[2] = 0.5 * (g[0] + g[2]) - g[1]; + C[0] = h[0] + h[1] + h[2]; + C[1] = M_SQRT3_2 * (h[0] - h[2]); + C[2] = 0.5 * (h[0] + h[2]) - h[1]; + T[1] = B[0] * 0.5; + T[2] = C[0] * 0.5; + T[4] = B[1] - T[2]; + T[5] = C[1] - T[1]; + T[7] = B[2] - T[5]; + T[8] = C[2] - T[4]; + + for (size_t i = 1; i < 9; i++) + { + out[9 + i] = T[i] - out[8 + i]; + } + + for (size_t i = 0; i < 9; i++) + { + output[2 * i * st] = out[i]; + output[(2 * i + 1) * st] = out[9 + i]; + } +} + +void dct2(double *input, double *output) +{ + double tmp[324]; + static const double sc = 1.0 / 81.0; + for (size_t i = 0; i < 18; i++) + { + dct_radix23(input + i * 18, tmp + i * 18, 1); + } + for (size_t i = 0; i < 18; i++) + { + dct_radix23(tmp + i, output + i, 18); + } + for (size_t i = 0; i < 324; i++) + { + output[i] *= sc; + } + for (size_t i = 0; i < 18; i++) + { + output[i] *= 0.5; + output[i * 18] *= 0.5; + } +} + +#else + +void X(rm_gemv)(char TRANS, int m, int n, FLT alpha, FLT *A, int LDA, FLT *x, FLT beta, FLT *y) +{ + if (TRANS == 'N') + X(gemv)('T', m, n, alpha, A, LDA, x, beta, y); + else + X(gemv)('N', m, n, alpha, A, LDA, x, beta, y); +} + +void X(rm_gemm)(char TRANS, int m, int n, int p, FLT alpha, FLT *A, int LDA, FLT *B, int LDB, + FLT beta, FLT *C, int LDC) +{ + if (TRANS == 'T') + X(gemm)('N', m, n, p, alpha, A, LDA, B, LDB, beta, C, LDC); + else + X(gemm)('T', m, n, p, alpha, A, LDA, B, LDB, beta, C, LDC); +} + +FLT *X(fftwl_malloc)(int N) { return (FLT *)fftwl_malloc(N); } + +typedef fftwl_plan X(fftw_plan); + +fftwl_plan X(fftw_plan_r2r_1d)(int M, FLT *fun, FLT *fun_hat) +{ + return fftwl_plan_r2r_1d(M, fun, fun_hat, FFTW_REDFT10, FFTW_MEASURE); +} + +fftwl_plan X(fftw_plan_r2r_2d)(int M, FLT *fun, FLT *fun_hat) +{ + return fftwl_plan_r2r_2d(M, M, fun, fun_hat, FFTW_REDFT10, FFTW_REDFT10, FFTW_MEASURE); +} + +void X(fftw_execute)(fftwl_plan plan) { fftwl_execute(plan); } + +void X(fftw_destroy_plan)(fftwl_plan plan) { fftwl_destroy_plan(plan); } + +#endif + +FLT X(_Lambda0)(const FLT z) { return exp(lgamma(z + 0.5) - lgamma(z + 1)); } + +FLT X(Lambda)(const FLT z) +{ + FLT zz, y; + const FLT zp = z + 0.25; + const FLT s = 1 / sqrt(zp); + const FLT z2 = 1 / (zp * zp); + + if (z < 10) return X(Lambda)(z + 1) * (z + 1) / (z + 0.5); + + y = 1 - 1.5625e-02 * z2; + if (z > 2014) // 2000 + goto scalereturn; + zz = z2 * z2; + y += 2.5634765625e-03 * zz; + if (z > 141) // 92 + goto scalereturn; + zz *= z2; + y -= 1.2798309326171875e-03 * zz; + if (z > 40) // 31 + goto scalereturn; + zz *= z2; + y += 1.343511044979095458984375e-03 * zz; + if (z > 20) goto scalereturn; + zz *= z2; + y -= 2.4328966392204165e-03 * zz; + if (z > 14) goto scalereturn; + zz *= z2; + y += 6.754237533641572e-03 * zz; + goto scalereturn; +scalereturn: + return y * s; +} + +// Lambda with integer argument. For initializing direct solver +FLT X(LambdaI)(const size_t z) +{ + const static FLT LamInt[256] = { + 1.7724538509055161e+00, 8.8622692545275805e-01, 6.6467019408956851e-01, + 5.5389182840797380e-01, 4.8465534985697706e-01, 4.3618981487127934e-01, + 3.9984066363200604e-01, 3.7128061622971992e-01, 3.4807557771536241e-01, + 3.2873804562006448e-01, 3.1230114333906128e-01, 2.9810563682364938e-01, + 2.8568456862266400e-01, 2.7469670059871537e-01, 2.6488610414876129e-01, + 2.5605656734380255e-01, 2.4805479961430874e-01, 2.4075907021388790e-01, + 2.3407131826350211e-01, 2.2791154673025205e-01, 2.2221375806199575e-01, + 2.1692295429861491e-01, 2.1199288715546458e-01, 2.0738434613034576e-01, + 2.0306383891929691e-01, 1.9900256214091097e-01, 1.9517558979204730e-01, + 1.9156122701812048e-01, 1.8814049082136833e-01, 1.8489668925548267e-01, + 1.8181507776789130e-01, 1.7888257651357048e-01, 1.7608753625554593e-01, + 1.7341954328197706e-01, 1.7086925588077151e-01, 1.6842826651104620e-01, + 1.6608898503172612e-01, 1.6384453928805415e-01, 1.6168869008689554e-01, + 1.5961575816270457e-01, 1.5762056118567075e-01, 1.5569835921999184e-01, + 1.5384480732451575e-01, 1.5205591421609116e-01, 1.5032800609999922e-01, + 1.4865769492111033e-01, 1.4704185041109827e-01, 1.4547757540672487e-01, + 1.4396218399623817e-01, 1.4249318211872553e-01, 1.4106825029753828e-01, + 1.3968522823579768e-01, 1.3834210104122271e-01, 1.3703698688045646e-01, + 1.3576812589082260e-01, 1.3453387020090604e-01, 1.3333267493125509e-01, + 1.3216309006343707e-01, 1.3102375308013156e-01, 1.2991338229131691e-01, + 1.2883077077222260e-01, 1.2777478084786012e-01, 1.2674433906682897e-01, + 1.2573843161391765e-01, 1.2475610011693392e-01, 1.2379643780834211e-01, + 1.2285858600676376e-01, 1.2194173088731031e-01, 1.2104510051313890e-01, + 1.2016796210362340e-01, 1.1930961951716895e-01, 1.1846941092901987e-01, + 1.1764670668645723e-01, 1.1684090732559109e-01, 1.1605144173555330e-01, + 1.1527776545731629e-01, 1.1451935910562341e-01, 1.1377572690363885e-01, + 1.1304639532092321e-01, 1.1233091180623384e-01, 1.1162884360744486e-01, + 1.1093977667159645e-01, 1.1026331461872085e-01, 1.0959907778366831e-01, + 1.0894670232067029e-01, 1.0830583936584282e-01, 1.0767615425325071e-01, + 1.0705732578053088e-01, 1.0644904552041423e-01, 1.0585101717479392e-01, + 1.0526295596826729e-01, 1.0468458807833175e-01, 1.0411565009964517e-01, + 1.0355588853996966e-01, 1.0300505934560812e-01, 1.0246292745431544e-01, + 1.0192926637382421e-01, 1.0140385778426843e-01, 1.0088649116292012e-01, + 1.0037696342977405e-01, 9.9875078612625179e-02, 9.9380647530384461e-02, + 9.8893487493470808e-02, 9.8413422020201535e-02, 9.7940280568181340e-02, + 9.7473898279761426e-02, 9.7014115740705953e-02, 9.6560778751263399e-02, + 9.6113738108896438e-02, 9.5672849401974888e-02, 9.5237972813784100e-02, + 9.4808972936244532e-02, 9.4385718592779153e-02, 9.3968082669802250e-02, + 9.3555941956338207e-02, 9.3149176991310659e-02, 9.2747671918072247e-02, + 9.2351314345772789e-02, 9.1959995217189006e-02, 9.1573608682663010e-02, + 9.1192051979818570e-02, 9.0815225318744947e-02, 9.0443031772356644e-02, + 9.0075377171656007e-02, 8.9712170005641273e-02, 8.9353321325618698e-02, + 8.8998744653691647e-02, 8.8648355895212541e-02, 8.8302073254996866e-02, + 8.7959817157109280e-02, 8.7621510168043482e-02, 8.7287076923127288e-02, + 8.6956444055994217e-02, 8.6629540130971683e-02, 8.6306295578244180e-02, + 8.5986642631658089e-02, 8.5670515269041708e-02, 8.5357849154921117e-02, + 8.5048581585519228e-02, 8.4742651435931030e-02, 8.4439999109374136e-02, + 8.4140566488418903e-02, 8.3844296888107572e-02, 8.3551135010876423e-02, + 8.3261026903199767e-02, 8.2973919913878397e-02, 8.2689762653899351e-02, + 8.2408504957797654e-02, 8.2130097846453740e-02, 8.1854493491264307e-02, + 8.1581645179626752e-02, 8.1311507281680975e-02, 8.1044035218254387e-02, + 8.0779185429959446e-02, 8.0516915347394635e-02, 8.0257183362403048e-02, + 7.9999948800344056e-02, 7.9745171893336589e-02, 7.9492813754433622e-02, + 7.9242836352690124e-02, 7.8995202489087965e-02, 7.8749875773283351e-02, + 7.8506820601143584e-02, 7.8266002133041912e-02, 7.8027386272880209e-02, + 7.7790939647810864e-02, 7.7556629588630716e-02, 7.7324424110820439e-02, + 7.7094291896204911e-02, 7.6866202275210224e-02, 7.6640125209694890e-02, + 7.6416031276333216e-02, 7.6193891650529921e-02, 7.5973678090846306e-02, + 7.5755362923918587e-02, 7.5538919029850243e-02, 7.5324319828060898e-02, + 7.5111539263574847e-02, 7.4900551793733353e-02, 7.4691332375315098e-02, + 7.4483856452050343e-02, 7.4278099942514289e-02, 7.4074039228386498e-02, + 7.3871651143063044e-02, 7.3670912960609070e-02, 7.3471802385039850e-02, + 7.3274297539918778e-02, 7.3078376958261235e-02, 7.2884019572733952e-02, + 7.2691204706139420e-02, 7.2499912062175889e-02, 7.2310121716463394e-02, + 7.2121814107826768e-02, 7.1934970029827211e-02, 7.1749570622533843e-02, + 7.1565597364527347e-02, 7.1383032065128041e-02, 7.1201856856840912e-02, + 7.1022054188010511e-02, 7.0843606815678820e-02, 7.0666497798639621e-02, + 7.0490710490682812e-02, 7.0316228534022709e-02, 7.0143035852904420e-02, + 6.9971116647382606e-02, 6.9800455387267035e-02, 6.9631036806229979e-02, + 6.9462845896070005e-02, 6.9295867901127531e-02, 6.9130088312847310e-02, + 6.8965492864483391e-02, 6.8802067525941965e-02, 6.8639798498758134e-02, + 6.8478672211202365e-02, 6.8318675313512642e-02, 6.8159794673248661e-02, + 6.8002017370764292e-02, 6.7845330694794787e-02, 6.7689722138155342e-02, + 6.7535179393547681e-02, 6.7381690349471446e-02, 6.7229243086237345e-02, + 6.7077825872079153e-02, 6.6927427159361480e-02, 6.6778035580880774e-02, + 6.6629639946256591e-02, 6.6482229238410892e-02, 6.6335792610132449e-02, + 6.6190319380724269e-02, 6.6045799032731417e-02, 6.5902221208747211e-02, + 6.5759575708295381e-02, 6.5617852484786132e-02, 6.5477041642544101e-02, + 6.5337133433906180e-02, 6.5198118256387230e-02, 6.5059986649911833e-02, + 6.4922729294110332e-02, 6.4786337005677333e-02, 6.4650800735790978e-02, + 6.4516111567591405e-02, 6.4382260713716735e-02, 6.4249239513895010e-02, + 6.4117039432590700e-02, 6.3985652056704242e-02, 6.3855069093323211e-02, + 6.3725282367523783e-02, 6.3596283820221103e-02, 6.3468065506067428e-02, + 6.3340619591396613e-02, 6.3213938352213811e-02, 6.3088014172229326e-02, + 6.2962839540935220e-02, 6.2838407051723888e-02, 6.2714709400047267e-02, + 6.2591739381615802e-02}; + if (z > 255) return X(Lambda)((const FLT)z); + return LamInt[z]; +} + +size_t X(directM)(const FLT *input_array, FLT *output_array, X(fmm_plan) * fmmplan, + const size_t strides) +{ + size_t s = fmmplan->s; + size_t N = fmmplan->N; + size_t flops = 0; + size_t h = 2 * s; + size_t nL = N / h; + const FLT *a = fmmplan->dplan->a; + + for (size_t block = 0; block < nL - 1; block++) + { + size_t i0 = block * h; + for (size_t n = 0; n < 4 * s; n = n + 2) + { + const size_t n1 = n / 2; + const FLT *ap = &a[i0 + n1]; + const FLT a0 = a[n1]; + size_t i; + if (strides == 1) + { + FLT *vp = &output_array[i0]; + const FLT *up = &input_array[i0 + n]; + if (n < h) + { + for (i = 0; i < h; i++) + { + vp[i] += a0 * ap[i] * up[i]; + } + } + else + { + for (i = 0; i < 2 * h - n; i++) + { + vp[i] += a0 * ap[i] * up[i]; + } + } + } + else + { + FLT *vp = &output_array[i0 * strides]; + const FLT *up = &input_array[(i0 + n) * strides]; + if (n < h) + { + for (i = 0; i < h; i++) + { + vp[i * strides] += a0 * ap[i] * up[i * strides]; + } + } + else + { + for (i = 0; i < 2 * h - n; i++) + { + vp[i * strides] += a0 * ap[i] * up[i * strides]; + } + } + } + flops += i * 3; + } + } + + // Last block + size_t i0 = (nL - 1) * h; + for (size_t n = 0; n < N - i0; n++) + { + const FLT *ap = &a[n + i0]; + const FLT a0 = a[n]; + const FLT *cp = &input_array[(i0 + 2 * n) * strides]; + FLT *op = &output_array[i0 * strides]; + if ((int)N - (2 * n + i0) <= 0) break; + if (strides == 1) + { + for (int i = 0; i < N - (2 * n + i0); i++) + { + op[i] += a0 * ap[i] * cp[i]; + } + } + else + { + for (int i = 0; i < N - (2 * n + i0); i++) + { + op[i * strides] += a0 * ap[i] * cp[i * strides]; + } + } + flops += (N - (2 * n + i0)) * 3; + } + + FLT *op = &output_array[0]; + if (strides == 1) + { + { + for (size_t i = 0; i < N; i++) + output_array[i] *= M_2_PI; + } + } + else + { + for (size_t i = 0; i < N; i++) + { + (*op) *= M_2_PI; + op += strides; + } + } + output_array[0] *= 0.5; + return flops; +} + +size_t X(directL)(const FLT *input, FLT *output_array, X(fmm_plan) * fmmplan, size_t strides) +{ + const FLT sqrt_pi = 1.77245385090551602729816e0; + size_t s = fmmplan->s; + size_t N = fmmplan->N; + const FLT *an = fmmplan->dplan->an; + const FLT *dn = fmmplan->dplan->dn; + size_t h = 2 * s; + size_t nL = N / h; + size_t flops = 0; + FLT *op = &output_array[0]; + const FLT *ia = &input[0]; + const FLT *ap = &an[0]; + if (strides == 1) + { + for (size_t i = 0; i < N; i++) + (*op++) += sqrt_pi * (*ia++) * (*ap++); + } + else + { + for (size_t i = 0; i < N; i++) + { + (*op) += sqrt_pi * (*ia++) * (*ap++); + op += strides; + } + } + flops += N * 3; + + for (size_t block = 0; block < nL - 1; block++) + { + size_t i0 = block * h; + for (size_t n = 2; n < 4 * s; n = n + 2) + { + const size_t n1 = n / 2; + const FLT *ap = &an[i0 + n1]; + const FLT d0 = dn[n1]; + size_t i; + if (strides == 1) + { + FLT *vp = &output_array[i0]; + const FLT *ia = &input[i0 + n]; + if (n < h) + { + for (i = 0; i < h; i++) + { + (*vp++) -= d0 * (*ia++) * (*ap++); + } + } + else + { + for (i = 0; i < 2 * h - n; i++) + { + (*vp++) -= d0 * (*ia++) * (*ap++); + } + } + } + else + { + FLT *vp = &output_array[i0 * strides]; + const FLT *ia = &input[i0 + n]; + if (n < h) + { + for (i = 0; i < h; i++) + { + (*vp) -= d0 * (*ia++) * (*ap++); + vp += strides; + } + } + else + { + for (i = 0; i < 2 * h - n; i++) + { + (*vp) -= d0 * (*ia++) * (*ap++); + vp += strides; + } + } + } + flops += i * 3; + } + } + + // Last block + size_t i0 = (nL - 1) * h; + for (size_t n = 1; n < N - i0; n++) + { + const FLT *ap = &an[n + i0]; + const FLT *ia = &input[i0 + 2 * n]; + FLT *op = &output_array[i0 * strides]; + if (strides == 1) + { + for (size_t i = i0; i < N - 2 * n; i++) + { + (*op++) -= dn[n] * (*ap++) * (*ia++); + } + } + else + { + for (size_t i = i0; i < N - 2 * n; i++) + { + (*op) -= dn[n] * (*ap++) * (*ia++); + op += strides; + } + } + flops += (N - (2 * n + i0)) * 3; + } + + // Multiply result by (x+1/2) + op = &output_array[0]; + if (strides == 1) + { + for (size_t i = 0; i < N; i++) + { + (*op++) *= (i + 0.5); + } + } + else + { + for (size_t i = 0; i < N; i++) + { + (*op) *= (i + 0.5); + op += strides; + } + } + return flops; +} + +void X(matvectri)(const FLT *A, const FLT *x, FLT *b, FLT *w, const size_t m, const bool upper) +{ + // compact triangular matrix A + size_t i, j; + if (upper == false) + { + FLT *zp = &w[0]; + FLT *zm = &w[m]; + const FLT *xp = &x[m]; + + if (m % 2 == 0) + { + for (i = 0; i < m; i = i + 2) + { + zp[i] = x[i] + xp[i]; + zm[i] = x[i] - xp[i]; + zp[i + 1] = x[i + 1] - xp[i + 1]; + zm[i + 1] = x[i + 1] + xp[i + 1]; + } + const FLT *a0 = &A[0]; + for (i = 0; i < m; i = i + 2) + { + const FLT *z0 = &zp[0]; + const FLT *z1 = &zm[0]; + const FLT *a1 = a0 + i + 1; + FLT s0 = (*a0++) * (*z0++); + FLT s1 = (*a1++) * (*z1++); + for (size_t j = 1; j < i + 1; j++) + { + s0 += (*a0++) * (*z0++); + s1 += (*a1++) * (*z1++); + } + s1 += (*a1++) * (*z1); + a0 = a1; + b[i] = s0; + b[i + 1] = s1; + } + } + else + { + for (i = 0; i < m - 2; i = i + 2) + { + zp[i] = x[i] + xp[i]; + zm[i] = x[i] - xp[i]; + zp[i + 1] = x[i + 1] - xp[i + 1]; + zm[i + 1] = x[i + 1] + xp[i + 1]; + } + zp[i] = x[i] + xp[i]; + zm[i] = x[i] - xp[i]; + + const FLT *a0 = &A[0]; + for (i = 0; i < m - 2; i = i + 2) + { + const FLT *a1 = a0 + i + 1; + FLT s0 = (*a0++) * zp[0]; + FLT s1 = (*a1++) * zm[0]; + for (j = 1; j < i + 1; j++) + { + s0 += (*a0++) * zp[j]; + s1 += (*a1++) * zm[j]; + } + s1 += (*a1++) * zm[j]; + a0 = a1; + b[i] = s0; + b[i + 1] = s1; + } + FLT s0 = (*a0++) * zp[0]; + for (j = 1; j < i + 1; j++) + { + s0 += (*a0++) * zp[j]; + } + b[i] = s0; + } + } + else + { + FLT *bp = &b[m]; + const FLT *ap = &A[0]; + for (i = 0; i < m; i++) + { + FLT se = 0.0; + FLT so = 0.0; + for (j = i; j < m - 1; j = j + 2) + { + se += (*ap++) * x[j]; + so += (*ap++) * x[j + 1]; + } + if ((i + m) % 2 == 1) se += (*ap++) * x[j]; + (*b++) += se + so; + (*bp++) += se - so; + } + } +} + +void X(vandermonde)(FLT *T, const size_t h, const size_t N) +{ + FLT *x = (FLT *)X(fftw_malloc)(2 * h * sizeof(FLT)); + FLT *x2 = (FLT *)X(fftw_malloc)(2 * h * sizeof(FLT)); + FLT *Tm = (FLT *)X(fftw_malloc)(2 * h * N * sizeof(FLT)); + + for (size_t i = 0; i < 2 * h; i++) + { + x[i] = -1 + (FLT)(i) / ((FLT)h); + x2[i] = 2 * x[i]; + Tm[i * N] = 1; + Tm[i * N + 1] = x[i]; + } + + for (size_t i = 0; i < 2 * h; i++) + { + for (size_t j = 2; j < N; j++) + { + Tm[i * N + j] = Tm[i * N + j - 1] * x2[i] - Tm[i * N + j - 2]; + } + } + + for (size_t i = 0; i < h; i++) + { + for (size_t j = 0; j < N; j++) + { + T[i * N + j] = Tm[2 * i * N + j]; // even + T[(h + i) * N + j] = Tm[(2 * i + 1) * N + j]; // odd + } + } + fftw_free(x); + fftw_free(x2); + fftw_free(Tm); +} + +void X(free_direct)(X(direct_plan) * plan) +{ + if (plan->a != NULL) + { + fftw_free(plan->a); + plan->a = NULL; + } + if (plan->an != NULL) + { + fftw_free(plan->an); + plan->an = NULL; + } + if (plan->dn != NULL) + { + fftw_free(plan->dn); + plan->dn = NULL; + } + free(plan); + plan = NULL; +} + +void X(free_fmm_2d)(X(fmm_plan_2d) * plan) +{ + if (plan->fmmplan0 == plan->fmmplan1) + { + X(free_fmm)(plan->fmmplan0); + plan->fmmplan1 = NULL; + } + else if (plan->fmmplan0 != NULL) + { + X(free_fmm)(plan->fmmplan0); + } + else if (plan->fmmplan1 != NULL) + { + X(free_fmm)(plan->fmmplan1); + } + free(plan); + plan = NULL; +} + +void X(free_fmm)(X(fmm_plan) * plan) +{ + if (plan->A[0] != NULL) + { + fftw_free(plan->A[0]); + plan->A[0] = NULL; + } + if (plan->A[1] != NULL) + { + fftw_free(plan->A[1]); + plan->A[1] = NULL; + } + if (plan->A != NULL) + { + free(plan->A); + plan->A = NULL; + } + if (plan->T != NULL) + { + fftw_free(plan->T); + plan->T = NULL; + } + if (plan->M != 18) + { + if (plan->B != NULL) + { + fftw_free(plan->B); + plan->B = NULL; + } + if (plan->BT != NULL) + { + fftw_free(plan->BT); + plan->BT = NULL; + } + } + if (plan->ia != NULL) + { + fftw_free(plan->ia); + plan->ia = NULL; + } + if (plan->oa != NULL) + { + fftw_free(plan->ia); + plan->ia = NULL; + } + if (plan->work != NULL) + { + fftw_free(plan->work); + plan->work = NULL; + } + if (plan->wk != NULL) + { + fftw_free(plan->wk[0]); + for (size_t level = 0; level < plan->L; level++) + plan->wk[level] = NULL; + fftw_free(plan->wk); + } + if (plan->ck != NULL) + { + fftw_free(plan->ck[0]); + for (size_t level = 0; level < plan->L; level++) + plan->ck[level] = NULL; + fftw_free(plan->ck); + } + if (plan->dplan != NULL) + { + X(free_direct)(plan->dplan); + } + free(plan); + plan = NULL; +} + +X(direct_plan) * X(create_direct)(size_t N, size_t direction) +{ + X(direct_plan) *dplan = (X(direct_plan) *)malloc(sizeof(X(direct_plan))); + FLT *a = (FLT *)X(fftw_malloc)(N * sizeof(FLT)); + dplan->a = a; + dplan->an = NULL; + dplan->dn = NULL; + + size_t N0 = 256 < N ? 256 : N; + for (size_t i = 0; i < N0; i++) // Integer, precomputed values + a[i] = X(LambdaI)(i); + size_t DN = 8; + for (size_t i = N0; i < N; i += DN) + { + a[i] = X(LambdaI)(i); + size_t J0 = i + DN < N ? i + DN : N; + for (size_t j = i + 1; j < J0; j++) + { + // a[j] = a[j - 1] * (1 - 0.5 / j); + // a[j] = ((j << 1) -1) * a[j - 1] / (j << 1) ; + a[j] = a[j - 1] - 0.5 * a[j - 1] / j; + } + } + + if ((direction == C2L) | (direction == BOTH)) + { + FLT *dn = (FLT *)X(fftw_malloc)((N + 1) / 2 * sizeof(FLT)); + FLT *an = (FLT *)X(fftw_malloc)(N * sizeof(FLT)); + dn[0] = 0; + an[0] = M_2_SQRTPI; + // Using Lambda(i-0.5) = 1/(i*Lambda(i)) + for (size_t i = 1; i < N; i++) + { + an[i] = 1 / (a[i] * (2 * i * i + i)); + } + for (size_t i = 1; i < (N + 1) / 2; i++) + dn[i] = a[i - 1] / (2 * i); + + dplan->an = an; + dplan->dn = dn; + } + dplan->direction = direction; + dplan->N = N; + + return dplan; +} + +size_t X(direct)(const FLT *u, FLT *b, X(direct_plan) * dplan, size_t direction, size_t strides) +{ + size_t flops = 0; + const FLT sqrt_pi = 1.77245385090551602e0; + const size_t N = dplan->N; + for (size_t i = 0; i < N; i++) + b[i * strides] = 0.0; + flops += N; + if (direction == L2C) + { + const FLT *a = dplan->a; + for (size_t n = 0; n < N; n = n + 2) + { + const FLT *ap = &a[n / 2]; + const FLT *cp = &u[n * strides]; + const FLT a0 = ap[0] * M_2_PI; + for (size_t i = 0; i < N - n; i++) + { + b[i * strides] += a0 * ap[i] * cp[i * strides]; + } + flops += 3 * (N - n); + } + b[0] /= 2; + flops += N; + } + else + { + FLT *vn = (FLT *)X(fftw_malloc)(N * sizeof(FLT)); + const FLT *an = dplan->an; + const FLT *dn = dplan->dn; + vn[0] = u[0]; + for (size_t i = 1; i < N; i++) + vn[i] = u[i * strides] * i; + + for (size_t n = 0; n < N; n++) + b[n * strides] = sqrt_pi * vn[n] * an[n]; + + for (size_t n = 2; n < N; n = n + 2) + { + const FLT *ap = &an[n / 2]; + const FLT *vp = &vn[n]; + for (size_t i = 0; i < N - n; i++) + b[i * strides] -= dn[n / 2] * ap[i] * vp[i]; + flops += 3 * (N - n); + } + for (size_t i = 0; i < N; i++) + b[i * strides] *= (i + 0.5); + flops += N; + fftw_free(vn); + } + return flops; +} + +X(fmm_plan) * X(create_fmm)(const size_t N, const size_t maxs, const size_t M, + const size_t direction, const size_t lagrange, const size_t v) +{ + X(fmm_plan) *fmmplan = (X(fmm_plan) *)malloc(sizeof(X(fmm_plan))); + X(fftw_plan) plan1d, plan; + uint64_t t1 = tic; + size_t Nn; + size_t s; + size_t ij[2]; + size_t directions[2]; + size_t num_directions = 2; + switch (direction) + { + case L2C: + directions[0] = 0; + num_directions = 1; + break; + case C2L: + directions[0] = 1; + num_directions = 1; + break; + default: + directions[0] = 0; + directions[1] = 1; + break; + } + FLT **A = (FLT **)calloc(2, sizeof(FLT *)); + fmmplan->A = A; + fmmplan->T = NULL; + fmmplan->B = NULL; + fmmplan->BT = NULL; + fmmplan->ia = NULL; + fmmplan->oa = NULL; + fmmplan->work = NULL; + fmmplan->wk = NULL; + fmmplan->ck = NULL; + fmmplan->dplan = NULL; + fmmplan->lagrange = lagrange; + + int L = ceil(log2((FLT)N / (FLT)maxs)) - 2; + if (L < 1) + { + if (v > 1) printf("Levels < 1. Using only direct method\n"); + fmmplan->dplan = X(create_direct)(N, direction); + fmmplan->Nn = N; + fmmplan->L = 0; + return fmmplan; + } + + s = ceil((FLT)N / (FLT)pow(2, L + 2)); + Nn = s * pow(2, L + 2); + + fmmplan->dplan = X(create_direct)(N, direction); + fmmplan->M = M; + fmmplan->L = L; + fmmplan->N = N; + fmmplan->Nn = Nn; + fmmplan->s = s; + if (v > 1) + { + printf("N %lu\n", N); + printf("Num levels %d\n", L); + printf("Num submatrices %lu\n", get_total_number_of_submatrices(L)); + printf("Num blocks %lu\n", get_total_number_of_blocks(L)); + printf("Given max s %lu \n", maxs); + printf("Computed s %lu \n", s); + printf("Computed N %lu\n", Nn); + printf("Lagrange %lu\n", lagrange); + } + + FLT *fun = (FLT *)X(fftw_malloc)(M * M * sizeof(FLT)); + FLT *fun_hat = (FLT *)X(fftw_malloc)(M * M * sizeof(FLT)); + bool use_FFTW = true; + +#if defined(FT_USE_DOUBLE) + use_FFTW = (M != 18 && lagrange == 0); +#endif + + if (use_FFTW) + { + if (v > 1) printf("using FFTW for planning\n"); + plan1d = X(fftw_plan_r2r_1d)(M, fun, fun_hat); + plan = X(fftw_plan_r2r_2d)(M, fun, fun_hat); + } + + const size_t MM = M * M; + if (direction == BOTH) + { + A[0] = (FLT *)X(fftw_malloc)(get_total_number_of_submatrices(L) * MM * sizeof(FLT)); + A[1] = (FLT *)X(fftw_malloc)(get_total_number_of_submatrices(L) * MM * sizeof(FLT)); + } + else + { + A[direction] = (FLT *)X(fftw_malloc)(get_total_number_of_submatrices(L) * MM * sizeof(FLT)); + } + + FLT *xj = (FLT *)X(fftw_malloc)(M * sizeof(FLT)); + FLT *xjh = (FLT *)X(fftw_malloc)(M * sizeof(FLT)); + for (size_t i = 0; i < M; i++) + { + xj[i] = cos((i + 0.5) * M_PI / M); + } + + FLT *fx0 = (FLT *)X(fftw_malloc)(2 * MM * sizeof(FLT)); + FLT *fx1 = (FLT *)X(fftw_malloc)(MM * sizeof(FLT)); + FLT *lx1 = (FLT *)X(fftw_malloc)(MM * sizeof(FLT)); + + size_t kk = 0; + for (size_t level = 0; level < L; level++) + { + size_t h = s * get_h(level, L); + for (size_t k = 0; k < M; k++) + xjh[k] = xj[k] * h; + for (size_t block = 0; block < get_number_of_blocks(level); block++) + { + get_ij(ij, level, block, s, L); + for (size_t q = 0; q < 2; q++) + { + size_t y0 = 2 * (ij[1] + q * h) + h; + for (size_t p = 0; p < q + 1; p++) + { + size_t x0 = 2 * (ij[0] + p * h) + h; + for (size_t di = 0; di < num_directions; di++) + { + const size_t dir = directions[di]; + // ff is input to the DCT + FLT *ff = (lagrange == 0) ? &fun[0] : &A[dir][kk * MM]; + FLT *f00 = &fx0[q * MM]; + FLT *fpq = &fx0[(q - p) * MM]; + + for (size_t i = 0; i < M; i++) + { + FLT x = x0 + xjh[i]; + for (size_t j = 0; j < M; j++) + { + FLT y = y0 + xjh[j]; + size_t ix = i * M + j; + size_t xi = j * M + i; + if (di == 0 && block == 0 && p == 0) + { + if (j < M - i) + { // Persymmetric Lambda((y-x)/2) + FLT m0 = X(Lambda)((y - x) / 2); + // FLT m0 = LambdaE((y - x) / 2); + f00[ix] = m0; + f00[MM - xi - 1] = m0; + } + } + if (di == 0) + { + if (j >= i) + { // Symmetric Lambda((x+y)/2) + FLT m1 = X(Lambda)((x + y) / 2); + // FLT m1 = LambdaE((x + y) / 2); + fx1[ix] = m1; + fx1[xi] = m1; + } + } + if (dir == L2C) + { + (*ff++) = fpq[ix] * fx1[ix]; + } + else + { + (*ff++) = 2 * (fpq[ix] / (fx1[ix] * (x + y) * (x + y + 1) * (x - y + 1))); + } + } + } + + if (lagrange == 0) + { +#if defined(FT_USE_DOUBLE) + if (use_FFTW) +#endif + { + X(fftw_execute)(plan); + for (size_t i = 0; i < M; i++) + { + for (size_t j = 0; j < M; j++) + { + fun_hat[i * M + j] *= (1. / (M * M)); + } + } + for (size_t i = 0; i < M; i++) + { + fun_hat[i] /= 2; + fun_hat[i * M] /= 2; + } + memcpy(&A[dir][kk * MM], &fun_hat[0], MM * sizeof(FLT)); + } +#if defined(FT_USE_DOUBLE) + else + { + dct2(&fun[0], &A[dir][kk * MM]); + } +#endif + } + } + kk += 1; + } + } + } + } + + FLT *wj = NULL; + if (lagrange == 1) + { + wj = (FLT *)X(fftw_malloc)(M * sizeof(FLT)); + for (size_t j = 0; j < M; j++) + { + int sign = (j % 2 == 0) ? 1 : -1; + wj[j] = sign * sin((j + 0.5) * M_PI / M); + } + } + + FLT *T = (FLT *)X(fftw_malloc)(2 * s * M * sizeof(FLT)); + if (lagrange == 0) + X(vandermonde)(T, s, M); + else + { + FLT *xh = (FLT *)X(fftw_malloc)(2 * s * sizeof(FLT)); + for (size_t i = 0; i < 2 * s; i++) + { + xh[i] = -1 + (FLT)(i) / ((FLT)s); + } + for (size_t i = 0; i < s; i++) + { + FLT sume = 0.0; + FLT sumo = 0.0; + for (size_t j = 0; j < M; j++) + { + FLT se = wj[j] / (xh[2 * i] - xj[j]); + FLT so = wj[j] / (xh[2 * i + 1] - xj[j]); + T[i * M + j] = se; // even + T[(s + i) * M + j] = so; // odd + sume += se; + sumo += so; + } + for (size_t j = 0; j < M; j++) + { + T[i * M + j] /= sume; + T[(s + i) * M + j] /= sumo; + } + } + fftw_free(xh); + } + fmmplan->T = T; + + FLT *B = NULL; + FLT *BT = NULL; +#if defined(FT_USE_DOUBLE) + if (!use_FFTW && lagrange == 0) + { + if (v > 1) printf("Using exact binomial matrix\n"); + B = (FLT *)&BMe[0]; + BT = (FLT *)&BMeT[0]; + } + else +#endif + { + if (lagrange == 0) + { + B = (FLT *)X(fftw_malloc)((MM + M) / 2 * sizeof(FLT)); + BT = (FLT *)X(fftw_malloc)((MM + M) / 2 * sizeof(FLT)); + FLT *Ba = (FLT *)X(fftw_malloc)(MM * sizeof(FLT)); + FLT *BTa = (FLT *)X(fftw_malloc)(MM * sizeof(FLT)); + FLT *th = &Ba[0]; + for (size_t k = 0; k < M; k++) + { + for (size_t j = 0; j < M; j++) + { + fun[j] = cos(k * acos((xj[j] - 1) / 2)); + fun_hat[j] = 0.0; + } + X(fftw_execute)(plan1d); + *th++ = fun_hat[0] / M / 2; + for (size_t j = 1; j < M; j++) + *th++ = fun_hat[j] / M; + } + // Make transpose + th = &Ba[0]; + FLT *tht = &BTa[0]; + for (size_t i = 0; i < M; i++) + { + for (size_t j = 0; j < M; j++) + { + tht[i * M + j] = th[j * M + i]; + } + } + /// Move to compact storage + th = &B[0]; + for (size_t k = 0; k < M; k++) + { + for (size_t j = 0; j < k + 1; j++) + { + (*th++) = Ba[k * M + j]; + } + } + tht = &BT[0]; + for (size_t k = 0; k < M; k++) + { + for (size_t j = k; j < M; j++) + { + (*tht++) = BTa[k * M + j]; + } + } + fftw_free(Ba); + fftw_free(BTa); + } + else + { + B = (FLT *)X(fftw_malloc)(2 * MM * sizeof(FLT)); + FLT *xh = (FLT *)X(fftw_malloc)(2 * M * sizeof(FLT)); + for (size_t i = 0; i < M; i++) + { + xh[i] = (xj[i] - 1) / 2; + xh[M + i] = (xj[i] + 1) / 2; + } + for (size_t i = 0; i < M; i++) + { + FLT sum0 = 0.0; + FLT sum1 = 0.0; + for (size_t j = 0; j < M; j++) + { + FLT s0 = wj[j] / (xh[i] - xj[j]); + FLT s1 = wj[j] / (xh[M + i] - xj[j]); + B[i * M + j] = s0; + B[MM + i * M + j] = s1; + sum0 += s0; + sum1 += s1; + } + for (size_t j = 0; j < M; j++) + { + B[i * M + j] /= sum0; + B[MM + i * M + j] /= sum1; + } + } + fftw_free(xh); + } + } + + FLT *ia = (FLT *)X(fftw_malloc)(Nn / 2 * sizeof(FLT)); + FLT *oa = (FLT *)X(fftw_malloc)(Nn / 2 * sizeof(FLT)); + FLT *work = (FLT *)X(fftw_malloc)(2 * M * sizeof(FLT)); + FLT **wk = (FLT **)X(fftw_malloc)(L * sizeof(FLT *)); + FLT **ck = (FLT **)X(fftw_malloc)(L * sizeof(FLT *)); + size_t Nb = get_total_number_of_blocks(L); + wk[0] = (FLT *)X(fftw_malloc)(Nb * 2 * M * sizeof(FLT)); + ck[0] = (FLT *)X(fftw_malloc)(Nb * 2 * M * sizeof(FLT)); + for (size_t level = 1; level < L; level++) + { + size_t b = get_number_of_blocks(level - 1); + wk[level] = wk[level - 1] + b * 2 * M; + ck[level] = ck[level - 1] + b * 2 * M; + } + fmmplan->ia = ia; + fmmplan->oa = oa; + fmmplan->wk = wk; + fmmplan->ck = ck; + fmmplan->work = work; + if (use_FFTW) + { + X(fftw_destroy_plan)(plan); + X(fftw_destroy_plan)(plan1d); + } + + fftw_free(fun); + fftw_free(fun_hat); + fftw_free(fx0); + fftw_free(fx1); + fftw_free(lx1); + fftw_free(xj); + fftw_free(xjh); + if (lagrange == 1) + { + fftw_free(wj); + } + fmmplan->B = B; + fmmplan->BT = BT; + uint64_t t2 = tic; + if (v > 1) printf("Initialization %2.4e s\n", dtics(t1, t2)); + return fmmplan; +} + +X(fmm_plan_2d) * X(create_fmm_2d)(size_t N0, size_t N1, int axis, size_t maxs, size_t M, + size_t direction, size_t lagrange, size_t v) +{ + X(fmm_plan_2d) *fmmplan2d = (X(fmm_plan_2d) *)malloc(sizeof(X(fmm_plan_2d))); + fmmplan2d->fmmplan0 = NULL; + fmmplan2d->fmmplan1 = NULL; + if (v > 1) + { + printf("crate_fmm_2d\n"); + } + if (axis == 0) + { + fmmplan2d->fmmplan0 = X(create_fmm)(N0, maxs, M, direction, lagrange, v); + } + else if (axis == 1) + { + fmmplan2d->fmmplan1 = X(create_fmm)(N1, maxs, M, direction, lagrange, v); + } + else if (axis == -1) + { + fmmplan2d->fmmplan0 = X(create_fmm)(N0, maxs, M, direction, lagrange, v); + if (N0 == N1) + { + fmmplan2d->fmmplan1 = fmmplan2d->fmmplan0; + } + else + { + fmmplan2d->fmmplan1 = X(create_fmm)(N1, maxs, M, direction, lagrange, v); + } + } + fmmplan2d->N0 = N0; + fmmplan2d->N1 = N1; + fmmplan2d->axis = axis; + return fmmplan2d; +} + +size_t X(execute2D)(const FLT *input_array, FLT *output_array, X(fmm_plan_2d) * fmmplan2d, + size_t direction) +{ + size_t flops = 0; + if (fmmplan2d->axis == 0) + { + for (size_t i = 0; i < fmmplan2d->N1; i++) + { + flops += X(execute)(&input_array[i], &output_array[i], fmmplan2d->fmmplan0, direction, + fmmplan2d->N1); + } + } + else if (fmmplan2d->axis == 1) + { + for (size_t i = 0; i < fmmplan2d->N0; i++) + { + size_t N1 = fmmplan2d->N1; + flops += X(execute)(&input_array[i * N1], &output_array[i * N1], fmmplan2d->fmmplan1, + direction, 1); + } + } + else if (fmmplan2d->axis == -1) + { + FLT *out = (FLT *)calloc(fmmplan2d->N0 * fmmplan2d->N1, sizeof(FLT)); + for (size_t i = 0; i < fmmplan2d->N1; i++) + { + flops += X(execute)(&input_array[i], &out[i], fmmplan2d->fmmplan0, direction, fmmplan2d->N1); + } + + for (size_t i = 0; i < fmmplan2d->N0; i++) + { + size_t N1 = fmmplan2d->N1; + flops += X(execute)(&out[i * N1], &output_array[i * N1], fmmplan2d->fmmplan1, direction, 1); + } + fftw_free(out); + } + return flops; +} + +size_t X(execute)(const FLT *input_array, FLT *output_array, X(fmm_plan) * fmmplan, + size_t direction, const size_t stride) +{ + size_t Nn = fmmplan->Nn; + size_t N = fmmplan->N; + size_t L = fmmplan->L; + size_t s = fmmplan->s; + size_t M = fmmplan->M; + FLT *T = fmmplan->T; + FLT *B = fmmplan->B; + FLT *BT = fmmplan->BT; + FLT *A = fmmplan->A[direction]; + size_t flops = 0; + size_t lagrange = fmmplan->lagrange; + assert((direction == C2L) | (direction == L2C)); + + if (T == NULL) + { + flops = X(direct)(input_array, output_array, fmmplan->dplan, direction, stride); + return flops; + } + + FLT *ia = fmmplan->ia; + FLT *oa = fmmplan->oa; + FLT **wk = fmmplan->wk; + FLT **ck = fmmplan->ck; + FLT *input = NULL; + if (direction == C2L) + { // Need to modify input array, so make copy + input = (FLT *)X(fftw_malloc)(N * sizeof(FLT)); + input[0] = input_array[0]; + input[1] = input_array[stride]; + FLT *w0 = &input[2]; + size_t ii = 2 * stride; + for (size_t i = 2; i < N; i++) + { + (*w0++) = input_array[ii] * i; + ii += stride; + } + } + + + for (size_t odd = 0; odd < 2; odd++) + { + for (size_t i = 0; i < Nn / 2; i++) + { + oa[i] = 0.0; + } + + memset(&wk[0][0], 0, 2 * M * get_total_number_of_blocks(L) * sizeof(FLT)); + memset(&ck[0][0], 0, 2 * M * get_total_number_of_blocks(L) * sizeof(FLT)); + + const FLT *ap; + switch (direction) + { + case L2C: + ap = &input_array[odd * stride]; + break; + + case C2L: + ap = &input[odd]; + break; + } + + size_t rest = N % 2; + FLT *iap = &ia[0]; + if (stride == 1 || direction == C2L) + { + for (size_t i = 0; i < N / 2 + rest * (1 - odd); i++) + { + *iap++ = *ap; + ap += 2; + } + } + else + { + for (size_t i = 0; i < N / 2 + rest * (1 - odd); i++) + { + *iap++ = *ap; + ap += 2 * stride; + } + } + + for (size_t i = N / 2 + rest * (1 - odd); i < Nn / 2; i++) + { + *iap++ = 0; + } + + const size_t MM = M * M; + const size_t K = get_number_of_blocks(L - 1) * 2; + + X(rm_gemm)('N', K, M, s, 1.0, &ia[2 * s], s, &T[odd * s * M], M, 0, wk[L - 1], M); + flops += 2 * K * s * M; + for (size_t level = L; level-- > 1;) + { + FLT *w1 = wk[level - 1]; + for (size_t block = 1; block < get_number_of_blocks(level); block++) + { + size_t Nd = block * 2 * M; + FLT *wq = &wk[level][Nd]; + int b0 = (block - 1) / 2; + int q0 = (block - 1) % 2; + if (lagrange == 0) + { + X(matvectri)(&B[0], wq, &w1[(b0 * 2 + q0) * M], fmmplan->work, M, false); + flops += MM; //+2*M; + } + else + { + X(rm_gemv)('T', M, M, 1, &B[0], M, &wq[0], 0, &w1[(b0 * 2 + q0) * M]); + X(rm_gemv)('T', M, M, 1, &B[MM], M, &wq[M], 1, &w1[(b0 * 2 + q0) * M]); + flops += 4 * MM; + } + } + } + + size_t ik = 0; + for (size_t level = 0; level < L; level++) + { + for (size_t block = 0; block < get_number_of_blocks(level); block++) + { + size_t Nd = block * 2 * M; + FLT *cp = &ck[level][Nd]; + FLT *wq = &wk[level][Nd]; + X(rm_gemv)('N', M, M, 1, &A[ik * MM], M, wq, 0, cp); + X(rm_gemv)('N', M, M, 1, &A[(ik + 1) * MM], M, &wq[M], 1, cp); + X(rm_gemv)('N', M, M, 1, &A[(ik + 2) * MM], M, &wq[M], 0, &cp[M]); + flops += 6 * MM; + ik += 3; + } + } + + for (size_t level = 0; level < L - 1; level++) + { + FLT *c0 = ck[level]; + FLT *c1 = ck[level + 1]; + for (size_t block = 0; block < get_number_of_blocks(level + 1) - 1; block++) + { + if (lagrange == 0) + { + X(matvectri)(&BT[0], &c0[block * M], &c1[block * 2 * M], NULL, M, true); + flops += MM; + } + else + { + X(rm_gemv)('N', M, M, 1, &B[0], M, &c0[block * M], 1, &c1[block * 2 * M]); + X(rm_gemv)('N', M, M, 1, &B[MM], M, &c0[block * M], 1, &c1[block * 2 * M + M]); + flops += 4 * MM; + } + } + } + X(rm_gemm)('T', K, s, M, 1.0, ck[L - 1], M, &T[odd * s * M], M, 0, &oa[0], s); + + flops += 2 * K * s * M; + FLT *oaa = &output_array[odd * stride]; + FLT *oap = &oa[0]; + if (stride == 1) + { + for (size_t i = 0; i < N; i = i + 2) + { + *oaa += (*oap++); + oaa += 2; + } + } + else + { + const size_t s2 = 2 * stride; + for (size_t i = 0; i < N / 2 + rest * (1 - odd); i++) + { + *oaa += (*oap++); + oaa += s2; + } + } + // flops += 3*N/2; + } + + switch (direction) + { + case L2C: + flops += X(directM)(input_array, output_array, fmmplan, stride); + break; + + case C2L: + flops += X(directL)(input, output_array, fmmplan, stride); + break; + } + + if (input != NULL) fftw_free(input); + return flops; +} diff --git a/src/fmm_source.h b/src/fmm_source.h new file mode 100644 index 00000000..c6514055 --- /dev/null +++ b/src/fmm_source.h @@ -0,0 +1,58 @@ + +typedef struct { + size_t direction; + size_t N; + FLT *a; + FLT *dn; + FLT *an; +} X(direct_plan); + +typedef struct { + size_t direction; + size_t M; + size_t N; + size_t Nn; + size_t L; + size_t s; + size_t lagrange; + FLT **A; + FLT *T; + FLT *B; + FLT *BT; + FLT *ia; + FLT *oa; + FLT *work; + FLT **wk; + FLT **ck; + X(direct_plan) * dplan; +} X(fmm_plan); + +typedef struct { + X(fmm_plan) * fmmplan0; + X(fmm_plan) * fmmplan1; + size_t N0; + size_t N1; + int axis; +} X(fmm_plan_2d); + +void X(rm_gemv)(char TRANS, int m, int n, FLT alpha, FLT *A, int LDA, FLT *x, + FLT beta, FLT *y); +void X(rm_gemm)(char TRANS, int m, int n, int p, FLT alpha, FLT *A, int LDA, + FLT *B, int LDB, FLT beta, FLT *C, int LDC); +void X(free_fmm)(X(fmm_plan) * plan); +void X(free_fmm_2d)(X(fmm_plan_2d) * plan); +void X(free_direct)(X(direct_plan) * dplan); +X(fmm_plan_2d) * X(create_fmm_2d)(size_t N0, size_t N1, int axis, size_t maxs, + size_t M, size_t direction, + const size_t lagrange, size_t v); +X(fmm_plan) * X(create_fmm)(const size_t N, const size_t maxs, const size_t M, + const size_t direction, const size_t lagrange, + const size_t v); +X(direct_plan) * X(create_direct)(size_t N, size_t direction); +size_t X(execute2D)(const FLT *input_array, FLT *output_array, + X(fmm_plan_2d) * fmmplan2d, size_t direction); +size_t X(execute)(const FLT *input_array, FLT *output_array, + X(fmm_plan) * fmmplan, size_t direction, const size_t stride); +size_t X(direct)(const FLT *u, FLT *b, X(direct_plan) * dplan, size_t direction, + size_t strides); +FLT X(Lambda)(const FLT z); diff --git a/test/test_fmm.c b/test/test_fmm.c new file mode 100644 index 00000000..1d46092d --- /dev/null +++ b/test/test_fmm.c @@ -0,0 +1,39 @@ +#include "fasttransforms.h" +#include "ftutilities.h" +#include "fmm.h" + +#define FLT double +#define X(name) FT_CONCAT(ft_, name, ) +#define Y(name) FT_CONCAT(, name, ) +#define FT_USE_DOUBLE +#include "test_fmm_source.c" +#undef FLT +#undef X +#undef Y +#undef FT_USE_DOUBLE + +#define FLT float +#define X(name) FT_CONCAT(ft_, name, f) +#define Y(name) FT_CONCAT(, name, f) +#define FT_USE_SINGLE +#include "test_fmm_source.c" +#undef FLT +#undef X +#undef Y +#undef FT_USE_SINGLE + +int main(void) { + int checksum = 0; + printf("\nTesting fast multipole method for Legendre-Chebyshev transforms.\n"); + printf("\n\tSingle precision.\n\n"); + test_fmm_transformsf(&checksum, 1e5); + test_fmm_transforms2df(&checksum, 2048); + printf("\n\tDouble precision.\n\n"); + test_fmm_transforms(&checksum, 1e5); + test_fmm_transforms2d(&checksum, 2048); + //test_fmm(8192, 64, 18, 0.0, 1, 0, 2); + //test_fmmf(8192, 32, 8, 0.0, 1, 0, 2); + //test_fmm_speed(8192, 64, 1000, L2C, 18, 0, 2); + //test_fmm_speedf(8192, 32, 1000, L2C, 8, 0, 2); + return 0; +} diff --git a/test/test_fmm_source.c b/test/test_fmm_source.c new file mode 100644 index 00000000..6cbf85cb --- /dev/null +++ b/test/test_fmm_source.c @@ -0,0 +1,177 @@ +#include "fmm_source.h" + +void Y(test_fmm)(size_t N, size_t maxs, size_t M, FLT m, size_t random, size_t lagrange, + size_t verbose) +{ + if (verbose > 1) printf("test_forward_backward\n"); + X(fmm_plan) *fmmplan = X(create_fmm)(N, maxs, M, BOTH, lagrange, verbose); + FLT *input_array = (FLT *)calloc(N, sizeof(FLT)); + FLT *output_array = (FLT *)calloc(N, sizeof(FLT)); + + // srand48((unsigned int)time(NULL)); + srand48(1); + // Initialize some input array + if (random == 1) + { + for (size_t i = 0; i < N; i++) + input_array[i] = (2 * drand48() - 1) / pow(i + 1, m); + } + else + { + for (size_t i = 0; i < N; i++) + input_array[i] = 0.5 / pow(i + 1, m); + } + + // Leg to Cheb + if (verbose > 1) printf("Leg2cheb\n"); + + size_t flops = X(execute)(input_array, output_array, fmmplan, L2C, 1); + + FLT *ia = (FLT *)calloc(N, sizeof(FLT)); + // Cheb to Leg + flops += X(execute)(output_array, ia, fmmplan, C2L, 1); + + // Compute maximum error + FLT error = 0.0; + for (size_t j = 0; j < N; j++) + { + error = MAX(Y(fabs)(input_array[j] - ia[j]), error); + } + + FLT e0 = 0; + for (size_t j = 0; j < N; j++) + { + e0 = MAX(Y(fabs)(ia[j]), e0); + } + FLT ulp; +#if defined(FT_USE_SINGLE) + ulp = nextafterf(e0, 1e8) - e0; +#elif defined(FT_USE_DOUBLE) + ulp = nextafter(e0, 1e8) - e0; +#endif + + printf("N %6lu L inf Error = %2.8e \n", N, error / e0); + printf(" Flops = %lu\n", flops); + printf(" ulp = %2.16e\n", ulp); + printf(" error ulp = %2.1f\n", error / ulp); +#ifdef TEST + assert(error < 1e-10); +#endif + free(ia); + free(input_array); + free(output_array); + X(free_fmm)(fmmplan); +} + +void Y(test_fmm_speed)(size_t N, size_t maxs, size_t repeat, size_t direction, size_t M, + size_t lagrange, size_t verbose) +{ + if (verbose > 1) printf("test_fmm_speed %lu\n", direction); + X(fmm_plan) *fmmplan = X(create_fmm)(N, maxs, M, direction, lagrange, verbose); + + FLT *input_array = (FLT *)calloc(N, sizeof(FLT)); + FLT *output_array = (FLT *)calloc(N, sizeof(FLT)); + // Initialize some input array + for (size_t i = 0; i < N; i++) + input_array[i] = 1.0; + + uint64_t t0 = tic; + FLT min_time = 1e8; + size_t flops; + for (size_t i = 0; i < repeat; i++) + { + for (size_t j = 0; j < N; j++) + output_array[j] = 0.0; + uint64_t g0 = tic; + flops = X(execute)(input_array, output_array, fmmplan, direction, 1); + FLT s1 = toc(g0); + min_time = s1 < min_time ? s1 : min_time; + } + + uint64_t t1 = tic; + printf("Timing N %6lu avg / min = %2.4e / %2.4e flops = %lu\n", N, dtics(t0, t1) / repeat, + min_time, flops); + free(input_array); + free(output_array); + X(free_fmm)(fmmplan); +} + +void Y(test_fmm_transforms)(int *checksum, int N) +{ + FLT err; + size_t flops; +#if defined(FT_USE_SINGLE) + size_t s = 32; + size_t M = 10; +#elif defined(FT_USE_DOUBLE) + size_t s = 64; + size_t M = 18; +#endif + + printf("\nTesting the accuracy of FMM Chebyshev--Legendre transforms (1D).\n\n"); + printf("\t\t\t Test \t\t\t\t | inf-norm Maximum error\n"); + printf("---------------------------------------------------------|----------------------\n"); + for (int n = 64; n < N; n *= 2) + { + FLT m = 0; + FLT *input_array = (FLT *)calloc(n, sizeof(FLT)); + FLT *output_array = (FLT *)calloc(n, sizeof(FLT)); + FLT *ia = (FLT *)calloc(n, sizeof(FLT)); + srand48(1); + FLT e0 = 0; + for (int i = 0; i < n; i++) + { + // input_array[i] = 1; + input_array[i] = (2 * drand48() - 1) / pow(i + 1, m); + e0 = MAX(e0, Y(fabs)(input_array[i])); + } + + X(fmm_plan) *fmmplan = X(create_fmm)(n, s, M, BOTH, 0, 0); + flops = X(execute)(input_array, output_array, fmmplan, L2C, 1); + flops += X(execute)(output_array, ia, fmmplan, C2L, 1); + X(free_fmm)(fmmplan); + err = X(normInf_2arg)(ia, input_array, n) / e0; + printf("n = %5i: \t\t\t\t\t\t |%20.2e ", n, (double)err); + X(checktest)(err, 2 * Y(sqrt)(n), checksum); + free(input_array); + free(output_array); + free(ia); + } +} +void Y(test_fmm_transforms2d)(int *checksum, int N) +{ + FLT err; + size_t flops; +#if defined(FT_USE_SINGLE) + size_t s = 32; + size_t M = 10; +#elif defined(FT_USE_DOUBLE) + size_t s = 64; + size_t M = 18; +#endif + + printf("\nTesting the accuracy of FMM Chebyshev--Legendre transforms (2D).\n\n"); + printf("\t\t\t Test \t\t\t\t | 2-norm Relative Error\n"); + printf("---------------------------------------------------------|----------------------\n"); + for (int n = 64; n < N; n *= 2) + { + FLT *input_array = (FLT *)calloc(n*n, sizeof(FLT)); + FLT *output_array = (FLT *)calloc(n*n, sizeof(FLT)); + FLT *ia = (FLT *)calloc(n*n, sizeof(FLT)); + for (int i = 0; i < n; i++) + { + input_array[i+i*n] = 1; + } + + X(fmm_plan_2d) *fmmplan = X(create_fmm_2d)(n, n, 0, s, M, BOTH, 0, 0); + flops = X(execute2D)(input_array, output_array, fmmplan, L2C); + flops += X(execute2D)(output_array, ia, fmmplan, C2L); + X(free_fmm_2d)(fmmplan); + err = X(norm_2arg)(ia, input_array, n*n) / X(norm_1arg)(ia, n*n); + printf("(n×n) = (%4ix%4i): \t\t\t\t\t |%20.2e ", n, n, (double)err); + X(checktest)(err, 2 * Y(sqrt)(n), checksum); + free(input_array); + free(output_array); + free(ia); + } +} \ No newline at end of file