From bca8d24f95eaca3ddee4d5cf3fed0421a27c7297 Mon Sep 17 00:00:00 2001 From: Hartmut Kaiser Date: Fri, 27 Jan 2017 16:22:53 -0600 Subject: [PATCH] Applying some optimizations - also add clang-format and editorconfig file --- .clang-format | 89 ++ .editorconfig | 15 + src/grid_fmm.cpp | 2443 ++++++++++++++++----------------- src/node_server_actions_1.cpp | 634 +++++---- src/taylor.hpp | 756 +++++----- 5 files changed, 1982 insertions(+), 1955 deletions(-) create mode 100644 .clang-format create mode 100644 .editorconfig diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..a8b90be92 --- /dev/null +++ b/.clang-format @@ -0,0 +1,89 @@ +# Copyright (c) 2016 Thomas Heller +# Copyright (c) 2016 Hartmut Kaiser +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: DontAlign +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlinesLeft: true +AlignOperands: false +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +#BinPackArguments: true +#BinPackParameters: true +BraceWrapping: + AfterClass: true + AfterControlStatement: false + AfterEnum: true + AfterFunction: false + AfterNamespace: false + AfterStruct: true + AfterUnion: true + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: true +BreakStringLiterals: true +ColumnLimit: 100 +CommentPragmas: "///" +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 2 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +ExperimentalAutoDetectBinPacking: false +# ForEachMacros: [''] +IncludeCategories: + - Regex: '^<.*' + Priority: 1 + - Regex: '.*' + Priority: 2 +# IncludeIsMainRegex: '' +IndentCaseLabels: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +Language: Cpp +# MacroBlockBegin: '' +# MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: Inner +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +ReflowComments: true +SortIncludes: true +SpaceAfterCStyleCast: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 4 +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: true +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 4 +UseTab: Never +... diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..e0ba0c6cc --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +; Copyright (c) 2013 Hartmut Kaiser +; +; Distributed under the Boost Software License, Version 1.0. (See accompanying +; file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +; Download plugins for your favorite editor at editorconfig.org + +root = true + +[*] +indent_style = space +indent_size = 4 +charset = latin1 +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/src/grid_fmm.cpp b/src/grid_fmm.cpp index a04c67e92..329999cdf 100644 --- a/src/grid_fmm.cpp +++ b/src/grid_fmm.cpp @@ -31,1413 +31,1368 @@ static std::vector ilist_n; static std::vector ilist_d; // interactions of root node (empty list? boundary stuff?) (David) static std::vector ilist_r; -static std::vector> - ilist_d_bnd(geo::direction::count()); -static std::vector> - ilist_n_bnd(geo::direction::count()); +static std::vector> ilist_d_bnd(geo::direction::count()); +static std::vector> ilist_n_bnd(geo::direction::count()); static taylor<4, real> factor; extern options opts; template -void load_multipole(taylor<4, T> &m, space_vector &c, - const gravity_boundary_type &data, integer iter, - bool monopole) { - if (monopole) { - m = T(0.0); - m = T((*(data.m))[iter]); - } else { - auto const &tmp1 = (*(data.M))[iter]; +void load_multipole(taylor<4, T>& m, space_vector& c, const gravity_boundary_type& data, + integer iter, bool monopole) { + if (monopole) { + m = T(0.0); + m = T((*(data.m))[iter]); + } else { + auto const& tmp1 = (*(data.M))[iter]; #pragma GCC ivdep - for (int i = 0; i != 20; ++i) { - m[i] = tmp1[i]; - } - auto const &tmp2 = (*(data.x))[iter]; + for (int i = 0; i != 20; ++i) { + m[i] = tmp1[i]; + } + auto const& tmp2 = (*(data.x))[iter]; #pragma GCC ivdep - for (integer d = 0; d != NDIM; ++d) { - c[d] = tmp2[d]; + for (integer d = 0; d != NDIM; ++d) { + c[d] = tmp2[d]; + } } - } } void find_eigenvectors(real q[3][3], real e[3][3], real lambda[3]) { - PROF_BEGIN; - real b0[3], b1[3], A, bdif; - int iter = 0; - for (int l = 0; l < 3; l++) { - b0[0] = b0[1] = b0[2] = 0.0; - b0[l] = 1.0; - do { - iter++; - b1[0] = b1[1] = b1[2] = 0.0; - for (int i = 0; i < 3; i++) { + PROF_BEGIN; + real b0[3], b1[3], A, bdif; + int iter = 0; + for (int l = 0; l < 3; l++) { + b0[0] = b0[1] = b0[2] = 0.0; + b0[l] = 1.0; + do { + iter++; + b1[0] = b1[1] = b1[2] = 0.0; + for (int i = 0; i < 3; i++) { + for (int m = 0; m < 3; m++) { + b1[i] += q[i][m] * b0[m]; + } + } + A = sqrt(sqr(b1[0]) + sqr(b1[1]) + sqr(b1[2])); + bdif = 0.0; + for (int i = 0; i < 3; i++) { + b1[i] = b1[i] / A; + bdif += pow(b0[i] - b1[i], 2); + } + for (int i = 0; i < 3; i++) { + b0[i] = b1[i]; + } + + } while (fabs(bdif) > 1.0e-14); for (int m = 0; m < 3; m++) { - b1[i] += q[i][m] * b0[m]; + e[l][m] = b0[m]; } - } - A = sqrt(sqr(b1[0]) + sqr(b1[1]) + sqr(b1[2])); - bdif = 0.0; - for (int i = 0; i < 3; i++) { - b1[i] = b1[i] / A; - bdif += pow(b0[i] - b1[i], 2); - } - for (int i = 0; i < 3; i++) { - b0[i] = b1[i]; - } - - } while (fabs(bdif) > 1.0e-14); - for (int m = 0; m < 3; m++) { - e[l][m] = b0[m]; - } - for (int i = 0; i < 3; i++) { - A += b0[i] * q[l][i]; + for (int i = 0; i < 3; i++) { + A += b0[i] * q[l][i]; + } + lambda[l] = sqrt(A) / sqrt(sqr(e[l][0]) + sqr(e[l][1]) + sqr(e[l][2])); } - lambda[l] = sqrt(A) / sqrt(sqr(e[l][0]) + sqr(e[l][1]) + sqr(e[l][2])); - } - PROF_END; + PROF_END; } std::pair grid::find_axis() const { - PROF_BEGIN; - real quad_moment[NDIM][NDIM]; - real eigen[NDIM][NDIM]; - real lambda[NDIM]; - space_vector this_com; - real mtot = 0.0; - for (integer i = 0; i != NDIM; ++i) { - this_com[i] = 0.0; - for (integer j = 0; j != NDIM; ++j) { - quad_moment[i][j] = 0.0; + PROF_BEGIN; + real quad_moment[NDIM][NDIM]; + real eigen[NDIM][NDIM]; + real lambda[NDIM]; + space_vector this_com; + real mtot = 0.0; + for (integer i = 0; i != NDIM; ++i) { + this_com[i] = 0.0; + for (integer j = 0; j != NDIM; ++j) { + quad_moment[i][j] = 0.0; + } } - } - - auto &M = *M_ptr; - auto &mon = *mon_ptr; - std::vector const &com0 = *(com_ptr[0]); - for (integer i = 0; i != G_NX; ++i) { - for (integer j = 0; j != G_NX; ++j) { - for (integer k = 0; k != G_NX; ++k) { - const integer iii1 = gindex(i, j, k); - const integer iii0 = gindex(i + H_BW, j + H_BW, k + H_BW); - space_vector const &com0iii1 = com0[iii1]; - multipole const &Miii1 = M[iii1]; - for (integer n = 0; n != NDIM; ++n) { - real mass; - if (is_leaf) { - mass = mon[iii1]; - } else { - mass = Miii1(); - } - this_com[n] += mass * com0iii1[n]; - mtot += mass; - for (integer m = 0; m != NDIM; ++m) { - if (!is_leaf) { - quad_moment[n][m] += Miii1(n, m); + + auto& M = *M_ptr; + auto& mon = *mon_ptr; + std::vector const& com0 = *(com_ptr[0]); + for (integer i = 0; i != G_NX; ++i) { + for (integer j = 0; j != G_NX; ++j) { + for (integer k = 0; k != G_NX; ++k) { + const integer iii1 = gindex(i, j, k); + const integer iii0 = gindex(i + H_BW, j + H_BW, k + H_BW); + space_vector const& com0iii1 = com0[iii1]; + multipole const& Miii1 = M[iii1]; + for (integer n = 0; n != NDIM; ++n) { + real mass; + if (is_leaf) { + mass = mon[iii1]; + } else { + mass = Miii1(); + } + this_com[n] += mass * com0iii1[n]; + mtot += mass; + for (integer m = 0; m != NDIM; ++m) { + if (!is_leaf) { + quad_moment[n][m] += Miii1(n, m); + } + quad_moment[n][m] += mass * com0iii1[n] * com0iii1[m]; + } + } } - quad_moment[n][m] += mass * com0iii1[n] * com0iii1[m]; - } } - } } - } - for (integer j = 0; j != NDIM; ++j) { - this_com[j] /= mtot; - } - - find_eigenvectors(quad_moment, eigen, lambda); - integer index; - if (lambda[0] > lambda[1] && lambda[0] > lambda[2]) { - index = 0; - } else if (lambda[1] > lambda[2]) { - index = 1; - } else { - index = 2; - } - space_vector rc; - for (integer j = 0; j != NDIM; ++j) { - rc[j] = eigen[index][j]; - } - std::pair pair{rc, this_com}; - PROF_END; - return pair; + for (integer j = 0; j != NDIM; ++j) { + this_com[j] /= mtot; + } + + find_eigenvectors(quad_moment, eigen, lambda); + integer index; + if (lambda[0] > lambda[1] && lambda[0] > lambda[2]) { + index = 0; + } else if (lambda[1] > lambda[2]) { + index = 1; + } else { + index = 2; + } + space_vector rc; + for (integer j = 0; j != NDIM; ++j) { + rc[j] = eigen[index][j]; + } + std::pair pair{rc, this_com}; + PROF_END; + return pair; } void grid::solve_gravity(gsolve_type type) { - - compute_multipoles(type); - compute_interactions(type); - compute_expansions(type); + compute_multipoles(type); + compute_interactions(type); + compute_expansions(type); } void grid::compute_interactions(gsolve_type type) { - PROF_BEGIN; - - // calculating the contribution of all the inner cells - // calculating the interaction - - auto &mon = *mon_ptr; - // L stores the gravitational potentatial (Dominic) - // L should be L, (10) in the paper - // L_c stores the correction for angular momentum (Dominic) - // L_c, (20) in the paper (Dominic) - std::fill(std::begin(L), std::end(L), ZERO); - if (opts.ang_con) { - std::fill(std::begin(L_c), std::end(L_c), ZERO); - } - - // Non-leaf nodes use taylor expansion - // For leaf node, calculates the gravitational potential between two particles - if (!is_leaf) { - // Non-leaf means that multipole-multipole interaction (David) - - // Fetch the interaction list for the current cell (David) - const auto &this_ilist = is_root ? ilist_r : ilist_n; - const integer list_size = this_ilist.size(); - - // Space vector is a vector pack (David) - // Center of masses of each cell (com_ptr1 is the center of mass of the parent cell) - std::vector const &com0 = (*(com_ptr[0])); - // Do 8 cell-cell interactions at the same time (simd.hpp) (Dominic) - // This could lead to problems either on Haswell or on KNL as the number is hardcoded (David) - // It is unclear what the underlying vectorization library does, if simd_len > architectural length (David) - hpx::parallel::for_loop_strided( - for_loop_policy, 0, list_size, simd_len, - [&com0, &this_ilist, list_size, type, this](std::size_t li) { - - // variable for every dimension (per simd element, SoA style) (David) - // dX is distance between X and Y - // X and Y are the two cells interacting - // X and Y store the 3D center of masses - std::array dX; - std::array X; - std::array Y; - - // m multipole-moments of the cells - taylor<4, simd_vector> m0; - taylor<4, simd_vector> m1; - // n angular momentum of the cells - taylor<4, simd_vector> n0; - taylor<4, simd_vector> n1; - - // vector of multipoles (== taylor<4>s) (David) - auto &M = *M_ptr; - // No idea what the next loop does. It seems to fetch data (David) + PROF_BEGIN; + + // calculating the contribution of all the inner cells + // calculating the interaction + + auto& mon = *mon_ptr; + // L stores the gravitational potentatial (Dominic) + // L should be L, (10) in the paper + // L_c stores the correction for angular momentum (Dominic) + // L_c, (20) in the paper (Dominic) + std::fill(std::begin(L), std::end(L), ZERO); + if (opts.ang_con) { + std::fill(std::begin(L_c), std::end(L_c), ZERO); + } + + // Non-leaf nodes use taylor expansion + // For leaf node, calculates the gravitational potential between two particles + if (!is_leaf) { + // Non-leaf means that multipole-multipole interaction (David) + + // Fetch the interaction list for the current cell (David) + const auto& this_ilist = is_root ? ilist_r : ilist_n; + const integer list_size = this_ilist.size(); + + // Space vector is a vector pack (David) + // Center of masses of each cell (com_ptr1 is the center of mass of the parent cell) + std::vector const& com0 = (*(com_ptr[0])); + // Do 8 cell-cell interactions at the same time (simd.hpp) (Dominic) + // This could lead to problems either on Haswell or on KNL as the number is hardcoded + // (David) + // It is unclear what the underlying vectorization library does, if simd_len > architectural + // length (David) + hpx::parallel::for_loop_strided(for_loop_policy, 0, list_size, simd_len, + [&com0, &this_ilist, list_size, type, this](std::size_t li) { + + // variable for every dimension (per simd element, SoA style) (David) + // dX is distance between X and Y + // X and Y are the two cells interacting + // X and Y store the 3D center of masses + std::array dX; + std::array X; + std::array Y; + + // m multipole-moments of the cells + taylor<4, simd_vector> m0; + taylor<4, simd_vector> m1; + // n angular momentum of the cells + taylor<4, simd_vector> n0; + taylor<4, simd_vector> n1; + + // vector of multipoles (== taylor<4>s) (David) + auto& M = *M_ptr; +// No idea what the next loop does. It seems to fetch data (David) // FIXME: replace with vector-pack gather-loads - // only uses first 10 coefficients +// only uses first 10 coefficients #pragma GCC ivdep - for (integer i = 0; i != simd_len; ++i) { - const auto index = - std::min(integer(li + i), integer(list_size - 1)); - const integer iii0 = this_ilist[index].first; - const integer iii1 = this_ilist[index].second; - space_vector const &com0iii0 = com0[iii0]; - space_vector const &com0iii1 = com0[iii1]; - for (integer d = 0; d < NDIM; ++d) { - // load the 3D center of masses - X[d][i] = com0iii0[d]; - Y[d][i] = com0iii1[d]; - } - - // cell specific taylor series coefficients - multipole const &Miii0 = M[iii0]; - multipole const &Miii1 = M[iii1]; - for (integer j = 0; j != taylor_sizes[3]; ++j) { - m0[j][i] = Miii1[j]; - m1[j][i] = Miii0[j]; - } - - // RHO computes the gravitational potential based on the mass density - // non-RHO computes the time derivative (hydrodynamics code V_t) - if (type == RHO) { - // this branch computes the angular momentum correction, (20) in the paper - // divide by mass of other cell - real const tmp1 = Miii1() / Miii0(); - real const tmp2 = Miii0() / Miii1(); - // calculating the coeffcieints for formula (M are the octopole moments) - // the coefficients are calculated in (17) and (18) - // TODO: Dominic - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j][i] = Miii1[j] - Miii0[j] * tmp1; - n1[j][i] = Miii0[j] - Miii1[j] * tmp2; - } - } - } + for (integer i = 0; i != simd_len && integer(li + i) < list_size; ++i) { + const integer iii0 = this_ilist[li + i].first; + const integer iii1 = this_ilist[li + i].second; + space_vector const& com0iii0 = com0[iii0]; + space_vector const& com0iii1 = com0[iii1]; + for (integer d = 0; d < NDIM; ++d) { + // load the 3D center of masses + X[d][i] = com0iii0[d]; + Y[d][i] = com0iii1[d]; + } + + // cell specific taylor series coefficients + multipole const& Miii0 = M[iii0]; + multipole const& Miii1 = M[iii1]; + for (integer j = 0; j != taylor_sizes[3]; ++j) { + m0[j][i] = Miii1[j]; + m1[j][i] = Miii0[j]; + } + + // RHO computes the gravitational potential based on the mass density + // non-RHO computes the time derivative (hydrodynamics code V_t) + if (type == RHO) { + // this branch computes the angular momentum correction, (20) in the paper + // divide by mass of other cell + real const tmp1 = Miii1() / Miii0(); + real const tmp2 = Miii0() / Miii1(); + // calculating the coeffcieints for formula (M are the octopole moments) + // the coefficients are calculated in (17) and (18) + // TODO: Dominic + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j][i] = Miii1[j] - Miii0[j] * tmp1; + n1[j][i] = Miii0[j] - Miii1[j] * tmp2; + } + } + } - if (type != RHO) { + if (type != RHO) { #pragma GCC ivdep - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j] = ZERO; - n1[j] = ZERO; - } - } + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j] = ZERO; + n1[j] = ZERO; + } + } - // distance between cells in all dimensions +// distance between cells in all dimensions #pragma GCC ivdep - for (integer d = 0; d < NDIM; ++d) { - dX[d] = X[d] - Y[d]; - } - - // R_i in paper is the dX in the code - // D is taylor expansion value for a given X expansion of the gravitational potential (multipole expansion) - taylor<5, simd_vector> D; - // A0, A1 are the contributions to L (for each cell) - taylor<4, simd_vector> A0, A1; - // B0, B1 are the contributions to L_c (for each cell) - std::array B0 = { - simd_vector(ZERO), simd_vector(ZERO), simd_vector(ZERO)}; - std::array B1 = { - simd_vector(ZERO), simd_vector(ZERO), simd_vector(ZERO)}; - - // calculates all D-values, calculate all coefficients of 1/r (not the potential), formula (6)-(9) and (19) - D.set_basis(dX); - - // the following loops calculate formula (10), potential from A->B and B->A (basically alternating) - A0[0] = m0[0] * D[0]; - A1[0] = m1[0] * D[0]; - if (type != RHO) { - for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { - A0[0] -= m0[i] * D[i]; - A1[0] += m1[i] * D[i]; - } - } - for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { - const auto tmp = D[i] * (factor[i] / real(2)); - A0[0] += m0[i] * tmp; - A1[0] += m1[i] * tmp; - } - for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { - const auto tmp = D[i] * (factor[i] / real(6)); - A0[0] -= m0[i] * tmp; - A1[0] += m1[i] * tmp; - } - - for (integer a = 0; a < NDIM; ++a) { - A0(a) = m0() * D(a); - A1(a) = -m1() * D(a); - for (integer b = 0; b < NDIM; ++b) { - if (type != RHO) { - A0(a) -= m0(a) * D(a, b); - A1(a) -= m1(a) * D(a, b); - } - for (integer c = b; c < NDIM; ++c) { - const auto tmp1 = D(a, b, c) * (factor(c, b) / real(2)); - A0(a) += m0(c, b) * tmp1; - A1(a) -= m1(c, b) * tmp1; - } - } - } - - if (type == RHO) { - for (integer a = 0; a != NDIM; ++a) { - for (integer b = 0; b != NDIM; ++b) { - for (integer c = b; c != NDIM; ++c) { - for (integer d = c; d != NDIM; ++d) { - const auto tmp = - D(a, b, c, d) * (factor(b, c, d) / real(6)); - B0[a] -= n0(b, c, d) * tmp; - B1[a] -= n1(b, c, d) * tmp; - } + for (integer d = 0; d < NDIM; ++d) { + dX[d] = X[d] - Y[d]; + } + + // R_i in paper is the dX in the code + // D is taylor expansion value for a given X expansion of the gravitational + // potential (multipole expansion) + taylor<5, simd_vector> D; + // A0, A1 are the contributions to L (for each cell) + taylor<4, simd_vector> A0, A1; + // B0, B1 are the contributions to L_c (for each cell) + std::array B0 = { + simd_vector(ZERO), simd_vector(ZERO), simd_vector(ZERO)}; + std::array B1 = { + simd_vector(ZERO), simd_vector(ZERO), simd_vector(ZERO)}; + + // calculates all D-values, calculate all coefficients of 1/r (not the potential), + // formula (6)-(9) and (19) + D.set_basis(dX); + + // the following loops calculate formula (10), potential from A->B and B->A + // (basically alternating) + A0[0] = m0[0] * D[0]; + A1[0] = m1[0] * D[0]; + if (type != RHO) { + for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { + A0[0] -= m0[i] * D[i]; + A1[0] += m1[i] * D[i]; + } + } + for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { + const auto tmp = D[i] * (factor[i] / real(2)); + A0[0] += m0[i] * tmp; + A1[0] += m1[i] * tmp; + } + for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + const auto tmp = D[i] * (factor[i] / real(6)); + A0[0] -= m0[i] * tmp; + A1[0] += m1[i] * tmp; + } + + for (integer a = 0; a < NDIM; ++a) { + A0(a) = m0() * D(a); + A1(a) = -m1() * D(a); + for (integer b = 0; b < NDIM; ++b) { + if (type != RHO) { + A0(a) -= m0(a) * D(a, b); + A1(a) -= m1(a) * D(a, b); + } + for (integer c = b; c < NDIM; ++c) { + const auto tmp1 = D(a, b, c) * (factor(c, b) / real(2)); + A0(a) += m0(c, b) * tmp1; + A1(a) -= m1(c, b) * tmp1; + } + } } - } - } - } - - for (integer a = 0; a < NDIM; ++a) { - for (integer b = a; b < NDIM; ++b) { - A0(a, b) = m0() * D(a, b); - A1(a, b) = m1() * D(a, b); - for (integer c = 0; c < NDIM; ++c) { - const auto &tmp = D(a, b, c); - A0(a, b) -= m0(c) * tmp; - A1(a, b) += m1(c) * tmp; - } - } - } - for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { - const auto &tmp = D[i]; - A1[i] = -m1[0] * tmp; - A0[i] = m0[0] * tmp; - } + if (type == RHO) { + for (integer a = 0; a != NDIM; ++a) { + for (integer b = 0; b != NDIM; ++b) { + for (integer c = b; c != NDIM; ++c) { + for (integer d = c; d != NDIM; ++d) { + const auto tmp = D(a, b, c, d) * (factor(b, c, d) / real(6)); + B0[a] -= n0(b, c, d) * tmp; + B1[a] -= n1(b, c, d) * tmp; + } + } + } + } + } + + for (integer a = 0; a < NDIM; ++a) { + for (integer b = a; b < NDIM; ++b) { + A0(a, b) = m0() * D(a, b); + A1(a, b) = m1() * D(a, b); + for (integer c = 0; c < NDIM; ++c) { + const auto& tmp = D(a, b, c); + A0(a, b) -= m0(c) * tmp; + A1(a, b) += m1(c) * tmp; + } + } + } + + for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + const auto& tmp = D[i]; + A1[i] = -m1[0] * tmp; + A0[i] = m0[0] * tmp; + } #ifdef USE_GRAV_PAR - std::lock_guard lock(*L_mtx); + std::lock_guard lock(*L_mtx); #endif - // potential and correction have been computed, no scatter the results - for (integer i = 0; i != simd_len && i + li < list_size; ++i) { - const integer iii0 = this_ilist[li + i].first; - const integer iii1 = this_ilist[li + i].second; - expansion &Liii0 = L[iii0]; - expansion &Liii1 = L[iii1]; - - expansion tmp1, tmp2; - #pragma GCC ivdep - for (integer j = 0; j != taylor_sizes[3]; ++j) { - tmp1[j] = A0[j][i]; - tmp2[j] = A1[j][i]; - } - Liii0 += tmp1; - Liii1 += tmp2; - - if (type == RHO && opts.ang_con) { - space_vector &L_ciii0 = L_c[iii0]; - space_vector &L_ciii1 = L_c[iii1]; - for (integer j = 0; j != NDIM; ++j) { - L_ciii0[j] += B0[j][i]; - L_ciii1[j] += B1[j][i]; - } - } - } - }); - } else { + // potential and correction have been computed, no scatter the results + for (integer i = 0; i != simd_len && i + li < list_size; ++i) { + const integer iii0 = this_ilist[li + i].first; + const integer iii1 = this_ilist[li + i].second; + + expansion tmp1, tmp2; +#pragma GCC ivdep + for (integer j = 0; j != taylor_sizes[3]; ++j) { + tmp1[j] = A0[j][i]; + tmp2[j] = A1[j][i]; + } + L[iii0] += tmp1; + L[iii1] += tmp2; + + if (type == RHO && opts.ang_con) { + space_vector& L_ciii0 = L_c[iii0]; + space_vector& L_ciii1 = L_c[iii1]; + for (integer j = 0; j != NDIM; ++j) { + L_ciii0[j] += B0[j][i]; + L_ciii1[j] += B1[j][i]; + } + } + } + }); + } else { #if !defined(HPX_HAVE_DATAPAR) - const v4sd d0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), +1.0 / sqr(dx)}; - const v4sd d1 = {1.0 / dx, -1.0 / sqr(dx), -1.0 / sqr(dx), -1.0 / sqr(dx)}; + const v4sd d0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), +1.0 / sqr(dx)}; + const v4sd d1 = {1.0 / dx, -1.0 / sqr(dx), -1.0 / sqr(dx), -1.0 / sqr(dx)}; #else - // first compute potential and the the three components of the force - // vectorizating across across the dimensions, therefore 1 component for the potential + 3 components for the force + // first compute potential and the the three components of the force + // vectorizating across across the dimensions, therefore 1 component for the potential + 3 + // components for the force - // Coefficients for potential evaluation? (David) - const std::array di0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), - +1.0 / sqr(dx)}; - const v4sd d0(di0.data()); + // Coefficients for potential evaluation? (David) + const std::array di0 = { + 1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), +1.0 / sqr(dx)}; + const v4sd d0(di0.data()); - // negative of d0 because it's the force in the opposite direction - const std::array di1 = {1.0 / dx, -1.0 / sqr(dx), -1.0 / sqr(dx), - -1.0 / sqr(dx)}; - const v4sd d1(di1.data()); + // negative of d0 because it's the force in the opposite direction + const std::array di1 = { + 1.0 / dx, -1.0 / sqr(dx), -1.0 / sqr(dx), -1.0 / sqr(dx)}; + const v4sd d1(di1.data()); #endif - // Number of body-body interactions current leaf cell, probably includes interactions with bodies in neighboring cells (David) - const integer dsize = ilist_d.size(); + // Number of body-body interactions current leaf cell, probably includes interactions with + // bodies in neighboring cells (David) + const integer dsize = ilist_d.size(); - // Iterate interaction description between bodies in the current cell and closeby bodies (David) - for (integer li = 0; li < dsize; ++li) { - // Retrieve the interaction description for the current body (David) - const auto &ele = ilist_d[li]; - // Extract the indices of the two interacting bodies (David) - const integer iii0 = ele.first; - const integer iii1 = ele.second; + // Iterate interaction description between bodies in the current cell and closeby bodies + // (David) + for (integer li = 0; li < dsize; ++li) { + // Retrieve the interaction description for the current body (David) + const auto& ele = ilist_d[li]; + // Extract the indices of the two interacting bodies (David) + const integer iii0 = ele.first; + const integer iii1 = ele.second; #if !defined(HPX_HAVE_DATAPAR) - v4sd m0, m1; + v4sd m0, m1; #pragma GCC ivdep - for (integer i = 0; i != 4; ++i) { - m0[i] = mon[iii1]; - } + for (integer i = 0; i != 4; ++i) { + m0[i] = mon[iii1]; + } #pragma GCC ivdep - for (integer i = 0; i != 4; ++i) { - m1[i] = mon[iii0]; - } -#else - // fetch both interacting bodies (monopoles) (David) - // broadcasts a single value - v4sd m0 = mon[iii1]; - v4sd m1 = mon[iii0]; + for (integer i = 0; i != 4; ++i) { + m1[i] = mon[iii0]; + } + +#ifdef USE_GRAV_PAR + std::lock_guard lock(*L_mtx); #endif + L[iii0] += m0 * ele.four * d0; + L[iii1] += m1 * ele.four * d1; +#else #ifdef USE_GRAV_PAR - std::lock_guard lock(*L_mtx); + std::lock_guard lock(*L_mtx); #endif - - // potential and force calculation - auto tmp1 = m0 * ele.four * d0; - auto tmp2 = m1 * ele.four * d1; - // Expansion is typedef to taylor<4, real> (David) - // update the results, this is a scatter operation - expansion &Liii0 = L[iii0]; - expansion &Liii1 = L[iii1]; - - // Strange: A taylor<4> creates a array of size 20? -#pragma GCC ivdep - for (integer i = 0; i != 4; ++i) { - Liii0[i] += tmp1[i]; - Liii1[i] += tmp2[i]; - } + // fetch both interacting bodies (monopoles) (David) + // broadcasts a single value + L[iii0] += mon[iii1] * ele.four * d0; + L[iii1] += mon[iii0] * ele.four * d1; +#endif + } } - } - PROF_END; + PROF_END; } -void grid::compute_boundary_interactions(gsolve_type type, - const geo::direction &dir, - bool is_monopole, - const gravity_boundary_type &mpoles) { - if (!is_leaf) { - if (!is_monopole) { - compute_boundary_interactions_multipole_multipole(type, ilist_n_bnd[dir], - mpoles); - } else { - compute_boundary_interactions_monopole_multipole(type, ilist_d_bnd[dir], - mpoles); - } - } else { - if (!is_monopole) { - compute_boundary_interactions_multipole_monopole(type, ilist_d_bnd[dir], - mpoles); +void grid::compute_boundary_interactions(gsolve_type type, const geo::direction& dir, + bool is_monopole, const gravity_boundary_type& mpoles) { + if (!is_leaf) { + if (!is_monopole) { + compute_boundary_interactions_multipole_multipole(type, ilist_n_bnd[dir], mpoles); + } else { + compute_boundary_interactions_monopole_multipole(type, ilist_d_bnd[dir], mpoles); + } } else { - compute_boundary_interactions_monopole_monopole(type, ilist_d_bnd[dir], - mpoles); + if (!is_monopole) { + compute_boundary_interactions_multipole_monopole(type, ilist_d_bnd[dir], mpoles); + } else { + compute_boundary_interactions_monopole_monopole(type, ilist_d_bnd[dir], mpoles); + } } - } } -void grid::compute_boundary_interactions_multipole_multipole( - gsolve_type type, const std::vector &ilist_n_bnd, - const gravity_boundary_type &mpoles) { - PROF_BEGIN; - auto &M = *M_ptr; - auto &mon = *mon_ptr; +void grid::compute_boundary_interactions_multipole_multipole(gsolve_type type, + const std::vector& ilist_n_bnd, + const gravity_boundary_type& mpoles) { + PROF_BEGIN; + auto& M = *M_ptr; + auto& mon = *mon_ptr; - std::vector const &com0 = *(com_ptr[0]); - hpx::parallel::for_loop( - for_loop_policy, 0, ilist_n_bnd.size(), - [&mpoles, &com0, &ilist_n_bnd, type, this, &M](std::size_t si) { + std::vector const& com0 = *(com_ptr[0]); + hpx::parallel::for_loop(for_loop_policy, 0, ilist_n_bnd.size(), + [&mpoles, &com0, &ilist_n_bnd, type, this, &M](std::size_t si) { - taylor<4, simd_vector> m0; - taylor<4, simd_vector> n0; - std::array dX; - std::array X; - space_vector Y; + taylor<4, simd_vector> m0; + taylor<4, simd_vector> n0; + std::array dX; + std::array X; + space_vector Y; - boundary_interaction_type const &bnd = ilist_n_bnd[si]; - integer index = mpoles.is_local ? bnd.second : si; + boundary_interaction_type const& bnd = ilist_n_bnd[si]; + integer index = mpoles.is_local ? bnd.second : si; - load_multipole(m0, Y, mpoles, index, false); + load_multipole(m0, Y, mpoles, index, false); - std::array simdY = { - simd_vector(real(Y[0])), simd_vector(real(Y[1])), - simd_vector(real(Y[2])), - }; + std::array simdY = { + simd_vector(real(Y[0])), simd_vector(real(Y[1])), simd_vector(real(Y[2])), + }; - const integer list_size = bnd.first.size(); - for (integer li = 0; li < list_size; li += simd_len) { + const integer list_size = bnd.first.size(); + for (integer li = 0; li < list_size; li += simd_len) { #pragma GCC ivdep - for (integer i = 0; i != simd_len; ++i) { - const auto index = - std::min(integer(li + i), integer(list_size - 1)); - const integer iii0 = bnd.first[index]; - space_vector const &com0iii0 = com0[iii0]; - for (integer d = 0; d < NDIM; ++d) { - X[d][i] = com0iii0[d]; - } - if (type == RHO) { - multipole const &Miii0 = M[iii0]; - real const tmp = m0()[i] / Miii0(); - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j][i] = m0[j][i] - Miii0[j] * tmp; - } - } - } - if (type != RHO) { + for (integer i = 0; i != simd_len && li + i < list_size; ++i) { + const integer iii0 = bnd.first[li + i]; + space_vector const& com0iii0 = com0[iii0]; + for (integer d = 0; d < NDIM; ++d) { + X[d][i] = com0iii0[d]; + } + if (type == RHO) { + multipole const& Miii0 = M[iii0]; + real const tmp = m0()[i] / Miii0(); + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j][i] = m0[j][i] - Miii0[j] * tmp; + } + } + } + if (type != RHO) { #pragma GCC ivdep - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j] = ZERO; - } - } + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j] = ZERO; + } + } #pragma GCC ivdep - for (integer d = 0; d < NDIM; ++d) { - dX[d] = X[d] - simdY[d]; - } - - taylor<5, simd_vector> D; - taylor<4, simd_vector> A0; - std::array B0 = { - simd_vector(0.0), simd_vector(0.0), simd_vector(0.0)}; - - D.set_basis(dX); - A0[0] = m0[0] * D[0]; - if (type != RHO) { - for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { - A0[0] -= m0[i] * D[i]; - } - } - for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { - A0[0] += m0[i] * D[i] * (factor[i] / real(2)); - } - for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { - A0[0] -= m0[i] * D[i] * (factor[i] / real(6)); - } - - for (integer a = 0; a < NDIM; ++a) { - A0(a) = m0() * D(a); - for (integer b = 0; b < NDIM; ++b) { - if (type != RHO) { - A0(a) -= m0(a) * D(a, b); - } - for (integer c = b; c < NDIM; ++c) { - const auto tmp = D(a, b, c) * (factor(c, b) / real(2)); - A0(a) += m0(c, b) * tmp; - } - } - } - - if (type == RHO) { - for (integer a = 0; a < NDIM; ++a) { - for (integer b = 0; b < NDIM; ++b) { - for (integer c = b; c < NDIM; ++c) { - for (integer d = c; d < NDIM; ++d) { - const auto tmp = - D(a, b, c, d) * (factor(b, c, d) / real(6)); - B0[a] -= n0(b, c, d) * tmp; - } + for (integer d = 0; d < NDIM; ++d) { + dX[d] = X[d] - simdY[d]; + } + + taylor<5, simd_vector> D; + taylor<4, simd_vector> A0; + std::array B0 = { + simd_vector(0.0), simd_vector(0.0), simd_vector(0.0)}; + + D.set_basis(dX); + A0[0] = m0[0] * D[0]; + if (type != RHO) { + for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { + A0[0] -= m0[i] * D[i]; + } + } + for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { + A0[0] += m0[i] * D[i] * (factor[i] / real(2)); + } + for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + A0[0] -= m0[i] * D[i] * (factor[i] / real(6)); + } + + for (integer a = 0; a < NDIM; ++a) { + A0(a) = m0() * D(a); + for (integer b = 0; b < NDIM; ++b) { + if (type != RHO) { + A0(a) -= m0(a) * D(a, b); + } + for (integer c = b; c < NDIM; ++c) { + const auto tmp = D(a, b, c) * (factor(c, b) / real(2)); + A0(a) += m0(c, b) * tmp; + } + } + } + + if (type == RHO) { + for (integer a = 0; a < NDIM; ++a) { + for (integer b = 0; b < NDIM; ++b) { + for (integer c = b; c < NDIM; ++c) { + for (integer d = c; d < NDIM; ++d) { + const auto tmp = D(a, b, c, d) * (factor(b, c, d) / real(6)); + B0[a] -= n0(b, c, d) * tmp; + } + } + } + } + } + + for (integer a = 0; a < NDIM; ++a) { + for (integer b = a; b < NDIM; ++b) { + A0(a, b) = m0() * D(a, b); + for (integer c = 0; c < NDIM; ++c) { + A0(a, b) -= m0(c) * D(a, b, c); + } + } + } + for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + A0[i] = m0[0] * D[i]; } - } - } - } - - for (integer a = 0; a < NDIM; ++a) { - for (integer b = a; b < NDIM; ++b) { - A0(a, b) = m0() * D(a, b); - for (integer c = 0; c < NDIM; ++c) { - A0(a, b) -= m0(c) * D(a, b, c); - } - } - } - for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { - A0[i] = m0[0] * D[i]; - } #ifdef USE_GRAV_PAR - std::lock_guard lock(*L_mtx); + std::lock_guard lock(*L_mtx); #endif - for (integer i = 0; i != simd_len && i + li < list_size; ++i) { - const integer iii0 = bnd.first[li + i]; - expansion &Liii0 = L[iii0]; + for (integer i = 0; i != simd_len && i + li < list_size; ++i) { + const integer iii0 = bnd.first[li + i]; + expansion& Liii0 = L[iii0]; - expansion tmp; + expansion tmp; #pragma GCC ivdep - for (integer j = 0; j != taylor_sizes[3]; ++j) { - tmp[j] = A0[j][i]; - } - Liii0 += tmp; + for (integer j = 0; j != taylor_sizes[3]; ++j) { + tmp[j] = A0[j][i]; + } + Liii0 += tmp; - if (type == RHO && opts.ang_con) { - space_vector &L_ciii0 = L_c[iii0]; + if (type == RHO && opts.ang_con) { + space_vector& L_ciii0 = L_c[iii0]; #pragma GCC ivdep - for (integer j = 0; j != NDIM; ++j) { - L_ciii0[j] += B0[j][i]; - } + for (integer j = 0; j != NDIM; ++j) { + L_ciii0[j] += B0[j][i]; + } + } + } } - } - } - }); - PROF_END; + }); + PROF_END; } -void grid::compute_boundary_interactions_multipole_monopole( - gsolve_type type, const std::vector &ilist_n_bnd, - const gravity_boundary_type &mpoles) { - PROF_BEGIN; - auto &M = *M_ptr; - auto &mon = *mon_ptr; - - std::vector const &com0 = *(com_ptr[0]); - hpx::parallel::for_loop( - for_loop_policy, 0, ilist_n_bnd.size(), - [&mpoles, &com0, &ilist_n_bnd, type, this](std::size_t si) { - - taylor<4, simd_vector> m0; - taylor<4, simd_vector> n0; - std::array dX; - std::array X; - space_vector Y; - - boundary_interaction_type const &bnd = ilist_n_bnd[si]; - const integer list_size = bnd.first.size(); - integer index = mpoles.is_local ? bnd.second : si; - load_multipole(m0, Y, mpoles, index, false); - - std::array simdY = { - simd_vector(real(Y[0])), simd_vector(real(Y[1])), - simd_vector(real(Y[2])), - }; +void grid::compute_boundary_interactions_multipole_monopole(gsolve_type type, + const std::vector& ilist_n_bnd, + const gravity_boundary_type& mpoles) { + PROF_BEGIN; + auto& M = *M_ptr; + auto& mon = *mon_ptr; - if (type == RHO) { -#pragma GCC ivdep - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j] = m0[j]; - } - } else { + std::vector const& com0 = *(com_ptr[0]); + hpx::parallel::for_loop(for_loop_policy, 0, ilist_n_bnd.size(), + [&mpoles, &com0, &ilist_n_bnd, type, this](std::size_t si) { + + taylor<4, simd_vector> m0; + taylor<4, simd_vector> n0; + std::array dX; + std::array X; + space_vector Y; + + boundary_interaction_type const& bnd = ilist_n_bnd[si]; + const integer list_size = bnd.first.size(); + integer index = mpoles.is_local ? bnd.second : si; + load_multipole(m0, Y, mpoles, index, false); + + std::array simdY = { + simd_vector(real(Y[0])), simd_vector(real(Y[1])), simd_vector(real(Y[2])), + }; + + if (type == RHO) { #pragma GCC ivdep - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j] = ZERO; - } - } - for (integer li = 0; li < list_size; li += simd_len) { + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j] = m0[j]; + } + } else { #pragma GCC ivdep - for (integer i = 0; i != simd_len; ++i) { - const auto index = - std::min(integer(li + i), integer(list_size - 1)); - const integer iii0 = bnd.first[index]; - space_vector const &com0iii0 = com0[iii0]; - for (integer d = 0; d < NDIM; ++d) { - X[d][i] = com0iii0[d]; + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j] = ZERO; + } } - } + for (integer li = 0; li < list_size; li += simd_len) { #pragma GCC ivdep - for (integer d = 0; d < NDIM; ++d) { - dX[d] = X[d] - simdY[d]; - } - - taylor<5, simd_vector> D; - taylor<2, simd_vector> A0; - std::array B0 = { - simd_vector(0.0), simd_vector(0.0), simd_vector(0.0)}; - - D.set_basis(dX); - A0[0] = m0[0] * D[0]; - if (type != RHO) { - for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { - A0[0] -= m0[i] * D[i]; - } - } - for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { - A0[0] += m0[i] * D[i] * (factor[i] / real(2)); - } - for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { - A0[0] -= m0[i] * D[i] * (factor[i] / real(6)); - } - - for (integer a = 0; a < NDIM; ++a) { - A0(a) = m0() * D(a); - for (integer b = 0; b < NDIM; ++b) { - if (type != RHO) { - A0(a) -= m0(a) * D(a, b); - } - for (integer c = b; c < NDIM; ++c) { - A0(a) += m0(c, b) * D(a, b, c) * (factor(b, c) / real(2)); - } - } - } - - if (type == RHO) { - for (integer a = 0; a < NDIM; ++a) { - for (integer b = 0; b < NDIM; ++b) { - for (integer c = b; c < NDIM; ++c) { - for (integer d = c; d < NDIM; ++d) { - const auto tmp = - D(a, b, c, d) * (factor(b, c, d) / real(6)); - B0[a] -= n0(b, c, d) * tmp; - } + for (integer i = 0; i != simd_len && li + i < list_size; ++i) { + const integer iii0 = bnd.first[li + i]; + space_vector const& com0iii0 = com0[iii0]; + for (integer d = 0; d < NDIM; ++d) { + X[d][i] = com0iii0[d]; + } + } +#pragma GCC ivdep + for (integer d = 0; d < NDIM; ++d) { + dX[d] = X[d] - simdY[d]; + } + + taylor<5, simd_vector> D; + taylor<2, simd_vector> A0; + std::array B0 = { + simd_vector(0.0), simd_vector(0.0), simd_vector(0.0)}; + + D.set_basis(dX); + A0[0] = m0[0] * D[0]; + if (type != RHO) { + for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { + A0[0] -= m0[i] * D[i]; + } + } + for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { + A0[0] += m0[i] * D[i] * (factor[i] / real(2)); + } + for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + A0[0] -= m0[i] * D[i] * (factor[i] / real(6)); + } + + for (integer a = 0; a < NDIM; ++a) { + A0(a) = m0() * D(a); + for (integer b = 0; b < NDIM; ++b) { + if (type != RHO) { + A0(a) -= m0(a) * D(a, b); + } + for (integer c = b; c < NDIM; ++c) { + A0(a) += m0(c, b) * D(a, b, c) * (factor(b, c) / real(2)); + } + } + } + + if (type == RHO) { + for (integer a = 0; a < NDIM; ++a) { + for (integer b = 0; b < NDIM; ++b) { + for (integer c = b; c < NDIM; ++c) { + for (integer d = c; d < NDIM; ++d) { + const auto tmp = D(a, b, c, d) * (factor(b, c, d) / real(6)); + B0[a] -= n0(b, c, d) * tmp; + } + } + } + } } - } - } - } #ifdef USE_GRAV_PAR - std::lock_guard lock(*L_mtx); + std::lock_guard lock(*L_mtx); #endif - for (integer i = 0; i != simd_len && i + li < list_size; ++i) { - const integer iii0 = bnd.first[li + i]; - expansion &Liii0 = L[iii0]; + for (integer i = 0; i != simd_len && i + li < list_size; ++i) { + const integer iii0 = bnd.first[li + i]; + expansion& Liii0 = L[iii0]; #pragma GCC ivdep - for (integer j = 0; j != 4; ++j) { - Liii0[j] += A0[j][i]; - } - if (type == RHO && opts.ang_con) { - space_vector &L_ciii0 = L_c[iii0]; + for (integer j = 0; j != 4; ++j) { + Liii0[j] += A0[j][i]; + } + if (type == RHO && opts.ang_con) { + space_vector& L_ciii0 = L_c[iii0]; #pragma GCC ivdep - for (integer j = 0; j != NDIM; ++j) { - L_ciii0[j] += B0[j][i]; - } + for (integer j = 0; j != NDIM; ++j) { + L_ciii0[j] += B0[j][i]; + } + } + } } - } - } - }); - PROF_END; + }); + PROF_END; } -void grid::compute_boundary_interactions_monopole_multipole( - gsolve_type type, const std::vector &ilist_n_bnd, - const gravity_boundary_type &mpoles) { - PROF_BEGIN; - auto &M = *M_ptr; - auto &mon = *mon_ptr; +void grid::compute_boundary_interactions_monopole_multipole(gsolve_type type, + const std::vector& ilist_n_bnd, + const gravity_boundary_type& mpoles) { + PROF_BEGIN; + auto& M = *M_ptr; + auto& mon = *mon_ptr; - std::array Xbase = {X[0][hindex(H_BW, H_BW, H_BW)], - X[1][hindex(H_BW, H_BW, H_BW)], - X[2][hindex(H_BW, H_BW, H_BW)]}; + std::array Xbase = {X[0][hindex(H_BW, H_BW, H_BW)], X[1][hindex(H_BW, H_BW, H_BW)], + X[2][hindex(H_BW, H_BW, H_BW)]}; - std::vector const &com0 = *(com_ptr[0]); - hpx::parallel::for_loop( - for_loop_policy, 0, ilist_n_bnd.size(), - [&mpoles, &Xbase, &com0, &ilist_n_bnd, type, this, &M](std::size_t si) { + std::vector const& com0 = *(com_ptr[0]); + hpx::parallel::for_loop(for_loop_policy, 0, ilist_n_bnd.size(), + [&mpoles, &Xbase, &com0, &ilist_n_bnd, type, this, &M](std::size_t si) { - simd_vector m0; - taylor<4, simd_vector> n0; - std::array dX; - std::array X; - std::array Y; + simd_vector m0; + taylor<4, simd_vector> n0; + std::array dX; + std::array X; + std::array Y; - boundary_interaction_type const &bnd = ilist_n_bnd[si]; - integer index = mpoles.is_local ? bnd.second : si; - const integer list_size = bnd.first.size(); + boundary_interaction_type const& bnd = ilist_n_bnd[si]; + integer index = mpoles.is_local ? bnd.second : si; + const integer list_size = bnd.first.size(); #pragma GCC ivdep - for (integer d = 0; d != NDIM; ++d) { - Y[d] = bnd.x[d] * dx + Xbase[d]; - } + for (integer d = 0; d != NDIM; ++d) { + Y[d] = bnd.x[d] * dx + Xbase[d]; + } - m0 = (*(mpoles.m))[index]; - for (integer li = 0; li < list_size; li += simd_len) { - for (integer i = 0; i != simd_len && li + i < list_size; ++i) { - const integer iii0 = bnd.first[li + i]; - space_vector const &com0iii0 = com0[iii0]; + m0 = (*(mpoles.m))[index]; + for (integer li = 0; li < list_size; li += simd_len) { + for (integer i = 0; i != simd_len && li + i < list_size; ++i) { + const integer iii0 = bnd.first[li + i]; + space_vector const& com0iii0 = com0[iii0]; #pragma GCC ivdep - for (integer d = 0; d < NDIM; ++d) { - X[d][i] = com0iii0[d]; - } - if (type == RHO) { - multipole const &Miii0 = M[iii0]; - real const tmp = m0[i] / Miii0(); + for (integer d = 0; d < NDIM; ++d) { + X[d][i] = com0iii0[d]; + } + if (type == RHO) { + multipole const& Miii0 = M[iii0]; + real const tmp = m0[i] / Miii0(); #pragma GCC ivdep - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j][i] = -Miii0[j] * tmp; - } - } - } + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j][i] = -Miii0[j] * tmp; + } + } + } - if (type != RHO) { + if (type != RHO) { #pragma GCC ivdep - for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { - n0[j] = ZERO; - } - } + for (integer j = taylor_sizes[2]; j != taylor_sizes[3]; ++j) { + n0[j] = ZERO; + } + } #pragma GCC ivdep - for (integer d = 0; d < NDIM; ++d) { - dX[d] = X[d] - Y[d]; - } - - taylor<5, simd_vector> D; - taylor<4, simd_vector> A0; - std::array B0 = { - simd_vector(0.0), simd_vector(0.0), simd_vector(0.0)}; - - D.set_basis(dX); - - A0[0] = m0 * D[0]; - for (integer i = taylor_sizes[0]; i != taylor_sizes[2]; ++i) { - A0[i] = m0 * D[i]; - } - if (type == RHO) { - for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { - A0[i] = m0 * D[i]; - } - } - - if (type == RHO) { - for (integer a = 0; a != NDIM; ++a) { - for (integer b = 0; b != NDIM; ++b) { - for (integer c = b; c != NDIM; ++c) { - for (integer d = c; d != NDIM; ++d) { - const auto tmp = - D(a, b, c, d) * (factor(b, c, d) / real(6)); - B0[a] -= n0(b, c, d) * tmp; - } + for (integer d = 0; d < NDIM; ++d) { + dX[d] = X[d] - Y[d]; + } + + taylor<5, simd_vector> D; + taylor<4, simd_vector> A0; + std::array B0 = { + simd_vector(0.0), simd_vector(0.0), simd_vector(0.0)}; + + D.set_basis(dX); + + A0[0] = m0 * D[0]; + for (integer i = taylor_sizes[0]; i != taylor_sizes[2]; ++i) { + A0[i] = m0 * D[i]; + } + if (type == RHO) { + for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + A0[i] = m0 * D[i]; + } + } + + if (type == RHO) { + for (integer a = 0; a != NDIM; ++a) { + for (integer b = 0; b != NDIM; ++b) { + for (integer c = b; c != NDIM; ++c) { + for (integer d = c; d != NDIM; ++d) { + const auto tmp = D(a, b, c, d) * (factor(b, c, d) / real(6)); + B0[a] -= n0(b, c, d) * tmp; + } + } + } + } } - } - } - } #ifdef USE_GRAV_PAR - std::lock_guard lock(*L_mtx); + std::lock_guard lock(*L_mtx); #endif - for (integer i = 0; i != simd_len && i + li < list_size; ++i) { - const integer iii0 = bnd.first[li + i]; - expansion &Liii0 = L[iii0]; + for (integer i = 0; i != simd_len && i + li < list_size; ++i) { + const integer iii0 = bnd.first[li + i]; + expansion& Liii0 = L[iii0]; #pragma GCC ivdep - for (integer j = 0; j != 20; ++j) { - Liii0[j] += A0[j][i]; - } + for (integer j = 0; j != 20; ++j) { + Liii0[j] += A0[j][i]; + } - if (type == RHO && opts.ang_con) { - space_vector &L_ciii0 = L_c[iii0]; + if (type == RHO && opts.ang_con) { + space_vector& L_ciii0 = L_c[iii0]; #pragma GCC ivdep - for (integer j = 0; j != NDIM; ++j) { - L_ciii0[j] += B0[j][i]; - } + for (integer j = 0; j != NDIM; ++j) { + L_ciii0[j] += B0[j][i]; + } + } + } } - } - } - }); - PROF_END; + }); + PROF_END; } -void grid::compute_boundary_interactions_monopole_monopole( - gsolve_type type, const std::vector &ilist_n_bnd, - const gravity_boundary_type &mpoles) { - PROF_BEGIN; - auto &M = *M_ptr; - auto &mon = *mon_ptr; +void grid::compute_boundary_interactions_monopole_monopole(gsolve_type type, + const std::vector& ilist_n_bnd, + const gravity_boundary_type& mpoles) { + PROF_BEGIN; + auto& M = *M_ptr; + auto& mon = *mon_ptr; #if !defined(HPX_HAVE_DATAPAR) - const v4sd d0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), +1.0 / sqr(dx)}; + const v4sd d0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), +1.0 / sqr(dx)}; #else - const std::array di0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), - +1.0 / sqr(dx)}; - const v4sd d0(di0.data()); + const std::array di0 = {1.0 / dx, +1.0 / sqr(dx), +1.0 / sqr(dx), +1.0 / sqr(dx)}; + const v4sd d0(di0.data()); #endif - hpx::parallel::for_loop( - for_loop_policy, 0, ilist_n_bnd.size(), - [&mpoles, &ilist_n_bnd, &d0, this](std::size_t si) { - - boundary_interaction_type const &bnd = ilist_n_bnd[si]; - const integer dsize = bnd.first.size(); - const integer lev = 0; - integer index = mpoles.is_local ? bnd.second : si; + hpx::parallel::for_loop( + for_loop_policy, 0, ilist_n_bnd.size(), [&mpoles, &ilist_n_bnd, &d0, this](std::size_t si) { + + boundary_interaction_type const& bnd = ilist_n_bnd[si]; + const integer dsize = bnd.first.size(); + const integer lev = 0; + integer index = mpoles.is_local ? bnd.second : si; #if !defined(HPX_HAVE_DATAPAR) - const auto tmp = (*(mpoles).m)[index]; - v4sd m0; + const auto tmp = (*(mpoles).m)[index]; + v4sd m0; #pragma GCC ivdep - for (integer i = 0; i != 4; ++i) { - m0[i] = tmp; - } + for (integer i = 0; i != 4; ++i) { + m0[i] = tmp; + } #else - v4sd m0 = (*(mpoles).m)[index]; + v4sd m0 = (*(mpoles).m)[index]; #endif - m0 *= d0; + m0 *= d0; #ifdef USE_GRAV_PAR - std::lock_guard lock(*L_mtx); + std::lock_guard lock(*L_mtx); #endif - for (integer li = 0; li < dsize; ++li) { - const integer iii0 = bnd.first[li]; - auto tmp1 = m0 * bnd.four[li]; - expansion &Liii0 = L[iii0]; + for (integer li = 0; li < dsize; ++li) { + const integer iii0 = bnd.first[li]; + auto tmp1 = m0 * bnd.four[li]; + expansion& Liii0 = L[iii0]; #pragma GCC ivdep - for (integer i = 0; i != 4; ++i) { - Liii0[i] += tmp1[i]; - } - } - }); - PROF_END; + for (integer i = 0; i != 4; ++i) { + Liii0[i] += tmp1[i]; + } + } + }); + PROF_END; } void compute_ilist() { - - factor = 0.0; - factor() += 1.0; - for (integer a = 0; a < NDIM; ++a) { - factor(a) += 1.0; - for (integer b = 0; b < NDIM; ++b) { - factor(a, b) += 1.0; - for (integer c = 0; c < NDIM; ++c) { - factor(a, b, c) += 1.0; - } - } - } - - const integer inx = INX; - const integer nx = inx; - interaction_type np; - interaction_type dp; - std::vector ilist_r0; - std::vector ilist_n0; - std::vector ilist_d0; - std::array, geo::direction::count()> - ilist_n0_bnd; - std::array, geo::direction::count()> - ilist_d0_bnd; - const real theta0 = opts.theta; - const auto theta = [](integer i0, integer j0, integer k0, integer i1, - integer j1, integer k1) { - real tmp = (sqr(i0 - i1) + sqr(j0 - j1) + sqr(k0 - k1)); - // protect against sqrt(0) - if (tmp > 0.0) { - return 1.0 / (std::sqrt(tmp)); - } else { - return 1.0e+10; - } - }; - const auto interior = [](integer i, integer j, integer k) { - bool rc = true; - if (i < 0 || i >= G_NX) { - rc = false; - } else if (j < 0 || j >= G_NX) { - rc = false; - } else if (k < 0 || k >= G_NX) { - rc = false; - } - return rc; - }; - const auto neighbor_dir = [](integer i, integer j, integer k) { - integer i0 = 0, j0 = 0, k0 = 0; - if (i < 0) { - i0 = -1; - } else if (i >= G_NX) { - i0 = +1; - } - if (j < 0) { - j0 = -1; - } else if (j >= G_NX) { - j0 = +1; - } - if (k < 0) { - k0 = -1; - } else if (k >= G_NX) { - k0 = +1; - } - geo::direction d; - d.set(i0, j0, k0); - return d; - }; - integer max_d = 0; - integer width = INX; - for (integer i0 = 0; i0 != nx; ++i0) { - for (integer j0 = 0; j0 != nx; ++j0) { - for (integer k0 = 0; k0 != nx; ++k0) { - integer this_d = 0; - integer ilb = std::min(i0 - width, integer(0)); - integer jlb = std::min(j0 - width, integer(0)); - integer klb = std::min(k0 - width, integer(0)); - integer iub = std::max(i0 + width + 1, G_NX); - integer jub = std::max(j0 + width + 1, G_NX); - integer kub = std::max(k0 + width + 1, G_NX); - for (integer i1 = ilb; i1 < iub; ++i1) { - for (integer j1 = jlb; j1 < jub; ++j1) { - for (integer k1 = klb; k1 < kub; ++k1) { - const real x = i0 - i1; - const real y = j0 - j1; - const real z = k0 - k1; - // protect against sqrt(0) - const real tmp = sqr(x) + sqr(y) + sqr(z); - const real r = (tmp == 0) ? 0 : std::sqrt(tmp); - const real r3 = r * r * r; - v4sd four; - if (r > 0.0) { - four[0] = -1.0 / r; - four[1] = x / r3; - four[2] = y / r3; - four[3] = z / r3; - } else { - for (integer i = 0; i != 4; ++i) { - four[i] = 0.0; - } - } - if (i0 == i1 && j0 == j1 && k0 == k1) { - continue; - } - const integer i0_c = (i0 + INX) / 2 - INX / 2; - const integer j0_c = (j0 + INX) / 2 - INX / 2; - const integer k0_c = (k0 + INX) / 2 - INX / 2; - const integer i1_c = (i1 + INX) / 2 - INX / 2; - const integer j1_c = (j1 + INX) / 2 - INX / 2; - const integer k1_c = (k1 + INX) / 2 - INX / 2; - const real theta_f = theta(i0, j0, k0, i1, j1, k1); - const real theta_c = theta(i0_c, j0_c, k0_c, i1_c, j1_c, k1_c); - const integer iii0 = gindex(i0, j0, k0); - const integer iii1n = - gindex((i1 + INX) % INX, (j1 + INX) % INX, (k1 + INX) % INX); - const integer iii1 = gindex(i1, j1, k1); - if (theta_c > theta0 && theta_f <= theta0) { - np.first = iii0; - np.second = iii1n; - np.four = four; - np.x[XDIM] = i1; - np.x[YDIM] = j1; - np.x[ZDIM] = k1; - if (interior(i1, j1, k1) && interior(i0, j0, k0)) { - if (iii1 > iii0) { - ilist_n0.push_back(np); - } - } else if (interior(i0, j0, k0)) { - ilist_n0_bnd[neighbor_dir(i1, j1, k1)].push_back(np); - } - } - if (theta_c > theta0) { - ++this_d; - dp.first = iii0; - dp.second = iii1n; - dp.x[XDIM] = i1; - dp.x[YDIM] = j1; - dp.x[ZDIM] = k1; - dp.four = four; - if (interior(i1, j1, k1) && interior(i0, j0, k0)) { - if (iii1 > iii0) { - ilist_d0.push_back(dp); - } - } else if (interior(i0, j0, k0)) { - ilist_d0_bnd[neighbor_dir(i1, j1, k1)].push_back(dp); - } - } - if (theta_f <= theta0) { - np.first = iii0; - np.second = iii1n; - np.x[XDIM] = i1; - np.x[YDIM] = j1; - np.x[ZDIM] = k1; - np.four = four; - if (interior(i1, j1, k1) && interior(i0, j0, k0)) { - if (iii1 > iii0) { - ilist_r0.push_back(np); - } - } - } + factor = 0.0; + factor() += 1.0; + for (integer a = 0; a < NDIM; ++a) { + factor(a) += 1.0; + for (integer b = 0; b < NDIM; ++b) { + factor(a, b) += 1.0; + for (integer c = 0; c < NDIM; ++c) { + factor(a, b, c) += 1.0; } - } - } - max_d = std::max(max_d, this_d); - } - } - } - printf("# direct = %i\n", int(max_d)); - ilist_n = std::vector(ilist_n0.begin(), ilist_n0.end()); - ilist_d = std::vector(ilist_d0.begin(), ilist_d0.end()); - ilist_r = std::vector(ilist_r0.begin(), ilist_r0.end()); - for (auto &dir : geo::direction::full_set()) { - auto &d = ilist_d_bnd[dir]; - auto &d0 = ilist_d0_bnd[dir]; - auto &n = ilist_n_bnd[dir]; - auto &n0 = ilist_n0_bnd[dir]; - for (auto i0 : d0) { - bool found = false; - for (auto &i : d) { - if (i.second == i0.second) { - i.first.push_back(i0.first); - i.four.push_back(i0.four); - found = true; - break; } - } - if (!found) { - boundary_interaction_type i; - i.second = i0.second; - i.x = i0.x; - n.push_back(i); - i.first.push_back(i0.first); - i.four.push_back(i0.four); - d.push_back(i); - } } - for (auto i0 : n0) { - bool found = false; - for (auto &i : n) { - if (i.second == i0.second) { - i.first.push_back(i0.first); - i.four.push_back(i0.four); - found = true; - break; - } - } - assert(found); - } - } -} -expansion_pass_type -grid::compute_expansions(gsolve_type type, - const expansion_pass_type *parent_expansions) { - PROF_BEGIN; - expansion_pass_type exp_ret; - if (!is_leaf) { - exp_ret.first.resize(INX * INX * INX); - if (type == RHO) { - exp_ret.second.resize(INX * INX * INX); - } - } - const integer inx = INX; - const integer nxp = (inx / 2); - auto child_index = [=](integer ip, integer jp, integer kp, integer ci, - integer bw = 0) -> integer { - const integer ic = (2 * (ip) + bw) + ((ci >> 0) & 1); - const integer jc = (2 * (jp) + bw) + ((ci >> 1) & 1); - const integer kc = (2 * (kp) + bw) + ((ci >> 2) & 1); - return (inx + 2 * bw) * (inx + 2 * bw) * ic + (inx + 2 * bw) * jc + kc; - }; - - for (integer ip = 0; ip != nxp; ++ip) { - for (integer jp = 0; jp != nxp; ++jp) { - for (integer kp = 0; kp != nxp; ++kp) { - const integer iiip = nxp * nxp * ip + nxp * jp + kp; - std::array X; - std::array dX; - taylor<4, simd_vector> l; - std::array lc; - if (!is_root) { - const integer index = - (INX * INX / 4) * (ip) + (INX / 2) * (jp) + (kp); - for (integer j = 0; j != 20; ++j) { - l[j] = parent_expansions->first[index][j]; - } - if (type == RHO && opts.ang_con) { - for (integer j = 0; j != NDIM; ++j) { - lc[j] = parent_expansions->second[index][j]; - } - } + const integer inx = INX; + const integer nx = inx; + interaction_type np; + interaction_type dp; + std::vector ilist_r0; + std::vector ilist_n0; + std::vector ilist_d0; + std::array, geo::direction::count()> ilist_n0_bnd; + std::array, geo::direction::count()> ilist_d0_bnd; + const real theta0 = opts.theta; + const auto theta = [](integer i0, integer j0, integer k0, integer i1, integer j1, integer k1) { + real tmp = (sqr(i0 - i1) + sqr(j0 - j1) + sqr(k0 - k1)); + // protect against sqrt(0) + if (tmp > 0.0) { + return 1.0 / (std::sqrt(tmp)); } else { - for (integer j = 0; j != 20; ++j) { - l[j] = 0.0; - } - if (opts.ang_con) { - for (integer j = 0; j != NDIM; ++j) { - lc[j] = 0.0; - } - } + return 1.0e+10; + } + }; + const auto interior = [](integer i, integer j, integer k) { + bool rc = true; + if (i < 0 || i >= G_NX) { + rc = false; + } else if (j < 0 || j >= G_NX) { + rc = false; + } else if (k < 0 || k >= G_NX) { + rc = false; + } + return rc; + }; + const auto neighbor_dir = [](integer i, integer j, integer k) { + integer i0 = 0, j0 = 0, k0 = 0; + if (i < 0) { + i0 = -1; + } else if (i >= G_NX) { + i0 = +1; } - for (integer ci = 0; ci != NCHILD; ++ci) { - const integer iiic = child_index(ip, jp, kp, ci); - for (integer d = 0; d < NDIM; ++d) { - X[d][ci] = (*(com_ptr[0]))[iiic][d]; - } + if (j < 0) { + j0 = -1; + } else if (j >= G_NX) { + j0 = +1; } - const auto &Y = (*(com_ptr[1]))[iiip]; - for (integer d = 0; d < NDIM; ++d) { - dX[d] = X[d] - Y[d]; + if (k < 0) { + k0 = -1; + } else if (k >= G_NX) { + k0 = +1; } - l <<= dX; - for (integer ci = 0; ci != NCHILD; ++ci) { - const integer iiic = child_index(ip, jp, kp, ci); - expansion &Liiic = L[iiic]; - for (integer j = 0; j != 20; ++j) { - Liiic[j] += l[j][ci]; - } - - space_vector &L_ciiic = L_c[iiic]; - if (opts.ang_con) { - if (type == RHO && opts.ang_con) { - for (integer j = 0; j != NDIM; ++j) { - L_ciiic[j] += lc[j][ci]; - } + geo::direction d; + d.set(i0, j0, k0); + return d; + }; + integer max_d = 0; + integer width = INX; + for (integer i0 = 0; i0 != nx; ++i0) { + for (integer j0 = 0; j0 != nx; ++j0) { + for (integer k0 = 0; k0 != nx; ++k0) { + integer this_d = 0; + integer ilb = std::min(i0 - width, integer(0)); + integer jlb = std::min(j0 - width, integer(0)); + integer klb = std::min(k0 - width, integer(0)); + integer iub = std::max(i0 + width + 1, G_NX); + integer jub = std::max(j0 + width + 1, G_NX); + integer kub = std::max(k0 + width + 1, G_NX); + for (integer i1 = ilb; i1 < iub; ++i1) { + for (integer j1 = jlb; j1 < jub; ++j1) { + for (integer k1 = klb; k1 < kub; ++k1) { + const real x = i0 - i1; + const real y = j0 - j1; + const real z = k0 - k1; + // protect against sqrt(0) + const real tmp = sqr(x) + sqr(y) + sqr(z); + const real r = (tmp == 0) ? 0 : std::sqrt(tmp); + const real r3 = r * r * r; + v4sd four; + if (r > 0.0) { + four[0] = -1.0 / r; + four[1] = x / r3; + four[2] = y / r3; + four[3] = z / r3; + } else { + for (integer i = 0; i != 4; ++i) { + four[i] = 0.0; + } + } + if (i0 == i1 && j0 == j1 && k0 == k1) { + continue; + } + const integer i0_c = (i0 + INX) / 2 - INX / 2; + const integer j0_c = (j0 + INX) / 2 - INX / 2; + const integer k0_c = (k0 + INX) / 2 - INX / 2; + const integer i1_c = (i1 + INX) / 2 - INX / 2; + const integer j1_c = (j1 + INX) / 2 - INX / 2; + const integer k1_c = (k1 + INX) / 2 - INX / 2; + const real theta_f = theta(i0, j0, k0, i1, j1, k1); + const real theta_c = theta(i0_c, j0_c, k0_c, i1_c, j1_c, k1_c); + const integer iii0 = gindex(i0, j0, k0); + const integer iii1n = + gindex((i1 + INX) % INX, (j1 + INX) % INX, (k1 + INX) % INX); + const integer iii1 = gindex(i1, j1, k1); + if (theta_c > theta0 && theta_f <= theta0) { + np.first = iii0; + np.second = iii1n; + np.four = four; + np.x[XDIM] = i1; + np.x[YDIM] = j1; + np.x[ZDIM] = k1; + if (interior(i1, j1, k1) && interior(i0, j0, k0)) { + if (iii1 > iii0) { + ilist_n0.push_back(np); + } + } else if (interior(i0, j0, k0)) { + ilist_n0_bnd[neighbor_dir(i1, j1, k1)].push_back(np); + } + } + if (theta_c > theta0) { + ++this_d; + dp.first = iii0; + dp.second = iii1n; + dp.x[XDIM] = i1; + dp.x[YDIM] = j1; + dp.x[ZDIM] = k1; + dp.four = four; + if (interior(i1, j1, k1) && interior(i0, j0, k0)) { + if (iii1 > iii0) { + ilist_d0.push_back(dp); + } + } else if (interior(i0, j0, k0)) { + ilist_d0_bnd[neighbor_dir(i1, j1, k1)].push_back(dp); + } + } + if (theta_f <= theta0) { + np.first = iii0; + np.second = iii1n; + np.x[XDIM] = i1; + np.x[YDIM] = j1; + np.x[ZDIM] = k1; + np.four = four; + if (interior(i1, j1, k1) && interior(i0, j0, k0)) { + if (iii1 > iii0) { + ilist_r0.push_back(np); + } + } + } + } + } + } + max_d = std::max(max_d, this_d); } - } - - if (!is_leaf) { - integer index = child_index(ip, jp, kp, ci, 0); - for (integer j = 0; j != 20; ++j) { - exp_ret.first[index][j] = Liiic[j]; + } + } + printf("# direct = %i\n", int(max_d)); + ilist_n = std::vector(ilist_n0.begin(), ilist_n0.end()); + ilist_d = std::vector(ilist_d0.begin(), ilist_d0.end()); + ilist_r = std::vector(ilist_r0.begin(), ilist_r0.end()); + for (auto& dir : geo::direction::full_set()) { + auto& d = ilist_d_bnd[dir]; + auto& d0 = ilist_d0_bnd[dir]; + auto& n = ilist_n_bnd[dir]; + auto& n0 = ilist_n0_bnd[dir]; + for (auto i0 : d0) { + bool found = false; + for (auto& i : d) { + if (i.second == i0.second) { + i.first.push_back(i0.first); + i.four.push_back(i0.four); + found = true; + break; + } } - - if (type == RHO && opts.ang_con) { - for (integer j = 0; j != 3; ++j) { - exp_ret.second[index][j] = L_ciiic[j]; - } + if (!found) { + boundary_interaction_type i; + i.second = i0.second; + i.x = i0.x; + n.push_back(i); + i.first.push_back(i0.first); + i.four.push_back(i0.four); + d.push_back(i); } - } } - } - } - } - - if (is_leaf) { - for (integer i = 0; i != G_NX; ++i) { - for (integer j = 0; j != G_NX; ++j) { - for (integer k = 0; k != G_NX; ++k) { - const integer iii = gindex(i, j, k); - const integer iii0 = h0index(i, j, k); - const integer iiih = hindex(i + H_BW, j + H_BW, k + H_BW); - if (type == RHO) { - G[iii][phi_i] = L[iii](); - for (integer d = 0; d < NDIM; ++d) { - G[iii][gx_i + d] = -L[iii](d); - if (opts.ang_con == true) { - G[iii][gx_i + d] -= L_c[iii][d]; - } + for (auto i0 : n0) { + bool found = false; + for (auto& i : n) { + if (i.second == i0.second) { + i.first.push_back(i0.first); + i.four.push_back(i0.four); + found = true; + break; + } } - U[pot_i][iiih] = G[iii][phi_i] * U[rho_i][iiih]; - } else { - dphi_dt[iii0] = L[iii](); - } + assert(found); } - } } - } - PROF_END; - return exp_ret; } -multipole_pass_type -grid::compute_multipoles(gsolve_type type, - const multipole_pass_type *child_poles) { - - // if( int(*Muse_counter) > 0) - // printf( "%i\n", int(*Muse_counter)); - PROF_BEGIN; - integer lev = 0; - const real dx3 = dx * dx * dx; - M_ptr = std::make_shared>(); - mon_ptr = std::make_shared>(); - if (com_ptr[1] == nullptr) { - com_ptr[1] = std::make_shared>(G_N3 / 8); - } - if (type == RHO) { - com_ptr[0] = std::make_shared>(G_N3); - } - auto &M = *M_ptr; - auto &mon = *mon_ptr; - if (is_leaf) { - M.resize(0); - mon.resize(G_N3); - } else { - M.resize(G_N3); - mon.resize(0); - } - if (type == RHO) { - const integer iii0 = hindex(H_BW, H_BW, H_BW); - const std::array x0 = {X[XDIM][iii0], X[YDIM][iii0], - X[ZDIM][iii0]}; - - // std::array i; - // for (i[0] = 0; i[0] != G_NX; ++i[0]) { - // for (i[1] = 0; i[1] != G_NX; ++i[1]) { - // for (i[2] = 0; i[2] != G_NX; ++i[2]) { - // const integer iii = gindex(i[0], i[1], i[2]); - // for (integer dim = 0; dim != NDIM; ++dim) { - // (*(com_ptr[0]))[iii][dim] = x0[dim] + (i[dim]) * - // dx; - // } - // } - // } - // } - for (integer i = 0; i != G_NX; ++i) { - for (integer j = 0; j != G_NX; ++j) { - for (integer k = 0; k != G_NX; ++k) { - auto &com0iii = (*(com_ptr[0]))[gindex(i, j, k)]; - // for (integer dim = 0; dim != NDIM; ++dim) - com0iii[0] = x0[0] + i * dx; - com0iii[1] = x0[1] + j * dx; - com0iii[2] = x0[2] + k * dx; +expansion_pass_type grid::compute_expansions( + gsolve_type type, const expansion_pass_type* parent_expansions) { + PROF_BEGIN; + expansion_pass_type exp_ret; + if (!is_leaf) { + exp_ret.first.resize(INX * INX * INX); + if (type == RHO) { + exp_ret.second.resize(INX * INX * INX); } - } } - } - - multipole_pass_type mret; - if (!is_root) { - mret.first.resize(INX * INX * INX / NCHILD); - mret.second.resize(INX * INX * INX / NCHILD); - } - taylor<4, real> MM; - integer index = 0; - for (integer inx = INX; (inx >= INX / 2); inx >>= 1) { - - const integer nxp = inx; - const integer nxc = (2 * inx); - - auto child_index = [=](integer ip, integer jp, integer kp, - integer ci) -> integer { - const integer ic = (2 * ip) + ((ci >> 0) & 1); - const integer jc = (2 * jp) + ((ci >> 1) & 1); - const integer kc = (2 * kp) + ((ci >> 2) & 1); - return nxc * nxc * ic + nxc * jc + kc; + const integer inx = INX; + const integer nxp = (inx / 2); + auto child_index = [=]( + integer ip, integer jp, integer kp, integer ci, integer bw = 0) -> integer { + const integer ic = (2 * (ip) + bw) + ((ci >> 0) & 1); + const integer jc = (2 * (jp) + bw) + ((ci >> 1) & 1); + const integer kc = (2 * (kp) + bw) + ((ci >> 2) & 1); + return (inx + 2 * bw) * (inx + 2 * bw) * ic + (inx + 2 * bw) * jc + kc; }; for (integer ip = 0; ip != nxp; ++ip) { - for (integer jp = 0; jp != nxp; ++jp) { - for (integer kp = 0; kp != nxp; ++kp) { - const integer iiip = nxp * nxp * ip + nxp * jp + kp; - if (lev != 0) { - if (type == RHO) { - simd_vector mc; - std::array X; - for (integer ci = 0; ci != NCHILD; ++ci) { - const integer iiic = child_index(ip, jp, kp, ci); - if (is_leaf) { - mc[ci] = mon[iiic]; + for (integer jp = 0; jp != nxp; ++jp) { + for (integer kp = 0; kp != nxp; ++kp) { + const integer iiip = nxp * nxp * ip + nxp * jp + kp; + std::array X; + std::array dX; + taylor<4, simd_vector> l; + std::array lc; + if (!is_root) { + const integer index = (INX * INX / 4) * (ip) + (INX / 2) * (jp) + (kp); + for (integer j = 0; j != 20; ++j) { + l[j] = parent_expansions->first[index][j]; + } + if (type == RHO && opts.ang_con) { + for (integer j = 0; j != NDIM; ++j) { + lc[j] = parent_expansions->second[index][j]; + } + } } else { - mc[ci] = M[iiic](); + for (integer j = 0; j != 20; ++j) { + l[j] = 0.0; + } + if (opts.ang_con) { + for (integer j = 0; j != NDIM; ++j) { + lc[j] = 0.0; + } + } } - for (integer d = 0; d < NDIM; ++d) { - X[d][ci] = (*(com_ptr[0]))[iiic][d]; + for (integer ci = 0; ci != NCHILD; ++ci) { + const integer iiic = child_index(ip, jp, kp, ci); + for (integer d = 0; d < NDIM; ++d) { + X[d][ci] = (*(com_ptr[0]))[iiic][d]; + } } - } - real mtot = mc.sum(); - for (integer d = 0; d < NDIM; ++d) { - (*(com_ptr[1]))[iiip][d] = (X[d] * mc).sum() / mtot; - } - } - taylor<4, simd_vector> mc, mp; - std::array x, y, dx; - for (integer ci = 0; ci != NCHILD; ++ci) { - const integer iiic = child_index(ip, jp, kp, ci); - const space_vector &X = (*(com_ptr[lev - 1]))[iiic]; - if (is_leaf) { - mc()[ci] = mon[iiic]; - for (integer j = 1; j != 20; ++j) { - mc.ptr()[j][ci] = 0.0; + const auto& Y = (*(com_ptr[1]))[iiip]; + for (integer d = 0; d < NDIM; ++d) { + dX[d] = X[d] - Y[d]; } - } else { - for (integer j = 0; j != 20; ++j) { - mc.ptr()[j][ci] = M[iiic].ptr()[j]; + l <<= dX; + for (integer ci = 0; ci != NCHILD; ++ci) { + const integer iiic = child_index(ip, jp, kp, ci); + expansion& Liiic = L[iiic]; + for (integer j = 0; j != 20; ++j) { + Liiic[j] += l[j][ci]; + } + + space_vector& L_ciiic = L_c[iiic]; + if (opts.ang_con) { + if (type == RHO && opts.ang_con) { + for (integer j = 0; j != NDIM; ++j) { + L_ciiic[j] += lc[j][ci]; + } + } + } + + if (!is_leaf) { + integer index = child_index(ip, jp, kp, ci, 0); + for (integer j = 0; j != 20; ++j) { + exp_ret.first[index][j] = Liiic[j]; + } + + if (type == RHO && opts.ang_con) { + for (integer j = 0; j != 3; ++j) { + exp_ret.second[index][j] = L_ciiic[j]; + } + } + } } - } - for (integer d = 0; d < NDIM; ++d) { - x[d][ci] = X[d]; - } } - const space_vector &Y = (*(com_ptr[lev]))[iiip]; - for (integer d = 0; d < NDIM; ++d) { - dx[d] = x[d] - simd_vector(Y[d]); - } - mp = mc >> dx; - for (integer j = 0; j != 20; ++j) { - MM[j] = mp[j].sum(); - } - } else { - if (child_poles == nullptr) { - const integer iiih = hindex(ip + H_BW, jp + H_BW, kp + H_BW); - const integer iii0 = h0index(ip, jp, kp); - if (type == RHO) { - mon[iiip] = U[rho_i][iiih] * dx3; - } else { - mon[iiip] = dUdt[rho_i][iii0] * dx3; - } - } else { - M[iiip] = child_poles->first[index]; - if (type == RHO) { - (*(com_ptr)[lev])[iiip] = child_poles->second[index]; - } - ++index; + } + } + + if (is_leaf) { + for (integer i = 0; i != G_NX; ++i) { + for (integer j = 0; j != G_NX; ++j) { + for (integer k = 0; k != G_NX; ++k) { + const integer iii = gindex(i, j, k); + const integer iii0 = h0index(i, j, k); + const integer iiih = hindex(i + H_BW, j + H_BW, k + H_BW); + if (type == RHO) { + G[iii][phi_i] = L[iii](); + for (integer d = 0; d < NDIM; ++d) { + G[iii][gx_i + d] = -L[iii](d); + if (opts.ang_con == true) { + G[iii][gx_i + d] -= L_c[iii][d]; + } + } + U[pot_i][iiih] = G[iii][phi_i] * U[rho_i][iiih]; + } else { + dphi_dt[iii0] = L[iii](); + } + } } - } - if (!is_root && (lev == 1)) { - mret.first[index] = MM; - mret.second[index] = (*(com_ptr[lev]))[iiip]; - ++index; - } } - } } - ++lev; - index = 0; - } - PROF_END; - return mret; + PROF_END; + return exp_ret; } -gravity_boundary_type grid::get_gravity_boundary(const geo::direction &dir, - bool is_local) { - PROF_BEGIN; - // std::array lb, ub; - gravity_boundary_type data; - data.is_local = is_local; - auto &M = *M_ptr; - auto &mon = *mon_ptr; - if (!is_local) { - data.allocate(); - integer iter = 0; - const std::vector &list = - ilist_n_bnd[dir.flip()]; +multipole_pass_type grid::compute_multipoles( + gsolve_type type, const multipole_pass_type* child_poles) { + // if( int(*Muse_counter) > 0) + // printf( "%i\n", int(*Muse_counter)); + PROF_BEGIN; + integer lev = 0; + const real dx3 = dx * dx * dx; + M_ptr = std::make_shared>(); + mon_ptr = std::make_shared>(); + if (com_ptr[1] == nullptr) { + com_ptr[1] = std::make_shared>(G_N3 / 8); + } + if (type == RHO) { + com_ptr[0] = std::make_shared>(G_N3); + } + auto& M = *M_ptr; + auto& mon = *mon_ptr; if (is_leaf) { - data.m->reserve(list.size()); - for (auto i : list) { - const integer iii = i.second; - data.m->push_back(mon[iii]); - } + M.resize(0); + mon.resize(G_N3); } else { - data.M->reserve(list.size()); - data.x->reserve(list.size()); - for (auto i : list) { - const integer iii = i.second; - const integer top = M[iii].size(); - data.M->push_back(M[iii]); - data.x->push_back((*(com_ptr[0]))[iii]); - } + M.resize(G_N3); + mon.resize(0); } - } else { - if (is_leaf) { - data.m = mon_ptr; + if (type == RHO) { + const integer iii0 = hindex(H_BW, H_BW, H_BW); + const std::array x0 = {X[XDIM][iii0], X[YDIM][iii0], X[ZDIM][iii0]}; + + // std::array i; + // for (i[0] = 0; i[0] != G_NX; ++i[0]) { + // for (i[1] = 0; i[1] != G_NX; ++i[1]) { + // for (i[2] = 0; i[2] != G_NX; ++i[2]) { + // const integer iii = gindex(i[0], i[1], i[2]); + // for (integer dim = 0; dim != NDIM; ++dim) { + // (*(com_ptr[0]))[iii][dim] = x0[dim] + (i[dim]) * + // dx; + // } + // } + // } + // } + for (integer i = 0; i != G_NX; ++i) { + for (integer j = 0; j != G_NX; ++j) { + for (integer k = 0; k != G_NX; ++k) { + auto& com0iii = (*(com_ptr[0]))[gindex(i, j, k)]; + // for (integer dim = 0; dim != NDIM; ++dim) + com0iii[0] = x0[0] + i * dx; + com0iii[1] = x0[1] + j * dx; + com0iii[2] = x0[2] + k * dx; + } + } + } + } + + multipole_pass_type mret; + if (!is_root) { + mret.first.resize(INX * INX * INX / NCHILD); + mret.second.resize(INX * INX * INX / NCHILD); + } + taylor<4, real> MM; + integer index = 0; + for (integer inx = INX; (inx >= INX / 2); inx >>= 1) { + const integer nxp = inx; + const integer nxc = (2 * inx); + + auto child_index = [=](integer ip, integer jp, integer kp, integer ci) -> integer { + const integer ic = (2 * ip) + ((ci >> 0) & 1); + const integer jc = (2 * jp) + ((ci >> 1) & 1); + const integer kc = (2 * kp) + ((ci >> 2) & 1); + return nxc * nxc * ic + nxc * jc + kc; + }; + + for (integer ip = 0; ip != nxp; ++ip) { + for (integer jp = 0; jp != nxp; ++jp) { + for (integer kp = 0; kp != nxp; ++kp) { + const integer iiip = nxp * nxp * ip + nxp * jp + kp; + if (lev != 0) { + if (type == RHO) { + simd_vector mc; + std::array X; + for (integer ci = 0; ci != NCHILD; ++ci) { + const integer iiic = child_index(ip, jp, kp, ci); + if (is_leaf) { + mc[ci] = mon[iiic]; + } else { + mc[ci] = M[iiic](); + } + for (integer d = 0; d < NDIM; ++d) { + X[d][ci] = (*(com_ptr[0]))[iiic][d]; + } + } + real mtot = mc.sum(); + for (integer d = 0; d < NDIM; ++d) { + (*(com_ptr[1]))[iiip][d] = (X[d] * mc).sum() / mtot; + } + } + taylor<4, simd_vector> mc, mp; + std::array x, y, dx; + for (integer ci = 0; ci != NCHILD; ++ci) { + const integer iiic = child_index(ip, jp, kp, ci); + const space_vector& X = (*(com_ptr[lev - 1]))[iiic]; + if (is_leaf) { + mc()[ci] = mon[iiic]; + for (integer j = 1; j != 20; ++j) { + mc.ptr()[j][ci] = 0.0; + } + } else { + for (integer j = 0; j != 20; ++j) { + mc.ptr()[j][ci] = M[iiic].ptr()[j]; + } + } + for (integer d = 0; d < NDIM; ++d) { + x[d][ci] = X[d]; + } + } + const space_vector& Y = (*(com_ptr[lev]))[iiip]; + for (integer d = 0; d < NDIM; ++d) { + dx[d] = x[d] - simd_vector(Y[d]); + } + mp = mc >> dx; + for (integer j = 0; j != 20; ++j) { + MM[j] = mp[j].sum(); + } + } else { + if (child_poles == nullptr) { + const integer iiih = hindex(ip + H_BW, jp + H_BW, kp + H_BW); + const integer iii0 = h0index(ip, jp, kp); + if (type == RHO) { + mon[iiip] = U[rho_i][iiih] * dx3; + } else { + mon[iiip] = dUdt[rho_i][iii0] * dx3; + } + } else { + M[iiip] = child_poles->first[index]; + if (type == RHO) { + (*(com_ptr)[lev])[iiip] = child_poles->second[index]; + } + ++index; + } + } + if (!is_root && (lev == 1)) { + mret.first[index] = MM; + mret.second[index] = (*(com_ptr[lev]))[iiip]; + ++index; + } + } + } + } + ++lev; + index = 0; + } + PROF_END; + return mret; +} + +gravity_boundary_type grid::get_gravity_boundary(const geo::direction& dir, bool is_local) { + PROF_BEGIN; + // std::array lb, ub; + gravity_boundary_type data; + data.is_local = is_local; + auto& M = *M_ptr; + auto& mon = *mon_ptr; + if (!is_local) { + data.allocate(); + integer iter = 0; + const std::vector& list = ilist_n_bnd[dir.flip()]; + if (is_leaf) { + data.m->reserve(list.size()); + for (auto i : list) { + const integer iii = i.second; + data.m->push_back(mon[iii]); + } + } else { + data.M->reserve(list.size()); + data.x->reserve(list.size()); + for (auto i : list) { + const integer iii = i.second; + const integer top = M[iii].size(); + data.M->push_back(M[iii]); + data.x->push_back((*(com_ptr[0]))[iii]); + } + } } else { - data.M = M_ptr; - data.x = com_ptr[0]; + if (is_leaf) { + data.m = mon_ptr; + } else { + data.M = M_ptr; + data.x = com_ptr[0]; + } } - } - PROF_END; - return data; + PROF_END; + return data; } diff --git a/src/node_server_actions_1.cpp b/src/node_server_actions_1.cpp index 52c023eca..bb56c01f1 100644 --- a/src/node_server_actions_1.cpp +++ b/src/node_server_actions_1.cpp @@ -5,10 +5,10 @@ * Author: dmarce1 */ -#include "node_server.hpp" #include "diagnostics.hpp" #include "future.hpp" #include "node_client.hpp" +#include "node_server.hpp" #include "options.hpp" #include "profiler.hpp" #include "taylor.hpp" @@ -27,429 +27,411 @@ hpx::mutex rec_size_mutex; integer rec_size = -1; void set_locality_data(real omega, space_vector pivot, integer record_size) { - grid::set_omega(omega); - grid::set_pivot(pivot); - rec_size = record_size; + grid::set_omega(omega); + grid::set_pivot(pivot); + rec_size = record_size; } -hpx::id_type make_new_node(const node_location &loc, - const hpx::id_type &_parent) { - return hpx::new_(hpx::find_here(), loc, _parent, ZERO, ZERO) - .get(); +hpx::id_type make_new_node(const node_location& loc, const hpx::id_type& _parent) { + return hpx::new_(hpx::find_here(), loc, _parent, ZERO, ZERO).get(); } HPX_PLAIN_ACTION(make_new_node, make_new_node_action); HPX_PLAIN_ACTION(set_locality_data, set_locality_data_action); -hpx::future node_client::load(integer i, - const hpx::id_type &_me, - bool do_o, - std::string s) const { - return hpx::async(get_gid(), i, _me, do_o, - s); +hpx::future node_client::load( + integer i, const hpx::id_type& _me, bool do_o, std::string s) const { + return hpx::async(get_gid(), i, _me, do_o, s); } -grid::output_list_type node_server::load(integer cnt, const hpx::id_type &_me, - bool do_output, std::string filename) { +grid::output_list_type node_server::load( + integer cnt, const hpx::id_type& _me, bool do_output, std::string filename) { + if (rec_size == -1 && my_location.level() == 0) { + real omega = 0; + space_vector pivot; + + // run output on separate thread + hpx::threads::run_as_os_thread([&]() { + FILE* fp = fopen(filename.c_str(), "rb"); + if (fp == NULL) { + printf("Failed to open file\n"); + abort(); + } + fseek(fp, -sizeof(integer), SEEK_END); + std::size_t read_cnt = fread(&rec_size, sizeof(integer), 1, fp); + fseek(fp, -4 * sizeof(real) - sizeof(integer), SEEK_END); + read_cnt += fread(&omega, sizeof(real), 1, fp); + for (auto& d : geo::dimension::full_set()) { + real temp_pivot; + read_cnt += fread(&temp_pivot, sizeof(real), 1, fp); + pivot[d] = temp_pivot; + } + fclose(fp); + }).get(); + + auto localities = hpx::find_all_localities(); + std::vector> futs; + futs.reserve(localities.size()); + for (auto& locality : localities) { + futs.push_back(hpx::async(locality, omega, pivot, rec_size)); + } + wait_all_and_propagate_exceptions(futs); + } - if (rec_size == -1 && my_location.level() == 0) { + static auto localities = hpx::find_all_localities(); + me = _me; - real omega = 0; - space_vector pivot; + char flag = '0'; + integer total_nodes = 0; + std::vector counts(NCHILD); // run output on separate thread hpx::threads::run_as_os_thread([&]() { - FILE *fp = fopen(filename.c_str(), "rb"); - if (fp == NULL) { - printf("Failed to open file\n"); - abort(); - } - fseek(fp, -sizeof(integer), SEEK_END); - std::size_t read_cnt = fread(&rec_size, sizeof(integer), 1, fp); - fseek(fp, -4 * sizeof(real) - sizeof(integer), SEEK_END); - read_cnt += fread(&omega, sizeof(real), 1, fp); - for (auto &d : geo::dimension::full_set()) { - real temp_pivot; - read_cnt += fread(&temp_pivot, sizeof(real), 1, fp); - pivot[d] = temp_pivot; - } - fclose(fp); + FILE* fp = fopen(filename.c_str(), "rb"); + fseek(fp, cnt * rec_size, SEEK_SET); + std::size_t read_cnt = fread(&flag, sizeof(char), 1, fp); + for (auto& this_cnt : counts) { + read_cnt += fread(&this_cnt, sizeof(integer), 1, fp); + } + load_me(fp); + fseek(fp, 0L, SEEK_END); + total_nodes = ftell(fp) / rec_size; + fclose(fp); }).get(); - auto localities = hpx::find_all_localities(); - std::vector> futs; - futs.reserve(localities.size()); - for (auto &locality : localities) { - futs.push_back(hpx::async(locality, omega, - pivot, rec_size)); + std::vector> futs; + // printf( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1\n" ); + if (flag == '1') { + is_refined = true; + children.resize(NCHILD); + constexpr auto full_set = geo::octant::full_set(); + futs.reserve(full_set.size()); + for (auto& ci : full_set) { + integer loc_id = ((cnt * localities.size()) / (total_nodes + 1)); + children[ci] = hpx::async( + localities[loc_id], my_location.get_child(ci), me.get_gid()); + futs.push_back( + children[ci].load(counts[ci], children[ci].get_gid(), do_output, filename)); + } + } else if (flag == '0') { + is_refined = false; + children.clear(); + } else { + printf("Corrupt checkpoint file\n"); + // sleep(10); + hpx::this_thread::sleep_for(std::chrono::seconds(10)); + abort(); } - wait_all_and_propagate_exceptions(futs); - } - - static auto localities = hpx::find_all_localities(); - me = _me; - - char flag = '0'; - integer total_nodes = 0; - std::vector counts(NCHILD); - - // run output on separate thread - hpx::threads::run_as_os_thread([&]() { - FILE *fp = fopen(filename.c_str(), "rb"); - fseek(fp, cnt * rec_size, SEEK_SET); - std::size_t read_cnt = fread(&flag, sizeof(char), 1, fp); - for (auto &this_cnt : counts) { - read_cnt += fread(&this_cnt, sizeof(integer), 1, fp); + grid::output_list_type my_list; + for (auto&& fut : futs) { + if (do_output) { + grid::merge_output_lists(my_list, fut.get()); + } else { + fut.get(); + } } - load_me(fp); - fseek(fp, 0L, SEEK_END); - total_nodes = ftell(fp) / rec_size; - fclose(fp); - }).get(); - - std::vector> futs; - // printf( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1\n" ); - if (flag == '1') { - is_refined = true; - children.resize(NCHILD); - constexpr auto full_set = geo::octant::full_set(); - futs.reserve(full_set.size()); - for (auto &ci : full_set) { - integer loc_id = ((cnt * localities.size()) / (total_nodes + 1)); - children[ci] = hpx::async( - localities[loc_id], my_location.get_child(ci), me.get_gid()); - futs.push_back(children[ci].load(counts[ci], children[ci].get_gid(), - do_output, filename)); + // printf( "***************************************\n" ); + if (!is_refined && do_output) { + my_list = grid_ptr->get_output_list(false); + // grid_ptr = nullptr; } - } else if (flag == '0') { - is_refined = false; - children.clear(); - } else { - printf("Corrupt checkpoint file\n"); - // sleep(10); - hpx::this_thread::sleep_for(std::chrono::seconds(10)); - abort(); - } - grid::output_list_type my_list; - for (auto &&fut : futs) { - if (do_output) { - grid::merge_output_lists(my_list, fut.get()); - } else { - fut.get(); - } - } - // printf( "***************************************\n" ); - if (!is_refined && do_output) { - my_list = grid_ptr->get_output_list(false); - // grid_ptr = nullptr; - } - // hpx::async(localities[0]).get(); - if (my_location.level() == 0) { - if (do_output) { - if (hydro_on && opts.problem == DWD) { - diagnostics(); - } - grid::output(my_list, "data.silo", current_time, - get_rotation_count() / opts.output_dt, false); + // hpx::async(localities[0]).get(); + if (my_location.level() == 0) { + if (do_output) { + if (hydro_on && opts.problem == DWD) { + diagnostics(); + } + grid::output( + my_list, "data.silo", current_time, get_rotation_count() / opts.output_dt, false); + } + printf("Loaded checkpoint file\n"); + my_list = decltype(my_list)(); } - printf("Loaded checkpoint file\n"); - my_list = decltype(my_list)(); - } - return my_list; + return my_list; } typedef node_server::output_action output_action_type; HPX_REGISTER_ACTION(output_action_type); -hpx::future -node_client::output(std::string fname, int cycle, bool flag) const { - return hpx::async(get_gid(), fname, - cycle, flag); +hpx::future node_client::output( + std::string fname, int cycle, bool flag) const { + return hpx::async(get_gid(), fname, cycle, flag); } -grid::output_list_type node_server::output(std::string fname, int cycle, - bool analytic) const { - if (is_refined) { - std::vector> futs; - futs.reserve(children.size()); - for (auto i = children.begin(); i != children.end(); ++i) { - futs.push_back(i->output(fname, cycle, analytic)); - } - auto i = futs.begin(); - grid::output_list_type my_list = i->get(); - for (++i; i != futs.end(); ++i) { - grid::merge_output_lists(my_list, i->get()); - } +grid::output_list_type node_server::output(std::string fname, int cycle, bool analytic) const { + if (is_refined) { + std::vector> futs; + futs.reserve(children.size()); + for (auto i = children.begin(); i != children.end(); ++i) { + futs.push_back(i->output(fname, cycle, analytic)); + } + auto i = futs.begin(); + grid::output_list_type my_list = i->get(); + for (++i; i != futs.end(); ++i) { + grid::merge_output_lists(my_list, i->get()); + } - if (my_location.level() == 0) { - // hpx::apply([](const grid::output_list_type& - // olists, const char* filename) { - printf("Outputing...\n"); - grid::output(my_list, fname, get_time(), cycle, analytic); - printf("Done...\n"); - // }, std::move(my_list), fname.c_str()); - } - return my_list; + if (my_location.level() == 0) { + // hpx::apply([](const grid::output_list_type& + // olists, const char* filename) { + printf("Outputing...\n"); + grid::output(my_list, fname, get_time(), cycle, analytic); + printf("Done...\n"); + // }, std::move(my_list), fname.c_str()); + } + return my_list; - } else { - return grid_ptr->get_output_list(analytic); - } + } else { + return grid_ptr->get_output_list(analytic); + } } typedef node_server::regrid_gather_action regrid_gather_action_type; HPX_REGISTER_ACTION(regrid_gather_action_type); hpx::future node_client::regrid_gather(bool rb) const { - return hpx::async(get_gid(), rb); + return hpx::async(get_gid(), rb); } integer node_server::regrid_gather(bool rebalance_only) { - integer count = integer(1); - - if (is_refined) { - if (!rebalance_only) { - /* Turning refinement off */ - if (refinement_flag == 0) { - children.clear(); - is_refined = false; - grid_ptr->set_leaf(true); - } - } + integer count = integer(1); if (is_refined) { - std::vector> futs; - futs.reserve(children.size()); - for (auto &child : children) { - futs.push_back(child.regrid_gather(rebalance_only)); - } - auto futi = futs.begin(); - for (auto &ci : geo::octant::full_set()) { - auto child_cnt = futi->get(); - ++futi; - child_descendant_count[ci] = child_cnt; - count += child_cnt; - } - } else { - for (auto &ci : geo::octant::full_set()) { - child_descendant_count[ci] = 0; - } - } - } else if (!rebalance_only) { - // if (grid_ptr->refine_me(my_location.level())) { - if (refinement_flag != 0) { - refinement_flag = 0; - count += NCHILD; - - children.resize(NCHILD); - std::vector clocs(NCHILD); - - /* Turning refinement on*/ - is_refined = true; - grid_ptr->set_leaf(false); - - for (auto &ci : geo::octant::full_set()) { - child_descendant_count[ci] = 1; - children[ci] = - hpx::new_(hpx::find_here(), my_location.get_child(ci), - me, current_time, rotational_time); - std::array lb = {2 * H_BW, 2 * H_BW, 2 * H_BW}; - std::array ub; - lb[XDIM] += (1 & (ci >> 0)) * (INX); - lb[YDIM] += (1 & (ci >> 1)) * (INX); - lb[ZDIM] += (1 & (ci >> 2)) * (INX); - for (integer d = 0; d != NDIM; ++d) { - ub[d] = lb[d] + (INX); + if (!rebalance_only) { + /* Turning refinement off */ + if (refinement_flag == 0) { + children.clear(); + is_refined = false; + grid_ptr->set_leaf(true); + } } - std::vector outflows(NF, ZERO); - if (ci == 0) { - outflows = grid_ptr->get_outflows(); + + if (is_refined) { + std::vector> futs; + futs.reserve(children.size()); + for (auto& child : children) { + futs.push_back(child.regrid_gather(rebalance_only)); + } + auto futi = futs.begin(); + for (auto& ci : geo::octant::full_set()) { + auto child_cnt = futi->get(); + ++futi; + child_descendant_count[ci] = child_cnt; + count += child_cnt; + } + } else { + for (auto& ci : geo::octant::full_set()) { + child_descendant_count[ci] = 0; + } } - if (current_time > ZERO) { - children[ci] - .set_grid(grid_ptr->get_prolong(lb, ub), std::move(outflows)) - .get(); + } else if (!rebalance_only) { + // if (grid_ptr->refine_me(my_location.level())) { + if (refinement_flag != 0) { + refinement_flag = 0; + count += NCHILD; + + children.resize(NCHILD); + std::vector clocs(NCHILD); + + /* Turning refinement on*/ + is_refined = true; + grid_ptr->set_leaf(false); + + for (auto& ci : geo::octant::full_set()) { + child_descendant_count[ci] = 1; + children[ci] = hpx::new_( + hpx::find_here(), my_location.get_child(ci), me, current_time, rotational_time); + std::array lb = {2 * H_BW, 2 * H_BW, 2 * H_BW}; + std::array ub; + lb[XDIM] += (1 & (ci >> 0)) * (INX); + lb[YDIM] += (1 & (ci >> 1)) * (INX); + lb[ZDIM] += (1 & (ci >> 2)) * (INX); + for (integer d = 0; d != NDIM; ++d) { + ub[d] = lb[d] + (INX); + } + std::vector outflows(NF, ZERO); + if (ci == 0) { + outflows = grid_ptr->get_outflows(); + } + if (current_time > ZERO) { + children[ci].set_grid(grid_ptr->get_prolong(lb, ub), std::move(outflows)).get(); + } + } } - } } - } - return count; + return count; } typedef node_server::regrid_scatter_action regrid_scatter_action_type; HPX_REGISTER_ACTION(regrid_scatter_action_type); hpx::future node_client::regrid_scatter(integer a, integer b) const { - return hpx::async(get_gid(), a, - b); + return hpx::async(get_gid(), a, b); } void node_server::regrid_scatter(integer a_, integer total) { - refinement_flag = 0; - std::vector> futs; - if (is_refined) { - integer a = a_; - std::vector localities; - { - timings::scope ts(timings_, timings::time_find_localities); - localities = hpx::find_all_localities(); - } - ++a; - for (auto &ci : geo::octant::full_set()) { - const integer loc_index = a * localities.size() / total; - const auto child_loc = localities[loc_index]; - a += child_descendant_count[ci]; - const hpx::id_type id = children[ci].get_gid(); - integer current_child_id = - hpx::naming::get_locality_id_from_gid(id.get_gid()); - auto current_child_loc = localities[current_child_id]; - if (child_loc != current_child_loc) { - children[ci] = children[ci].copy_to_locality(child_loc); - } - } - a = a_ + 1; - constexpr auto full_set = geo::octant::full_set(); - futs.reserve(full_set.size()); - for (auto &ci : full_set) { - futs.push_back(children[ci].regrid_scatter(a, total)); - a += child_descendant_count[ci]; + refinement_flag = 0; + std::vector> futs; + if (is_refined) { + integer a = a_; + std::vector localities; + { + timings::scope ts(timings_, timings::time_find_localities); + localities = hpx::find_all_localities(); + } + ++a; + for (auto& ci : geo::octant::full_set()) { + const integer loc_index = a * localities.size() / total; + const auto child_loc = localities[loc_index]; + a += child_descendant_count[ci]; + const hpx::id_type id = children[ci].get_gid(); + integer current_child_id = hpx::naming::get_locality_id_from_gid(id.get_gid()); + auto current_child_loc = localities[current_child_id]; + if (child_loc != current_child_loc) { + children[ci] = children[ci].copy_to_locality(child_loc); + } + } + a = a_ + 1; + constexpr auto full_set = geo::octant::full_set(); + futs.reserve(full_set.size()); + for (auto& ci : full_set) { + futs.push_back(children[ci].regrid_scatter(a, total)); + a += child_descendant_count[ci]; + } } - } - clear_family(); - wait_all_and_propagate_exceptions(futs); + clear_family(); + wait_all_and_propagate_exceptions(futs); } typedef node_server::regrid_action regrid_action_type; HPX_REGISTER_ACTION(regrid_action_type); -hpx::future node_client::regrid(const hpx::id_type &g, bool rb) const { - return hpx::async(get_gid(), g, rb); +hpx::future node_client::regrid(const hpx::id_type& g, bool rb) const { + return hpx::async(get_gid(), g, rb); } -int node_server::regrid(const hpx::id_type &root_gid, bool rb) { - timings::scope ts(timings_, timings::time_regrid); - assert(grid_ptr != nullptr); - printf("-----------------------------------------------\n"); - if (!rb) { - printf("checking for refinement\n"); - check_for_refinement(); - } - printf("regridding\n"); - integer a = regrid_gather(rb); - printf("rebalancing %i nodes\n", int(a)); - regrid_scatter(0, a); - assert(grid_ptr != nullptr); - std::vector null_neighbors(geo::direction::count()); - printf("forming tree connections\n"); - form_tree(root_gid, hpx::invalid_id, null_neighbors); - if (current_time > ZERO) { - printf("solving gravity\n"); - solve_gravity(true); - } - printf("regrid done\n-----------------------------------------------\n"); - return a; +int node_server::regrid(const hpx::id_type& root_gid, bool rb) { + timings::scope ts(timings_, timings::time_regrid); + assert(grid_ptr != nullptr); + printf("-----------------------------------------------\n"); + if (!rb) { + printf("checking for refinement\n"); + check_for_refinement(); + } + printf("regridding\n"); + integer a = regrid_gather(rb); + printf("rebalancing %i nodes\n", int(a)); + regrid_scatter(0, a); + assert(grid_ptr != nullptr); + std::vector null_neighbors(geo::direction::count()); + printf("forming tree connections\n"); + form_tree(root_gid, hpx::invalid_id, null_neighbors); + if (current_time > ZERO) { + printf("solving gravity\n"); + solve_gravity(true); + } + printf("regrid done\n-----------------------------------------------\n"); + return a; } typedef node_server::save_action save_action_type; HPX_REGISTER_ACTION(save_action_type); integer node_client::save(integer i, std::string s) const { - return hpx::async(get_gid(), i, s).get(); + return hpx::async(get_gid(), i, s).get(); } integer node_server::save(integer cnt, std::string filename) const { - char flag = is_refined ? '1' : '0'; - integer record_size = 0; - - // run output on separate thread - hpx::threads::run_as_os_thread([&]() { - FILE *fp = fopen(filename.c_str(), (cnt == 0) ? "wb" : "ab"); - fwrite(&flag, sizeof(flag), 1, fp); - ++cnt; - // printf(" \rSaved %li sub-grids\r", - //(long int) cnt); - integer value = cnt; - std::array values; - for (auto &ci : geo::octant::full_set()) { - if (ci != 0 && is_refined) { - value += child_descendant_count[ci - 1]; - } - values[ci] = value; - fwrite(&value, sizeof(value), 1, fp); - } - record_size = save_me(fp) + sizeof(flag) + NCHILD * sizeof(integer); - fclose(fp); - }).get(); + char flag = is_refined ? '1' : '0'; + integer record_size = 0; - if (is_refined) { - for (auto &ci : geo::octant::full_set()) { - cnt = children[ci].save(cnt, filename); - } - } - - if (my_location.level() == 0) { // run output on separate thread hpx::threads::run_as_os_thread([&]() { - FILE *fp = fopen(filename.c_str(), "ab"); - real omega = grid::get_omega(); - space_vector pivot = grid::get_pivot(); - fwrite(&omega, sizeof(real), 1, fp); - for (auto &d : geo::dimension::full_set()) { - real temp_pivot; - fwrite(&temp_pivot, sizeof(real), 1, fp); - pivot[d] = temp_pivot; - } - fwrite(&record_size, sizeof(integer), 1, fp); - fclose(fp); + FILE* fp = fopen(filename.c_str(), (cnt == 0) ? "wb" : "ab"); + fwrite(&flag, sizeof(flag), 1, fp); + ++cnt; + // printf(" \rSaved %li sub-grids\r", + //(long int) cnt); + integer value = cnt; + std::array values; + for (auto& ci : geo::octant::full_set()) { + if (ci != 0 && is_refined) { + value += child_descendant_count[ci - 1]; + } + values[ci] = value; + fwrite(&value, sizeof(value), 1, fp); + } + record_size = save_me(fp) + sizeof(flag) + NCHILD * sizeof(integer); + fclose(fp); }).get(); - printf("Saved %li grids to checkpoint file\n", (long int)cnt); - } + if (is_refined) { + for (auto& ci : geo::octant::full_set()) { + cnt = children[ci].save(cnt, filename); + } + } - return cnt; + if (my_location.level() == 0) { + // run output on separate thread + hpx::threads::run_as_os_thread([&]() { + FILE* fp = fopen(filename.c_str(), "ab"); + real omega = grid::get_omega(); + space_vector pivot = grid::get_pivot(); + fwrite(&omega, sizeof(real), 1, fp); + for (auto& d : geo::dimension::full_set()) { + real temp_pivot; + fwrite(&temp_pivot, sizeof(real), 1, fp); + pivot[d] = temp_pivot; + } + fwrite(&record_size, sizeof(integer), 1, fp); + fclose(fp); + }).get(); + + printf("Saved %li grids to checkpoint file\n", (long int) cnt); + } + + return cnt; } typedef node_server::set_aunt_action set_aunt_action_type; HPX_REGISTER_ACTION(set_aunt_action_type); -hpx::future node_client::set_aunt(const hpx::id_type &aunt, - const geo::face &f) const { - return hpx::async(get_gid(), aunt, f); +hpx::future node_client::set_aunt(const hpx::id_type& aunt, const geo::face& f) const { + return hpx::async(get_gid(), aunt, f); } -void node_server::set_aunt(const hpx::id_type &aunt, const geo::face &face) { - aunts[face] = aunt; +void node_server::set_aunt(const hpx::id_type& aunt, const geo::face& face) { + aunts[face] = aunt; } typedef node_server::set_grid_action set_grid_action_type; HPX_REGISTER_ACTION(set_grid_action_type); -hpx::future node_client::set_grid(std::vector &&g, - std::vector &&o) const { - return hpx::async(get_gid(), g, o); +hpx::future node_client::set_grid(std::vector&& g, std::vector&& o) const { + return hpx::async(get_gid(), g, o); } -void node_server::set_grid(const std::vector &data, - std::vector &&outflows) { - grid_ptr->set_prolong(data, std::move(outflows)); +void node_server::set_grid(const std::vector& data, std::vector&& outflows) { + grid_ptr->set_prolong(data, std::move(outflows)); } typedef node_server::solve_gravity_action solve_gravity_action_type; HPX_REGISTER_ACTION(solve_gravity_action_type); hpx::future node_client::solve_gravity(bool ene) const { - return hpx::async(get_gid(), ene); + return hpx::async(get_gid(), ene); } void node_server::solve_gravity(bool ene) { - if (!gravity_on) { - return; - } - std::vector> child_futs; - child_futs.reserve(children.size()); - for (auto &child : children) { - child_futs.push_back(child.solve_gravity(ene)); - } - compute_fmm(RHO, ene); - wait_all_and_propagate_exceptions(child_futs); + if (!gravity_on) { + return; + } + std::vector> child_futs; + child_futs.reserve(children.size()); + for (auto& child : children) { + child_futs.push_back(child.solve_gravity(ene)); + } + compute_fmm(RHO, ene); + wait_all_and_propagate_exceptions(child_futs); } diff --git a/src/taylor.hpp b/src/taylor.hpp index f08550b05..847395ef8 100644 --- a/src/taylor.hpp +++ b/src/taylor.hpp @@ -9,7 +9,6 @@ #define TAYLOR_HPP_ #include "defs.hpp" -#include "simd.hpp" #include "profiler.hpp" #include "simd.hpp" @@ -23,192 +22,181 @@ #include #endif -//class simd_vector; +// class simd_vector; #define MAX_ORDER 5 -struct taylor_consts { - static const real delta[3][3]; - static integer map2[3][3]; - static integer map3[3][3][3]; - static integer map4[3][3][3][3]; +struct taylor_consts +{ + static const real delta[3][3]; + static integer map2[3][3]; + static integer map3[3][3][3]; + static integer map4[3][3][3][3]; }; -constexpr integer taylor_sizes[MAX_ORDER] = {1, 4, 10, 20, 35}; // +constexpr integer taylor_sizes[MAX_ORDER] = {1, 4, 10, 20, 35}; // -template -class taylor { +template +class taylor +{ private: - static constexpr integer my_size = taylor_sizes[N - 1]; - static taylor_consts tc; - std::array data; + static constexpr integer my_size = taylor_sizes[N - 1]; + static taylor_consts tc; + std::array data; + public: - T& operator[](integer i) { - return data[i]; - } - const T& operator[](integer i) const { - return data[i]; - } - static constexpr decltype(my_size) size() { - return my_size; - } - taylor() = default; - ~taylor() = default; - taylor(const taylor&) = default; - taylor(taylor && other) { - data = std::move(other.data); - } - taylor& operator=(const taylor&) = default; - taylor& operator=(taylor && other) { - data = std::move(other.data); - return *this; - } - - taylor& operator=(T d) { + T& operator[](integer i) { + return data[i]; + } + const T& operator[](integer i) const { + return data[i]; + } + static constexpr decltype(my_size) size() { + return my_size; + } + taylor() = default; + ~taylor() = default; + taylor(const taylor&) = default; + taylor(taylor&& other) { + data = std::move(other.data); + } + taylor& operator=(const taylor&) = default; + taylor& operator=(taylor&& other) { + data = std::move(other.data); + return *this; + } + + taylor& operator=(T d) { #if !defined(HPX_HAVE_DATAPAR) #pragma GCC ivdep - for (integer i = 0; i != my_size; ++i) { - data[i] = d; - } + for (integer i = 0; i != my_size; ++i) { + data[i] = d; + } #else - hpx::parallel::fill( - hpx::parallel::execution::dataseq, - data.begin(), data.end(), d); + hpx::parallel::fill(hpx::parallel::execution::dataseq, data.begin(), data.end(), d); #endif - return *this; - } + return *this; + } - taylor& operator*=(T d) { + taylor& operator*=(T d) { #if !defined(HPX_HAVE_DATAPAR) #pragma GCC ivdep - for (integer i = 0; i != my_size; ++i) { - data[i] *= d; - } + for (integer i = 0; i != my_size; ++i) { + data[i] *= d; + } #else - hpx::parallel::transform( - hpx::parallel::execution::dataseq, - data.begin(), data.end(), data.begin(), - [d](auto const& val) - { - return val * d; - }); + hpx::parallel::transform(hpx::parallel::execution::dataseq, data.begin(), data.end(), + data.begin(), [d](auto const& val) { return val * d; }); #endif - return *this; - } + return *this; + } - taylor& operator/=(T d) { + taylor& operator/=(T d) { #if !defined(HPX_HAVE_DATAPAR) #pragma GCC ivdep - for (integer i = 0; i != my_size; ++i) { - data[i] /= d; - } + for (integer i = 0; i != my_size; ++i) { + data[i] /= d; + } #else - hpx::parallel::transform( - hpx::parallel::execution::dataseq, - data.begin(), data.end(), data.begin(), - [d](auto const& val) - { - return val / d; - }); + hpx::parallel::transform(hpx::parallel::execution::dataseq, data.begin(), data.end(), + data.begin(), [d](auto const& val) { return val / d; }); #endif - return *this; - } + return *this; + } - taylor& operator+=(const taylor& other) { + taylor& operator+=(const taylor& other) { // #if !defined(HPX_HAVE_DATAPAR) #pragma GCC ivdep - for (integer i = 0; i != my_size; ++i) { - data[i] += other.data[i]; - } -// #else -// hpx::parallel::transform( -// hpx::parallel::execution::dataseq, -// data.begin(), data.end(), -// other.data.begin(), other.data.end(), data.begin(), -// [](auto const& t1, auto const& t2) -// { -// return t1 + t2; -// }); -// #endif - return *this; - } - - taylor& operator-=(const taylor& other) { + for (integer i = 0; i != my_size; ++i) { + data[i] += other.data[i]; + } + // #else + // hpx::parallel::transform( + // hpx::parallel::execution::dataseq, + // data.begin(), data.end(), + // other.data.begin(), other.data.end(), data.begin(), + // [](auto const& t1, auto const& t2) + // { + // return t1 + t2; + // }); + // #endif + return *this; + } + + taylor& operator-=(const taylor& other) { // #if !defined(HPX_HAVE_DATAPAR) #pragma GCC ivdep - for (integer i = 0; i != my_size; ++i) { - data[i] -= other.data[i]; - } -// #else -// hpx::parallel::transform( -// hpx::parallel::execution::dataseq, -// data.begin(), data.end(), -// other.data.begin(), other.data.end(), data.begin(), -// [](auto const& t1, auto const& t2) -// { -// return t1 - t2; -// }); -// #endif - return *this; - } - - taylor operator+(const taylor& other) const { - taylor r = *this; - r += other; - return r; - } - - taylor operator-(const taylor& other) const { - taylor r = *this; - r -= other; - return r; - } - - taylor operator*(const T& d) const { - taylor r = *this; - r *= d; - return r; - } - - taylor operator/(const T& d) const { - taylor r = *this; - r /= d; - return r; - } - - taylor operator+() const { - return *this; - } - - taylor operator-() const { - taylor r = *this; + for (integer i = 0; i != my_size; ++i) { + data[i] -= other.data[i]; + } + // #else + // hpx::parallel::transform( + // hpx::parallel::execution::dataseq, + // data.begin(), data.end(), + // other.data.begin(), other.data.end(), data.begin(), + // [](auto const& t1, auto const& t2) + // { + // return t1 - t2; + // }); + // #endif + return *this; + } + + taylor operator+(const taylor& other) const { + taylor r = *this; + r += other; + return r; + } + + taylor operator-(const taylor& other) const { + taylor r = *this; + r -= other; + return r; + } + + taylor operator*(const T& d) const { + taylor r = *this; + r *= d; + return r; + } + + taylor operator/(const T& d) const { + taylor r = *this; + r /= d; + return r; + } + + taylor operator+() const { + return *this; + } + + taylor& operator+=(v4sd& other) { +#pragma GCC ivdep + for (integer i = 0; i != 4; ++i) { + data[i] += other[i]; + } + return *this; + } + + taylor operator-() const { + taylor r = *this; #if !defined(HPX_HAVE_DATAPAR) #pragma GCC ivdep - for (integer i = 0; i != my_size; ++i) { - r.data[i] = -r.data[i]; - } + for (integer i = 0; i != my_size; ++i) { + r.data[i] = -r.data[i]; + } #else - hpx::parallel::transform( - hpx::parallel::execution::dataseq, - r.data.begin(), r.data.end(), r.data.begin(), - [](T const& val) - { - return -val; - }); + hpx::parallel::transform(hpx::parallel::execution::dataseq, r.data.begin(), r.data.end(), + r.data.begin(), [](T const& val) { return -val; }); #endif - return r; - } + return r; + } #if defined(HPX_HAVE_DATAPAR) - friend bool operator==(taylor const& lhs, taylor const& rhs) - { - return hpx::parallel::equal( - hpx::parallel::execution::dataseq, - lhs.data.begin(), lhs.data.end(), rhs.data.begin(), - [](T const& t1, T const& t2) - { - return all_of(t1 == t2); - }); + friend bool operator==(taylor const& lhs, taylor const& rhs) { + return hpx::parallel::equal(hpx::parallel::execution::dataseq, lhs.data.begin(), + lhs.data.end(), rhs.data.begin(), + [](T const& t1, T const& t2) { return all_of(t1 == t2); }); } #endif @@ -232,265 +220,264 @@ class taylor { return tc.map4[i][j][k][l]; } + T const& operator()() const { + return data[index()]; + } + + T const& operator()(integer i) const { + return data[index(i)]; + } - T const& operator()() const { - return data[index()]; - } - - T const& operator()(integer i) const { - return data[index(i)]; - } - - T const& operator()(integer i, integer j) const { - return data[index(i, j)]; - } - - T const& operator()(integer i, integer j, integer k) const { - return data[index(i, j, k)]; - } - - T const& operator()(integer i, integer j, integer k, integer l) const { - return data[index(i, j, k, l)]; - } - - T& operator()() { - return data[index()]; - } - - T& operator()(integer i) { - return data[index(i)]; - } - - T& operator()(integer i, integer j) { - return data[index(i, j)]; - } - - T& operator()(integer i, integer j, integer k) { - return data[index(i, j, k)]; - } - - T& operator()(integer i, integer j, integer k, integer l) { - return data[index(i, j, k, l)]; - } - - taylor& operator>>=(const std::array& X) { - //PROF_BEGIN; - const taylor& A = *this; - taylor B = A; - - if (N > 1) { - for (integer a = 0; a < NDIM; ++a) { - B(a) += A() * X[a]; - } - if (N > 2) { - for (integer a = 0; a < NDIM; ++a) { - for (integer b = a; b < NDIM; ++b) { - B(a, b) += A(a) * X[b] + X[a] * A(b); - B(a, b) += A() * X[a] * X[b]; - } - } - if (N > 3) { - for (integer a = 0; a < NDIM; ++a) { - for (integer b = a; b < NDIM; ++b) { - for (integer c = b; c < NDIM; ++c) { - B(a, b, c) += A(a, b) * X[c] + A(b, c) * X[a] + A(c, a) * X[b]; - B(a, b, c) += A(a) * X[b] * X[c] + A(b) * X[c] * X[a] + A(c) * X[a] * X[b]; - B(a, b, c) += A() * X[a] * X[b] * X[c]; - } - } - } - } - } - } - *this = B; - //PROF_END; - return *this; - } - - taylor operator>>(const std::array& X) const { - taylor r = *this; - r >>= X; - return r; - } - - taylor& operator<<=(const std::array& X) { - //PROF_BEGIN; - const taylor& A = *this; - taylor B = A; - - if (N > 1) { - for (integer a = 0; a != NDIM; a++) { - B() += A(a) * X[a]; - } - if (N > 2) { - for (integer a = 0; a != NDIM; a++) { - for (integer b = 0; b != NDIM; b++) { - B() += A(a, b) * X[a] * X[b] * T(real(1) / real(2)); - } - } - for (integer a = 0; a != NDIM; a++) { - for (integer b = 0; b != NDIM; b++) { - B(a) += A(a, b) * X[b]; - } - } - if (N > 3) { - for (integer a = 0; a != NDIM; a++) { - for (integer b = 0; b != NDIM; b++) { - for (integer c = 0; c != NDIM; c++) { - B() += A(a, b, c) * X[a] * X[b] * X[c] * T(real(1) / real(6)); - } - } - } - for (integer a = 0; a != NDIM; a++) { - for (integer b = 0; b != NDIM; b++) { - for (integer c = 0; c != NDIM; c++) { - B(a) += A(a, b, c) * X[b] * X[c] * T(real(1) / real(2)); - } - } - } - for (integer a = 0; a != NDIM; a++) { - for (integer b = 0; b != NDIM; b++) { - for (integer c = a; c != NDIM; c++) { - B(a, c) += A(a, b, c) * X[b]; - } - } - } - } - } - } - *this = B; - //PROF_END; - return *this; - } - - taylor operator<<(const std::array& X) const { - taylor r = *this; - r <<= X; - return r; - } - - void set_basis(const std::array& X); - - T* ptr() { - return data.data(); - } - - const T* ptr() const { - return data.data(); - } - - template - void serialize(Arc& arc, const unsigned) { - arc & data; - } + T const& operator()(integer i, integer j) const { + return data[index(i, j)]; + } + T const& operator()(integer i, integer j, integer k) const { + return data[index(i, j, k)]; + } + + T const& operator()(integer i, integer j, integer k, integer l) const { + return data[index(i, j, k, l)]; + } + + T& operator()() { + return data[index()]; + } + + T& operator()(integer i) { + return data[index(i)]; + } + + T& operator()(integer i, integer j) { + return data[index(i, j)]; + } + + T& operator()(integer i, integer j, integer k) { + return data[index(i, j, k)]; + } + + T& operator()(integer i, integer j, integer k, integer l) { + return data[index(i, j, k, l)]; + } + + taylor& operator>>=(const std::array& X) { + // PROF_BEGIN; + const taylor& A = *this; + taylor B = A; + + if (N > 1) { + for (integer a = 0; a < NDIM; ++a) { + B(a) += A() * X[a]; + } + if (N > 2) { + for (integer a = 0; a < NDIM; ++a) { + for (integer b = a; b < NDIM; ++b) { + B(a, b) += A(a) * X[b] + X[a] * A(b); + B(a, b) += A() * X[a] * X[b]; + } + } + if (N > 3) { + for (integer a = 0; a < NDIM; ++a) { + for (integer b = a; b < NDIM; ++b) { + for (integer c = b; c < NDIM; ++c) { + B(a, b, c) += A(a, b) * X[c] + A(b, c) * X[a] + A(c, a) * X[b]; + B(a, b, c) += + A(a) * X[b] * X[c] + A(b) * X[c] * X[a] + A(c) * X[a] * X[b]; + B(a, b, c) += A() * X[a] * X[b] * X[c]; + } + } + } + } + } + } + *this = B; + // PROF_END; + return *this; + } + + taylor operator>>(const std::array& X) const { + taylor r = *this; + r >>= X; + return r; + } + + taylor& operator<<=(const std::array& X) { + // PROF_BEGIN; + const taylor& A = *this; + taylor B = A; + + if (N > 1) { + for (integer a = 0; a != NDIM; a++) { + B() += A(a) * X[a]; + } + if (N > 2) { + for (integer a = 0; a != NDIM; a++) { + for (integer b = 0; b != NDIM; b++) { + B() += A(a, b) * X[a] * X[b] * T(real(1) / real(2)); + } + } + for (integer a = 0; a != NDIM; a++) { + for (integer b = 0; b != NDIM; b++) { + B(a) += A(a, b) * X[b]; + } + } + if (N > 3) { + for (integer a = 0; a != NDIM; a++) { + for (integer b = 0; b != NDIM; b++) { + for (integer c = 0; c != NDIM; c++) { + B() += A(a, b, c) * X[a] * X[b] * X[c] * T(real(1) / real(6)); + } + } + } + for (integer a = 0; a != NDIM; a++) { + for (integer b = 0; b != NDIM; b++) { + for (integer c = 0; c != NDIM; c++) { + B(a) += A(a, b, c) * X[b] * X[c] * T(real(1) / real(2)); + } + } + } + for (integer a = 0; a != NDIM; a++) { + for (integer b = 0; b != NDIM; b++) { + for (integer c = a; c != NDIM; c++) { + B(a, c) += A(a, b, c) * X[b]; + } + } + } + } + } + } + *this = B; + // PROF_END; + return *this; + } + + taylor operator<<(const std::array& X) const { + taylor r = *this; + r <<= X; + return r; + } + + void set_basis(const std::array& X); + + T* ptr() { + return data.data(); + } + + const T* ptr() const { + return data.data(); + } + + template + void serialize(Arc& arc, const unsigned) { + arc& data; + } }; #include "space_vector.hpp" -template +template taylor_consts taylor::tc; -template<> +template <> inline void taylor<5, simd_vector>::set_basis(const std::array& X) { constexpr integer N = 5; using T = simd_vector; - //PROF_BEGIN; + // PROF_BEGIN; - // also highly optimized + // also highly optimized - // A is D in the paper in formula (6) + // A is D in the paper in formula (6) taylor& A = *this; - const T r2 = sqr(X[0]) + sqr(X[1]) + sqr(X[2]); - T r2inv = 0.0; -// #if !defined(HPX_HAVE_DATAPAR) - for (integer i = 0; i != simd_len; ++i) { - if (r2[i] > 0.0) { - r2inv[i] = ONE / r2[i]; - } - } -// #else -// where(r2 > 0.0) | r2inv = ONE / r2; -// #endif - - // parts of formula (6) + const T r2 = sqr(X[0]) + sqr(X[1]) + sqr(X[2]); + T r2inv = 0.0; +#if !defined(HPX_HAVE_DATAPAR) + for (integer i = 0; i != simd_len; ++i) { + if (r2[i] > 0.0) { + r2inv[i] = ONE / r2[i]; + } + } +#else + where(r2 > 0.0) | r2inv = ONE / r2; +#endif + + // parts of formula (6) const T d0 = -sqrt(r2inv); - // parts of formula (7) + // parts of formula (7) const T d1 = -d0 * r2inv; - // parts of formula (8) + // parts of formula (8) const T d2 = T(-3) * d1 * r2inv; - // parts of formula (9) + // parts of formula (9) const T d3 = T(-5) * d2 * r2inv; -// const T d4 = -T(7) * d3 * r2inv; + // const T d4 = -T(7) * d3 * r2inv; // Previously we've had this code. In my measurements the old code was // about 15% faster than the current code. However, I have measured it on // non-KNL platforms only. -// taylor XX; -// for (integer a = 0; a != NDIM; a++) { -// auto const tmpa = X[a]; -// XX(a) = tmpa; -// -// for (integer b = a; b != NDIM; b++) { -// auto const tmpab = tmpa * X[b]; -// XX(a, b) = tmpab; -// -// for (integer c = b; c != NDIM; c++) { -// XX(a, b, c) = tmpab * X[c]; -// } -// } -// } -// A[0] = d0; -// for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { -// A[i] = XX[i] * d1; -// } -// for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { -// A[i] = XX[i] * d2; -// } -// for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { -// A[i] = XX[i] * d3; -// } -// for (integer i = taylor_sizes[3]; i != taylor_sizes[4]; ++i) { -// A[i] = ZERO; -// } -// for (integer a = 0; a != NDIM; a++) { -// auto const XXa = XX(a) * d2; -// A(a, a) += d1; -// A(a, a, a) += XXa; -// A(a, a, a, a) += XX(a, a) * d3; -// A(a, a, a, a) += 2.0 * d2; -// for (integer b = a; b != NDIM; b++) { -// auto const XXab = XX(a, b) * d3; -// A(a, a, b) += XX(b) * d2; -// A(a, b, b) += XXa; -// A(a, a, a, b) += XXab; -// A(a, b, b, b) += XXab; -// A(a, a, b, b) += d2; -// for (integer c = b; c != NDIM; c++) { -// A(a, a, b, c) += XX(b, c) * d3; -// A(a, b, b, c) += XX(a, c) * d3; -// A(a, b, c, c) += XXab; -// } -// } -// } + // taylor XX; + // for (integer a = 0; a != NDIM; a++) { + // auto const tmpa = X[a]; + // XX(a) = tmpa; + // + // for (integer b = a; b != NDIM; b++) { + // auto const tmpab = tmpa * X[b]; + // XX(a, b) = tmpab; + // + // for (integer c = b; c != NDIM; c++) { + // XX(a, b, c) = tmpab * X[c]; + // } + // } + // } + // A[0] = d0; + // for (integer i = taylor_sizes[0]; i != taylor_sizes[1]; ++i) { + // A[i] = XX[i] * d1; + // } + // for (integer i = taylor_sizes[1]; i != taylor_sizes[2]; ++i) { + // A[i] = XX[i] * d2; + // } + // for (integer i = taylor_sizes[2]; i != taylor_sizes[3]; ++i) { + // A[i] = XX[i] * d3; + // } + // for (integer i = taylor_sizes[3]; i != taylor_sizes[4]; ++i) { + // A[i] = ZERO; + // } + // for (integer a = 0; a != NDIM; a++) { + // auto const XXa = XX(a) * d2; + // A(a, a) += d1; + // A(a, a, a) += XXa; + // A(a, a, a, a) += XX(a, a) * d3; + // A(a, a, a, a) += 2.0 * d2; + // for (integer b = a; b != NDIM; b++) { + // auto const XXab = XX(a, b) * d3; + // A(a, a, b) += XX(b) * d2; + // A(a, b, b) += XXa; + // A(a, a, a, b) += XXab; + // A(a, b, b, b) += XXab; + // A(a, a, b, b) += d2; + // for (integer c = b; c != NDIM; c++) { + // A(a, a, b, c) += XX(b, c) * d3; + // A(a, b, b, c) += XX(a, c) * d3; + // A(a, b, c, c) += XXab; + // } + // } + // } /////////////////////////////////////////////////////////////////////////// - // forumla (6) + // formula (6) A[0] = d0; - // forumla (7) + // formula (7) for (integer i = taylor_sizes[0], a = 0; a != NDIM; ++a, ++i) { A[i] = X[a] * d1; } - // forumla (8) + // formula (8) for (integer i = taylor_sizes[1], a = 0; a != NDIM; ++a) { - T const Xad2 = X[a] * d2; - for (integer b = a; b != NDIM; ++b, ++i) { - A[i] = Xad2 * X[b]; - } + T const Xad2 = X[a] * d2; + for (integer b = a; b != NDIM; ++b, ++i) { + A[i] = Xad2 * X[b]; + } } - // formula (9) + // formula (9) for (integer i = taylor_sizes[2], a = 0; a != NDIM; ++a) { T const Xad3 = X[a] * d3; for (integer b = a; b != NDIM; ++b) { @@ -501,12 +488,11 @@ inline void taylor<5, simd_vector>::set_basis(const std::array::set_basis(const std::array