Expose the slp_vectorize flag.

bluescarni · Sep 4, 2023 · fa55bf5 · fa55bf5
1 parent c9c67cd
commit fa55bf5
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 15 deletions.
diff --git a/heyoka/cfunc.cpp b/heyoka/cfunc.cpp
@@ -712,7 +712,7 @@ void expose_add_cfunc_impl(py::module &m, const char *suffix)
     m.def(
         fmt::format("_add_cfunc_{}", suffix).c_str(),
         [](std::vector<hey::expression> fn, std::optional<std::vector<hey::expression>> vars, bool high_accuracy,
-           bool compact_mode, bool parallel_mode, unsigned opt_level, bool force_avx512,
+           bool compact_mode, bool parallel_mode, unsigned opt_level, bool force_avx512, bool slp_vectorize,
            std::optional<std::uint32_t> batch_size, bool fast_math, long long prec) {
             // Compute the SIMD size.
             const auto simd_size = batch_size ? *batch_size : hey::recommended_simd_size<T>();
@@ -729,8 +729,9 @@ void expose_add_cfunc_impl(py::module &m, const char *suffix)
             ptr_s_t fptr_scal_s = nullptr, fptr_batch_s = nullptr;
 
             hey::llvm_state s_scal{kw::opt_level = opt_level, kw::force_avx512 = force_avx512,
-                                   kw::fast_math = fast_math},
-                s_batch{kw::opt_level = opt_level, kw::force_avx512 = force_avx512, kw::fast_math = fast_math};
+                                   kw::slp_vectorize = slp_vectorize, kw::fast_math = fast_math},
+                s_batch{kw::opt_level = opt_level, kw::force_avx512 = force_avx512, kw::slp_vectorize = slp_vectorize,
+                        kw::fast_math = fast_math};
 
             // Variable to store the decomposition.
             std::vector<hey::expression> dc;
@@ -879,8 +880,8 @@ void expose_add_cfunc_impl(py::module &m, const char *suffix)
         },
         "fn"_a, "vars"_a = py::none{}, "high_accuracy"_a.noconvert() = false,
         "compact_mode"_a.noconvert() = default_cm<T>, "parallel_mode"_a.noconvert() = false,
-        "opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "batch_size"_a.noconvert() = py::none{},
-        "fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0);
+        "opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "slp_vectorize"_a.noconvert() = false,
+        "batch_size"_a.noconvert() = py::none{}, "fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0);
 }
 
 } // namespace

diff --git a/heyoka/core.cpp b/heyoka/core.cpp
@@ -164,6 +164,7 @@ PYBIND11_MODULE(core, m)
         .def_property_readonly("opt_level", &hey::llvm_state::get_opt_level)
         .def_property_readonly("fast_math", [](const hey::llvm_state &s) { return s.fast_math(); })
         .def_property_readonly("force_avx512", [](const hey::llvm_state &s) { return s.force_avx512(); })
+        .def_property_readonly("slp_vectorize", &hey::llvm_state::get_slp_vectorize)
         // Repr.
         .def("__repr__",
              [](const hey::llvm_state &s) {

diff --git a/heyoka/expose_batch_integrators.cpp b/heyoka/expose_batch_integrators.cpp
@@ -62,7 +62,7 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
     auto tab_ctor_impl = [](const auto &sys, const py::iterable &state_ob, std::optional<py::iterable> time_ob,
                             std::optional<py::iterable> pars_ob, T tol, bool high_accuracy, bool compact_mode,
                             std::vector<t_ev_t> tes, std::vector<nt_ev_t> ntes, bool parallel_mode, unsigned opt_level,
-                            bool force_avx512, bool fast_math) {
+                            bool force_avx512, bool slp_vectorize, bool fast_math) {
         // Fetch the dtype corresponding to T.
         const auto dt = get_dtype<T>();
 
@@ -142,6 +142,7 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
                                                  kw::parallel_mode = parallel_mode,
                                                  kw::opt_level = opt_level,
                                                  kw::force_avx512 = force_avx512,
+                                                 kw::slp_vectorize = slp_vectorize,
                                                  kw::fast_math = fast_math};
         } else {
             // Times not provided.
@@ -164,6 +165,7 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
                                                  kw::parallel_mode = parallel_mode,
                                                  kw::opt_level = opt_level,
                                                  kw::force_avx512 = force_avx512,
+                                                 kw::slp_vectorize = slp_vectorize,
                                                  kw::fast_math = fast_math};
         }
     };
@@ -178,19 +180,19 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
         .def(py::init([tab_ctor_impl](const variant_t &sys, const py::iterable &state, std::optional<py::iterable> time,
                                       std::optional<py::iterable> pars, T tol, bool high_accuracy, bool compact_mode,
                                       std::vector<t_ev_t> tes, std::vector<nt_ev_t> ntes, bool parallel_mode,
-                                      unsigned opt_level, bool force_avx512, bool fast_math) {
+                                      unsigned opt_level, bool force_avx512, bool slp_vectorize, bool fast_math) {
                  return std::visit(
                      [&](const auto &value) {
                          return tab_ctor_impl(value, state, std::move(time), std::move(pars), tol, high_accuracy,
                                               compact_mode, std::move(tes), std::move(ntes), parallel_mode, opt_level,
-                                              force_avx512, fast_math);
+                                              force_avx512, slp_vectorize, fast_math);
                      },
                      sys);
              }),
              "sys"_a, "state"_a, "time"_a = py::none{}, "pars"_a = py::none{}, "tol"_a.noconvert() = static_cast<T>(0),
              "high_accuracy"_a = false, "compact_mode"_a = false, "t_events"_a = py::list{}, "nt_events"_a = py::list{},
              "parallel_mode"_a = false, "opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false,
-             "fast_math"_a.noconvert() = false)
+             "slp_vectorize"_a.noconvert() = false, "fast_math"_a.noconvert() = false)
         .def_property_readonly("decomposition", &hey::taylor_adaptive_batch<T>::get_decomposition)
         .def_property_readonly("state_vars", &hey::taylor_adaptive_batch<T>::get_state_vars)
         .def_property_readonly("rhs", &hey::taylor_adaptive_batch<T>::get_rhs)

diff --git a/heyoka/taylor_add_jet.cpp b/heyoka/taylor_add_jet.cpp
@@ -226,7 +226,7 @@ void expose_taylor_add_jet_impl(py::module &m, const char *name)
         name,
         [](const U &sys, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
            const std::vector<hey::expression> &sv_funcs, bool parallel_mode, unsigned opt_level, bool force_avx512,
-           bool fast_math, long long prec) {
+           bool slp_vectorize, bool fast_math, long long prec) {
             // Forbid batch sizes > 1 for everything but double.
             if (!std::is_same_v<T, double> && batch_size > 1u) {
                 py_throw(PyExc_ValueError, "Batch sizes greater than 1 are not supported for this floating-point type");
@@ -255,7 +255,8 @@ void expose_taylor_add_jet_impl(py::module &m, const char *name)
             // Add the jet function.
             using jptr_t = void (*)(T *, const T *, const T *);
             jptr_t jptr = nullptr;
-            hey::llvm_state s{kw::opt_level = opt_level, kw::force_avx512 = force_avx512, kw::fast_math = fast_math};
+            hey::llvm_state s{kw::opt_level = opt_level, kw::force_avx512 = force_avx512,
+                              kw::slp_vectorize = slp_vectorize, kw::fast_math = fast_math};
 
             {
                 // NOTE: release the GIL during compilation.
@@ -381,7 +382,8 @@ void expose_taylor_add_jet_impl(py::module &m, const char *name)
         },
         "sys"_a, "order"_a, "batch_size"_a = 1u, "high_accuracy"_a = false, "compact_mode"_a = default_cm<T>,
         "sv_funcs"_a = py::list{}, "parallel_mode"_a = false, "opt_level"_a.noconvert() = 3,
-        "force_avx512"_a.noconvert() = false, "fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0);
+        "force_avx512"_a.noconvert() = false, "slp_vectorize"_a.noconvert() = false, "fast_math"_a.noconvert() = false,
+        "prec"_a.noconvert() = 0);
 }
 
 } // namespace

diff --git a/heyoka/taylor_expose_integrator.cpp b/heyoka/taylor_expose_integrator.cpp
@@ -44,6 +44,7 @@
 #endif
 
 #include <heyoka/expression.hpp>
+#include <heyoka/llvm_state.hpp>
 #include <heyoka/step_callback.hpp>
 #include <heyoka/taylor.hpp>
 
@@ -91,7 +92,7 @@ void expose_taylor_integrator_impl(py::module &m, const std::string &suffix)
     py::class_<hey::taylor_adaptive<T>> cl(m, (fmt::format("taylor_adaptive_{}", suffix)).c_str(), py::dynamic_attr{});
     cl.def(py::init([](const sys_t &sys, std::vector<T> state, T time, std::vector<T> pars, T tol, bool high_accuracy,
                        bool compact_mode, std::vector<t_ev_t> tes, std::vector<nt_ev_t> ntes, bool parallel_mode,
-                       unsigned opt_level, bool force_avx512, bool fast_math, long long prec) {
+                       unsigned opt_level, bool force_avx512, bool slp_vectorize, bool fast_math, long long prec) {
                return std::visit(
                    [&](const auto &val) {
                        // NOTE: GIL release is fine here even if the events contain
@@ -112,6 +113,7 @@ void expose_taylor_integrator_impl(py::module &m, const std::string &suffix)
                                                       kw::parallel_mode = parallel_mode,
                                                       kw::opt_level = opt_level,
                                                       kw::force_avx512 = force_avx512,
+                                                      kw::slp_vectorize = slp_vectorize,
                                                       kw::fast_math = fast_math,
                                                       kw::prec = prec};
                    },
@@ -120,8 +122,8 @@ void expose_taylor_integrator_impl(py::module &m, const std::string &suffix)
            "sys"_a, "state"_a.noconvert(), "time"_a.noconvert() = static_cast<T>(0), "pars"_a.noconvert() = py::list{},
            "tol"_a.noconvert() = static_cast<T>(0), "high_accuracy"_a = false, "compact_mode"_a = default_cm<T>,
            "t_events"_a = py::list{}, "nt_events"_a = py::list{}, "parallel_mode"_a = false,
-           "opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "fast_math"_a.noconvert() = false,
-           "prec"_a.noconvert() = 0)
+           "opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "slp_vectorize"_a.noconvert() = false,
+           "fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0)
         .def_property_readonly(
             "state",
             [](py::object &o) {