Skip to content

Commit

Permalink
Expose the slp_vectorize flag.
Browse files Browse the repository at this point in the history
  • Loading branch information
Francesco Biscani authored and Francesco Biscani committed Sep 4, 2023
1 parent c9c67cd commit fa55bf5
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 15 deletions.
11 changes: 6 additions & 5 deletions heyoka/cfunc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,7 @@ void expose_add_cfunc_impl(py::module &m, const char *suffix)
m.def(
fmt::format("_add_cfunc_{}", suffix).c_str(),
[](std::vector<hey::expression> fn, std::optional<std::vector<hey::expression>> vars, bool high_accuracy,
bool compact_mode, bool parallel_mode, unsigned opt_level, bool force_avx512,
bool compact_mode, bool parallel_mode, unsigned opt_level, bool force_avx512, bool slp_vectorize,
std::optional<std::uint32_t> batch_size, bool fast_math, long long prec) {
// Compute the SIMD size.
const auto simd_size = batch_size ? *batch_size : hey::recommended_simd_size<T>();
Expand All @@ -729,8 +729,9 @@ void expose_add_cfunc_impl(py::module &m, const char *suffix)
ptr_s_t fptr_scal_s = nullptr, fptr_batch_s = nullptr;

hey::llvm_state s_scal{kw::opt_level = opt_level, kw::force_avx512 = force_avx512,
kw::fast_math = fast_math},
s_batch{kw::opt_level = opt_level, kw::force_avx512 = force_avx512, kw::fast_math = fast_math};
kw::slp_vectorize = slp_vectorize, kw::fast_math = fast_math},
s_batch{kw::opt_level = opt_level, kw::force_avx512 = force_avx512, kw::slp_vectorize = slp_vectorize,
kw::fast_math = fast_math};

// Variable to store the decomposition.
std::vector<hey::expression> dc;
Expand Down Expand Up @@ -879,8 +880,8 @@ void expose_add_cfunc_impl(py::module &m, const char *suffix)
},
"fn"_a, "vars"_a = py::none{}, "high_accuracy"_a.noconvert() = false,
"compact_mode"_a.noconvert() = default_cm<T>, "parallel_mode"_a.noconvert() = false,
"opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "batch_size"_a.noconvert() = py::none{},
"fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0);
"opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "slp_vectorize"_a.noconvert() = false,
"batch_size"_a.noconvert() = py::none{}, "fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0);
}

} // namespace
Expand Down
1 change: 1 addition & 0 deletions heyoka/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ PYBIND11_MODULE(core, m)
.def_property_readonly("opt_level", &hey::llvm_state::get_opt_level)
.def_property_readonly("fast_math", [](const hey::llvm_state &s) { return s.fast_math(); })
.def_property_readonly("force_avx512", [](const hey::llvm_state &s) { return s.force_avx512(); })
.def_property_readonly("slp_vectorize", &hey::llvm_state::get_slp_vectorize)
// Repr.
.def("__repr__",
[](const hey::llvm_state &s) {
Expand Down
10 changes: 6 additions & 4 deletions heyoka/expose_batch_integrators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
auto tab_ctor_impl = [](const auto &sys, const py::iterable &state_ob, std::optional<py::iterable> time_ob,
std::optional<py::iterable> pars_ob, T tol, bool high_accuracy, bool compact_mode,
std::vector<t_ev_t> tes, std::vector<nt_ev_t> ntes, bool parallel_mode, unsigned opt_level,
bool force_avx512, bool fast_math) {
bool force_avx512, bool slp_vectorize, bool fast_math) {
// Fetch the dtype corresponding to T.
const auto dt = get_dtype<T>();

Expand Down Expand Up @@ -142,6 +142,7 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
kw::parallel_mode = parallel_mode,
kw::opt_level = opt_level,
kw::force_avx512 = force_avx512,
kw::slp_vectorize = slp_vectorize,
kw::fast_math = fast_math};
} else {
// Times not provided.
Expand All @@ -164,6 +165,7 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
kw::parallel_mode = parallel_mode,
kw::opt_level = opt_level,
kw::force_avx512 = force_avx512,
kw::slp_vectorize = slp_vectorize,
kw::fast_math = fast_math};
}
};
Expand All @@ -178,19 +180,19 @@ void expose_batch_integrator_impl(py::module_ &m, const std::string &suffix)
.def(py::init([tab_ctor_impl](const variant_t &sys, const py::iterable &state, std::optional<py::iterable> time,
std::optional<py::iterable> pars, T tol, bool high_accuracy, bool compact_mode,
std::vector<t_ev_t> tes, std::vector<nt_ev_t> ntes, bool parallel_mode,
unsigned opt_level, bool force_avx512, bool fast_math) {
unsigned opt_level, bool force_avx512, bool slp_vectorize, bool fast_math) {
return std::visit(
[&](const auto &value) {
return tab_ctor_impl(value, state, std::move(time), std::move(pars), tol, high_accuracy,
compact_mode, std::move(tes), std::move(ntes), parallel_mode, opt_level,
force_avx512, fast_math);
force_avx512, slp_vectorize, fast_math);
},
sys);
}),
"sys"_a, "state"_a, "time"_a = py::none{}, "pars"_a = py::none{}, "tol"_a.noconvert() = static_cast<T>(0),
"high_accuracy"_a = false, "compact_mode"_a = false, "t_events"_a = py::list{}, "nt_events"_a = py::list{},
"parallel_mode"_a = false, "opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false,
"fast_math"_a.noconvert() = false)
"slp_vectorize"_a.noconvert() = false, "fast_math"_a.noconvert() = false)
.def_property_readonly("decomposition", &hey::taylor_adaptive_batch<T>::get_decomposition)
.def_property_readonly("state_vars", &hey::taylor_adaptive_batch<T>::get_state_vars)
.def_property_readonly("rhs", &hey::taylor_adaptive_batch<T>::get_rhs)
Expand Down
8 changes: 5 additions & 3 deletions heyoka/taylor_add_jet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ void expose_taylor_add_jet_impl(py::module &m, const char *name)
name,
[](const U &sys, std::uint32_t order, std::uint32_t batch_size, bool high_accuracy, bool compact_mode,
const std::vector<hey::expression> &sv_funcs, bool parallel_mode, unsigned opt_level, bool force_avx512,
bool fast_math, long long prec) {
bool slp_vectorize, bool fast_math, long long prec) {
// Forbid batch sizes > 1 for everything but double.
if (!std::is_same_v<T, double> && batch_size > 1u) {
py_throw(PyExc_ValueError, "Batch sizes greater than 1 are not supported for this floating-point type");
Expand Down Expand Up @@ -255,7 +255,8 @@ void expose_taylor_add_jet_impl(py::module &m, const char *name)
// Add the jet function.
using jptr_t = void (*)(T *, const T *, const T *);
jptr_t jptr = nullptr;
hey::llvm_state s{kw::opt_level = opt_level, kw::force_avx512 = force_avx512, kw::fast_math = fast_math};
hey::llvm_state s{kw::opt_level = opt_level, kw::force_avx512 = force_avx512,
kw::slp_vectorize = slp_vectorize, kw::fast_math = fast_math};

{
// NOTE: release the GIL during compilation.
Expand Down Expand Up @@ -381,7 +382,8 @@ void expose_taylor_add_jet_impl(py::module &m, const char *name)
},
"sys"_a, "order"_a, "batch_size"_a = 1u, "high_accuracy"_a = false, "compact_mode"_a = default_cm<T>,
"sv_funcs"_a = py::list{}, "parallel_mode"_a = false, "opt_level"_a.noconvert() = 3,
"force_avx512"_a.noconvert() = false, "fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0);
"force_avx512"_a.noconvert() = false, "slp_vectorize"_a.noconvert() = false, "fast_math"_a.noconvert() = false,
"prec"_a.noconvert() = 0);
}

} // namespace
Expand Down
8 changes: 5 additions & 3 deletions heyoka/taylor_expose_integrator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#endif

#include <heyoka/expression.hpp>
#include <heyoka/llvm_state.hpp>
#include <heyoka/step_callback.hpp>
#include <heyoka/taylor.hpp>

Expand Down Expand Up @@ -91,7 +92,7 @@ void expose_taylor_integrator_impl(py::module &m, const std::string &suffix)
py::class_<hey::taylor_adaptive<T>> cl(m, (fmt::format("taylor_adaptive_{}", suffix)).c_str(), py::dynamic_attr{});
cl.def(py::init([](const sys_t &sys, std::vector<T> state, T time, std::vector<T> pars, T tol, bool high_accuracy,
bool compact_mode, std::vector<t_ev_t> tes, std::vector<nt_ev_t> ntes, bool parallel_mode,
unsigned opt_level, bool force_avx512, bool fast_math, long long prec) {
unsigned opt_level, bool force_avx512, bool slp_vectorize, bool fast_math, long long prec) {
return std::visit(
[&](const auto &val) {
// NOTE: GIL release is fine here even if the events contain
Expand All @@ -112,6 +113,7 @@ void expose_taylor_integrator_impl(py::module &m, const std::string &suffix)
kw::parallel_mode = parallel_mode,
kw::opt_level = opt_level,
kw::force_avx512 = force_avx512,
kw::slp_vectorize = slp_vectorize,
kw::fast_math = fast_math,
kw::prec = prec};
},
Expand All @@ -120,8 +122,8 @@ void expose_taylor_integrator_impl(py::module &m, const std::string &suffix)
"sys"_a, "state"_a.noconvert(), "time"_a.noconvert() = static_cast<T>(0), "pars"_a.noconvert() = py::list{},
"tol"_a.noconvert() = static_cast<T>(0), "high_accuracy"_a = false, "compact_mode"_a = default_cm<T>,
"t_events"_a = py::list{}, "nt_events"_a = py::list{}, "parallel_mode"_a = false,
"opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "fast_math"_a.noconvert() = false,
"prec"_a.noconvert() = 0)
"opt_level"_a.noconvert() = 3, "force_avx512"_a.noconvert() = false, "slp_vectorize"_a.noconvert() = false,
"fast_math"_a.noconvert() = false, "prec"_a.noconvert() = 0)
.def_property_readonly(
"state",
[](py::object &o) {
Expand Down

0 comments on commit fa55bf5

Please sign in to comment.