vectorize helmholtz_3d (#10)

sarah · mscroggs · tbetcke · web-flow · commit d82ccfe689b2 · 2024-05-15T22:04:27.000+01:00
* check dependency versions (#7) * check dependency versions * version++ * add release instructions (#8) * add -dev to version (#9) * add -dev to version * tweak release notes * vectorize helmholtz_3d --------- Co-authored-by: Matthew Scroggs <matthew.w.scroggs@gmail.com> Co-authored-by: sarah <> Co-authored-by: Timo Betcke <timo.betcke@gmail.com>
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -36,3 +36,22 @@ jobs:
         run: cargo test --examples --release ${{ matrix.feature-flags }}
       - name: Test benchmarks build
         run: cargo bench --no-run --features "strict"
+
+  check-dependencies:
+    name: Check dependencies
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        rust-version: ["stable"]
+    steps:
+      - name: Set up Rust
+        uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: ${{ matrix.rust-version }}
+          components: rustfmt
+      - name: Install cargo-upgrades
+        run: cargo install cargo-upgrades
+      - uses: actions/checkout@v3
+      - name: Check that dependencies are up to date
+        run:
+          cargo upgrades
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,10 +1,11 @@
 [features]
+nightly = ["pulp/nightly"]
 # Treat warnings as a build error.
 strict = []
 
 [package]
 name = "green-kernels"
-version = "0.1.0"
+version = "0.1.0-dev"
 edition = "2021"
 authors = ["Timo Betcke <timo.betcke@gmail.com>", "Matthew Scroggs <rust@mscroggs.co.uk>"]
 description = "Evaluation of Green's function kernels."
@@ -28,9 +29,11 @@ num = "0.4"
 num_cpus = "1"
 rlst = { git = "https://github.com/linalg-rs/rlst.git" }
 rand = "0.8.5"
-itertools = "0.12"
-pulp = "0.18.12"
-coe-rs = "0.1"
+itertools = { version = "0.12.1", default-features = false }
+coe-rs = "0.1.2"
+pulp = { version = "0.18.12" }
+bytemuck = "1.16.0"
+hexf = "0.2.1"
 
 [dev-dependencies]
 criterion = { version = "0.3", features = ["html_reports"] }
@@ -43,12 +46,12 @@ cargo-args = ["-Zunstable-options", "-Zrustdoc-scrape-examples"]
 wildcard_imports = "forbid"
 
 [target.aarch64-apple-darwin.dev-dependencies]
-blas-src = { version = "0.9", features = ["accelerate"]}
-lapack-src = { version = "0.9", features = ["accelerate"]}
+blas-src = { version = "0.10", features = ["accelerate"]}
+lapack-src = { version = "0.10", features = ["accelerate"]}
 
 [target.x86_64-unknown-linux-gnu.dev-dependencies]
-blas-src = { version = "0.9", features = ["blis"]}
-lapack-src = { version = "0.9", features = ["netlib"]}
+blas-src = { version = "0.10", features = ["blis"]}
+lapack-src = { version = "0.10", features = ["netlib"]}
 
 [[bench]]
 name = "laplace_f32"
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,36 @@
+# Making a release
+
+To make a new release of green-kernels, follow the following steps:
+
+0) If you are yet to make a release on your current computer, run `cargo login` and copy an API
+   key from https://crates.io/me
+
+1) Checkout the branch `release` and merge the branch `main` into the branch `release`:
+   ```bash
+   git checkout release
+   git merge main
+   ```
+
+2) Update the version number in `Cargo.toml`.
+   The version numbers have the format `[x].[y].[z]`. If you are releasing a major
+   version, you should increment `[x]` and set `[y]` and `[z]` to 0.
+   If you are releasing a minor version, you should increment `[y]` and set `[z]`
+   to zero. If you are releasing a bugfix, you should increment `[z]`.
+
+3) In `Cargo.toml`, check that the `rlst` dependency is at the latest version.
+
+4) Commit your changes and push to GitHub, and check that all the tests on CI pass.
+
+5) [Create a release on GitHub](https://github.com/bempp/green-kernels/releases/new) from the `release` branch.
+   The release tag and title should be `v[x].[y].[z]` (where `[x]`, `[y]` and `[z]` are as in step 2).
+   In the "Describe this release" box, you should bullet point the main changes since the last
+   release.
+
+6) Run `cargo publish --dry-run`, then run `cargo package --list` and
+   check that no unwanted extras have been included in the release.
+
+7) If everything is working as expected, run `cargo publish`. This will push the new version to
+   crates.io. Note: this cannot be undone, but you can use `cargo yank` to mark a version as
+   unsuitable for use.
+
+8) Open a pull request to `main` to update the version number in `Cargo.toml` to `[x].[y].[z]-dev`
diff --git a/src/helmholtz_3d.rs b/src/helmholtz_3d.rs
@@ -4,9 +4,15 @@ use crate::helpers::{
 };
 use crate::traits::Kernel;
 use crate::types::EvalType;
+use crate::RealScalar;
+use crate::{ComplexScalar, SimdFor};
 use num::traits::FloatConst;
+use num::One;
 use num::Zero;
+use pulp::Simd;
 use rayon::prelude::*;
+use rlst::c32;
+use rlst::c64;
 use rlst::RlstScalar;
 use std::marker::PhantomData;
 
@@ -251,8 +257,8 @@ where
 /// Evaluate Helmholtz kernel for one target
 pub fn evaluate_helmholtz_one_target<T: RlstScalar<Complex = T>>(
     eval_type: EvalType,
-    target: &[<T as RlstScalar>::Real],
-    sources: &[<T as RlstScalar>::Real],
+    target: &[T::Real],
+    sources: &[T::Real],
     charges: &[T],
     wavenumber: T::Real,
     result: &mut [T],
@@ -273,33 +279,159 @@ pub fn evaluate_helmholtz_one_target<T: RlstScalar<Complex = T>>(
 
     match eval_type {
         EvalType::Value => {
-            let mut my_result_real = <<T as RlstScalar>::Real as Zero>::zero();
-            let mut my_result_imag = <<T as RlstScalar>::Real as Zero>::zero();
-            for index in 0..nsources {
-                diff0 = sources0[index] - target[0];
-                diff1 = sources1[index] - target[1];
-                diff2 = sources2[index] - target[2];
-                let diff_norm = (diff0 * diff0 + diff1 * diff1 + diff2 * diff2).sqrt();
-                let inv_diff_norm = {
-                    if diff_norm == zero_real {
-                        zero_real
-                    } else {
-                        one_real / diff_norm
+            struct Impl<'a, T: RlstScalar> {
+                wavenumber: T::Real,
+                t0: T::Real,
+                t1: T::Real,
+                t2: T::Real,
+
+                sources0: &'a [T::Real],
+                sources1: &'a [T::Real],
+                sources2: &'a [T::Real],
+                charges: &'a [T],
+            }
+
+            impl<T: ComplexScalar> pulp::WithSimd for Impl<'_, T> {
+                type Output = (T::Real, T::Real);
+
+                #[inline(always)]
+                fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
+                    use coe::Coerce;
+
+                    let Self {
+                        wavenumber,
+                        t0,
+                        t1,
+                        t2,
+                        sources0,
+                        sources1,
+                        sources2,
+                        charges,
+                    } = self;
+
+                    let (s0_head, s0_tail) = T::Real::as_simd_slice::<S>(sources0);
+                    let (s1_head, s1_tail) = T::Real::as_simd_slice::<S>(sources1);
+                    let (s2_head, s2_tail) = T::Real::as_simd_slice::<S>(sources2);
+
+                    let len = s0_head.len();
+                    let n = std::mem::size_of::<<T::Real as RealScalar>::Scalars<S>>()
+                        / std::mem::size_of::<T::Real>();
+                    let (c_head, c_tail) = charges.split_at(len * n);
+                    let c_head: &[[<T::Real as RealScalar>::Scalars<S>; 2]] =
+                        bytemuck::cast_slice(c_head);
+                    let c_tail: &[[T::Real; 2]] = bytemuck::cast_slice(c_tail);
+
+                    #[inline(always)]
+                    fn impl_slice<T: ComplexScalar, S: Simd>(
+                        simd: S,
+                        wavenumber: T::Real,
+                        t0: T::Real,
+                        t1: T::Real,
+                        t2: T::Real,
+
+                        sources0: &[<T::Real as RealScalar>::Scalars<S>],
+                        sources1: &[<T::Real as RealScalar>::Scalars<S>],
+                        sources2: &[<T::Real as RealScalar>::Scalars<S>],
+                        charges: &[[<T::Real as RealScalar>::Scalars<S>; 2]],
+                    ) -> (T::Real, T::Real) {
+                        let simd = SimdFor::<T::Real, S>::new(simd);
+
+                        let t0 = simd.splat(t0);
+                        let t1 = simd.splat(t1);
+                        let t2 = simd.splat(t2);
+                        let zero = simd.splat(T::Real::zero());
+                        let wavenumber = simd.splat(wavenumber);
+                        let mut acc_re = simd.splat(T::Real::zero());
+                        let mut acc_im = simd.splat(T::Real::zero());
+
+                        for (&s0, &s1, &s2, &c) in
+                            itertools::izip!(sources0, sources1, sources2, charges)
+                        {
+                            let [c_re, c_im] = simd.deinterleave(c);
+
+                            let diff0 = simd.sub(s0, t0);
+                            let diff1 = simd.sub(s1, t1);
+                            let diff2 = simd.sub(s2, t2);
+
+                            let diff_norm = simd.sqrt(simd.mul_add(
+                                diff0,
+                                diff0,
+                                simd.mul_add(diff1, diff1, simd.mul(diff2, diff2)),
+                            ));
+
+                            let is_zero = simd.cmp_eq(diff_norm, zero);
+                            let inv_diff_norm = simd.select(
+                                is_zero,
+                                zero,
+                                simd.div(simd.splat(T::Real::one()), diff_norm),
+                            );
+                            let kr = simd.mul(wavenumber, diff_norm);
+
+                            let (g_re, g_im) = {
+                                let (s, c) = simd.sin_cos(kr);
+                                (simd.mul(c, inv_diff_norm), simd.mul(s, inv_diff_norm))
+                            };
+
+                            acc_re = simd.mul_add(
+                                g_re,
+                                c_re,
+                                simd.mul_add(simd.neg(g_im), c_im, acc_re),
+                            );
+                            acc_im = simd.mul_add(g_re, c_im, simd.mul_add(g_im, c_re, acc_im));
+                        }
+                        (simd.reduce_add(acc_re), simd.reduce_add(acc_im))
                     }
-                };
 
-                let kr = wavenumber * diff_norm;
+                    let (re0, im0) = impl_slice::<T, S>(
+                        simd, wavenumber, t0, t1, t2, s0_head, s1_head, s2_head, c_head,
+                    );
+                    let (re1, im1) = impl_slice::<T, pulp::Scalar>(
+                        pulp::Scalar::new(),
+                        wavenumber,
+                        t0,
+                        t1,
+                        t2,
+                        s0_tail.coerce(),
+                        s1_tail.coerce(),
+                        s2_tail.coerce(),
+                        c_tail.coerce(),
+                    );
 
-                let g_re = <T::Real as RlstScalar>::cos(kr) * inv_diff_norm;
-                let g_im = <T::Real as RlstScalar>::sin(kr) * inv_diff_norm;
-                let charge_re = charges[index].re();
-                let charge_im = charges[index].im();
+                    (re0 + re1, im0 + im1)
+                }
+            }
 
-                my_result_imag += g_re * charge_im + g_im * charge_re;
-                my_result_real += g_re * charge_re - g_im * charge_im;
+            use coe::coerce_static as to;
+            use coe::Coerce;
+            if coe::is_same::<T, c32>() {
+                let (re, im) = pulp::Arch::new().dispatch(Impl::<'_, c32> {
+                    wavenumber: to(wavenumber),
+                    t0: to(target[0]),
+                    t1: to(target[1]),
+                    t2: to(target[2]),
+                    sources0: sources0.coerce(),
+                    sources1: sources1.coerce(),
+                    sources2: sources2.coerce(),
+                    charges: charges.coerce(),
+                });
+                result[0] += T::Complex::complex(to::<_, T::Real>(re), to::<_, T::Real>(im))
+                    .mul_real(m_inv_4pi);
+            } else if coe::is_same::<T, c64>() {
+                let (re, im) = pulp::Arch::new().dispatch(Impl::<'_, c64> {
+                    wavenumber: to(wavenumber),
+                    t0: to(target[0]),
+                    t1: to(target[1]),
+                    t2: to(target[2]),
+                    sources0: sources0.coerce(),
+                    sources1: sources1.coerce(),
+                    sources2: sources2.coerce(),
+                    charges: charges.coerce(),
+                });
+                result[0] += T::Complex::complex(to::<_, T::Real>(re), to::<_, T::Real>(im))
+                    .mul_real(m_inv_4pi);
+            } else {
+                panic!()
             }
-            result[0] += <T::Complex as RlstScalar>::complex(my_result_real, my_result_imag)
-                .mul_real(m_inv_4pi);
         }
         EvalType::ValueDeriv => {
             // Cannot simply use an array my_result as this is not
diff --git a/src/lib.rs b/src/lib.rs