STEllAR-GROUP · NK-Nikunj · Aug 4, 2020 · Aug 4, 2020
diff --git a/examples/03_stencil/CMakeLists.txt b/examples/03_stencil/CMakeLists.txt
@@ -17,6 +17,9 @@ hpx_setup_target(stencil_serial)
 add_executable(stencil_parallel_0 stencil_parallel_0.cpp)
 hpx_setup_target(stencil_parallel_0)
 
+add_executable(stencil_parallel_vector stencil_parallel_vector.cpp)
+hpx_setup_target(stencil_parallel_vector)
+
 add_executable(stencil_parallel_1 stencil_parallel_1.cpp)
 hpx_setup_target(stencil_parallel_1)
 
@@ -33,6 +36,7 @@ hpx_setup_target(stencil_parallel_4)
 if (TARGET tutorial)
   add_hpx_pseudo_dependencies(tutorial.stencil stencil_serial)
   add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_0)
+  add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_parallel_vector)
   add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_1)
   add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_2)
   add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_3)

diff --git a/examples/03_stencil/README.md b/examples/03_stencil/README.md
@@ -0,0 +1,8 @@
+# Optimized 2D stencil code series
+
+This example section takes you through an approach to write a serial stencil
+and scale it to an optimized distributed 2D stencil.
+
+To be able to build this directory, you are required to have
+[HPX](https://github.com/STEllAR-GROUP/hpx) and
+[NSIMD](https://github.com/agenium-scale/nsimd) installed.
diff --git a/examples/03_stencil/include/grid.hpp b/examples/03_stencil/include/grid.hpp
@@ -0,0 +1,181 @@
+// Copyright (c) 2020 Nikunj Gupta
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef GRID
+#define GRID
+
+#include <hpx/include/future.hpp>
+
+#include <vector>
+#include <nsimd/nsimd-all.hpp>
+#include <type_traits>
+
+// Define some commonly used packs
+using vdouble = nsimd::pack<double>;
+using vfloat = nsimd::pack<float>;
+
+
+// Meta templates to extract type of a pack
+template <typename _Ty>
+struct get_type
+{
+    using type = _Ty;
+};
+
+template <typename _Ty>
+struct get_type<nsimd::pack<_Ty>>
+{
+    using type = _Ty;
+};
+
+
+// Our grid data structure encapsulates the stencil
+// and provide with api functions to use in for loops
+template <typename Container>
+class Grid
+{
+public:
+    using container_t = Container;
+    using type = typename container_t::value_type;
+    using alloc_t = typename container_t::allocator_type;
+    using iterator_t = typename container_t::iterator;
+
+    // Conform rule of 5
+    Grid() = default;
+    Grid(const Grid<Container>&) = default;
+    Grid(Grid<Container>&&) = default;
+    Grid<Container>& operator=(const Grid<Container>&) = default;
+    Grid<Container>& operator=(Grid<Container>&&) = default;
+
+    Grid(std::size_t Nx, std::size_t Ny, type value, alloc_t alloc)
+    :
+    nx_(Nx), ny_(Ny), grid_(container_t(Nx * Ny, value, alloc))
+    {}
+
+    // Define useful iterators
+    iterator_t begin()
+    {
+        return grid_.begin();
+    }
+
+    iterator_t end()
+    {
+        return grid_.end();
+    }
+
+    // Returns size of a row
+    std::size_t row_size()
+    {
+        return nx_;
+    }
+
+    // Returns number of columns
+    std::size_t column_size()
+    {
+        return ny_;
+    }
+
+    // Indexing function
+    type& in(std::size_t x, std::size_t y)
+    {
+        return grid_[y*nx_ + x];
+    }
+
+private:
+    container_t grid_;
+    std::size_t nx_;
+    std::size_t ny_;
+};
+
+
+// Template specializations to allow for correct shuffle and print operations
+template <
+        typename Container,
+        typename _Tx = typename Container::value_type,
+        typename _Ty = typename get_type<_Tx>::type
+        >
+struct helper
+{
+    static void shuffle(Grid<Container>& grid, std::size_t ny)
+    {
+        std::size_t len = std::size_t(nsimd::len(_Tx()));
+        std::vector<_Ty> vect(len+2, 0.0);
+
+        // Left shift the first simd register
+        nsimd::storeu(&vect[1], grid.in(1, ny));
+        grid.in(0, ny) = nsimd::loadu<_Tx>(&vect[0]);
+
+        // Right shift the last simd register
+        nsimd::storeu(&vect[1], grid.in(grid.row_size() - 2, ny));
+        grid.in(grid.row_size() - 1, ny) = nsimd::loadu<_Tx>(&vect[2]);
+    }
+
+    static void print(Grid<Container>& grid)
+    {
+        const std::size_t nx = grid.row_size();
+        const std::size_t ny = grid.column_size();
+
+        std::size_t len = std::size_t(nsimd::len(_Tx()));
+
+        const std::size_t nx_actual = (nx - 2) * len + 2;
+        std::vector<_Ty> vect(len, 0.0);
+
+        std::vector<_Ty> line(nx_actual);
+
+        std::cout << "{ ";
+        for (const auto& elem: line)
+            std::cout << 1.0 << " , ";
+        std::cout << "}" << std::endl;
+
+        for (std::size_t y = 1; y < ny-1; ++y)
+        {
+            for (std::size_t x = 1; x < nx - 1; ++x)
+            {
+                nsimd::storeu(&vect[0], grid.in(x, y));
+                for (std::size_t i = 0; i < len; ++i)
+                {
+                    line[x + (nx-2)*i] = vect[i];
+                }
+            }
+            line[0] = 0.0;
+            line[nx_actual - 1] = 0.0;
+
+            std::cout << "{ ";
+            for (const auto& elem: line)
+                std::cout << elem << " , ";
+            std::cout << "}" << std::endl;
+        }
+
+        std::cout << "{ ";
+        for (const auto& elem: line)
+            std::cout << 1.0 << " , ";
+        std::cout << "}" << std::endl;
+    }
+};
+
+template <typename Container, typename _Tx>
+struct helper<Container, _Tx, _Tx>
+{
+    static void shuffle(Grid<Container>& grid, std::size_t ny) {}
+
+    static void print(Grid<Container>& grid)
+    {
+        const std::size_t nx = grid.row_size();
+        const std::size_t ny = grid.column_size();
+
+        for (std::size_t y = 0; y < ny; ++y)
+        {
+            std::cout << "{ ";
+            for (std::size_t x = 0; x < nx; ++x)
+            {
+                std::cout << grid.in(x, y) << " , ";
+            }
+            std::cout << "}" << std::endl;
+        }
+    }
+};
+
+
+#endif
diff --git a/examples/03_stencil/include/stencil.hpp b/examples/03_stencil/include/stencil.hpp
@@ -0,0 +1,57 @@
+// Copyright (c) 2016 Thomas Heller
+// Copyright (c) 2020 Nikunj Gupta
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef STENCIL
+#define STENCIL
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+
+#include "grid.hpp"
+
+template <typename Container>
+using array_t = std::array<Grid<Container>, 2>;
+
+template <typename Container>
+void init(array_t<Container>& U, std::size_t Nx, std::size_t Ny)
+{
+    // Initialize the top row with 1
+    std::fill(U[0].begin(), U[0].begin() + Nx, 1.0);
+    std::fill(U[1].begin(), U[1].begin() + Nx, 1.0);
+
+    // Initialize bottom row with 1
+    std::fill(U[0].end() - Nx, U[0].end(), 1.0);
+    std::fill(U[1].end() - Nx, U[1].end(), 1.0);
+}
+
+template <typename Container>
+void stencil_update(array_t<Container>& U, const std::size_t ny,
+                    const std::size_t len, const std::size_t t)
+{
+    Grid<Container>& curr = U[t % 2];
+    Grid<Container>& next = U[(t + 1) % 2];
+
+    std::size_t row_length = curr.row_size();
+
+    #pragma unroll
+    for (std::size_t nx = 1; nx < row_length - 1; ++nx)
+    {
+        // Stencil operation
+        next.in(nx, ny) =
+            (curr.in(nx-1, ny) + curr.in(nx+1, ny) +
+            curr.in(nx, ny-1) + curr.in(nx, ny+1)) * 0.25f;
+    }
+
+    // Maintain the halo for next computation in case of simd
+    if (std::is_same<typename Container::value_type,
+        nsimd::pack<typename get_type<typename Container::value_type>::type>>::value)
+    {
+        helper<Container>::shuffle(next, ny);
+    }
+}
+
+#endif
diff --git a/examples/03_stencil/stencil_parallel_vector.cpp b/examples/03_stencil/stencil_parallel_vector.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2016 Thomas Heller
+// Copyright (c) 2020 Nikunj Gupta
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <array>
+#include <vector>
+#include <iostream>
+
+#include <hpx/hpx_init.hpp>
+#include <hpx/include/compute.hpp>
+#include <hpx/include/parallel_for_loop.hpp>
+#include <hpx/program_options.hpp>
+#include <hpx/timing/high_resolution_timer.hpp>
+#include <hpx/parallel/executors/auto_chunk_size.hpp>
+#include <hpx/parallel/executors/static_chunk_size.hpp>
+#include <hpx/traits/executor_traits.hpp>
+#include <hpx/execution/executors/parallel_executor_aggregated.hpp>
+
+// Depends on NSIMD
+#include <nsimd/nsimd-all.hpp>
+
+#include "include/stencil.hpp"
+
+template <typename _Tx>
+void stencil_main(const std::size_t& Nx, const std::size_t& Ny,
+                const std::size_t& steps, const std::size_t& chunk_size)
+{
+    using allocator_type = hpx::compute::host::block_allocator<_Tx>;
+    using executor_type = hpx::compute::host::block_executor<>;
+    using data_type = hpx::compute::vector<_Tx, allocator_type>;
+
+    std::array<Grid<data_type>, 2> U;
+
+    auto numa_domains = hpx::compute::host::numa_domains();
+    allocator_type alloc(numa_domains);
+
+    // Get register length
+    // for float, length = 1
+    // for simd register, length = 2/4/8/..
+    std::size_t len = 1;
+    if (std::is_same<_Tx,
+            nsimd::pack<typename get_type<_Tx>::type>>::value)
+        len = static_cast<std::size_t>(nsimd::len(_Tx()));
+
+    // Create augmented matrix
+    std::size_t Nx_aug = Nx/len + 2;
+    std::size_t Ny_aug = Ny + 2;
+
+    U[0] = Grid<data_type>(Nx_aug, Ny_aug, 0.0, alloc);
+    U[1] = Grid<data_type>(Nx_aug, Ny_aug, 0.0, alloc);
+
+    // Initialize the stencil
+    init<data_type>(U, Nx_aug, Ny_aug);
+
+    // Range of Ny for stencil operation
+    auto range = boost::irange(static_cast<std::size_t>(1), Ny_aug - 1);
+
+    // Define HPX executor
+    executor_type executor(numa_domains);
+    auto policy = hpx::parallel::execution::par.on(executor);
+
+    hpx::util::high_resolution_timer t;
+    using hpx::parallel::execution::auto_chunk_size;
+
+    for (std::size_t t = 0; t < steps; ++t)
+    {
+        hpx::parallel::for_each(
+                policy,
+                std::begin(range), std::end(range),
+                [&U, len, t] (std::size_t i)
+                {
+                    stencil_update<data_type>(U, i, len, t);
+                });
+    }
+    double elapsed = t.elapsed();
+
+    std::cout << "Working with type: " <<
+                boost::typeindex::type_id<_Tx>().pretty_name() << std::endl;
+
+    std::cout << "Time elapsed: " << elapsed << std::endl;
+
+    double mlups = (Nx * Ny * steps) / 1e6 / elapsed;
+    std::cout << "MLUPS: " << mlups << "\n\n";
+}
+
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    // Get commandline arguments
+    std::size_t Nx = vm["Nx"].as<std::size_t>();
+    std::size_t Ny = vm["Ny"].as<std::size_t>();
+    std::size_t steps = vm["steps"].as<std::size_t>();
+    std::size_t chunk_size = vm["chunk-size"].as<std::size_t>();
+
+    stencil_main<float>(Nx, Ny, steps, chunk_size);
+    stencil_main<vfloat>(Nx, Ny, steps, chunk_size);
+    stencil_main<double>(Nx, Ny, steps, chunk_size);
+    stencil_main<vdouble>(Nx, Ny, steps, chunk_size);
+
+    return hpx::finalize();
+}
+
+int main(int argc, char* argv[])
+{
+    using namespace hpx::program_options;
+
+    options_description commandline;
+    commandline.add_options()
+        ("Nx", value<std::size_t>()->default_value(1024),
+         "Elements in the x direction")
+        ("Ny", value<std::size_t>()->default_value(1024),
+         "Elements in the y direction")
+        ("steps", value<std::size_t>()->default_value(100),
+         "Number of steps to apply the stencil")
+        ("chunk-size", value<std::size_t>()->default_value(2000),
+         "Number of row upgrades in a task")
+    ;
+
+    // Initialize the runtime
+    return hpx::init(commandline, argc, argv);
+}