Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds on-node vectorization variant of 2D stencil #6

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/03_stencil/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ hpx_setup_target(stencil_serial)
add_executable(stencil_parallel_0 stencil_parallel_0.cpp)
hpx_setup_target(stencil_parallel_0)

add_executable(stencil_parallel_vector stencil_parallel_vector.cpp)
hpx_setup_target(stencil_parallel_vector)

add_executable(stencil_parallel_1 stencil_parallel_1.cpp)
hpx_setup_target(stencil_parallel_1)

Expand All @@ -33,6 +36,7 @@ hpx_setup_target(stencil_parallel_4)
if (TARGET tutorial)
add_hpx_pseudo_dependencies(tutorial.stencil stencil_serial)
add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_0)
add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_parallel_vector)
add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_1)
add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_2)
add_hpx_pseudo_dependencies(tutorial.stencil stencil_parallel_3)
Expand Down
8 changes: 8 additions & 0 deletions examples/03_stencil/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Optimized 2D stencil code series

This example section takes you through an approach to write a serial stencil
and scale it to an optimized distributed 2D stencil.

To be able to build this directory, you are required to have
[HPX](https://github.com/STEllAR-GROUP/hpx) and
[NSIMD](https://github.com/agenium-scale/nsimd) installed.
181 changes: 181 additions & 0 deletions examples/03_stencil/include/grid.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// Copyright (c) 2020 Nikunj Gupta
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#ifndef GRID
#define GRID

#include <hpx/include/future.hpp>

#include <vector>
#include <nsimd/nsimd-all.hpp>
#include <type_traits>

// Define some commonly used packs
using vdouble = nsimd::pack<double>;
using vfloat = nsimd::pack<float>;


// Meta templates to extract type of a pack
template <typename _Ty>
struct get_type
{
using type = _Ty;
};

template <typename _Ty>
struct get_type<nsimd::pack<_Ty>>
{
using type = _Ty;
};


// Our grid data structure encapsulates the stencil
// and provide with api functions to use in for loops
template <typename Container>
class Grid
{
public:
using container_t = Container;
using type = typename container_t::value_type;
using alloc_t = typename container_t::allocator_type;
using iterator_t = typename container_t::iterator;

// Conform rule of 5
Grid() = default;
Grid(const Grid<Container>&) = default;
Grid(Grid<Container>&&) = default;
Grid<Container>& operator=(const Grid<Container>&) = default;
Grid<Container>& operator=(Grid<Container>&&) = default;

Grid(std::size_t Nx, std::size_t Ny, type value, alloc_t alloc)
:
nx_(Nx), ny_(Ny), grid_(container_t(Nx * Ny, value, alloc))
{}

// Define useful iterators
iterator_t begin()
{
return grid_.begin();
}

iterator_t end()
{
return grid_.end();
}

// Returns size of a row
std::size_t row_size()
{
return nx_;
}

// Returns number of columns
std::size_t column_size()
{
return ny_;
}

// Indexing function
type& in(std::size_t x, std::size_t y)
{
return grid_[y*nx_ + x];
}

private:
container_t grid_;
std::size_t nx_;
std::size_t ny_;
};


// Template specializations to allow for correct shuffle and print operations
template <
typename Container,
typename _Tx = typename Container::value_type,
typename _Ty = typename get_type<_Tx>::type
>
struct helper
{
static void shuffle(Grid<Container>& grid, std::size_t ny)
{
std::size_t len = std::size_t(nsimd::len(_Tx()));
std::vector<_Ty> vect(len+2, 0.0);

// Left shift the first simd register
nsimd::storeu(&vect[1], grid.in(1, ny));
grid.in(0, ny) = nsimd::loadu<_Tx>(&vect[0]);

// Right shift the last simd register
nsimd::storeu(&vect[1], grid.in(grid.row_size() - 2, ny));
grid.in(grid.row_size() - 1, ny) = nsimd::loadu<_Tx>(&vect[2]);
}

static void print(Grid<Container>& grid)
{
const std::size_t nx = grid.row_size();
const std::size_t ny = grid.column_size();

std::size_t len = std::size_t(nsimd::len(_Tx()));

const std::size_t nx_actual = (nx - 2) * len + 2;
std::vector<_Ty> vect(len, 0.0);

std::vector<_Ty> line(nx_actual);

std::cout << "{ ";
for (const auto& elem: line)
std::cout << 1.0 << " , ";
std::cout << "}" << std::endl;

for (std::size_t y = 1; y < ny-1; ++y)
{
for (std::size_t x = 1; x < nx - 1; ++x)
{
nsimd::storeu(&vect[0], grid.in(x, y));
for (std::size_t i = 0; i < len; ++i)
{
line[x + (nx-2)*i] = vect[i];
}
}
line[0] = 0.0;
line[nx_actual - 1] = 0.0;

std::cout << "{ ";
for (const auto& elem: line)
std::cout << elem << " , ";
std::cout << "}" << std::endl;
}

std::cout << "{ ";
for (const auto& elem: line)
std::cout << 1.0 << " , ";
std::cout << "}" << std::endl;
}
};

template <typename Container, typename _Tx>
struct helper<Container, _Tx, _Tx>
{
static void shuffle(Grid<Container>& grid, std::size_t ny) {}

static void print(Grid<Container>& grid)
{
const std::size_t nx = grid.row_size();
const std::size_t ny = grid.column_size();

for (std::size_t y = 0; y < ny; ++y)
{
std::cout << "{ ";
for (std::size_t x = 0; x < nx; ++x)
{
std::cout << grid.in(x, y) << " , ";
}
std::cout << "}" << std::endl;
}
}
};


#endif
57 changes: 57 additions & 0 deletions examples/03_stencil/include/stencil.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright (c) 2016 Thomas Heller
// Copyright (c) 2020 Nikunj Gupta
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#ifndef STENCIL
#define STENCIL

#include <algorithm>
#include <array>
#include <atomic>

#include "grid.hpp"

template <typename Container>
using array_t = std::array<Grid<Container>, 2>;

template <typename Container>
void init(array_t<Container>& U, std::size_t Nx, std::size_t Ny)
{
// Initialize the top row with 1
std::fill(U[0].begin(), U[0].begin() + Nx, 1.0);
std::fill(U[1].begin(), U[1].begin() + Nx, 1.0);

// Initialize bottom row with 1
std::fill(U[0].end() - Nx, U[0].end(), 1.0);
std::fill(U[1].end() - Nx, U[1].end(), 1.0);
}

template <typename Container>
void stencil_update(array_t<Container>& U, const std::size_t ny,
const std::size_t len, const std::size_t t)
{
Grid<Container>& curr = U[t % 2];
Grid<Container>& next = U[(t + 1) % 2];

std::size_t row_length = curr.row_size();

#pragma unroll
for (std::size_t nx = 1; nx < row_length - 1; ++nx)
{
// Stencil operation
next.in(nx, ny) =
(curr.in(nx-1, ny) + curr.in(nx+1, ny) +
curr.in(nx, ny-1) + curr.in(nx, ny+1)) * 0.25f;
}

// Maintain the halo for next computation in case of simd
if (std::is_same<typename Container::value_type,
nsimd::pack<typename get_type<typename Container::value_type>::type>>::value)
{
helper<Container>::shuffle(next, ny);
}
}

#endif
122 changes: 122 additions & 0 deletions examples/03_stencil/stencil_parallel_vector.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Copyright (c) 2016 Thomas Heller
// Copyright (c) 2020 Nikunj Gupta
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <array>
#include <vector>
#include <iostream>

#include <hpx/hpx_init.hpp>
#include <hpx/include/compute.hpp>
#include <hpx/include/parallel_for_loop.hpp>
#include <hpx/program_options.hpp>
#include <hpx/timing/high_resolution_timer.hpp>
#include <hpx/parallel/executors/auto_chunk_size.hpp>
#include <hpx/parallel/executors/static_chunk_size.hpp>
#include <hpx/traits/executor_traits.hpp>
#include <hpx/execution/executors/parallel_executor_aggregated.hpp>

// Depends on NSIMD
#include <nsimd/nsimd-all.hpp>

#include "include/stencil.hpp"

template <typename _Tx>
void stencil_main(const std::size_t& Nx, const std::size_t& Ny,
const std::size_t& steps, const std::size_t& chunk_size)
{
using allocator_type = hpx::compute::host::block_allocator<_Tx>;
using executor_type = hpx::compute::host::block_executor<>;
using data_type = hpx::compute::vector<_Tx, allocator_type>;

std::array<Grid<data_type>, 2> U;

auto numa_domains = hpx::compute::host::numa_domains();
allocator_type alloc(numa_domains);

// Get register length
// for float, length = 1
// for simd register, length = 2/4/8/..
std::size_t len = 1;
if (std::is_same<_Tx,
nsimd::pack<typename get_type<_Tx>::type>>::value)
len = static_cast<std::size_t>(nsimd::len(_Tx()));

// Create augmented matrix
std::size_t Nx_aug = Nx/len + 2;
std::size_t Ny_aug = Ny + 2;

U[0] = Grid<data_type>(Nx_aug, Ny_aug, 0.0, alloc);
U[1] = Grid<data_type>(Nx_aug, Ny_aug, 0.0, alloc);

// Initialize the stencil
init<data_type>(U, Nx_aug, Ny_aug);

// Range of Ny for stencil operation
auto range = boost::irange(static_cast<std::size_t>(1), Ny_aug - 1);

// Define HPX executor
executor_type executor(numa_domains);
auto policy = hpx::parallel::execution::par.on(executor);

hpx::util::high_resolution_timer t;
using hpx::parallel::execution::auto_chunk_size;

for (std::size_t t = 0; t < steps; ++t)
{
hpx::parallel::for_each(
policy,
std::begin(range), std::end(range),
[&U, len, t] (std::size_t i)
{
stencil_update<data_type>(U, i, len, t);
});
}
double elapsed = t.elapsed();

std::cout << "Working with type: " <<
boost::typeindex::type_id<_Tx>().pretty_name() << std::endl;

std::cout << "Time elapsed: " << elapsed << std::endl;

double mlups = (Nx * Ny * steps) / 1e6 / elapsed;
std::cout << "MLUPS: " << mlups << "\n\n";
}

int hpx_main(hpx::program_options::variables_map& vm)
{
// Get commandline arguments
std::size_t Nx = vm["Nx"].as<std::size_t>();
std::size_t Ny = vm["Ny"].as<std::size_t>();
std::size_t steps = vm["steps"].as<std::size_t>();
std::size_t chunk_size = vm["chunk-size"].as<std::size_t>();

stencil_main<float>(Nx, Ny, steps, chunk_size);
stencil_main<vfloat>(Nx, Ny, steps, chunk_size);
stencil_main<double>(Nx, Ny, steps, chunk_size);
stencil_main<vdouble>(Nx, Ny, steps, chunk_size);

return hpx::finalize();
}

int main(int argc, char* argv[])
{
using namespace hpx::program_options;

options_description commandline;
commandline.add_options()
("Nx", value<std::size_t>()->default_value(1024),
"Elements in the x direction")
("Ny", value<std::size_t>()->default_value(1024),
"Elements in the y direction")
("steps", value<std::size_t>()->default_value(100),
"Number of steps to apply the stencil")
("chunk-size", value<std::size_t>()->default_value(2000),
"Number of row upgrades in a task")
;

// Initialize the runtime
return hpx::init(commandline, argc, argv);
}