From f407b7d7d8bb88e6e7cfd80ccb3907cbb3255728 Mon Sep 17 00:00:00 2001
From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com>
Date: Mon, 1 Jul 2024 14:47:40 +0200
Subject: [PATCH] Make `GridNeighborhoodSearch` with `FullGridCellList` run on
 the GPU (#45)

* Make code run on the GPU

* Use GPUArraysCore.jl to import `AbstractGPUArray`

* Implement suggestions
---
 Project.toml               |  4 ++
 src/PointNeighbors.jl      |  4 +-
 src/gpu.jl                 |  3 ++
 src/neighborhood_search.jl | 32 +++++++++++--
 src/nhs_grid.jl            | 46 +++++++++++-------
 src/util.jl                | 95 +++++++++++++++++++++++---------------
 6 files changed, 125 insertions(+), 59 deletions(-)

diff --git a/Project.toml b/Project.toml
index f01986f..139bace 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,8 @@ version = "0.4.1-dev"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -13,6 +15,8 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
 [compat]
 Atomix = "0.1"
+GPUArraysCore = "0.1"
+KernelAbstractions = "0.9"
 LinearAlgebra = "1"
 Polyester = "0.7.5"
 Reexport = "1"
diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl
index fff1c09..b7a8203 100644
--- a/src/PointNeighbors.jl
+++ b/src/PointNeighbors.jl
@@ -4,8 +4,10 @@ using Reexport: @reexport
 
 using Adapt: Adapt
 using Atomix: Atomix
+using GPUArraysCore: AbstractGPUArray
+using KernelAbstractions: KernelAbstractions, @kernel, @index
 using LinearAlgebra: dot
-using Polyester: @batch
+using Polyester: Polyester
 @reexport using StaticArrays: SVector
 
 include("util.jl")
diff --git a/src/gpu.jl b/src/gpu.jl
index 594ecad..70e3390 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -32,3 +32,6 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch)
     return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells,
                                   cell_size, update_buffer, nhs.update_strategy)
 end
+
+# This is useful to pass the backend directly to `@threaded`
+KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend
diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl
index 53e6d4d..dacbc10 100644
--- a/src/neighborhood_search.jl
+++ b/src/neighborhood_search.jl
@@ -130,19 +130,43 @@ Note that `system_coords` and `neighbor_coords` can be identical.
 See also [`initialize!`](@ref), [`update!`](@ref).
 """
 function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search;
-                                points = axes(system_coords, 2),
-                                parallel = true) where {T}
+                                parallel::Union{Bool, KernelAbstractions.Backend} = true,
+                                points = axes(system_coords, 2)) where {T}
     # The type annotation above is to make Julia specialize on the type of the function.
     # Otherwise, unspecialized code will cause a lot of allocations
     # and heavily impact performance.
     # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
+    if parallel isa Bool
+        # When `false` is passed, run serially. When `true` is passed, run either a
+        # threaded loop with `Polyester.@batch`, or, when `system_coords` is a GPU array,
+        # launch the loop as a kernel on the GPU.
+        parallel_ = Val(parallel)
+    elseif parallel isa KernelAbstractions.Backend
+        # WARNING! Undocumented, experimental feature:
+        # When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel
+        # on this backend. This is useful to test the GPU code on the CPU by passing
+        # `parallel = KernelAbstractions.CPU()`, even though `system_coords isa Array`.
+        parallel_ = parallel
+    end
+
     foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points,
-                           Val(parallel))
+                           parallel_)
 end
 
 @inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
                                         neighborhood_search, points, parallel::Val{true})
-    @threaded for point in points
+    @threaded system_coords for point in points
+        foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
+    end
+
+    return nothing
+end
+
+# When a `KernelAbstractions.Backend` is passed, launch a GPU kernel on this backend
+@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
+                                        neighborhood_search, points,
+                                        backend::KernelAbstractions.Backend)
+    @threaded backend for point in points
         foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point)
     end
 
diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index 66c6bf0..3699ab6 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -183,38 +183,45 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra
     return neighborhood_search
 end
 
+# WARNING! Undocumented, experimental feature:
+# By default, determine the parallelization backend from the type of `x`.
+# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code
+# on this backend. This can be useful to run GPU kernels on the CPU by passing
+# `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`.
 function update!(neighborhood_search::GridNeighborhoodSearch,
                  x::AbstractMatrix, y::AbstractMatrix;
-                 points_moving = (true, true))
+                 points_moving = (true, true), parallelization_backend = x)
     # The coordinates of the first set of points are irrelevant for this NHS.
     # Only update when the second set is moving.
     points_moving[2] || return neighborhood_search
 
-    update_grid!(neighborhood_search, y)
+    update_grid!(neighborhood_search, y; parallelization_backend)
 end
 
 # Update only with neighbor coordinates
 function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS},
-                      y::AbstractMatrix) where {NDIMS}
-    update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i))
+                      y::AbstractMatrix; parallelization_backend = y) where {NDIMS}
+    update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i);
+                 parallelization_backend)
 end
 
-# Serial and semi-parallel update
+# Serial and semi-parallel update.
+# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`.
 function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
                                                                         SerialUpdate},
                                                  GridNeighborhoodSearch{<:Any,
                                                                         SemiParallelUpdate}},
-                      coords_fun::Function)
+                      coords_fun::Function; parallelization_backend = nothing)
     (; cell_list, update_buffer) = neighborhood_search
 
     # Empty each thread's list
-    @threaded for i in eachindex(update_buffer)
+    @threaded parallelization_backend for i in eachindex(update_buffer)
         emptyat!(update_buffer, i)
     end
 
     # Find all cells containing points that now belong to another cell.
-    # This loop is threaded for `update_strategy == SemiParallelUpdate`.
-    mark_changed_cells!(neighborhood_search, coords_fun)
+    # This loop is threaded for `update_strategy == SemiParallelUpdate()`.
+    mark_changed_cells!(neighborhood_search, coords_fun, parallelization_backend)
 
     # Iterate over all marked cells and move the points into their new cells.
     # This is always a serial loop (hence "semi-parallel").
@@ -252,18 +259,22 @@ end
 # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
 @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
                                                                                  SemiParallelUpdate},
-                                     coords_fun::T) where {T}
+                                     coords_fun::T, parallelization_backend) where {T}
+    (; cell_list) = neighborhood_search
+
     # `each_cell_index(cell_list)` might return a `KeySet`, which has to be `collect`ed
     # first to be able to be used in a threaded loop. This function takes care of that.
-    @threaded for cell_index in each_cell_index_threadable(neighborhood_search.cell_list)
+    @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
         mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
     end
 end
 
 @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
                                                                                  SerialUpdate},
-                                     coords_fun::T) where {T}
-    for cell_index in each_cell_index(neighborhood_search.cell_list)
+                                     coords_fun::T, _) where {T}
+    (; cell_list) = neighborhood_search
+
+    for cell_index in each_cell_index(cell_list)
         mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
     end
 end
@@ -285,9 +296,10 @@ end
     end
 end
 
-# Fully parallel update with atomic push
+# Fully parallel update with atomic push.
+# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`.
 function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate},
-                      coords_fun::Function)
+                      coords_fun::Function; parallelization_backend = nothing)
     (; cell_list) = neighborhood_search
 
     # Note that we need two separate loops for adding and removing points.
@@ -295,7 +307,7 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle
     # simultaneously, but it does not work when `deleteat_cell!` is called at the same time.
 
     # Add points to new cells
-    @threaded for cell_index in each_cell_index_threadable(cell_list)
+    @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
         for point in cell_list[cell_index]
             cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)
 
@@ -309,7 +321,7 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle
     end
 
     # Remove points from old cells
-    @threaded for cell_index in each_cell_index_threadable(cell_list)
+    @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
         points = cell_list[cell_index]
 
         # WARNING!!!
diff --git a/src/util.jl b/src/util.jl
index c7dbf65..d979ac2 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -23,53 +23,74 @@ end
 end
 
 """
-    @threaded for ... end
+    @threaded x for ... end
 
+Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `x`.
 Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange`
 but without guarantee that the underlying implementation uses `Threads.@threads`
 or works for more general `for` loops.
-In particular, there may be an additional check whether only one thread is used
-to reduce the overhead of serial execution or the underlying threading capabilities
-might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
+
+The first argument must either be a `KernelAbstractions.Backend` or an array from which the
+backend can be derived to determine if the loop must be run threaded on the CPU
+or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU
+kernel on the CPU.
+
+In particular, the underlying threading capabilities might be provided by other packages
+such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
 
 !!! warn
     This macro does not necessarily work for general `for` loops. For example,
     it does not necessarily support general iterables such as `eachline(filename)`.
-
-Some discussion can be found at
-[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964)
-and
-[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435).
-
-Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl).
 """
-macro threaded(expr)
-    # Use `esc(quote ... end)` for nested macro calls as suggested in
-    # https://github.com/JuliaLang/julia/issues/23221
-    #
-    # The following code is a simple version using only `Threads.@threads` from the
-    # standard library with an additional check whether only a single thread is used
-    # to reduce some overhead (and allocations) for serial execution.
-    #
-    # return esc(quote
-    #   let
-    #     if Threads.nthreads() == 1
-    #       $(expr)
-    #     else
-    #       Threads.@threads $(expr)
-    #     end
-    #   end
-    # end)
-    #
-    # However, the code below using `@batch` from Polyester.jl is more efficient,
-    # since this packages provides threads with less overhead. Since it is written
-    # by Chris Elrod, the author of LoopVectorization.jl, we expect this package
-    # to provide the most efficient and useful implementation of threads (as we use
-    # them) available in Julia.
-    # !!! danger "Heisenbug"
-    #     Look at the comments for `wrap_array` when considering to change this macro.
+macro threaded(system, expr)
+    # Reverse-engineer the for loop.
+    # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`.
+    # So, `expr.args[1].args[2]` is the iterator `eachindex(x)`
+    # and `expr.args[1].args[1]` is the loop variable `i`.
+    iterator = expr.args[1].args[2]
+    i = expr.args[1].args[1]
+    inner_loop = expr.args[2]
 
+    # Assemble the `for` loop again as a call to `parallel_foreach`, using `$i` to use the
+    # same loop variable as used in the for loop.
     return esc(quote
-                   PointNeighbors.@batch $(expr)
+                   PointNeighbors.parallel_foreach($iterator, $system) do $i
+                       $inner_loop
+                   end
                end)
 end
+
+# Use `Polyester.@batch` for low-overhead threading
+@inline function parallel_foreach(f, iterator, x)
+    Polyester.@batch for i in iterator
+        @inline f(i)
+    end
+end
+
+# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
+@inline function parallel_foreach(f, iterator,
+                                  x::Union{AbstractGPUArray, KernelAbstractions.Backend})
+    # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
+    # and index with `iterator[eachindex(iterator)[i]]`.
+    # Note that this only works with vector-like iterators that support arbitrary indexing.
+    indices = eachindex(iterator)
+    ndrange = length(indices)
+
+    # Skip empty loops
+    ndrange == 0 && return
+
+    backend = KernelAbstractions.get_backend(x)
+
+    # Call the generic kernel that is defined below, which only calls a function with
+    # the global GPU index.
+    generic_kernel(backend)(ndrange = ndrange) do i
+        @inline f(iterator[indices[i]])
+    end
+
+    KernelAbstractions.synchronize(backend)
+end
+
+@kernel function generic_kernel(f)
+    i = @index(Global)
+    @inline f(i)
+end