From f407b7d7d8bb88e6e7cfd80ccb3907cbb3255728 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:47:40 +0200 Subject: [PATCH] Make `GridNeighborhoodSearch` with `FullGridCellList` run on the GPU (#45) * Make code run on the GPU * Use GPUArraysCore.jl to import `AbstractGPUArray` * Implement suggestions --- Project.toml | 4 ++ src/PointNeighbors.jl | 4 +- src/gpu.jl | 3 ++ src/neighborhood_search.jl | 32 +++++++++++-- src/nhs_grid.jl | 46 +++++++++++------- src/util.jl | 95 +++++++++++++++++++++++--------------- 6 files changed, 125 insertions(+), 59 deletions(-) diff --git a/Project.toml b/Project.toml index f01986f..139bace 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,8 @@ version = "0.4.1-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -13,6 +15,8 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] Atomix = "0.1" +GPUArraysCore = "0.1" +KernelAbstractions = "0.9" LinearAlgebra = "1" Polyester = "0.7.5" Reexport = "1" diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index fff1c09..b7a8203 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -4,8 +4,10 @@ using Reexport: @reexport using Adapt: Adapt using Atomix: Atomix +using GPUArraysCore: AbstractGPUArray +using KernelAbstractions: KernelAbstractions, @kernel, @index using LinearAlgebra: dot -using Polyester: @batch +using Polyester: Polyester @reexport using StaticArrays: SVector include("util.jl") diff --git a/src/gpu.jl b/src/gpu.jl index 594ecad..70e3390 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -32,3 +32,6 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch) return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells, cell_size, update_buffer, nhs.update_strategy) end + +# This is useful to pass the backend directly to `@threaded` +KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 53e6d4d..dacbc10 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -130,19 +130,43 @@ Note that `system_coords` and `neighbor_coords` can be identical. See also [`initialize!`](@ref), [`update!`](@ref). """ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search; - points = axes(system_coords, 2), - parallel = true) where {T} + parallel::Union{Bool, KernelAbstractions.Backend} = true, + points = axes(system_coords, 2)) where {T} # The type annotation above is to make Julia specialize on the type of the function. # Otherwise, unspecialized code will cause a lot of allocations # and heavily impact performance. # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing + if parallel isa Bool + # When `false` is passed, run serially. When `true` is passed, run either a + # threaded loop with `Polyester.@batch`, or, when `system_coords` is a GPU array, + # launch the loop as a kernel on the GPU. + parallel_ = Val(parallel) + elseif parallel isa KernelAbstractions.Backend + # WARNING! Undocumented, experimental feature: + # When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel + # on this backend. This is useful to test the GPU code on the CPU by passing + # `parallel = KernelAbstractions.CPU()`, even though `system_coords isa Array`. + parallel_ = parallel + end + foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, - Val(parallel)) + parallel_) end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, parallel::Val{true}) - @threaded for point in points + @threaded system_coords for point in points + foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + end + + return nothing +end + +# When a `KernelAbstractions.Backend` is passed, launch a GPU kernel on this backend +@inline function foreach_point_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, points, + backend::KernelAbstractions.Backend) + @threaded backend for point in points foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) end diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 66c6bf0..3699ab6 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -183,38 +183,45 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra return neighborhood_search end +# WARNING! Undocumented, experimental feature: +# By default, determine the parallelization backend from the type of `x`. +# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code +# on this backend. This can be useful to run GPU kernels on the CPU by passing +# `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`. function update!(neighborhood_search::GridNeighborhoodSearch, x::AbstractMatrix, y::AbstractMatrix; - points_moving = (true, true)) + points_moving = (true, true), parallelization_backend = x) # The coordinates of the first set of points are irrelevant for this NHS. # Only update when the second set is moving. points_moving[2] || return neighborhood_search - update_grid!(neighborhood_search, y) + update_grid!(neighborhood_search, y; parallelization_backend) end # Update only with neighbor coordinates function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS}, - y::AbstractMatrix) where {NDIMS} - update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i)) + y::AbstractMatrix; parallelization_backend = y) where {NDIMS} + update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i); + parallelization_backend) end -# Serial and semi-parallel update +# Serial and semi-parallel update. +# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`. function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any, SerialUpdate}, GridNeighborhoodSearch{<:Any, SemiParallelUpdate}}, - coords_fun::Function) + coords_fun::Function; parallelization_backend = nothing) (; cell_list, update_buffer) = neighborhood_search # Empty each thread's list - @threaded for i in eachindex(update_buffer) + @threaded parallelization_backend for i in eachindex(update_buffer) emptyat!(update_buffer, i) end # Find all cells containing points that now belong to another cell. - # This loop is threaded for `update_strategy == SemiParallelUpdate`. - mark_changed_cells!(neighborhood_search, coords_fun) + # This loop is threaded for `update_strategy == SemiParallelUpdate()`. + mark_changed_cells!(neighborhood_search, coords_fun, parallelization_backend) # Iterate over all marked cells and move the points into their new cells. # This is always a serial loop (hence "semi-parallel"). @@ -252,18 +259,22 @@ end # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any, SemiParallelUpdate}, - coords_fun::T) where {T} + coords_fun::T, parallelization_backend) where {T} + (; cell_list) = neighborhood_search + # `each_cell_index(cell_list)` might return a `KeySet`, which has to be `collect`ed # first to be able to be used in a threaded loop. This function takes care of that. - @threaded for cell_index in each_cell_index_threadable(neighborhood_search.cell_list) + @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list) mark_changed_cell!(neighborhood_search, cell_index, coords_fun) end end @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any, SerialUpdate}, - coords_fun::T) where {T} - for cell_index in each_cell_index(neighborhood_search.cell_list) + coords_fun::T, _) where {T} + (; cell_list) = neighborhood_search + + for cell_index in each_cell_index(cell_list) mark_changed_cell!(neighborhood_search, cell_index, coords_fun) end end @@ -285,9 +296,10 @@ end end end -# Fully parallel update with atomic push +# Fully parallel update with atomic push. +# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`. function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate}, - coords_fun::Function) + coords_fun::Function; parallelization_backend = nothing) (; cell_list) = neighborhood_search # Note that we need two separate loops for adding and removing points. @@ -295,7 +307,7 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle # simultaneously, but it does not work when `deleteat_cell!` is called at the same time. # Add points to new cells - @threaded for cell_index in each_cell_index_threadable(cell_list) + @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list) for point in cell_list[cell_index] cell_coords_ = cell_coords(coords_fun(point), neighborhood_search) @@ -309,7 +321,7 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle end # Remove points from old cells - @threaded for cell_index in each_cell_index_threadable(cell_list) + @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list) points = cell_list[cell_index] # WARNING!!! diff --git a/src/util.jl b/src/util.jl index c7dbf65..d979ac2 100644 --- a/src/util.jl +++ b/src/util.jl @@ -23,53 +23,74 @@ end end """ - @threaded for ... end + @threaded x for ... end +Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `x`. Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange` but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -In particular, there may be an additional check whether only one thread is used -to reduce the overhead of serial execution or the underlying threading capabilities -might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). + +The first argument must either be a `KernelAbstractions.Backend` or an array from which the +backend can be derived to determine if the loop must be run threaded on the CPU +or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU +kernel on the CPU. + +In particular, the underlying threading capabilities might be provided by other packages +such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). !!! warn This macro does not necessarily work for general `for` loops. For example, it does not necessarily support general iterables such as `eachline(filename)`. - -Some discussion can be found at -[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964) -and -[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435). - -Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl). """ -macro threaded(expr) - # Use `esc(quote ... end)` for nested macro calls as suggested in - # https://github.com/JuliaLang/julia/issues/23221 - # - # The following code is a simple version using only `Threads.@threads` from the - # standard library with an additional check whether only a single thread is used - # to reduce some overhead (and allocations) for serial execution. - # - # return esc(quote - # let - # if Threads.nthreads() == 1 - # $(expr) - # else - # Threads.@threads $(expr) - # end - # end - # end) - # - # However, the code below using `@batch` from Polyester.jl is more efficient, - # since this packages provides threads with less overhead. Since it is written - # by Chris Elrod, the author of LoopVectorization.jl, we expect this package - # to provide the most efficient and useful implementation of threads (as we use - # them) available in Julia. - # !!! danger "Heisenbug" - # Look at the comments for `wrap_array` when considering to change this macro. +macro threaded(system, expr) + # Reverse-engineer the for loop. + # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`. + # So, `expr.args[1].args[2]` is the iterator `eachindex(x)` + # and `expr.args[1].args[1]` is the loop variable `i`. + iterator = expr.args[1].args[2] + i = expr.args[1].args[1] + inner_loop = expr.args[2] + # Assemble the `for` loop again as a call to `parallel_foreach`, using `$i` to use the + # same loop variable as used in the for loop. return esc(quote - PointNeighbors.@batch $(expr) + PointNeighbors.parallel_foreach($iterator, $system) do $i + $inner_loop + end end) end + +# Use `Polyester.@batch` for low-overhead threading +@inline function parallel_foreach(f, iterator, x) + Polyester.@batch for i in iterator + @inline f(i) + end +end + +# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl +@inline function parallel_foreach(f, iterator, + x::Union{AbstractGPUArray, KernelAbstractions.Backend}) + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. + indices = eachindex(iterator) + ndrange = length(indices) + + # Skip empty loops + ndrange == 0 && return + + backend = KernelAbstractions.get_backend(x) + + # Call the generic kernel that is defined below, which only calls a function with + # the global GPU index. + generic_kernel(backend)(ndrange = ndrange) do i + @inline f(iterator[indices[i]]) + end + + KernelAbstractions.synchronize(backend) +end + +@kernel function generic_kernel(f) + i = @index(Global) + @inline f(i) +end