diff --git a/ext/PointNeighborsCUDAExt.jl b/ext/PointNeighborsCUDAExt.jl new file mode 100644 index 00000000..0719305e --- /dev/null +++ b/ext/PointNeighborsCUDAExt.jl @@ -0,0 +1,59 @@ +module PointNeighborsCUDAExt + +using PointNeighbors: PointNeighbors, generic_kernel, CUDAMultiGPUBackend, KernelAbstractions +using CUDA: CUDA, CuArray, CUDABackend + +const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory} + +# This is needed because TrixiParticles passes `get_backend(coords)` to distinguish between +# `nothing` (Polyester.jl) and `KernelAbstractions.CPU`. +PointNeighbors.get_backend(x::UnifiedCuArray) = CUDAMultiGPUBackend() + +# Convert input array to `CuArray` with unified memory +function PointNeighbors.Adapt.adapt_structure(to::CUDAMultiGPUBackend, array::Array) + return CuArray{eltype(array), ndims(array), CUDA.UnifiedMemory}(array) +end + +@inline function PointNeighbors.parallel_foreach(f, iterator, x::UnifiedCuArray) + PointNeighbors.parallel_foreach(f, iterator, CUDAMultiGPUBackend()) +end + +# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl +@inline function PointNeighbors.parallel_foreach(f, iterator, x::CUDAMultiGPUBackend) + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. + indices = eachindex(iterator) + + # Skip empty loops + length(indices) == 0 && return + + # Partition `ndrange` to the GPUs + n_gpus = length(CUDA.devices()) + indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus)) + @assert length(indices_split) <= n_gpus + + backend = CUDABackend() + + # Spawn kernel on each device + for (i, indices_) in enumerate(indices_split) + # Select the correct device for this partition + CUDA.device!(i - 1) + + # Call the generic kernel, which only calls a function with the global GPU index + generic_kernel(backend)(ndrange = length(indices_)) do j + @inbounds @inline f(iterator[indices_[j]]) + end + end + + # Synchronize each device + for i in 1:length(indices_split) + CUDA.device!(i - 1) + KernelAbstractions.synchronize(backend) + end + + # Select first device again + CUDA.device!(0) +end + +end # module diff --git a/src/gpu.jl b/src/gpu.jl index 9293e19f..cd418c2e 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -31,6 +31,3 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch) return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells, cell_size, update_buffer, nhs.update_strategy) end - -# This is useful to pass the backend directly to `@threaded` -KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend diff --git a/src/util.jl b/src/util.jl index 7cfbfedc..0fdc1176 100644 --- a/src/util.jl +++ b/src/util.jl @@ -60,7 +60,9 @@ with `Threads.@threads :static`. """ struct ThreadsStaticBackend <: AbstractThreadingBackend end -const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend} +struct CUDAMultiGPUBackend end + +const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend, CUDAMultiGPUBackend} """ @threaded x for ... end @@ -140,7 +142,9 @@ end # Skip empty loops ndrange == 0 && return - backend = KernelAbstractions.get_backend(x) + # Use our `get_backend`, which usually forwards to `KernelAbstractions.get_backend`, + # but also works when `x` is already a backend. + backend = get_backend(x) # Call the generic kernel that is defined below, which only calls a function with # the global GPU index. @@ -155,3 +159,8 @@ end i = @index(Global) @inline f(i) end + +get_backend(x) = KernelAbstractions.get_backend(x) + +# This is useful to pass the backend directly to `@threaded` +get_backend(backend::KernelAbstractions.Backend) = backend