Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
883 changes: 449 additions & 434 deletions .buildkite/pipeline.yml

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

[sources]
GPUArrays = {rev = "reverse", url = "https://github.com/christiangnrd/GPUArrays.jl"}
KernelAbstractions = {rev = "intrinsics", url = "https://github.com/christiangnrd/KernelAbstractions.jl"}

[extensions]
ChainRulesCoreExt = "ChainRulesCore"
EnzymeCoreExt = "EnzymeCore"
Expand All @@ -67,7 +71,7 @@ ExprTools = "0.1"
GPUArrays = "11.2.4"
GPUCompiler = "1.4"
GPUToolbox = "0.3, 1"
KernelAbstractions = "0.9.38"
KernelAbstractions = "0.10"
LLVM = "9.3.1"
LLVMLoopInfo = "1"
LazyArtifacts = "1"
Expand Down
1 change: 1 addition & 0 deletions perf/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
230 changes: 115 additions & 115 deletions perf/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,37 @@ gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))

group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
# group["construct"] = @benchmarkable CuArray{Int}(undef, 1)

group["copy"] = @async_benchmarkable copy($gpu_mat)
# group["copy"] = @async_benchmarkable copy($gpu_mat)

gpu_mat2 = copy(gpu_mat)
let group = addgroup!(group, "copyto!")
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
end
# gpu_mat2 = copy(gpu_mat)
# let group = addgroup!(group, "copyto!")
# group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
# group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
# group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
# end

let group = addgroup!(group, "iteration")
group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
# let group = addgroup!(group, "iteration")
# group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]

group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
# group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]

let group = addgroup!(group, "findall")
group["bool"] = @benchmarkable findall($gpu_vec_bools)
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
end
# let group = addgroup!(group, "findall")
# group["bool"] = @benchmarkable findall($gpu_vec_bools)
# group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
# end

let group = addgroup!(group, "findfirst")
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
end
# let group = addgroup!(group, "findfirst")
# group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
# group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
# end

let group = addgroup!(group, "findmin") # findmax
group["1d"] = @async_benchmarkable findmin($gpu_vec)
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
end
end
# let group = addgroup!(group, "findmin") # findmax
# group["1d"] = @async_benchmarkable findmin($gpu_vec)
# group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
# end
# end

let group = addgroup!(group, "reverse")
group["1d"] = @async_benchmarkable reverse($gpu_vec)
Expand All @@ -62,94 +62,94 @@ let group = addgroup!(group, "reverse")
group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
end

group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0

# no need to test inplace version, which performs the same operation (but with an alloc)
let group = addgroup!(group, "accumulate")
let group = addgroup!(group, "Float32")
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)

group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)

group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
end
end

let group = addgroup!(group, "reductions")
let group = addgroup!(group, "reduce")
let group = addgroup!(group, "Float32")
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
end
end

let group = addgroup!(group, "mapreduce")
let group = addgroup!(group, "Float32")
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
end
end

# used by sum, prod, minimum, maximum, all, any, count
end

let group = addgroup!(group, "random")
let group = addgroup!(group, "rand")
group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
end

let group = addgroup!(group, "rand!")
group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
end

let group = addgroup!(group, "randn")
group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
end

let group = addgroup!(group, "randn!")
group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
end
end

let group = addgroup!(group, "sorting")
group["1d"] = @async_benchmarkable sort($gpu_vec)
group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
end

let group = addgroup!(group, "permutedims")
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
end
# group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0

# # no need to test inplace version, which performs the same operation (but with an alloc)
# let group = addgroup!(group, "accumulate")
# let group = addgroup!(group, "Float32")
# group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
# group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
# group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)

# group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
# group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
# end
# let group = addgroup!(group, "Int64")
# group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
# group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
# group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)

# group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
# group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
# end
# end

# let group = addgroup!(group, "reductions")
# let group = addgroup!(group, "reduce")
# let group = addgroup!(group, "Float32")
# group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
# group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
# group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
# group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
# group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
# end
# let group = addgroup!(group, "Int64")
# group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
# group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
# group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
# group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
# group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
# end
# end

# let group = addgroup!(group, "mapreduce")
# let group = addgroup!(group, "Float32")
# group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
# group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
# group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
# group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
# group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
# end
# let group = addgroup!(group, "Int64")
# group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
# group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
# group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
# group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
# group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
# end
# end

# # used by sum, prod, minimum, maximum, all, any, count
# end

# let group = addgroup!(group, "random")
# let group = addgroup!(group, "rand")
# group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
# group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
# end

# let group = addgroup!(group, "rand!")
# group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
# group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
# end

# let group = addgroup!(group, "randn")
# group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
# end

# let group = addgroup!(group, "randn!")
# group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
# end
# end

# let group = addgroup!(group, "sorting")
# group["1d"] = @async_benchmarkable sort($gpu_vec)
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
# end

# let group = addgroup!(group, "permutedims")
# group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
# group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
# group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
# end
26 changes: 13 additions & 13 deletions perf/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ end

# before anything else, run latency benchmarks. these spawn subprocesses, so we don't want
# to do so after regular benchmarks have caused the memory allocator to reserve memory.
@info "Running latency benchmarks"
latency_results = include("latency.jl")
# @info "Running latency benchmarks"
# latency_results = include("latency.jl")

SUITE = BenchmarkGroup()

include("cuda.jl")
include("kernel.jl")
# include("cuda.jl")
# include("kernel.jl")
include("array.jl")

@info "Preparing main benchmarks"
Expand All @@ -34,20 +34,20 @@ GC.gc(true)
CUDA.reclaim()

# benchmark groups that aren't part of the suite
addgroup!(SUITE, "integration")
# addgroup!(SUITE, "integration")

@info "Running main benchmarks"
results = run(SUITE, verbose=true)

# integration tests (that do nasty things, so need to be run last)
@info "Running integration benchmarks"
integration_results = BenchmarkGroup()
integration_results["volumerhs"] = include("volumerhs.jl")
integration_results["byval"] = include("byval.jl")
integration_results["cudadevrt"] = include("cudadevrt.jl")

results["latency"] = latency_results
results["integration"] = integration_results
# @info "Running integration benchmarks"
# integration_results = BenchmarkGroup()
# integration_results["volumerhs"] = include("volumerhs.jl")
# integration_results["byval"] = include("byval.jl")
# integration_results["cudadevrt"] = include("cudadevrt.jl")

# results["latency"] = latency_results
# results["integration"] = integration_results

# write out the results
result_file = length(ARGS) >= 1 ? ARGS[1] : "benchmarkresults.json"
Expand Down
1 change: 1 addition & 0 deletions src/CUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module CUDA
using GPUCompiler

using GPUArrays
import KernelAbstractions: KernelIntrinsics as KI

using GPUToolbox

Expand Down
Loading