JuliaGPU · christiangnrd · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
diff --git a/Project.toml b/Project.toml
@@ -44,6 +44,10 @@ EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
+[sources]
+GPUArrays = {rev = "reverse", url = "https://github.com/christiangnrd/GPUArrays.jl"}
+KernelAbstractions = {rev = "intrinsics", url = "https://github.com/christiangnrd/KernelAbstractions.jl"}
+
 [extensions]
 ChainRulesCoreExt = "ChainRulesCore"
 EnzymeCoreExt = "EnzymeCore"
@@ -67,7 +71,7 @@ ExprTools = "0.1"
 GPUArrays = "11.2.4"
 GPUCompiler = "1.4"
 GPUToolbox = "0.3, 1"
-KernelAbstractions = "0.9.38"
+KernelAbstractions = "0.10"
 LLVM = "9.3.1"
 LLVMLoopInfo = "1"
 LazyArtifacts = "1"

diff --git a/perf/Project.toml b/perf/Project.toml
@@ -2,5 +2,6 @@
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/array.jl b/perf/array.jl
@@ -19,37 +19,37 @@ gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
 gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
 gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
 
-group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
+# group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
 
-group["copy"] = @async_benchmarkable copy($gpu_mat)
+# group["copy"] = @async_benchmarkable copy($gpu_mat)
 
-gpu_mat2 = copy(gpu_mat)
-let group = addgroup!(group, "copyto!")
-    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
-    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
-    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
-end
+# gpu_mat2 = copy(gpu_mat)
+# let group = addgroup!(group, "copyto!")
+#     group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+#     group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+#     group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+# end
 
-let group = addgroup!(group, "iteration")
-    group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
+# let group = addgroup!(group, "iteration")
+#     group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
 
-    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+#     group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
 
-    let group = addgroup!(group, "findall")
-        group["bool"] = @benchmarkable findall($gpu_vec_bools)
-        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
-    end
+#     let group = addgroup!(group, "findall")
+#         group["bool"] = @benchmarkable findall($gpu_vec_bools)
+#         group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+#     end
 
-    let group = addgroup!(group, "findfirst")
-        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
-        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
-    end
+#     let group = addgroup!(group, "findfirst")
+#         group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+#         group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+#     end
 
-    let group = addgroup!(group, "findmin") # findmax
-        group["1d"] = @async_benchmarkable findmin($gpu_vec)
-        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
-    end
-end
+#     let group = addgroup!(group, "findmin") # findmax
+#         group["1d"] = @async_benchmarkable findmin($gpu_vec)
+#         group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+#     end
+# end
 
 let group = addgroup!(group, "reverse")
     group["1d"] = @async_benchmarkable reverse($gpu_vec)
@@ -62,94 +62,94 @@ let group = addgroup!(group, "reverse")
     group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
 end
 
-group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
-
-# no need to test inplace version, which performs the same operation (but with an alloc)
-let group = addgroup!(group, "accumulate")
-    let group = addgroup!(group, "Float32")
-        group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
-        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
-        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
-
-        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
-        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
-    end
-    let group = addgroup!(group, "Int64")
-        group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
-        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
-        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
-
-        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
-        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
-    end
-end
-
-let group = addgroup!(group, "reductions")
-    let group = addgroup!(group, "reduce")
-        let group = addgroup!(group, "Float32")
-            group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
-            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
-            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
-            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
-            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
-        end
-        let group = addgroup!(group, "Int64")
-            group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
-            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
-            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
-            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
-            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
-        end
-    end
-
-    let group = addgroup!(group, "mapreduce")
-        let group = addgroup!(group, "Float32")
-            group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
-            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
-            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
-            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
-            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
-        end
-        let group = addgroup!(group, "Int64")
-            group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
-            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
-            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
-            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
-            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
-        end
-    end
-
-    # used by sum, prod, minimum, maximum, all, any, count
-end
-
-let group = addgroup!(group, "random")
-    let group = addgroup!(group, "rand")
-        group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
-        group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
-    end
-
-    let group = addgroup!(group, "rand!")
-        group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
-        group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
-    end
-
-    let group = addgroup!(group, "randn")
-        group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
-    end
-
-    let group = addgroup!(group, "randn!")
-        group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
-    end
-end
-
-let group = addgroup!(group, "sorting")
-    group["1d"] = @async_benchmarkable sort($gpu_vec)
-    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
-end
-
-let group = addgroup!(group, "permutedims")
-    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
-    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
-    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
-end
+# group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# # no need to test inplace version, which performs the same operation (but with an alloc)
+# let group = addgroup!(group, "accumulate")
+#     let group = addgroup!(group, "Float32")
+#         group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
+#         group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+#         group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
+
+#         group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
+#         group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
+#     end
+#     let group = addgroup!(group, "Int64")
+#         group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
+#         group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
+#         group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
+
+#         group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
+#         group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
+#     end
+# end
+
+# let group = addgroup!(group, "reductions")
+#     let group = addgroup!(group, "reduce")
+#         let group = addgroup!(group, "Float32")
+#             group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
+#             group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+#             group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
+#             group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
+#             group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
+#         end
+#         let group = addgroup!(group, "Int64")
+#             group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
+#             group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
+#             group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
+#             group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
+#             group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
+#         end
+#     end
+
+#     let group = addgroup!(group, "mapreduce")
+#         let group = addgroup!(group, "Float32")
+#             group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+#             group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+#             group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
+#             group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
+#             group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
+#         end
+#         let group = addgroup!(group, "Int64")
+#             group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
+#             group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
+#             group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+#             group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
+#             group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
+#         end
+#     end
+
+#     # used by sum, prod, minimum, maximum, all, any, count
+# end
+
+# let group = addgroup!(group, "random")
+#     let group = addgroup!(group, "rand")
+#         group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
+#         group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
+#     end
+
+#     let group = addgroup!(group, "rand!")
+#         group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
+#         group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
+#     end
+
+#     let group = addgroup!(group, "randn")
+#         group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
+#     end
+
+#     let group = addgroup!(group, "randn!")
+#         group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
+#     end
+# end
+
+# let group = addgroup!(group, "sorting")
+#     group["1d"] = @async_benchmarkable sort($gpu_vec)
+#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+# end
+
+# let group = addgroup!(group, "permutedims")
+#     group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+#     group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+#     group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+# end
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
@@ -16,13 +16,13 @@ end
 
 # before anything else, run latency benchmarks. these spawn subprocesses, so we don't want
 # to do so after regular benchmarks have caused the memory allocator to reserve memory.
-@info "Running latency benchmarks"
-latency_results = include("latency.jl")
+# @info "Running latency benchmarks"
+# latency_results = include("latency.jl")
 
 SUITE = BenchmarkGroup()
 
-include("cuda.jl")
-include("kernel.jl")
+# include("cuda.jl")
+# include("kernel.jl")
 include("array.jl")
 
 @info "Preparing main benchmarks"
@@ -34,20 +34,20 @@ GC.gc(true)
 CUDA.reclaim()
 
 # benchmark groups that aren't part of the suite
-addgroup!(SUITE, "integration")
+# addgroup!(SUITE, "integration")
 
 @info "Running main benchmarks"
 results = run(SUITE, verbose=true)
 
 # integration tests (that do nasty things, so need to be run last)
-@info "Running integration benchmarks"
-integration_results = BenchmarkGroup()
-integration_results["volumerhs"] = include("volumerhs.jl")
-integration_results["byval"] = include("byval.jl")
-integration_results["cudadevrt"] = include("cudadevrt.jl")
-
-results["latency"] = latency_results
-results["integration"] = integration_results
+# @info "Running integration benchmarks"
+# integration_results = BenchmarkGroup()
+# integration_results["volumerhs"] = include("volumerhs.jl")
+# integration_results["byval"] = include("byval.jl")
+# integration_results["cudadevrt"] = include("cudadevrt.jl")
+
+# results["latency"] = latency_results
+# results["integration"] = integration_results
 
 # write out the results
 result_file = length(ARGS) >= 1 ? ARGS[1] : "benchmarkresults.json"

diff --git a/src/CUDA.jl b/src/CUDA.jl
@@ -3,6 +3,7 @@ module CUDA
 using GPUCompiler
 
 using GPUArrays
+import KernelAbstractions: KernelIntrinsics as KI
 
 using GPUToolbox