diff --git a/Project.toml b/Project.toml index f5bcd15..defe897 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8" CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d" CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index 3da1a0a..8a98997 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -9,7 +9,7 @@ using CompilerCaching: CacheView, method_instance, results import Core.Compiler as CC -using CUDA: CuModule, CuFunction, cudacall, device, capability +using CUDA: CUDA, CuModule, CuFunction, cudacall, device, capability using CUDA_Compiler_jll public launch @@ -54,6 +54,8 @@ function check_tile_ir_support() return VersionNumber(cuda_ver.major, cuda_ver.minor) end +const EMIT_CODE_LOCK = ReentrantLock() + """ emit_binary(cache, mi; const_argtypes=nothing) -> Vector{UInt8} @@ -61,7 +63,9 @@ Binary phase: compile Tile IR bytecode to CUBIN using tileiras. """ function emit_binary(cache::CacheView, mi::Core.MethodInstance; const_argtypes::Union{Vector{Any}, Nothing}=nothing) - bytecode = emit_code(cache, mi; const_argtypes) + bytecode = lock(EMIT_CODE_LOCK) do + emit_code(cache, mi; const_argtypes) + end ci = get(cache, mi) res = const_argtypes !== nothing ? results(cache, ci, const_argtypes) : results(cache, ci) @@ -267,4 +271,6 @@ Other values pass through unchanged. to_tile_arg(x) = x to_tile_arg(arr::AbstractArray) = TileArray(arr) +include("autotune/autotune.jl") + end diff --git a/ext/autotune/autotune.jl b/ext/autotune/autotune.jl new file mode 100644 index 0000000..653a7d2 --- /dev/null +++ b/ext/autotune/autotune.jl @@ -0,0 +1,300 @@ +import cuTile.Experimental: autotune_launch, clear_autotune_cache +using cuTile.Experimental: AbstractSearchSpace, CartesianSpace, FixedSpace + +using Random + +const AUTOTUNE_LOCK = ReentrantLock() +const AUTOTUNE_CACHE = Dict{Any, Dict{Any, Any}}() + +struct VerificationError <: Exception + msg::String +end + +const TUNING_PRESETS = ( + fast = (warmup=1, reps=3, refine_topk=0, refine_reps=2), + default = (warmup=2, reps=5, refine_topk=2, refine_reps=4), + thorough = (warmup=2, reps=7, refine_topk=4, refine_reps=6), +) + +function normalize_tuning(tuning::NamedTuple) + preset = get(tuning, :preset, :default) + preset isa Symbol || throw(ArgumentError("tuning.preset must be a Symbol")) + hasproperty(TUNING_PRESETS, preset) || + throw(ArgumentError("Unknown preset `$preset`; use :fast, :default, or :thorough")) + + base = merge(getproperty(TUNING_PRESETS, preset), + (seed=nothing, force=false, precompile_workers=Threads.nthreads())) + + # Apply user overrides (excluding :preset) + overrides = NamedTuple(k => v for (k, v) in pairs(tuning) if k !== :preset) + return merge(base, overrides) +end + +# Extract hint fields (occupancy, num_ctas) from a config for launch kwargs +function hints_from_cfg(cfg) + n = hasproperty(cfg, :num_ctas) ? cfg.num_ctas : nothing + o = hasproperty(cfg, :occupancy) ? cfg.occupancy : nothing + return (num_ctas=n, occupancy=o) +end + +function time_ms(run_once::Function, get_args::Function; + warmup::Int, reps::Int, verify::Union{Nothing, Function}=nothing, + reset::Union{Nothing, Function}=nothing) + CUDA.synchronize() + for _ in 1:max(warmup, verify !== nothing ? 1 : 0) + reset !== nothing && reset() + run_once(get_args()) + end + + if verify !== nothing + CUDA.synchronize() + verify() || throw(VerificationError("config produced incorrect output")) + end + + best_ms = Inf32 + for _ in 1:reps + reset !== nothing && reset() + args = get_args() + CUDA.synchronize() + elapsed_s = CUDA.@elapsed run_once(args) + CUDA.synchronize() + best_ms = min(best_ms, Float32(elapsed_s * 1000)) + end + return best_ms +end + +function eval_cfg(@nospecialize(f), cfg, grid_fn::Function, args_fn::Function; + sm_arch::String, opt_level::Int, warmup::Int, reps::Int, + verify::Union{Nothing, Function}=nothing, + reset::Union{Nothing, Function}=nothing) + run_once = args -> cuTile.launch(f, grid_fn(cfg), args...; + sm_arch, opt_level, hints_from_cfg(cfg)...) + return time_ms(run_once, () -> args_fn(cfg); warmup, reps, verify, reset) +end + +function precompile_cfg(@nospecialize(f), cfg, grid_fn::Function, args_fn::Function; + sm_arch::String, opt_level::Int) + grid_fn(cfg) + args = args_fn(cfg) + tile_args = map(to_tile_arg, args) + + # Mirror launch's Constant handling + unwrapped_types = map(tile_args) do arg + arg isa Constant ? constant_eltype(typeof(arg)) : typeof(arg) + end + argtypes = Tuple{unwrapped_types...} + + world = Base.get_world_counter() + mi = method_instance(f, argtypes; world) + mi === nothing && throw(MethodError(f, argtypes)) + + has_consts = any(x -> x isa Constant, tile_args) + const_argtypes = if has_consts + cats = Any[CC.Const(f)] + for arg in tile_args + push!(cats, arg isa Constant ? CC.Const(arg[]) : typeof(arg)) + end + cats + else + nothing + end + + hints = hints_from_cfg(cfg) + bytecode_version = check_tile_ir_support() + opts = (sm_arch=sm_arch, opt_level=opt_level, num_ctas=hints.num_ctas, occupancy=hints.occupancy, + bytecode_version=bytecode_version) + cache = CacheView{CuTileResults}((:cuTile, opts), world) + emit_function(cache, mi; const_argtypes) +end + +function precompile_candidates(@nospecialize(f), configs::Vector{Any}, + grid_fn::Function, args_fn::Function; + sm_arch::String, opt_level::Int, workers::Int) + isempty(configs) && return configs, nothing + iszero(workers) && return configs, nothing + + workers = min(workers, Threads.nthreads(), length(configs)) + compiled = fill(true, length(configs)) + errors = Vector{Any}(nothing, length(configs)) + sem = Base.Semaphore(workers) + cancelled = Threads.Atomic{Bool}(false) + + try + @sync for (i, cfg) in enumerate(configs) + Threads.@spawn begin + cancelled[] && return + Base.acquire(sem) do + cancelled[] && return + try + precompile_cfg(f, cfg, grid_fn, args_fn; sm_arch, opt_level) + catch err + compiled[i] = false + errors[i] = (cfg, err) + end + end + end + end + catch e + cancelled[] = true + e isa InterruptException || rethrow() + @warn "Precompilation interrupted, waiting for in-flight workers…" + # @sync already waits for spawned tasks before propagating, + # but the atomic flag ensures queued ones exit early. + rethrow() + end + + first_err = nothing + for e in errors + if e !== nothing + first_err = e + break + end + end + + return configs[compiled], first_err +end + +function measure_candidates(@nospecialize(f), configs::Vector{Any}, + grid_fn::Function, args_fn::Function; + sm_arch::String, opt_level::Int, warmup::Int, reps::Int, + verify::Union{Nothing, Function}=nothing, + reset::Union{Nothing, Function}=nothing) + record = Tuple{Any, Float32}[] + first_error = nothing + for cfg in configs + ms = try + eval_cfg(f, cfg, grid_fn, args_fn; sm_arch, opt_level, warmup, reps, verify, reset) + catch err + if err isa InterruptException + @warn "Benchmarking interrupted after $(length(record)) configs" + break + end + err isa VerificationError && @warn "Config $cfg failed verification, skipping" + first_error === nothing && (first_error = (cfg, err)) + continue + end + push!(record, (cfg, ms)) + end + return record, first_error +end + +function find_or_tune(@nospecialize(f), space::AbstractSearchSpace, rng::AbstractRNG, + grid_fn::Function, args_fn::Function, tuning; + sm_arch::String, opt_level::Int, kernel_key, arg_key, + verify::Union{Nothing, Function}=nothing, + setup::Union{Nothing, Function}=nothing) + if !tuning.force + entry = lock(AUTOTUNE_LOCK) do + per_kernel = get(AUTOTUNE_CACHE, kernel_key, nothing) + per_kernel !== nothing ? get(per_kernel, arg_key, nothing) : nothing + end + entry !== nothing && return entry, true, nothing + end + + checker = verify !== nothing ? verify() : nothing + reset = setup !== nothing ? setup() : nothing + + trials = collect(space) + + trials = Any[trials...] + trials, precompile_error = Base.ScopedValues.with(cuTile._SCOPED_INF_CACHE => CC.InferenceResult[]) do + precompile_candidates(f, trials, grid_fn, args_fn; + sm_arch, opt_level, workers=tuning.precompile_workers) + end + + record, first_error = measure_candidates(f, trials, grid_fn, args_fn; + sm_arch, opt_level, warmup=tuning.warmup, reps=tuning.reps, verify=checker, reset) + + if isempty(record) + # Prefer showing the precompile error (more informative) over the benchmark error + err_info = first_error !== nothing ? first_error : precompile_error + if err_info === nothing + throw(ArgumentError("No valid config found in search space")) + else + cfg, err = err_info + throw(ArgumentError( + "No valid config found. First failure for cfg=$cfg: $(sprint(showerror, err))")) + end + end + + # Refinement: re-benchmark top K with more reps to stabilize the winner + if tuning.refine_topk > 0 && length(record) > 1 + sort!(record, by=last) + top_configs = Any[first(r) for r in record[1:min(tuning.refine_topk, length(record))]] + refined, _ = measure_candidates(f, top_configs, grid_fn, args_fn; + sm_arch, opt_level, warmup=tuning.warmup, reps=tuning.refine_reps, reset) + if !isempty(refined) + record = refined + end + end + + _, best_idx = findmin(last, record) + candidate = (; best_config=record[best_idx][1], tuning_record=record) + + entry, cache_hit = lock(AUTOTUNE_LOCK) do + per_kernel = get!(Dict{Any,Any}, AUTOTUNE_CACHE, kernel_key) + if !tuning.force && haskey(per_kernel, arg_key) + per_kernel[arg_key], true + else + per_kernel[arg_key] = candidate + candidate, false + end + end + return entry, cache_hit, reset +end + +function autotune_launch(@nospecialize(f), space::AbstractSearchSpace, + grid_fn::Function, args_fn::Function; + key=nothing, + key_fn::Union{Nothing, Function}=nothing, + launch_args_fn::Union{Nothing, Function}=nothing, + verify::Union{Nothing, Function}=nothing, + setup::Union{Nothing, Function}=nothing, + tuning::NamedTuple=NamedTuple(), + sm_arch::String=default_sm_arch(), + opt_level::Int=3) + tuning = normalize_tuning(tuning) + rng = tuning.seed !== nothing ? MersenneTwister(tuning.seed) : Random.default_rng() + + kernel_key = (f, sm_arch, opt_level) + arg_key = key !== nothing ? key : (key_fn !== nothing ? key_fn() : nothing) + + entry, cache_hit, reset = find_or_tune(f, space, rng, grid_fn, args_fn, tuning; + sm_arch, opt_level, kernel_key, arg_key, verify, setup) + + cfg = entry.best_config + grid = grid_fn(cfg) + args = launch_args_fn !== nothing ? launch_args_fn(cfg) : args_fn(cfg) + + # Reset state before the final "real" launch + reset !== nothing && reset() + + cuTile.launch(f, grid, args...; sm_arch, opt_level, hints_from_cfg(cfg)...) + + return (; tuned_config=cfg, grid, tuning_record=copy(entry.tuning_record), cache_hit) +end + +# Convenience: accept plain Vector (→ FixedSpace) or NamedTuple (→ CartesianSpace) +function autotune_launch(@nospecialize(f), configs, grid_fn::Function, args_fn::Function; kwargs...) + space = configs isa NamedTuple ? CartesianSpace(configs) : FixedSpace(configs) + return autotune_launch(f, space, grid_fn, args_fn; kwargs...) +end + +function clear_autotune_cache(; kernel=nothing, key=nothing) + lock(AUTOTUNE_LOCK) do + if kernel === nothing + key === nothing || throw(ArgumentError("`key` requires `kernel`")) + empty!(AUTOTUNE_CACHE) + return nothing + end + + for kernel_key in collect(keys(AUTOTUNE_CACHE)) + kernel_key isa Tuple || continue + kernel_key[1] === kernel || continue + per_kernel = AUTOTUNE_CACHE[kernel_key] + key === nothing ? empty!(per_kernel) : pop!(per_kernel, key, nothing) + isempty(per_kernel) && delete!(AUTOTUNE_CACHE, kernel_key) + end + end + return nothing +end diff --git a/src/Experimental.jl b/src/Experimental.jl new file mode 100644 index 0000000..f6d5354 --- /dev/null +++ b/src/Experimental.jl @@ -0,0 +1,40 @@ +module Experimental + +autotune_launch(args...; kwargs...) = + error("Please import CUDA.jl before using `cuTile.autotune_launch`.") +clear_autotune_cache(args...; kwargs...) = + error("Please import CUDA.jl before using `cuTile.clear_autotune_cache`.") + +abstract type AbstractSearchSpace end + +Base.length(s::AbstractSearchSpace) = count(_ -> true, s) + +struct FixedSpace{names,NT<:NamedTuple{names}} <: AbstractSearchSpace + elements::Vector{NT} +end + +Base.iterate(space::FixedSpace, args...) = iterate(space.elements, args...) + +struct CartesianSpace{names,NT<:NamedTuple{names,<:Tuple{Vararg{Tuple}}}} <: AbstractSearchSpace + constraint::Function + axes::NT +end + +CartesianSpace(axes::NamedTuple) = CartesianSpace(Returns(true), axes) +CartesianSpace(; axes...) = CartesianSpace(NamedTuple(axes)) +CartesianSpace(constraint::Function; axes...) = CartesianSpace(constraint, NamedTuple(axes)) + +function Base.iterate(space::CartesianSpace{names}, state=nothing) where names + to_cfg = vals -> NamedTuple{names}(vals) + inner = state === nothing ? + Iterators.filter(space.constraint ∘ to_cfg, + Iterators.product(map(Tuple, values(space.axes))...)) : + state.inner + result = isnothing(state) ? iterate(inner) : iterate(inner, state.cursor) + isnothing(result) && return nothing + vals, cursor = result + cfg = to_cfg(vals) + return cfg, (; inner, cursor) +end + +end diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index 370918d..0abdff1 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -35,9 +35,15 @@ struct cuTileInterpreter <: CC.AbstractInterpreter opt_params::CC.OptimizationParams end +# Scoped inference cache: reuse callee inference results (e.g. kwarg sorters) +# across interpreter instances within a compilation scope. Set by autotuning +# to share callee inference across configs; unset falls back to fresh cache. +using Base.ScopedValues: ScopedValue, with +const _SCOPED_INF_CACHE = ScopedValue{Vector{CC.InferenceResult}}() + function cuTileInterpreter(cache::CacheView; always_inline::Bool=true) method_table = get_method_table_view(cache.world) - inf_cache = Vector{CC.InferenceResult}() + inf_cache = isassigned(_SCOPED_INF_CACHE) ? _SCOPED_INF_CACHE[] : Vector{CC.InferenceResult}() inf_params = CC.InferenceParams() opt_params = if always_inline CC.OptimizationParams(; inline_cost_threshold=typemax(Int)) diff --git a/src/cuTile.jl b/src/cuTile.jl index 2e5fe39..23a050c 100644 --- a/src/cuTile.jl +++ b/src/cuTile.jl @@ -41,4 +41,6 @@ include("language/atomics.jl") public launch, ByTarget, @compiler_options launch(args...) = error("Please import CUDA.jl before using `cuTile.launch`.") +include("Experimental.jl") + end # module cuTile diff --git a/test/execution/autotune.jl b/test/execution/autotune.jl new file mode 100644 index 0000000..d6e705e --- /dev/null +++ b/test/execution/autotune.jl @@ -0,0 +1,234 @@ +using CUDA + +const Exp = ct.Experimental + +@testset "Autotune" begin + + function vadd_kernel(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}, + tile::Int) + pid = ct.bid(1) + ta = ct.load(a, pid, (tile[],)) + tb = ct.load(b, pid, (tile[],)) + ct.store(c, pid, ta + tb) + return nothing + end + + function inplace_add_kernel(x::ct.TileArray{Float32,1}, + tile::Int) + pid = ct.bid(1) + tx = ct.load(x, pid, (tile[],)) + ct.store(x, pid, tx .+ 1f0) + return nothing + end + + n = 512 + a = CUDA.fill(1f0, n) + b = CUDA.fill(2f0, n) + c = CUDA.zeros(Float32, n) + + configs = [ + (; tile=16, occupancy=nothing, num_ctas=nothing), + (; tile=32, occupancy=2, num_ctas=nothing), + (; tile=64, occupancy=4, num_ctas=2), + ] + args_fn = cfg -> (a, b, c, ct.Constant(cfg.tile)) + grid_fn = cfg -> cld(n, cfg.tile) + + @testset "basic tuning" begin + Exp.clear_autotune_cache() + result = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:basic, n), + tuning=(preset=:fast, refine_topk=0), + ) + @test !result.cache_hit + @test result.tuned_config in configs + @test !isempty(result.tuning_record) + @test Array(c) ≈ fill(3f0, n) + end + + @testset "cache hit" begin + fill!(c, 0f0) + result = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:basic, n), + tuning=(preset=:fast, refine_topk=0), + ) + @test result.cache_hit + @test Array(c) ≈ fill(3f0, n) + end + + @testset "force retune" begin + fill!(c, 0f0) + result = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:basic, n), + tuning=(preset=:fast, refine_topk=0, force=true), + ) + @test !result.cache_hit + @test Array(c) ≈ fill(3f0, n) + end + + @testset "CartesianSpace" begin + Exp.clear_autotune_cache() + fill!(c, 0f0) + space = Exp.CartesianSpace(; + tile=(16, 32), occupancy=(nothing, 2), num_ctas=(nothing,)) + result = Exp.autotune_launch( + vadd_kernel, space, grid_fn, args_fn; + key=(:cartesian, n), + tuning=(preset=:fast, refine_topk=0), + ) + @test hasproperty(result.tuned_config, :tile) + @test hasproperty(result.tuned_config, :occupancy) + @test Array(c) ≈ fill(3f0, n) + end + + @testset "CartesianSpace with constraint" begin + Exp.clear_autotune_cache() + fill!(c, 0f0) + space = Exp.CartesianSpace( + cfg -> cfg.tile == 16; + tile=(16, 32, 64), occupancy=(nothing,), num_ctas=(nothing,)) + result = Exp.autotune_launch( + vadd_kernel, space, grid_fn, args_fn; + key=(:constrained, n), + tuning=(preset=:fast, refine_topk=0), + ) + @test result.tuned_config.tile == 16 + @test Array(c) ≈ fill(3f0, n) + end + + @testset "NamedTuple convenience → CartesianSpace" begin + Exp.clear_autotune_cache() + fill!(c, 0f0) + result = Exp.autotune_launch( + vadd_kernel, + (tile=(16, 32), occupancy=(nothing,), num_ctas=(nothing,)), + grid_fn, args_fn; + key=(:nt_convenience, n), + tuning=(preset=:fast, refine_topk=0), + ) + @test result.tuned_config.tile in (16, 32) + @test Array(c) ≈ fill(3f0, n) + end + + @testset "launch_args_fn (inplace kernel)" begin + x = CUDA.zeros(Float32, n) + original_x = Array(x) + Exp.clear_autotune_cache() + result = Exp.autotune_launch( + inplace_add_kernel, + [(; tile=16), (; tile=32)], + grid_fn, + cfg -> (copy(x), ct.Constant(cfg.tile)); + launch_args_fn=cfg -> (x, ct.Constant(cfg.tile)), + key=(:inplace, n), + tuning=(preset=:fast, refine_topk=0), + ) + @test !result.cache_hit + @test Array(x) == original_x .+ 1f0 + end + + @testset "refinement" begin + Exp.clear_autotune_cache() + fill!(c, 0f0) + result = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:refine, n), + tuning=(warmup=1, reps=2, refine_topk=2, refine_reps=4), + ) + @test !result.cache_hit + # Refinement record replaces initial — has at most refine_topk entries + @test length(result.tuning_record) <= 2 + @test Array(c) ≈ fill(3f0, n) + end + + @testset "verify" begin + Exp.clear_autotune_cache() + fill!(c, 0f0) + verify_called = Ref(false) + result = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:verify, n), + tuning=(preset=:fast, refine_topk=0), + verify=() -> let + ref = Array(a) .+ Array(b) + verify_called[] = true + () -> (CUDA.@allowscalar all(isapprox.(Array(c), ref, atol=1f-5))) + end, + ) + @test verify_called[] + @test Array(c) ≈ fill(3f0, n) + end + + @testset "clear cache per-kernel per-key" begin + Exp.clear_autotune_cache() + Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:k1, n), tuning=(preset=:fast, refine_topk=0)) + Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:k2, n), tuning=(preset=:fast, refine_topk=0)) + + # Clear only one key + Exp.clear_autotune_cache(kernel=vadd_kernel, key=(:k1, n)) + fill!(c, 0f0) + r1 = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:k1, n), tuning=(preset=:fast, refine_topk=0)) + @test !r1.cache_hit # was cleared + + fill!(c, 0f0) + r2 = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=(:k2, n), tuning=(preset=:fast, refine_topk=0)) + @test r2.cache_hit # still cached + end + + @testset "shared key across shapes" begin + Exp.clear_autotune_cache() + n2 = 1024 + a2 = CUDA.fill(1f0, n2) + b2 = CUDA.fill(2f0, n2) + c2 = CUDA.zeros(Float32, n2) + shared_key = (:shape_agnostic, eltype(a)) + + Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key=shared_key, tuning=(preset=:fast, refine_topk=0)) + + fill!(c2, 0f0) + result = Exp.autotune_launch( + vadd_kernel, configs, + cfg -> cld(n2, cfg.tile), + cfg -> (a2, b2, c2, ct.Constant(cfg.tile)); + key=shared_key, tuning=(preset=:fast, refine_topk=0)) + @test result.cache_hit + @test result.grid == cld(n2, result.tuned_config.tile) + @test Array(c2) ≈ fill(3f0, n2) + end + + @testset "key_fn" begin + Exp.clear_autotune_cache() + call_count = Ref(0) + my_key_fn = () -> begin + call_count[] += 1 + return (:dynamic, Float32) + end + + fill!(c, 0f0) + r1 = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key_fn=my_key_fn, tuning=(preset=:fast, refine_topk=0)) + r2 = Exp.autotune_launch( + vadd_kernel, configs, grid_fn, args_fn; + key_fn=my_key_fn, tuning=(preset=:fast, refine_topk=0)) + @test !r1.cache_hit + @test r2.cache_hit + @test call_count[] == 2 + @test Array(c) ≈ fill(3f0, n) + end +end