Inference failure when multiple structs are broadcasted via tuples #2623
What is the failing GPU version of that simple reproducer? Switching the inputs to GPU arrays works here: julia> gb = cu(b)
5×5 CuArray{Float32, 2, CUDA.DeviceMemory}:
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0
julia> ga = cu(a)
5×5 CuArray{Float32, 2, CUDA.DeviceMemory}:
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
julia> gbc = instantiate(broadcasted(foo, gb, p1, p2));
julia> materialize!(ga, gbc)
5×5 CuArray{Float32, 2, CUDA.DeviceMemory}:
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0
-1.0 -1.0 -1.0 -1.0 -1.0 In any case, the inference failure can manifest in the CPU case as well, it just executes with dynamic calls. |
I'll copy it from the gist here: #=
using Revise; include("cuda_broadcast_inference_reproducer.jl")
julia --project=.buildkite
julia --project=.buildkite cuda_broadcast_inference_reproducer.jl
julia +1.11 --project=.buildkite cuda_broadcast_inference_reproducer.jl
@static if !(VERSION ≥ v"1.11.0-beta")
using JET;
import CUDA # comment to run without CUDA
using Test
import Adapt
import Base
import Base.Broadcast: BroadcastStyle,
Broadcasted, instantiate, broadcasted, materialize, materialize!
struct VF{S <: AbstractFloat, Nv, A}
struct VFStyle{Nv, A} <: Base.BroadcastStyle end
function VF{S, Nv}(array::AbstractArray{T, 2}) where {S, Nv, T}
@assert size(array, 1) == Nv
@assert size(array, 2) == typesize(T, S)
VF{S, Nv, typeof(array)}(array)
function VF{S}(
) where {S, ArrayType}
Nf = typesize(eltype(ArrayType), S)
array = similar(ArrayType, Nv, Nf)
fill!(array, 0)
VF{S, Nv}(array)
typesize(::Type{T}, ::Type{S}) where {T, S} = div(sizeof(S), sizeof(T))
parent_array_type(::Type{<:Array{T}}) where {T} = Array{T}
Base.eltype(::Type{<:VF{S}}) where {S} = S
Base.parent(data::VF) = getfield(data, :array)
Base.similar(data::VF{S}) where {S} = similar(data, S)
@inline Base.size(data::VF, i::Integer) = size(data)[i]
@inline Base.size(data::VF{S, Nv}) where {S, Nv} = (1, 1, 1, Nv, 1)
Base.length(data::VF{S, Nv}) where {S, Nv} = Nv
Base.lastindex(data::VF) = length(data)
Base.copy(data::VF{S, NV}) where {S, NV} = VF{S, NV}(copy(parent(data)))
Base.Broadcast.BroadcastStyle(::Type{VF{S, Nv, A}}) where {S, Nv, A} = VFStyle{Nv, parent_array_type(A)}()
Base.Broadcast.BroadcastStyle(::Base.Broadcast.Style{<:Tuple}, ds::VFStyle) = ds
Base.Broadcast.broadcastable(data::VF) = data
Adapt.adapt_structure(to, data::VF{S, NV}) where {S, NV} = VF{S, NV}(Adapt.adapt(to, parent(data)))
@inline parent_array_type(::Type{VF{S, Nv, A}}) where {S, Nv, A} = A
Base.ndims(data::VF) = Base.ndims(typeof(data))
Base.ndims(::Type{T}) where {T <: VF} = Base.ndims(parent_array_type(T))
function Base.similar(
bc::Union{Base.Broadcast.Broadcasted{VFStyle{Nv, A}}, VF{S, Nv, A}},
) where {Nv, A, S}
PA = parent_array_type(A)
array = similar(PA, (Nv, typesize(eltype(A), S)))
return VF{S, Nv}(array)
@inline function Base.getindex(
data::VF{S, Nv},
) where {S, Nv}
@boundscheck 1 <= I.I[4] <= Nv || throw(BoundsError(data, I))
return parent(data)[I.I[4], 1]
@inline function Base.setindex!(
data::VF{S, Nv},
) where {S, Nv}
@boundscheck 1 <= I.I[4] <= Nv || throw(BoundsError(data, I))
parent(data)[I.I[4], 1] = val
function Base.copyto!(
bc::Union{VF, Base.Broadcast.Broadcasted},
) where {S}
Base.copyto!(dest, bc, parent(dest))
function Base.copyto!(
dest::VF{S, Nv},
bc::Union{Base.Broadcast.Broadcasted{VFStyle{Nv, A}}, VF{S, Nv, A}},
) where {S, Nv, A}
@inbounds for v in 1:Nv
idx = CartesianIndex(1, 1, 1, v, 1)
dest[idx] = convert(S, bc[idx])
return dest
# Extension
@static if @isdefined(CUDA)
parent_array_type(::Type{<:CUDA.CuArray{T, N, B} where {N}}) where {T, B} = CUDA.CuArray{T, N, B} where {N}
::Type{CUDA.CuArray{T, N′, B} where {N′}},
) where {T, N, B} = similar(CUDA.CuArray{T, N, B}, dims)
function knl_copyto!(dest::VF{S, Nv}, src) where {S, Nv}
(tv,) = CUDA.threadIdx()
(bv,) = CUDA.blockIdx()
v = tv + (bv - 1) * CUDA.blockDim().x
I = CartesianIndex((1, 1, 1, v, 1))
if 1 ≤ I.I[4] ≤ Nv
@inbounds dest[I] = src[I]
return nothing
function Base.copyto!(dest::VF{S, Nv}, bc, to::CUDA.CuArray) where {S, Nv}
kernel = CUDA.@cuda always_inline = true launch = false knl_copyto!(dest, bc)
config = CUDA.launch_configuration(kernel.fun)
n_max_threads = min(config.threads, Nv)
Nvt = fld(n_max_threads, Nv)
Nv_thread = Nvt == 0 ? n_max_threads : min(Int(Nvt), Nv)
Nv_blocks = cld(Nv, Nv_thread)
@assert Nv_thread ≤ n_max_threads "threads,n_max_threads=($(Nv_thread),$n_max_threads)"
p = (; threads = (Nv_thread,), blocks = (Nv_blocks,))
kernel(dest, bc; threads = p.threads, blocks = p.blocks)
return dest
struct MyParams1{A}
struct MyParams2{B}
Base.Broadcast.broadcastable(x::MyParams1) = tuple(x);
Base.Broadcast.broadcastable(x::MyParams2) = tuple(x);
foo(f, p1, p2) = f + p1.a - p2.b;
bar(p1, p2, f) = f + p1.a - p2.b;
FT = Float64;
p1 = MyParams1{FT}(1);
p2 = MyParams2{FT}(2);
@testset "Broken test" begin
b = zeros(FT, 5,5); # Ordinary CPU array works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
@static if !(VERSION ≥ v"1.11.0-beta")
@test_opt materialize!(a, bc) # also passes inference
b = VF{FT}(Array{FT}; Nv=4); # VF with CPU array works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
# @code_warntype materialize!(a, bc) # looks fine
@static if !(VERSION ≥ v"1.11.0-beta")
@test_opt materialize!(a, bc) # also passes inference
@static if @isdefined(CUDA)
b = CUDA.zeros(FT, 5,5); # CUDA.CuArray works
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc)
b = VF{FT}(CUDA.CuArray{FT}; Nv=4); # VF with CUDA.CuArray fails
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
@test_throws CUDA.InvalidIRError materialize!(a, bc) # fails to compile
# CUDA.@device_code_warntype materialize!(a, bc)
# re-run the last, breaking, part:
b = VF{FT}(CUDA.CuArray{FT}; Nv=4); # VF with CUDA.CuArray fails
a = similar(b);
bc = instantiate(broadcasted(foo, b, p1, p2));
materialize!(a, bc) # fails to compile
nothing Note the |
I suppose that's possible, but I don't think it is because it passes |
I'm not sure if this is the best place for this issue, so please let me know and I can move it if it belongs somewhere else.
I'm running into inference failure when multiple structs are broadcasted via tuples. The CPU, ordinary array version of this looks like the following:
Here is a reproducer that has all 4 cases I'm looking at.
AFAICT, the actual error/issue seems to be inference failure due to tuple recursion depth limit in the recursive broadcast
, but it's kind of surprising because the tuple that is being indexed is((MyParams1,), (MyParams2,))
.In summary, this is what is working / not working:
)`CUDA.@device_code_warntype` does seem to detect the issue:
version info:
