From 7782216d8cc6b4656f79b3ed3018328fe4c81381 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 17 Apr 2024 13:40:21 +0200 Subject: [PATCH] Add support for dynamically-constructed opaque closures. --- src/compiler/compilation.jl | 229 +++++++++++++++++++++++++----------- test/core/execution.jl | 41 ++++++- 2 files changed, 200 insertions(+), 70 deletions(-) diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl index 0f7d21cd03..81fca77ede 100644 --- a/src/compiler/compilation.jl +++ b/src/compiler/compilation.jl @@ -435,22 +435,7 @@ end using Core.Compiler: IRCode using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode -struct OpaqueClosure{F, E, A, R} # func, env, args, ret - env::E -end - -# XXX: because we can't call functions from other CUDA modules, we effectively need to -# recompile when the target function changes. this, and because of how GPUCompiler's -# deferred compilation mechanism currently works, is why we have `F` as a type param. - -# XXX: because of GPU code requiring specialized signatures, we also need to recompile -# when the environment or argument types change. together with the above, this -# negates much of the benefit of opaque closures. - -# TODO: support for constructing an opaque closure from source code - -# TODO: complete support for passing an environment. this probably requires a split into -# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray. +# helpers function compute_ir_rettype(ir::IRCode) rt = Union{} @@ -463,32 +448,25 @@ function compute_ir_rettype(ir::IRCode) return Core.Compiler.widenconst(rt) end -function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool) +function compute_oc_signature(ir::IRCode, nargs::Int) argtypes = Vector{Any}(undef, nargs) for i = 1:nargs argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1]) end - if isva - lastarg = pop!(argtypes) - if lastarg <: Tuple - append!(argtypes, lastarg.parameters) - else - push!(argtypes, Vararg{Any}) - end - end return Tuple{argtypes...} end -function OpaqueClosure(ir::IRCode, @nospecialize env...; - isva::Bool = false, - slotnames::Union{Nothing,Vector{Symbol}}=nothing) +function make_oc_codeinfo(ir::IRCode, @nospecialize env...; slotnames=nothing) # NOTE: we need ir.argtypes[1] == typeof(env) ir = Core.Compiler.copy(ir) - # if the user didn't specify a definition MethodInstance or filename Symbol to use for the debuginfo, set a filename now - ir.debuginfo.def === nothing && (ir.debuginfo.def = :var"generated IR for OpaqueClosure") + # if the user didn't specify a definition MethodInstance or filename Symbol to use + # for the debuginfo, set a filename now + if ir.debuginfo.def === nothing + ir.debuginfo.def = Symbol("IR for opaque gpu closure") + end nargtypes = length(ir.argtypes) nargs = nargtypes-1 - sig = compute_oc_signature(ir, nargs, isva) + sig = compute_oc_signature(ir, nargs) rt = compute_ir_rettype(ir) src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ()) if slotnames === nothing @@ -499,61 +477,39 @@ function OpaqueClosure(ir::IRCode, @nospecialize env...; end src.slotflags = Base.fill(zero(UInt8), nargtypes) src.slottypes = copy(ir.argtypes) - src = Core.Compiler.ir_to_codeinf!(src, ir) - config = compiler_config(device(); kernel=false) - return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...) -end - -function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs, isva=false) - config = compiler_config(device(); kernel=false) - return generate_opaque_closure(config, src, sig, rettype, nargs, isva, env...) + Core.Compiler.ir_to_codeinf!(src, ir) end -function generate_opaque_closure(config::CompilerConfig, src::CodeInfo, - @nospecialize(sig), @nospecialize(rt), - nargs::Int, isva::Bool, @nospecialize env...; - mod::Module=@__MODULE__, - file::Union{Nothing,Symbol}=nothing, line::Int=0) - # create a method (like `jl_make_opaque_closure_method`) +# create a method (like `jl_make_oc_method`) +function make_oc_method(nargs; file=nothing, line=0, world=GPUCompiler.tls_world_age()) meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main) meth.sig = Tuple - meth.isva = isva # XXX: probably not supported? - meth.is_for_opaque_closure = 0 # XXX: do we want this? + meth.isva = false + meth.is_for_opaque_closure = 0 meth.name = Symbol("opaque gpu closure") meth.nargs = nargs + 1 meth.file = something(file, Symbol()) meth.line = line - ccall(:jl_method_set_source, Nothing, (Any, Any), meth, src) - - # look up a method instance and create a compiler job - full_sig = Tuple{typeof(env), sig.parameters...} - mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, - (Any, Any, Any), meth, full_sig, Core.svec()) - job = CompilerJob(mi, config) # this captures the current world age - Base.@atomic meth.primary_world = job.world + Base.@atomic meth.primary_world = world Base.@atomic meth.deleted_world = typemax(UInt) + return meth +end - # create a code instance and store it in the cache - interp = GPUCompiler.get_interpreter(job) +function make_oc_codeinstance(mi::MethodInstance, src::CodeInfo; interp, world, rt) owner = Core.Compiler.cache_owner(interp) exctype = Any inferred_const = C_NULL inferred = src const_flags = Int32(0) - min_world = meth.primary_world - max_world = meth.deleted_world + min_world = world + max_world = typemax(UInt) ipo_effects = UInt32(0) effects = UInt32(0) analysis_results = nothing relocatability = UInt8(0) - ci = CodeInstance(mi, owner, rt, exctype, inferred_const, inferred, - const_flags, min_world, max_world, ipo_effects, effects, - analysis_results, relocatability, src.debuginfo) - Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi) - - id = length(GPUCompiler.deferred_codegen_jobs) + 1 - GPUCompiler.deferred_codegen_jobs[id] = job - return OpaqueClosure{id, typeof(env), sig, rt}(env) + CodeInstance(mi, owner, rt, exctype, inferred_const, inferred, + const_flags, min_world, max_world, ipo_effects, effects, + analysis_results, relocatability, src.debuginfo) end # generated function `ccall`, working around the restriction that ccall type @@ -587,7 +543,60 @@ end return ex end -# device-side call to an opaque closure +# static opaque closures + +# XXX: because we can't call functions from other CUDA modules, we effectively need to +# recompile when the target function changes. this, and because of how GPUCompiler's +# deferred compilation mechanism currently works, is why we have `F` as a type param. + +# XXX: because of GPU code requiring specialized signatures, we also need to recompile +# when the environment or argument types change. together with the above, this +# negates much of the benefit of opaque closures. + +# TODO: support for constructing an opaque closure from source code + +# TODO: complete support for passing an environment. this probably requires a split into +# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray. + +struct OpaqueClosure{F, E, A, R} # func, env, args, ret + env::E +end + +function OpaqueClosure(ir::IRCode, @nospecialize env...; + slotnames::Union{Nothing,Vector{Symbol}}=nothing) + nargtypes = length(ir.argtypes) + nargs = nargtypes-1 + sig = compute_oc_signature(ir, nargs) + rt = compute_ir_rettype(ir) + src = make_oc_codeinfo(ir, env...; slotnames) + return create_static_oc(src, sig, rt, nargs, env...) +end + +function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs) + return create_static_oc(src, sig, rettype, nargs, env...) +end + +function create_static_oc(src, @nospecialize(sig), @nospecialize(rt), nargs::Int, + @nospecialize env...; file=nothing, line=0) + config = compiler_config(device(); kernel=false) + meth = make_oc_method(nargs; file, line) + + # look up a method instance and create a compiler job + full_sig = Tuple{typeof(env), sig.parameters...} + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, + (Any, Any, Any), meth, full_sig, Core.svec()) + job = CompilerJob(mi, config, meth.primary_world) + + # create a callable object + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + oc = OpaqueClosure{id, typeof(env), sig, rt}(env) + + opaque_closure_jobs[job] = (; oc, src, rt) + return oc +end + +# device-side call (oc::OpaqueClosure)(args...) = call(oc, args...) ## NOTE: split into two to make `SciML.isinplace(oc)` work. ## it also resembles how kernels are called. @@ -597,3 +606,87 @@ end #ccall(ptr, R, (A...), args...) generated_ccall(ptr, R, A, args...) end + +# dynamic opaque closures + +const jit_opaque_closures = Dict() + +struct JITOpaqueClosure{B, T} + builder::B + tfunc::T + + function JITOpaqueClosure(builder, tfunc=Returns(nothing); nargs) + # the device and world are captured at closure construction time, but we only need + # them when creating the CompilerJob. as we cannot simply encode them in the + # JITOpaqueClosure object, we store them in a global dictionary instead. + config = compiler_config(device(); kernel=false) + meth = make_oc_method(nargs) + + # create a callable object + oc = new{typeof(builder), typeof(tfunc)}(builder, tfunc) + jit_opaque_closures[typeof(oc)] = (; env=(), meth, config, oc) + + return oc + end +end + +# device-side call +function (oc::JITOpaqueClosure)(args...) + rt = oc.tfunc(map(Core.Typeof, args)...) + call(oc, rt, args...) +end +@inline @generated function call(oct::JITOpaqueClosure{B,T}, ::Type{R}, args...) where {B,T,R} + rt = R + (; env, meth, config, oc) = jit_opaque_closures[oct] + + # look up a method instance and create a compiler job + full_sig = Tuple{typeof(env), args...} + mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance}, + (Any, Any, Any), meth, full_sig, Core.svec()) + job = CompilerJob(mi, config, meth.primary_world) + opaque_closure_jobs[job] = (; oc, args, rt) + + # generate a deferred compilation call + id = length(GPUCompiler.deferred_codegen_jobs) + 1 + GPUCompiler.deferred_codegen_jobs[id] = job + quote + ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id) + assume(ptr != C_NULL) + #ccall(ptr, R, (A...), args...) + generated_ccall(ptr, $rt, $(Tuple{args...}), args...) + end +end + +# compilation of opaque closures + +const opaque_closure_jobs = Dict{CompilerJob,Any}() + +function GPUCompiler.prepare_job!(@nospecialize(job::CUDACompilerJob)) + if haskey(opaque_closure_jobs, job) + rt = opaque_closure_jobs[job].rt + oc = opaque_closure_jobs[job].oc + if oc isa JITOpaqueClosure + args = opaque_closure_jobs[job].args + nargs = length(args) + + src = oc.builder(args...) + if src isa IRCode + nargtypes = length(src.argtypes) + nargs = nargtypes-1 + sig = compute_oc_signature(src, nargs) + @assert compute_ir_rettype(src) == rt "Inferred return type does not match the provided return type" + src = make_oc_codeinfo(src) + end + else + src = opaque_closure_jobs[job].src + end + @assert src isa CodeInfo + + # create a code instance and store it in the cache + interp = GPUCompiler.get_interpreter(job) + ci = make_oc_codeinstance(job.source, src; interp, job.world, rt) + Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, job.source) + end + + return +end diff --git a/test/core/execution.jl b/test/core/execution.jl index 3d23b97ca5..d0091adde1 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -1099,7 +1099,7 @@ end if VERSION >= v"1.12-" @testset "opaque closures" begin -# basic closure, constructed from IRCode +# static closure, constructed from IRCode let ir, rettyp = only(Base.code_ircode(+, (Int, Int))) oc = CUDA.OpaqueClosure(ir) @@ -1118,7 +1118,7 @@ let @test Array(c)[] == 3 end -# basic closure, constructed from CodeInfo +# static closure, constructed from CodeInfo let ir, rettype = only(Base.code_typed(*, (Int, Int, Int))) oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3) @@ -1138,6 +1138,43 @@ let @test Array(d)[] == 24 end +# dynamic closure, constructing IRCode based on argument types +let + tfunc(arg1, arg2) = Core.Compiler.return_type(+, Tuple{arg1,arg2}) + function builder(arg1, arg2) + ir, rettyp = only(Base.code_ircode(+, (arg1, arg2))) + return ir + end + + oc = CUDA.JITOpaqueClosure(builder, tfunc; nargs=2) + + function kernel(oc, c, a, b) + i = threadIdx().x + @inbounds c[i] = oc(a[i], b[i]) + return + end + + let + c = CuArray([0]) + a = CuArray([1]) + b = CuArray([2]) + + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 3 + end + + let + c = CuArray([3f0]) + a = CuArray([4f0]) + b = CuArray([5f0]) + + @cuda threads=1 kernel(oc, c, a, b) + + @test Array(c)[] == 9f0 + end +end + end end