From 7782216d8cc6b4656f79b3ed3018328fe4c81381 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 17 Apr 2024 13:40:21 +0200
Subject: [PATCH] Add support for dynamically-constructed opaque closures.

---
 src/compiler/compilation.jl | 229 +++++++++++++++++++++++++-----------
 test/core/execution.jl      |  41 ++++++-
 2 files changed, 200 insertions(+), 70 deletions(-)

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
index 0f7d21cd03..81fca77ede 100644
--- a/src/compiler/compilation.jl
+++ b/src/compiler/compilation.jl
@@ -435,22 +435,7 @@ end
 using Core.Compiler: IRCode
 using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode
 
-struct OpaqueClosure{F, E, A, R}    # func, env, args, ret
-    env::E
-end
-
-# XXX: because we can't call functions from other CUDA modules, we effectively need to
-#      recompile when the target function changes. this, and because of how GPUCompiler's
-#      deferred compilation mechanism currently works, is why we have `F` as a type param.
-
-# XXX: because of GPU code requiring specialized signatures, we also need to recompile
-#      when the environment or argument types change. together with the above, this
-#      negates much of the benefit of opaque closures.
-
-# TODO: support for constructing an opaque closure from source code
-
-# TODO: complete support for passing an environment. this probably requires a split into
-#       host and device structures to, e.g., root a CuArray and pass a CuDeviceArray.
+# helpers
 
 function compute_ir_rettype(ir::IRCode)
     rt = Union{}
@@ -463,32 +448,25 @@ function compute_ir_rettype(ir::IRCode)
     return Core.Compiler.widenconst(rt)
 end
 
-function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool)
+function compute_oc_signature(ir::IRCode, nargs::Int)
     argtypes = Vector{Any}(undef, nargs)
     for i = 1:nargs
         argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1])
     end
-    if isva
-        lastarg = pop!(argtypes)
-        if lastarg <: Tuple
-            append!(argtypes, lastarg.parameters)
-        else
-            push!(argtypes, Vararg{Any})
-        end
-    end
     return Tuple{argtypes...}
 end
 
-function OpaqueClosure(ir::IRCode, @nospecialize env...;
-                       isva::Bool = false,
-                       slotnames::Union{Nothing,Vector{Symbol}}=nothing)
+function make_oc_codeinfo(ir::IRCode, @nospecialize env...; slotnames=nothing)
     # NOTE: we need ir.argtypes[1] == typeof(env)
     ir = Core.Compiler.copy(ir)
-    # if the user didn't specify a definition MethodInstance or filename Symbol to use for the debuginfo, set a filename now
-    ir.debuginfo.def === nothing && (ir.debuginfo.def = :var"generated IR for OpaqueClosure")
+    # if the user didn't specify a definition MethodInstance or filename Symbol to use
+    # for the debuginfo, set a filename now
+    if ir.debuginfo.def === nothing
+        ir.debuginfo.def = Symbol("IR for opaque gpu closure")
+    end
     nargtypes = length(ir.argtypes)
     nargs = nargtypes-1
-    sig = compute_oc_signature(ir, nargs, isva)
+    sig = compute_oc_signature(ir, nargs)
     rt = compute_ir_rettype(ir)
     src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ())
     if slotnames === nothing
@@ -499,61 +477,39 @@ function OpaqueClosure(ir::IRCode, @nospecialize env...;
     end
     src.slotflags = Base.fill(zero(UInt8), nargtypes)
     src.slottypes = copy(ir.argtypes)
-    src = Core.Compiler.ir_to_codeinf!(src, ir)
-    config = compiler_config(device(); kernel=false)
-    return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...)
-end
-
-function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs, isva=false)
-    config = compiler_config(device(); kernel=false)
-    return generate_opaque_closure(config, src, sig, rettype, nargs, isva, env...)
+    Core.Compiler.ir_to_codeinf!(src, ir)
 end
 
-function generate_opaque_closure(config::CompilerConfig, src::CodeInfo,
-                                 @nospecialize(sig), @nospecialize(rt),
-                                 nargs::Int, isva::Bool, @nospecialize env...;
-                                 mod::Module=@__MODULE__,
-                                 file::Union{Nothing,Symbol}=nothing, line::Int=0)
-    # create a method (like `jl_make_opaque_closure_method`)
+# create a method (like `jl_make_oc_method`)
+function make_oc_method(nargs; file=nothing, line=0, world=GPUCompiler.tls_world_age())
     meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main)
     meth.sig = Tuple
-    meth.isva = isva                # XXX: probably not supported?
-    meth.is_for_opaque_closure = 0  # XXX: do we want this?
+    meth.isva = false
+    meth.is_for_opaque_closure = 0
     meth.name = Symbol("opaque gpu closure")
     meth.nargs = nargs + 1
     meth.file = something(file, Symbol())
     meth.line = line
-    ccall(:jl_method_set_source, Nothing, (Any, Any), meth, src)
-
-    # look up a method instance and create a compiler job
-    full_sig = Tuple{typeof(env), sig.parameters...}
-    mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
-               (Any, Any, Any), meth, full_sig, Core.svec())
-    job = CompilerJob(mi, config)   # this captures the current world age
-    Base.@atomic meth.primary_world = job.world
+    Base.@atomic meth.primary_world = world
     Base.@atomic meth.deleted_world = typemax(UInt)
+    return meth
+end
 
-    # create a code instance and store it in the cache
-    interp = GPUCompiler.get_interpreter(job)
+function make_oc_codeinstance(mi::MethodInstance, src::CodeInfo; interp, world, rt)
     owner = Core.Compiler.cache_owner(interp)
     exctype = Any
     inferred_const = C_NULL
     inferred = src
     const_flags = Int32(0)
-    min_world = meth.primary_world
-    max_world = meth.deleted_world
+    min_world = world
+    max_world = typemax(UInt)
     ipo_effects = UInt32(0)
     effects = UInt32(0)
     analysis_results = nothing
     relocatability = UInt8(0)
-    ci = CodeInstance(mi, owner, rt, exctype, inferred_const, inferred,
-                      const_flags, min_world, max_world, ipo_effects, effects,
-                      analysis_results, relocatability, src.debuginfo)
-    Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi)
-
-    id = length(GPUCompiler.deferred_codegen_jobs) + 1
-    GPUCompiler.deferred_codegen_jobs[id] = job
-    return OpaqueClosure{id, typeof(env), sig, rt}(env)
+    CodeInstance(mi, owner, rt, exctype, inferred_const, inferred,
+                    const_flags, min_world, max_world, ipo_effects, effects,
+                    analysis_results, relocatability, src.debuginfo)
 end
 
 # generated function `ccall`, working around the restriction that ccall type
@@ -587,7 +543,60 @@ end
     return ex
 end
 
-# device-side call to an opaque closure
+# static opaque closures
+
+# XXX: because we can't call functions from other CUDA modules, we effectively need to
+#      recompile when the target function changes. this, and because of how GPUCompiler's
+#      deferred compilation mechanism currently works, is why we have `F` as a type param.
+
+# XXX: because of GPU code requiring specialized signatures, we also need to recompile
+#      when the environment or argument types change. together with the above, this
+#      negates much of the benefit of opaque closures.
+
+# TODO: support for constructing an opaque closure from source code
+
+# TODO: complete support for passing an environment. this probably requires a split into
+#       host and device structures to, e.g., root a CuArray and pass a CuDeviceArray.
+
+struct OpaqueClosure{F, E, A, R}    # func, env, args, ret
+    env::E
+end
+
+function OpaqueClosure(ir::IRCode, @nospecialize env...;
+                       slotnames::Union{Nothing,Vector{Symbol}}=nothing)
+    nargtypes = length(ir.argtypes)
+    nargs = nargtypes-1
+    sig = compute_oc_signature(ir, nargs)
+    rt = compute_ir_rettype(ir)
+    src = make_oc_codeinfo(ir, env...; slotnames)
+    return create_static_oc(src, sig, rt, nargs, env...)
+end
+
+function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs)
+    return create_static_oc(src, sig, rettype, nargs, env...)
+end
+
+function create_static_oc(src, @nospecialize(sig), @nospecialize(rt), nargs::Int,
+                          @nospecialize env...; file=nothing, line=0)
+    config = compiler_config(device(); kernel=false)
+    meth = make_oc_method(nargs; file, line)
+
+    # look up a method instance and create a compiler job
+    full_sig = Tuple{typeof(env), sig.parameters...}
+    mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
+               (Any, Any, Any), meth, full_sig, Core.svec())
+    job = CompilerJob(mi, config, meth.primary_world)
+
+    # create a callable object
+    id = length(GPUCompiler.deferred_codegen_jobs) + 1
+    GPUCompiler.deferred_codegen_jobs[id] = job
+    oc = OpaqueClosure{id, typeof(env), sig, rt}(env)
+
+    opaque_closure_jobs[job] = (; oc, src, rt)
+    return oc
+end
+
+# device-side call
 (oc::OpaqueClosure)(args...) = call(oc, args...)
 ## NOTE: split into two to make `SciML.isinplace(oc)` work.
 ##       it also resembles how kernels are called.
@@ -597,3 +606,87 @@ end
     #ccall(ptr, R, (A...), args...)
     generated_ccall(ptr, R, A, args...)
 end
+
+# dynamic opaque closures
+
+const jit_opaque_closures = Dict()
+
+struct JITOpaqueClosure{B, T}
+    builder::B
+    tfunc::T
+
+    function JITOpaqueClosure(builder, tfunc=Returns(nothing); nargs)
+        # the device and world are captured at closure construction time, but we only need
+        # them when creating the CompilerJob. as we cannot simply encode them in the
+        # JITOpaqueClosure object, we store them in a global dictionary instead.
+        config = compiler_config(device(); kernel=false)
+        meth = make_oc_method(nargs)
+
+        # create a callable object
+        oc = new{typeof(builder), typeof(tfunc)}(builder, tfunc)
+        jit_opaque_closures[typeof(oc)] = (; env=(), meth, config, oc)
+
+        return oc
+    end
+end
+
+# device-side call
+function (oc::JITOpaqueClosure)(args...)
+    rt = oc.tfunc(map(Core.Typeof, args)...)
+    call(oc, rt, args...)
+end
+@inline @generated function call(oct::JITOpaqueClosure{B,T}, ::Type{R}, args...) where {B,T,R}
+    rt = R
+    (; env, meth, config, oc) = jit_opaque_closures[oct]
+
+    # look up a method instance and create a compiler job
+    full_sig = Tuple{typeof(env), args...}
+    mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
+               (Any, Any, Any), meth, full_sig, Core.svec())
+    job = CompilerJob(mi, config, meth.primary_world)
+    opaque_closure_jobs[job] = (; oc, args, rt)
+
+    # generate a deferred compilation call
+    id = length(GPUCompiler.deferred_codegen_jobs) + 1
+    GPUCompiler.deferred_codegen_jobs[id] = job
+    quote
+        ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id)
+        assume(ptr != C_NULL)
+        #ccall(ptr, R, (A...), args...)
+        generated_ccall(ptr, $rt, $(Tuple{args...}), args...)
+    end
+end
+
+# compilation of opaque closures
+
+const opaque_closure_jobs = Dict{CompilerJob,Any}()
+
+function GPUCompiler.prepare_job!(@nospecialize(job::CUDACompilerJob))
+    if haskey(opaque_closure_jobs, job)
+        rt = opaque_closure_jobs[job].rt
+        oc = opaque_closure_jobs[job].oc
+        if oc isa JITOpaqueClosure
+            args = opaque_closure_jobs[job].args
+            nargs = length(args)
+
+            src = oc.builder(args...)
+            if src isa IRCode
+                nargtypes = length(src.argtypes)
+                nargs = nargtypes-1
+                sig = compute_oc_signature(src, nargs)
+                @assert compute_ir_rettype(src) == rt "Inferred return type does not match the provided return type"
+                src = make_oc_codeinfo(src)
+            end
+        else
+            src = opaque_closure_jobs[job].src
+        end
+        @assert src isa CodeInfo
+
+        # create a code instance and store it in the cache
+        interp = GPUCompiler.get_interpreter(job)
+        ci = make_oc_codeinstance(job.source, src; interp, job.world, rt)
+        Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, job.source)
+    end
+
+    return
+end
diff --git a/test/core/execution.jl b/test/core/execution.jl
index 3d23b97ca5..d0091adde1 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -1099,7 +1099,7 @@ end
 if VERSION >= v"1.12-"
 @testset "opaque closures" begin
 
-# basic closure, constructed from IRCode
+# static closure, constructed from IRCode
 let
     ir, rettyp = only(Base.code_ircode(+, (Int, Int)))
     oc = CUDA.OpaqueClosure(ir)
@@ -1118,7 +1118,7 @@ let
     @test Array(c)[] == 3
 end
 
-# basic closure, constructed from CodeInfo
+# static closure, constructed from CodeInfo
 let
     ir, rettype = only(Base.code_typed(*, (Int, Int, Int)))
     oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3)
@@ -1138,6 +1138,43 @@ let
     @test Array(d)[] == 24
 end
 
+# dynamic closure, constructing IRCode based on argument types
+let
+    tfunc(arg1, arg2) = Core.Compiler.return_type(+, Tuple{arg1,arg2})
+    function builder(arg1, arg2)
+        ir, rettyp = only(Base.code_ircode(+, (arg1, arg2)))
+        return ir
+    end
+
+    oc = CUDA.JITOpaqueClosure(builder, tfunc; nargs=2)
+
+    function kernel(oc, c, a, b)
+        i = threadIdx().x
+        @inbounds c[i] = oc(a[i], b[i])
+        return
+    end
+
+    let
+        c = CuArray([0])
+        a = CuArray([1])
+        b = CuArray([2])
+
+        @cuda threads=1 kernel(oc, c, a, b)
+
+        @test Array(c)[] == 3
+    end
+
+    let
+        c = CuArray([3f0])
+        a = CuArray([4f0])
+        b = CuArray([5f0])
+
+        @cuda threads=1 kernel(oc, c, a, b)
+
+        @test Array(c)[] == 9f0
+    end
+end
+
 end
 end