Add an index typevar to CuDeviceArray.

JuliaGPU · May 3, 2023 · db2723f · db2723f
1 parent 594a8b6
commit db2723f
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 39 deletions.
diff --git a/src/device/array.jl b/src/device/array.jl
@@ -6,31 +6,38 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg
 ## construction
 
 """
-    CuDeviceArray{T,N,A}(ptr, dims, [maxsize])
+    CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize])
 
 Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
-pointer, where `N` is determined from the length of `dims` and `T` is determined from the
-type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
-lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
-then it must match the length of `dims`. The same applies to the element type `T`, which
-should match the type of the pointer `ptr`.
+pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding
+to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be
+stored in the array, and is determined automatically if not specified. `I` is the integer
+type used to store the size of the array, and is determined automatically if not specified.
 """
 CuDeviceArray
 
-# NOTE: we can't support the typical `tuple or series of integer` style construction,
-#       because we're currently requiring a trailing pointer argument.
-
-struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
+struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N}
     ptr::LLVMPtr{T,A}
-    maxsize::Int
-
-    dims::Dims{N}
-    len::Int
+    maxsize::I
+
+    dims::NTuple{N,I}
+    len::I
+
+    # determine an index type based on the size of the array.
+    # this is type unstable, so only use this constructor from the host side.
+    function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
+                                  maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N}
+        if maxsize <= typemax(Int32)
+            CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize)
+        else
+            CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize)
+        end
+    end
 
-    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
-    CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
-                         maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
-        new(ptr, maxsize, dims, prod(dims))
+    # fully typed, for use in device code
+    CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple,
+                           maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} =
+        new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims)))
 end
 
 const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
@@ -224,18 +231,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
     end
 end
 
-function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
+function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I}
   err = _reinterpret_exception(T, a)
   err === nothing || throw(err)
 
   if sizeof(T) == sizeof(S) # fast case
-    return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
+    return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
   end
 
   isize = size(a)
   size1 = div(isize[1]*sizeof(S), sizeof(T))
   osize = tuple(size1, Base.tail(isize)...)
-  return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
+  return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
 end
 
 
@@ -252,7 +259,7 @@ function Base.reshape(a::CuDeviceArray{T,M}, dims::NTuple{N,Int}) where {T,N,M}
 end
 
 # create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray
-@inline function _derived_array(::Type{T}, N::Int, a::CuDeviceArray{T,M,A},
-                                osize::Dims) where {T, M, A}
-  return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize)
+@inline function _derived_array(::Type{T}, N::Int, a::CuDeviceArray{T,M,A,I},
+                                osize::Dims) where {T, M, A, I}
+  return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize)
 end
diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl
@@ -16,7 +16,8 @@ generator function will be called dynamically.
     # NOTE: this relies on const-prop to forward the literal length to the generator.
     #       maybe we should include the size in the type, like StaticArrays does?
     ptr = emit_shmem(T, Val(len))
-    CuDeviceArray{T,N,AS.Shared}(ptr, dims)
+    # XXX: 4GB ought to be enough shared memory for anybody
+    CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
 end
 CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,))
 
@@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to
         end
     end
     ptr = emit_shmem(T) + offset
-    CuDeviceArray{T,N,AS.Shared}(ptr, dims)
+    # XXX: 4GB ought to be enough shared memory for anybody
+    CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
 end
 Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} =
     CuDynamicSharedArray(T, (len,), offset)

diff --git a/src/device/random.jl b/src/device/random.jl
@@ -17,7 +17,7 @@ import RandomNumbers
              }
              attributes #0 = { alwaysinline }
           """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
-    CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
+    CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
 end
 
 # shared memory with per-warp counters, incremented when generating numbers
@@ -31,7 +31,7 @@ end
              }
              attributes #0 = { alwaysinline }
           """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
-    CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
+    CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
 end
 
 @device_override Random.make_seed() = clock(UInt32)
@@ -190,7 +190,7 @@ end
 for var in [:ki, :wi, :fi, :ke, :we, :fe]
     val = getfield(Random, var)
     gpu_var = Symbol("gpu_$var")
-    arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
+    arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32})
     @eval @inline @generated function $gpu_var()
         ptr = emit_constant_array($(QuoteNode(var)), $val)
         Expr(:call, $arr_typ, ptr, $(size(val)))

diff --git a/test/codegen.jl b/test/codegen.jl
@@ -153,7 +153,7 @@ end
         return
     end
 
-    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
+    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
     @test !occursin(".local", asm)
 end
 

diff --git a/test/device/intrinsics/math.jl b/test/device/intrinsics/math.jl
@@ -143,7 +143,7 @@ using SpecialFunctions
             @inbounds b[], c[] = @fastmath sincos(a[])
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}}))
         @assert contains(asm, "sin.approx.f32")
         @assert contains(asm, "cos.approx.f32")
         @assert !contains(asm, "__nv")  # from libdevice

diff --git a/test/device/intrinsics/wmma.jl b/test/device/intrinsics/wmma.jl
@@ -7,7 +7,7 @@ map_ptx_to_jl_frag = Dict(
                             "s32" => Int32(42),
                             "f16" => ntuple(i -> VecElement{Float16}(42), 2),
                             "f32" => Float32(42)
-                            )  
+                            )
 # Return specific matrix shape given operation configuration
 function get_array_shape(mat, mnk, layout)
     if !(mat in ["a","b","c","d"])
@@ -46,13 +46,13 @@ end
             # Type-dependent variables
             array_ty = CUDA.WMMA.map_ptx_to_jl_array[elem_type]
             expected = map_ptx_to_jl_frag[elem_type]
-            
+
             # Address-space dependent variables
             do_shared_test = (addr_space == "_shared")
 
             # Get the function name
             func = Symbol("llvm_wmma_load_$(mat)_$(layout)_$(shape)$(addr_space)_stride_$(elem_type)")
-            
+
             input_shape = get_array_shape(mat, mnk, layout)
             input       = array_ty(42) * ones(array_ty, input_shape)
             input_dev   = CuArray(input)
@@ -96,7 +96,7 @@ end
             elem_type in ops[3],
             addr_space in ["", "_global", "_shared"],
             stride in ["stride"]
-            
+
             # Skip all but d matrices
             if mat != "d"
                 continue
@@ -171,7 +171,7 @@ end
             # Int/subint mma functions are distinguished by the a/b element type
             mma_sym = d_ty == Int32 ? Symbol("llvm_wmma_mma_$(a_layout)_$(b_layout)_$(shape)_$(ab_elem_type)") :
                                       Symbol("llvm_wmma_mma_$(a_layout)_$(b_layout)_$(shape)_$(d_elem_type)_$(c_elem_type)")
-            mma_func = getfield(Main, mma_sym)               
+            mma_func = getfield(Main, mma_sym)
             std_func = getfield(Main, Symbol("llvm_wmma_store_d_col_$(shape)_global_stride_$(d_elem_type)"))
 
             a_shape   = get_array_shape("a", mnk, a_layout)
@@ -207,7 +207,7 @@ end
             # Alter test depending on a/b element Type
             if ab_ty == Float16
                 @test new_a * new_b + c ≈ Array(d_dev) rtol=Base.rtoldefault(Float16)
-            else # Cast a and b to prevent UInt8 rollover of resultant data            
+            else # Cast a and b to prevent UInt8 rollover of resultant data
                 @test Int32.(new_a) * Int32.(new_b) + c == Array(d_dev)
             end
         end
@@ -322,7 +322,7 @@ end
             return
         end
 
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
+        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},)))
 
         @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
         @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
@@ -344,4 +344,4 @@ end
         @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
         @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx)
     end
-end
+end
-Original file line number
+Diff line change
@@ Expand Up / @@ -153,7 +153,7 @@ end @@
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
         @test !occursin(".local", asm)
     end
@@ Expand Down @@