Skip to content

Commit

Permalink
Add an index typevar to CuDeviceArray.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed May 3, 2023
1 parent 594a8b6 commit db2723f
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 39 deletions.
55 changes: 31 additions & 24 deletions src/device/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,38 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg
## construction

"""
CuDeviceArray{T,N,A}(ptr, dims, [maxsize])
CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize])
Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
pointer, where `N` is determined from the length of `dims` and `T` is determined from the
type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
then it must match the length of `dims`. The same applies to the element type `T`, which
should match the type of the pointer `ptr`.
pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding
to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be
stored in the array, and is determined automatically if not specified. `I` is the integer
type used to store the size of the array, and is determined automatically if not specified.
"""
CuDeviceArray

# NOTE: we can't support the typical `tuple or series of integer` style construction,
# because we're currently requiring a trailing pointer argument.

struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N}
ptr::LLVMPtr{T,A}
maxsize::Int

dims::Dims{N}
len::Int
maxsize::I

dims::NTuple{N,I}
len::I

# determine an index type based on the size of the array.
# this is type unstable, so only use this constructor from the host side.
function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N}
if maxsize <= typemax(Int32)
CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize)
else
CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize)
end
end

# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
new(ptr, maxsize, dims, prod(dims))
# fully typed, for use in device code
CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} =
new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims)))
end

const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
Expand Down Expand Up @@ -224,18 +231,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
end
end

function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I}
err = _reinterpret_exception(T, a)
err === nothing || throw(err)

if sizeof(T) == sizeof(S) # fast case
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
end

isize = size(a)
size1 = div(isize[1]*sizeof(S), sizeof(T))
osize = tuple(size1, Base.tail(isize)...)
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
end


Expand All @@ -252,7 +259,7 @@ function Base.reshape(a::CuDeviceArray{T,M}, dims::NTuple{N,Int}) where {T,N,M}
end

# create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray
@inline function _derived_array(::Type{T}, N::Int, a::CuDeviceArray{T,M,A},
osize::Dims) where {T, M, A}
return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize)
@inline function _derived_array(::Type{T}, N::Int, a::CuDeviceArray{T,M,A,I},
osize::Dims) where {T, M, A, I}
return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize)
end
6 changes: 4 additions & 2 deletions src/device/intrinsics/memory_shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ generator function will be called dynamically.
# NOTE: this relies on const-prop to forward the literal length to the generator.
# maybe we should include the size in the type, like StaticArrays does?
ptr = emit_shmem(T, Val(len))
CuDeviceArray{T,N,AS.Shared}(ptr, dims)
# XXX: 4GB ought to be enough shared memory for anybody
CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
end
CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,))

Expand Down Expand Up @@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to
end
end
ptr = emit_shmem(T) + offset
CuDeviceArray{T,N,AS.Shared}(ptr, dims)
# XXX: 4GB ought to be enough shared memory for anybody
CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
end
Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} =
CuDynamicSharedArray(T, (len,), offset)
Expand Down
6 changes: 3 additions & 3 deletions src/device/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import RandomNumbers
}
attributes #0 = { alwaysinline }
""", "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
end

# shared memory with per-warp counters, incremented when generating numbers
Expand All @@ -31,7 +31,7 @@ end
}
attributes #0 = { alwaysinline }
""", "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
end

@device_override Random.make_seed() = clock(UInt32)
Expand Down Expand Up @@ -190,7 +190,7 @@ end
for var in [:ki, :wi, :fi, :ke, :we, :fe]
val = getfield(Random, var)
gpu_var = Symbol("gpu_$var")
arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32})
@eval @inline @generated function $gpu_var()
ptr = emit_constant_array($(QuoteNode(var)), $val)
Expr(:call, $arr_typ, ptr, $(size(val)))
Expand Down
2 changes: 1 addition & 1 deletion test/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ end
return
end

asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
@test !occursin(".local", asm)
end

Expand Down
2 changes: 1 addition & 1 deletion test/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ using SpecialFunctions
@inbounds b[], c[] = @fastmath sincos(a[])
return
end
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}}))
@assert contains(asm, "sin.approx.f32")
@assert contains(asm, "cos.approx.f32")
@assert !contains(asm, "__nv") # from libdevice
Expand Down
16 changes: 8 additions & 8 deletions test/device/intrinsics/wmma.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ map_ptx_to_jl_frag = Dict(
"s32" => Int32(42),
"f16" => ntuple(i -> VecElement{Float16}(42), 2),
"f32" => Float32(42)
)
)
# Return specific matrix shape given operation configuration
function get_array_shape(mat, mnk, layout)
if !(mat in ["a","b","c","d"])
Expand Down Expand Up @@ -46,13 +46,13 @@ end
# Type-dependent variables
array_ty = CUDA.WMMA.map_ptx_to_jl_array[elem_type]
expected = map_ptx_to_jl_frag[elem_type]

# Address-space dependent variables
do_shared_test = (addr_space == "_shared")

# Get the function name
func = Symbol("llvm_wmma_load_$(mat)_$(layout)_$(shape)$(addr_space)_stride_$(elem_type)")

input_shape = get_array_shape(mat, mnk, layout)
input = array_ty(42) * ones(array_ty, input_shape)
input_dev = CuArray(input)
Expand Down Expand Up @@ -96,7 +96,7 @@ end
elem_type in ops[3],
addr_space in ["", "_global", "_shared"],
stride in ["stride"]

# Skip all but d matrices
if mat != "d"
continue
Expand Down Expand Up @@ -171,7 +171,7 @@ end
# Int/subint mma functions are distinguished by the a/b element type
mma_sym = d_ty == Int32 ? Symbol("llvm_wmma_mma_$(a_layout)_$(b_layout)_$(shape)_$(ab_elem_type)") :
Symbol("llvm_wmma_mma_$(a_layout)_$(b_layout)_$(shape)_$(d_elem_type)_$(c_elem_type)")
mma_func = getfield(Main, mma_sym)
mma_func = getfield(Main, mma_sym)
std_func = getfield(Main, Symbol("llvm_wmma_store_d_col_$(shape)_global_stride_$(d_elem_type)"))

a_shape = get_array_shape("a", mnk, a_layout)
Expand Down Expand Up @@ -207,7 +207,7 @@ end
# Alter test depending on a/b element Type
if ab_ty == Float16
@test new_a * new_b + c Array(d_dev) rtol=Base.rtoldefault(Float16)
else # Cast a and b to prevent UInt8 rollover of resultant data
else # Cast a and b to prevent UInt8 rollover of resultant data
@test Int32.(new_a) * Int32.(new_b) + c == Array(d_dev)
end
end
Expand Down Expand Up @@ -322,7 +322,7 @@ end
return
end

ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},)))

@test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
@test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
Expand All @@ -344,4 +344,4 @@ end
@test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
@test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx)
end
end
end

0 comments on commit db2723f

Please sign in to comment.