diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index b759241..65e6356 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -1519,14 +1519,16 @@ function encode_XOrIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::V end """ - encode_ShLIOp!(cb, result_type, lhs, rhs) -> Value + encode_ShLIOp!(cb, result_type, lhs, rhs; overflow) -> Value Shift left. Opcode: 96 """ -function encode_ShLIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::Value) +function encode_ShLIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::Value; + overflow::IntegerOverflow=OverflowNone) encode_varint!(cb.buf, Opcode.ShLIOp) encode_typeid!(cb.buf, result_type) + encode_enum!(cb.buf, overflow) encode_operand!(cb.buf, lhs) encode_operand!(cb.buf, rhs) return new_op!(cb) diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 803639c..31649e4 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -335,18 +335,7 @@ function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize( return CC.widenconst(x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) - cb = ctx.cb - tt = ctx.tt - - lhs = @something emit_value!(ctx, args[1]) throw(IRError("andi: cannot resolve lhs")) - rhs = @something emit_value!(ctx, args[2]) throw(IRError("andi: cannot resolve rhs")) - - lhs_type = CC.widenconst(lhs.jltype) - dtype = julia_to_tile_dtype!(tt, eltype(lhs_type)) - result_type_id = tile_type!(tt, dtype, lhs.shape) - - result = encode_AndIOp!(cb, result_type_id, lhs.v, rhs.v) - CGVal(result, result_type_id, lhs.jltype, lhs.shape) + emit_binop!(ctx, args, encode_AndIOp!) end # cuda_tile.ori @@ -361,18 +350,7 @@ function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y return CC.widenconst(x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) - cb = ctx.cb - tt = ctx.tt - - lhs = @something emit_value!(ctx, args[1]) throw(IRError("ori: cannot resolve lhs")) - rhs = @something emit_value!(ctx, args[2]) throw(IRError("ori: cannot resolve rhs")) - - lhs_type = CC.widenconst(lhs.jltype) - dtype = julia_to_tile_dtype!(tt, eltype(lhs_type)) - result_type_id = tile_type!(tt, dtype, lhs.shape) - - result = encode_OrIOp!(cb, result_type_id, lhs.v, rhs.v) - CGVal(result, result_type_id, lhs.jltype, lhs.shape) + emit_binop!(ctx, args, encode_OrIOp!) end # cuda_tile.xori @@ -380,16 +358,5 @@ end @intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args) - cb = ctx.cb - tt = ctx.tt - - lhs = @something emit_value!(ctx, args[1]) throw(IRError("xori: cannot resolve lhs")) - rhs = @something emit_value!(ctx, args[2]) throw(IRError("xori: cannot resolve rhs")) - - lhs_type = CC.widenconst(lhs.jltype) - dtype = julia_to_tile_dtype!(tt, eltype(lhs_type)) - result_type_id = tile_type!(tt, dtype, lhs.shape) - - result = encode_XOrIOp!(cb, result_type_id, lhs.v, rhs.v) - CGVal(result, result_type_id, lhs.jltype, lhs.shape) + emit_binop!(ctx, args, encode_XOrIOp!) end diff --git a/src/compiler/intrinsics/julia.jl b/src/compiler/intrinsics/julia.jl index 6ab683e..e348eb6 100644 --- a/src/compiler/intrinsics/julia.jl +++ b/src/compiler/intrinsics/julia.jl @@ -1,7 +1,8 @@ # Julia intrinsics # Handle Julia Core.Intrinsics that IRStructurizer uses for control flow transformations. -# These are: add_int (loop increments), slt_int, sle_int, ult_int (loop bounds). +# These are: add_int / sub_int (loop increments), slt_int / sle_int / ult_int (loop bounds), +# and not_int (bitwise NOT, used by `for` loop iteration). function emit_intrinsic!(ctx::CGCtx, func::Core.IntrinsicFunction, args) if func === Core.Intrinsics.add_int emit_intrinsic!(ctx, Intrinsics.addi, args) @@ -13,11 +14,35 @@ function emit_intrinsic!(ctx::CGCtx, func::Core.IntrinsicFunction, args) emit_intrinsic!(ctx, Intrinsics.cmpi, [args..., CmpLessThanOrEqual, SignednessSigned]) elseif func === Core.Intrinsics.ult_int emit_intrinsic!(ctx, Intrinsics.cmpi, [args..., CmpLessThan, SignednessUnsigned]) + elseif func === Core.Intrinsics.not_int + emit_not_int!(ctx, args) else throw(IRError("Unhandled Julia intrinsic: $func")) end end +# not_int(x) — bitwise NOT. +# Julia's `for` loops generate `not_int` to negate loop-exit conditions. +# Emitted as xori(x, allones) where allones is the bitwise complement identity: +# Bool → xori(x, true) (logical negation) +# Integer → xori(x, -1) (bitwise complement, all bits set) +function emit_not_int!(ctx::CGCtx, args) + cb = ctx.cb + tt = ctx.tt + + operand = @something emit_value!(ctx, args[1]) throw(IRError("not_int: cannot resolve operand")) + jltype = CC.widenconst(operand.jltype) + + # Build the all-ones constant for xori: true for Bool, -1 (all bits set) for integers + allones_val = jltype === Bool ? true : jltype(-1) + type_id = tile_type_for_julia!(ctx, jltype) + allones_bytes = reinterpret(UInt8, [allones_val]) + allones_v = encode_ConstantOp!(cb, type_id, collect(allones_bytes)) + + result = encode_XOrIOp!(cb, type_id, operand.v, allones_v) + CGVal(result, type_id, operand.jltype, operand.shape) +end + # built-in: === function emit_intrinsic!(ctx::CGCtx, ::typeof(===), args) cb = ctx.cb diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl index 3483e93..24cd62e 100644 --- a/test/codegen/operations.jl +++ b/test/codegen/operations.jl @@ -1387,7 +1387,54 @@ #========================================================================= 8.9 Bitwise =========================================================================# - # TODO: andi - bitwise AND + @testset "Bitwise" begin + spec_i32 = ct.ArraySpec{1}(16, true) + + @testset "andi, ori, xori" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}, ct.TileArray{Int32,1,spec_i32}}) do a, b + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + @check "andi" + Base.donotdelete(map(&, ta, tb)) + @check "ori" + Base.donotdelete(map(|, ta, tb)) + @check "xori" + Base.donotdelete(map(xor, ta, tb)) + return + end + end + end + + @testset "shli, shri" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "shli" + Base.donotdelete(map(x -> x << Int32(4), tile)) + @check "shri" + Base.donotdelete(map(x -> x >> Int32(8), tile)) + return + end + end + end + @testset "bitwise NOT (~)" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "xori" + Base.donotdelete(map(~, tile)) + return + end + end + end + end #========================================================================= 8.10 Atomics diff --git a/test/execution/basic.jl b/test/execution/basic.jl index 5c9a8ae..ee89621 100644 --- a/test/execution/basic.jl +++ b/test/execution/basic.jl @@ -1216,4 +1216,240 @@ end b3 = CUDA.zeros(Float32, 64) ct.launch(multi_early_return, 4, a, b3, Int32(1), Int32(0)) @test all(Array(b3) .== 0.0f0) +end + +@testset "bitwise operations" begin + +@testset "andi (bitwise AND)" begin + function bitwise_and_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, + c::ct.TileArray{Int32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, map(&, ta, tb)) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + c = CUDA.zeros(Int32, n) + + ct.launch(bitwise_and_kernel, cld(n, tile_size), a, b, c) + + @test Array(c) == Array(a) .& Array(b) +end + +@testset "ori (bitwise OR)" begin + function bitwise_or_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, + c::ct.TileArray{Int32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, map(|, ta, tb)) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + c = CUDA.zeros(Int32, n) + + ct.launch(bitwise_or_kernel, cld(n, tile_size), a, b, c) + + @test Array(c) == Array(a) .| Array(b) +end + +@testset "xori (bitwise XOR)" begin + function bitwise_xor_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, + c::ct.TileArray{Int32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, map(xor, ta, tb)) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + c = CUDA.zeros(Int32, n) + + ct.launch(bitwise_xor_kernel, cld(n, tile_size), a, b, c) + + @test Array(c) == Array(a) .\u22bb Array(b) +end + +@testset "shli (shift left)" begin + function shift_left_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, map(x -> x << Int32(4), tile)) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x0fff_ffff), n)) + b = CUDA.zeros(Int32, n) + + ct.launch(shift_left_kernel, cld(n, tile_size), a, b) + + @test Array(b) == Array(a) .<< Int32(4) +end + +@testset "shri (shift right)" begin + function shift_right_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, map(x -> x >> Int32(8), tile)) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + b = CUDA.zeros(Int32, n) + + ct.launch(shift_right_kernel, cld(n, tile_size), a, b) + + @test Array(b) == Array(a) .>> Int32(8) +end + +@testset "combined bitwise ops" begin + # (a & b) | (a ^ b) \u2014 exercises all three ops in a single kernel + function combined_bitwise_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, + c::ct.TileArray{Int32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, map(|, map(&, ta, tb), map(xor, ta, tb))) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + c = CUDA.zeros(Int32, n) + + ct.launch(combined_bitwise_kernel, cld(n, tile_size), a, b, c) + + @test Array(c) == (Array(a) .& Array(b)) .| (Array(a) .\u22bb Array(b)) +end + +@testset "bitwise NOT (~)" begin + function bitwise_not_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, map(~, tile)) + return + end + + n = 1024 + tile_size = 16 + a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n)) + b = CUDA.zeros(Int32, n) + + ct.launch(bitwise_not_kernel, cld(n, tile_size), a, b) + + @test Array(b) == .~Array(a) +end + +end + +@testset "for-loop iteration" begin + +@testset "simple for loop (accumulate)" begin + function for_loop_sum_kernel(data::ct.TileArray{Float32,1}, + out::ct.TileArray{Float32,1}, + n_iters::Int32) + pid = ct.bid(1) + acc = ct.zeros((16,), Float32) + for i in Int32(1):n_iters + tile = ct.load(data, i, (16,)) + acc = acc .+ tile + end + ct.store(out, pid, acc) + return + end + + n_iters = Int32(4) + data = CUDA.rand(Float32, 64) # 4 tiles of 16 + out = CUDA.zeros(Float32, 16) + + ct.launch(for_loop_sum_kernel, 1, data, out, n_iters) + + data_cpu = Array(data) + expected = zeros(Float32, 16) + for i in 1:4 + expected .+= data_cpu[(i-1)*16+1 : i*16] + end + @test Array(out) ≈ expected +end + +@testset "for loop with constant bound" begin + function for_loop_const_kernel(data::ct.TileArray{Float32,1}, + out::ct.TileArray{Float32,1}, + n_iters::Int) + pid = ct.bid(1) + acc = ct.zeros((16,), Float32) + for i in Int32(1):Int32(n_iters) + tile = ct.load(data, i, (16,)) + acc = acc .+ tile + end + ct.store(out, pid, acc) + return + end + + data = CUDA.rand(Float32, 48) # 3 tiles of 16 + out = CUDA.zeros(Float32, 16) + + ct.launch(for_loop_const_kernel, 1, data, out, ct.Constant(3)) + + data_cpu = Array(data) + expected = zeros(Float32, 16) + for i in 1:3 + expected .+= data_cpu[(i-1)*16+1 : i*16] + end + @test Array(out) ≈ expected +end + +@testset "for loop with dynamic bound" begin + # n_iters comes from scalar indexing (runtime value, not constant) + function for_loop_dynamic_kernel(data::ct.TileArray{Float32,1}, + lengths::ct.TileArray{Int32,1}, + out::ct.TileArray{Float32,1}) + bid = ct.bid(1) + len = lengths[bid] + acc = ct.zeros((16,), Float32) + for j in Int32(1):len + tile = ct.load(data, j, (16,)) + acc = acc .+ tile + end + ct.store(out, bid, acc) + return + end + + n_tiles = Int32[2, 3, 1] + data = CUDA.rand(Float32, 48) # 3 tiles of 16 + lengths = CuArray(n_tiles) + out = CUDA.zeros(Float32, 48) + + ct.launch(for_loop_dynamic_kernel, 3, data, lengths, out) + + data_cpu = Array(data) + out_cpu = Array(out) + for bid in 1:3 + expected = zeros(Float32, 16) + for j in 1:n_tiles[bid] + expected .+= data_cpu[(j-1)*16+1 : j*16] + end + @test out_cpu[(bid-1)*16+1 : bid*16] ≈ expected + end +end + end \ No newline at end of file