JuliaGPU · 0xtaruhi · Mar 6, 2026
diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
@@ -1519,14 +1519,16 @@ function encode_XOrIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::V
 end
 
 """
-    encode_ShLIOp!(cb, result_type, lhs, rhs) -> Value
+    encode_ShLIOp!(cb, result_type, lhs, rhs; overflow) -> Value
 
 Shift left.
 Opcode: 96
 """
-function encode_ShLIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::Value)
+function encode_ShLIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::Value;
+                        overflow::IntegerOverflow=OverflowNone)
     encode_varint!(cb.buf, Opcode.ShLIOp)
     encode_typeid!(cb.buf, result_type)
+    encode_enum!(cb.buf, overflow)
     encode_operand!(cb.buf, lhs)
     encode_operand!(cb.buf, rhs)
     return new_op!(cb)

diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
@@ -335,18 +335,7 @@ function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(
     return CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
-    cb = ctx.cb
-    tt = ctx.tt
-
-    lhs = @something emit_value!(ctx, args[1]) throw(IRError("andi: cannot resolve lhs"))
-    rhs = @something emit_value!(ctx, args[2]) throw(IRError("andi: cannot resolve rhs"))
-
-    lhs_type = CC.widenconst(lhs.jltype)
-    dtype = julia_to_tile_dtype!(tt, eltype(lhs_type))
-    result_type_id = tile_type!(tt, dtype, lhs.shape)
-
-    result = encode_AndIOp!(cb, result_type_id, lhs.v, rhs.v)
-    CGVal(result, result_type_id, lhs.jltype, lhs.shape)
+    emit_binop!(ctx, args, encode_AndIOp!)
 end
 
 # cuda_tile.ori
@@ -361,35 +350,13 @@ function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y
     return CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
-    cb = ctx.cb
-    tt = ctx.tt
-
-    lhs = @something emit_value!(ctx, args[1]) throw(IRError("ori: cannot resolve lhs"))
-    rhs = @something emit_value!(ctx, args[2]) throw(IRError("ori: cannot resolve rhs"))
-
-    lhs_type = CC.widenconst(lhs.jltype)
-    dtype = julia_to_tile_dtype!(tt, eltype(lhs_type))
-    result_type_id = tile_type!(tt, dtype, lhs.shape)
-
-    result = encode_OrIOp!(cb, result_type_id, lhs.v, rhs.v)
-    CGVal(result, result_type_id, lhs.jltype, lhs.shape)
+    emit_binop!(ctx, args, encode_OrIOp!)
 end
 
 # cuda_tile.xori
 @intrinsic xori(x::T, y::T) where {T<:Integer}
 @intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
-    cb = ctx.cb
-    tt = ctx.tt
-
-    lhs = @something emit_value!(ctx, args[1]) throw(IRError("xori: cannot resolve lhs"))
-    rhs = @something emit_value!(ctx, args[2]) throw(IRError("xori: cannot resolve rhs"))
-
-    lhs_type = CC.widenconst(lhs.jltype)
-    dtype = julia_to_tile_dtype!(tt, eltype(lhs_type))
-    result_type_id = tile_type!(tt, dtype, lhs.shape)
-
-    result = encode_XOrIOp!(cb, result_type_id, lhs.v, rhs.v)
-    CGVal(result, result_type_id, lhs.jltype, lhs.shape)
+    emit_binop!(ctx, args, encode_XOrIOp!)
 end
diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl
@@ -1387,7 +1387,42 @@
     #=========================================================================
      8.9 Bitwise
     =========================================================================#
-    # TODO: andi - bitwise AND
+    @testset "Bitwise" begin
+        spec_i32 = ct.ArraySpec{1}(16, true)
+
+        @testset "andi, ori, xori" begin
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}, ct.TileArray{Int32,1,spec_i32}}) do a, b
+                    pid = ct.bid(1)
+                    ta = ct.load(a, pid, (16,))
+                    tb = ct.load(b, pid, (16,))
+                    @check "andi"
+                    Base.donotdelete(map(&, ta, tb))
+                    @check "ori"
+                    Base.donotdelete(map(|, ta, tb))
+                    @check "xori"
+                    Base.donotdelete(map(xor, ta, tb))
+                    return
+                end
+            end
+        end
+
+        @testset "shli, shri" begin
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    @check "shli"
+                    Base.donotdelete(map(x -> x << Int32(4), tile))
+                    @check "shri"
+                    Base.donotdelete(map(x -> x >> Int32(8), tile))
+                    return
+                end
+            end
+        end
+    end
 
     #=========================================================================
      8.10 Atomics

diff --git a/test/execution/basic.jl b/test/execution/basic.jl
@@ -1216,4 +1216,129 @@ end
     b3 = CUDA.zeros(Float32, 64)
     ct.launch(multi_early_return, 4, a, b3, Int32(1), Int32(0))
     @test all(Array(b3) .== 0.0f0)
+end
+
+@testset "bitwise operations" begin
+
+@testset "andi (bitwise AND)" begin
+    function bitwise_and_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                                c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(&, ta, tb))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_and_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == Array(a) .& Array(b)
+end
+
+@testset "ori (bitwise OR)" begin
+    function bitwise_or_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                               c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(|, ta, tb))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_or_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == Array(a) .| Array(b)
+end
+
+@testset "xori (bitwise XOR)" begin
+    function bitwise_xor_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                                c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(xor, ta, tb))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_xor_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == Array(a) .⊻ Array(b)
+end
+
+@testset "shli (shift left)" begin
+    function shift_left_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, map(x -> x << Int32(4), tile))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x0fff_ffff), n))
+    b = CUDA.zeros(Int32, n)
+
+    ct.launch(shift_left_kernel, cld(n, tile_size), a, b)
+
+    @test Array(b) == Array(a) .<< Int32(4)
+end
+
+@testset "shri (shift right)" begin
+    function shift_right_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, map(x -> x >> Int32(8), tile))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CUDA.zeros(Int32, n)
+
+    ct.launch(shift_right_kernel, cld(n, tile_size), a, b)
+
+    @test Array(b) == Array(a) .>> Int32(8)
+end
+
+@testset "combined bitwise ops" begin
+    # (a & b) | (a ^ b) \u2014 exercises all three ops in a single kernel
+    function combined_bitwise_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                                     c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(|, map(&, ta, tb), map(xor, ta, tb)))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(combined_bitwise_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == (Array(a) .& Array(b)) .| (Array(a) .⊻ Array(b))
+end
+
 end