diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index b759241..65e6356 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1519,14 +1519,16 @@ function encode_XOrIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::V
 end
 
 """
-    encode_ShLIOp!(cb, result_type, lhs, rhs) -> Value
+    encode_ShLIOp!(cb, result_type, lhs, rhs; overflow) -> Value
 
 Shift left.
 Opcode: 96
 """
-function encode_ShLIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::Value)
+function encode_ShLIOp!(cb::CodeBuilder, result_type::TypeId, lhs::Value, rhs::Value;
+                        overflow::IntegerOverflow=OverflowNone)
     encode_varint!(cb.buf, Opcode.ShLIOp)
     encode_typeid!(cb.buf, result_type)
+    encode_enum!(cb.buf, overflow)
     encode_operand!(cb.buf, lhs)
     encode_operand!(cb.buf, rhs)
     return new_op!(cb)
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 803639c..31649e4 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -335,18 +335,7 @@ function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(
     return CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
-    cb = ctx.cb
-    tt = ctx.tt
-
-    lhs = @something emit_value!(ctx, args[1]) throw(IRError("andi: cannot resolve lhs"))
-    rhs = @something emit_value!(ctx, args[2]) throw(IRError("andi: cannot resolve rhs"))
-
-    lhs_type = CC.widenconst(lhs.jltype)
-    dtype = julia_to_tile_dtype!(tt, eltype(lhs_type))
-    result_type_id = tile_type!(tt, dtype, lhs.shape)
-
-    result = encode_AndIOp!(cb, result_type_id, lhs.v, rhs.v)
-    CGVal(result, result_type_id, lhs.jltype, lhs.shape)
+    emit_binop!(ctx, args, encode_AndIOp!)
 end
 
 # cuda_tile.ori
@@ -361,18 +350,7 @@ function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y
     return CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
-    cb = ctx.cb
-    tt = ctx.tt
-
-    lhs = @something emit_value!(ctx, args[1]) throw(IRError("ori: cannot resolve lhs"))
-    rhs = @something emit_value!(ctx, args[2]) throw(IRError("ori: cannot resolve rhs"))
-
-    lhs_type = CC.widenconst(lhs.jltype)
-    dtype = julia_to_tile_dtype!(tt, eltype(lhs_type))
-    result_type_id = tile_type!(tt, dtype, lhs.shape)
-
-    result = encode_OrIOp!(cb, result_type_id, lhs.v, rhs.v)
-    CGVal(result, result_type_id, lhs.jltype, lhs.shape)
+    emit_binop!(ctx, args, encode_OrIOp!)
 end
 
 # cuda_tile.xori
@@ -380,16 +358,5 @@ end
 @intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
-    cb = ctx.cb
-    tt = ctx.tt
-
-    lhs = @something emit_value!(ctx, args[1]) throw(IRError("xori: cannot resolve lhs"))
-    rhs = @something emit_value!(ctx, args[2]) throw(IRError("xori: cannot resolve rhs"))
-
-    lhs_type = CC.widenconst(lhs.jltype)
-    dtype = julia_to_tile_dtype!(tt, eltype(lhs_type))
-    result_type_id = tile_type!(tt, dtype, lhs.shape)
-
-    result = encode_XOrIOp!(cb, result_type_id, lhs.v, rhs.v)
-    CGVal(result, result_type_id, lhs.jltype, lhs.shape)
+    emit_binop!(ctx, args, encode_XOrIOp!)
 end
diff --git a/src/compiler/intrinsics/julia.jl b/src/compiler/intrinsics/julia.jl
index 6ab683e..e348eb6 100644
--- a/src/compiler/intrinsics/julia.jl
+++ b/src/compiler/intrinsics/julia.jl
@@ -1,7 +1,8 @@
 # Julia intrinsics
 
 # Handle Julia Core.Intrinsics that IRStructurizer uses for control flow transformations.
-# These are: add_int (loop increments), slt_int, sle_int, ult_int (loop bounds).
+# These are: add_int / sub_int (loop increments), slt_int / sle_int / ult_int (loop bounds),
+# and not_int (bitwise NOT, used by `for` loop iteration).
 function emit_intrinsic!(ctx::CGCtx, func::Core.IntrinsicFunction, args)
     if func === Core.Intrinsics.add_int
         emit_intrinsic!(ctx, Intrinsics.addi, args)
@@ -13,11 +14,35 @@ function emit_intrinsic!(ctx::CGCtx, func::Core.IntrinsicFunction, args)
         emit_intrinsic!(ctx, Intrinsics.cmpi, [args..., CmpLessThanOrEqual, SignednessSigned])
     elseif func === Core.Intrinsics.ult_int
         emit_intrinsic!(ctx, Intrinsics.cmpi, [args..., CmpLessThan, SignednessUnsigned])
+    elseif func === Core.Intrinsics.not_int
+        emit_not_int!(ctx, args)
     else
         throw(IRError("Unhandled Julia intrinsic: $func"))
     end
 end
 
+# not_int(x) — bitwise NOT.
+# Julia's `for` loops generate `not_int` to negate loop-exit conditions.
+# Emitted as xori(x, allones) where allones is the bitwise complement identity:
+#   Bool    → xori(x, true)     (logical negation)
+#   Integer → xori(x, -1)       (bitwise complement, all bits set)
+function emit_not_int!(ctx::CGCtx, args)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    operand = @something emit_value!(ctx, args[1]) throw(IRError("not_int: cannot resolve operand"))
+    jltype = CC.widenconst(operand.jltype)
+
+    # Build the all-ones constant for xori: true for Bool, -1 (all bits set) for integers
+    allones_val = jltype === Bool ? true : jltype(-1)
+    type_id = tile_type_for_julia!(ctx, jltype)
+    allones_bytes = reinterpret(UInt8, [allones_val])
+    allones_v = encode_ConstantOp!(cb, type_id, collect(allones_bytes))
+
+    result = encode_XOrIOp!(cb, type_id, operand.v, allones_v)
+    CGVal(result, type_id, operand.jltype, operand.shape)
+end
+
 # built-in: ===
 function emit_intrinsic!(ctx::CGCtx, ::typeof(===), args)
     cb = ctx.cb
diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl
index 3483e93..24cd62e 100644
--- a/test/codegen/operations.jl
+++ b/test/codegen/operations.jl
@@ -1387,7 +1387,54 @@
     #=========================================================================
      8.9 Bitwise
     =========================================================================#
-    # TODO: andi - bitwise AND
+    @testset "Bitwise" begin
+        spec_i32 = ct.ArraySpec{1}(16, true)
+
+        @testset "andi, ori, xori" begin
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}, ct.TileArray{Int32,1,spec_i32}}) do a, b
+                    pid = ct.bid(1)
+                    ta = ct.load(a, pid, (16,))
+                    tb = ct.load(b, pid, (16,))
+                    @check "andi"
+                    Base.donotdelete(map(&, ta, tb))
+                    @check "ori"
+                    Base.donotdelete(map(|, ta, tb))
+                    @check "xori"
+                    Base.donotdelete(map(xor, ta, tb))
+                    return
+                end
+            end
+        end
+
+        @testset "shli, shri" begin
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    @check "shli"
+                    Base.donotdelete(map(x -> x << Int32(4), tile))
+                    @check "shri"
+                    Base.donotdelete(map(x -> x >> Int32(8), tile))
+                    return
+                end
+            end
+        end
+        @testset "bitwise NOT (~)" begin
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec_i32}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    @check "xori"
+                    Base.donotdelete(map(~, tile))
+                    return
+                end
+            end
+        end
+    end
 
     #=========================================================================
      8.10 Atomics
diff --git a/test/execution/basic.jl b/test/execution/basic.jl
index 5c9a8ae..ee89621 100644
--- a/test/execution/basic.jl
+++ b/test/execution/basic.jl
@@ -1216,4 +1216,240 @@ end
     b3 = CUDA.zeros(Float32, 64)
     ct.launch(multi_early_return, 4, a, b3, Int32(1), Int32(0))
     @test all(Array(b3) .== 0.0f0)
+end
+
+@testset "bitwise operations" begin
+
+@testset "andi (bitwise AND)" begin
+    function bitwise_and_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                                c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(&, ta, tb))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_and_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == Array(a) .& Array(b)
+end
+
+@testset "ori (bitwise OR)" begin
+    function bitwise_or_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                               c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(|, ta, tb))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_or_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == Array(a) .| Array(b)
+end
+
+@testset "xori (bitwise XOR)" begin
+    function bitwise_xor_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                                c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(xor, ta, tb))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_xor_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == Array(a) .\u22bb Array(b)
+end
+
+@testset "shli (shift left)" begin
+    function shift_left_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, map(x -> x << Int32(4), tile))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x0fff_ffff), n))
+    b = CUDA.zeros(Int32, n)
+
+    ct.launch(shift_left_kernel, cld(n, tile_size), a, b)
+
+    @test Array(b) == Array(a) .<< Int32(4)
+end
+
+@testset "shri (shift right)" begin
+    function shift_right_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, map(x -> x >> Int32(8), tile))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CUDA.zeros(Int32, n)
+
+    ct.launch(shift_right_kernel, cld(n, tile_size), a, b)
+
+    @test Array(b) == Array(a) .>> Int32(8)
+end
+
+@testset "combined bitwise ops" begin
+    # (a & b) | (a ^ b) \u2014 exercises all three ops in a single kernel
+    function combined_bitwise_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1},
+                                     c::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        ta = ct.load(a, pid, (16,))
+        tb = ct.load(b, pid, (16,))
+        ct.store(c, pid, map(|, map(&, ta, tb), map(xor, ta, tb)))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    c = CUDA.zeros(Int32, n)
+
+    ct.launch(combined_bitwise_kernel, cld(n, tile_size), a, b, c)
+
+    @test Array(c) == (Array(a) .& Array(b)) .| (Array(a) .\u22bb Array(b))
+end
+
+@testset "bitwise NOT (~)" begin
+    function bitwise_not_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(b, pid, map(~, tile))
+        return
+    end
+
+    n = 1024
+    tile_size = 16
+    a = CuArray(rand(Int32(0):Int32(0x7fff_ffff), n))
+    b = CUDA.zeros(Int32, n)
+
+    ct.launch(bitwise_not_kernel, cld(n, tile_size), a, b)
+
+    @test Array(b) == .~Array(a)
+end
+
+end
+
+@testset "for-loop iteration" begin
+
+@testset "simple for loop (accumulate)" begin
+    function for_loop_sum_kernel(data::ct.TileArray{Float32,1},
+                                 out::ct.TileArray{Float32,1},
+                                 n_iters::Int32)
+        pid = ct.bid(1)
+        acc = ct.zeros((16,), Float32)
+        for i in Int32(1):n_iters
+            tile = ct.load(data, i, (16,))
+            acc = acc .+ tile
+        end
+        ct.store(out, pid, acc)
+        return
+    end
+
+    n_iters = Int32(4)
+    data = CUDA.rand(Float32, 64)  # 4 tiles of 16
+    out = CUDA.zeros(Float32, 16)
+
+    ct.launch(for_loop_sum_kernel, 1, data, out, n_iters)
+
+    data_cpu = Array(data)
+    expected = zeros(Float32, 16)
+    for i in 1:4
+        expected .+= data_cpu[(i-1)*16+1 : i*16]
+    end
+    @test Array(out) ≈ expected
+end
+
+@testset "for loop with constant bound" begin
+    function for_loop_const_kernel(data::ct.TileArray{Float32,1},
+                                   out::ct.TileArray{Float32,1},
+                                   n_iters::Int)
+        pid = ct.bid(1)
+        acc = ct.zeros((16,), Float32)
+        for i in Int32(1):Int32(n_iters)
+            tile = ct.load(data, i, (16,))
+            acc = acc .+ tile
+        end
+        ct.store(out, pid, acc)
+        return
+    end
+
+    data = CUDA.rand(Float32, 48)  # 3 tiles of 16
+    out = CUDA.zeros(Float32, 16)
+
+    ct.launch(for_loop_const_kernel, 1, data, out, ct.Constant(3))
+
+    data_cpu = Array(data)
+    expected = zeros(Float32, 16)
+    for i in 1:3
+        expected .+= data_cpu[(i-1)*16+1 : i*16]
+    end
+    @test Array(out) ≈ expected
+end
+
+@testset "for loop with dynamic bound" begin
+    # n_iters comes from scalar indexing (runtime value, not constant)
+    function for_loop_dynamic_kernel(data::ct.TileArray{Float32,1},
+                                     lengths::ct.TileArray{Int32,1},
+                                     out::ct.TileArray{Float32,1})
+        bid = ct.bid(1)
+        len = lengths[bid]
+        acc = ct.zeros((16,), Float32)
+        for j in Int32(1):len
+            tile = ct.load(data, j, (16,))
+            acc = acc .+ tile
+        end
+        ct.store(out, bid, acc)
+        return
+    end
+
+    n_tiles = Int32[2, 3, 1]
+    data = CUDA.rand(Float32, 48)  # 3 tiles of 16
+    lengths = CuArray(n_tiles)
+    out = CUDA.zeros(Float32, 48)
+
+    ct.launch(for_loop_dynamic_kernel, 3, data, lengths, out)
+
+    data_cpu = Array(data)
+    out_cpu = Array(out)
+    for bid in 1:3
+        expected = zeros(Float32, 16)
+        for j in 1:n_tiles[bid]
+            expected .+= data_cpu[(j-1)*16+1 : j*16]
+        end
+        @test out_cpu[(bid-1)*16+1 : bid*16] ≈ expected
+    end
+end
+
 end
\ No newline at end of file