diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index bade2d1..093cef1 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -6,15 +6,16 @@ steps:
           - JuliaCI/julia#v1:
               version: "{{matrix.julia}}"
           - JuliaCI/julia-test#v1:
-              test_args: "--quickfail"
+              coverage: false
+        commands: |
+          unset LD_LIBRARY_PATH
         agents:
           queue: "juliagpu"
           cuda: "*"
           gpu: "a100"
-        timeout_in_minutes: 90
+        timeout_in_minutes: 15
         matrix:
           setup:
             julia:
               - "1.11"
               - "1.12"
-              - "1.13"
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
index 0ea7edc..8888a62 100644
--- a/ext/CUDAExt.jl
+++ b/ext/CUDAExt.jl
@@ -1,8 +1,8 @@
 module CUDAExt
 
 using cuTile
-using cuTile: TileArray, Constant, CGOpts, CuTileResults, emit_code, sanitize_name,
-              constant_eltype, constant_value, is_ghost_type
+using cuTile: TileArray, Constant, CGOpts, CuTileResults, DEFAULT_BYTECODE_VERSION,
+              emit_code, sanitize_name, constant_eltype, constant_value, is_ghost_type
 
 using CompilerCaching: CacheView, method_instance, results
 
@@ -13,6 +13,16 @@ using CUDA_Compiler_jll
 
 public launch
 
+function run_and_collect(cmd)
+    stdout = Pipe()
+    proc = run(pipeline(ignorestatus(cmd); stdout, stderr=stdout), wait=false)
+    close(stdout.in)
+    reader = Threads.@spawn String(read(stdout))
+    Base.wait(proc)
+    log = strip(fetch(reader))
+    return proc, log
+end
+
 """
     check_tile_ir_support()
 
@@ -38,6 +48,9 @@ function check_tile_ir_support()
     else
         error("Tile IR is not supported on compute capability $cap ($sm_arch)")
     end
+
+    # Return bytecode version matching the toolkit
+    return VersionNumber(cuda_ver.major, cuda_ver.minor)
 end
 
 """
@@ -58,12 +71,29 @@ function emit_binary(cache::CacheView, mi::Core.MethodInstance;
     # Run tileiras to produce CUBIN
     input_path = tempname() * ".tile"
     output_path = tempname() * ".cubin"
+    compiled = false
     try
         write(input_path, bytecode)
-        run(`$(CUDA_Compiler_jll.tileiras()) $input_path -o $output_path --gpu-name $(opts.sm_arch) -O$(opts.opt_level)`)
+        cmd = addenv(`$(CUDA_Compiler_jll.tileiras()) $input_path -o $output_path --gpu-name $(opts.sm_arch) -O$(opts.opt_level)`,
+                     "CUDA_ROOT" => CUDA_Compiler_jll.artifact_dir)
+        proc, log = run_and_collect(cmd)
+        if !success(proc)
+            reason = proc.termsignal > 0 ? "tileiras received signal $(proc.termsignal)" :
+                                           "tileiras exited with code $(proc.exitcode)"
+            msg = "Failed to compile Tile IR ($reason)"
+            if !isempty(log)
+                msg *= "\n" * log
+            end
+            msg *= "\nIf you think this is a bug, please file an issue and attach $(input_path)"
+            if parse(Bool, get(ENV, "BUILDKITE", "false"))
+                run(`buildkite-agent artifact upload $(input_path)`)
+            end
+            error(msg)
+        end
+        compiled = true
         res.cuda_bin = read(output_path)
     finally
-        rm(input_path, force=true)
+        compiled && rm(input_path, force=true)
         rm(output_path, force=true)
     end
 
@@ -135,7 +165,7 @@ function cuTile.launch(@nospecialize(f), grid, args...;
                        opt_level::Int=3,
                        num_ctas::Union{Int, Nothing}=nothing,
                        occupancy::Union{Int, Nothing}=nothing)
-    check_tile_ir_support()
+    bytecode_version = check_tile_ir_support()
 
     # Convert CuArray -> TileArray (and other conversions)
     tile_args = map(to_tile_arg, args)
@@ -166,7 +196,8 @@ function cuTile.launch(@nospecialize(f), grid, args...;
     end
 
     # Create cache view with compilation options as sharding keys
-    opts = (sm_arch=sm_arch, opt_level=opt_level, num_ctas=num_ctas, occupancy=occupancy)
+    opts = (sm_arch=sm_arch, opt_level=opt_level, num_ctas=num_ctas, occupancy=occupancy,
+            bytecode_version=bytecode_version)
     cache = CacheView{CuTileResults}((:cuTile, opts), world)
 
     # Run cached compilation
diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index b759241..22bd173 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1122,9 +1122,14 @@ Example:
 function encode_ForOp!(body::Function, cb::CodeBuilder,
                        result_types::Vector{TypeId}, iv_type::TypeId,
                        lower::Value, upper::Value, step::Value,
-                       init_values::Vector{Value})
+                       init_values::Vector{Value};
+                       unsigned_cmp::Bool=false)
     encode_varint!(cb.buf, Opcode.ForOp)
     encode_typeid_seq!(cb.buf, result_types)
+    # Flags
+    if cb.version >= v"13.2"
+        encode_varint!(cb.buf, unsigned_cmp ? 1 : 0)
+    end
     # Operands: lower, upper, step, init_values...
     encode_varint!(cb.buf, 3 + length(init_values))
     encode_operand!(cb.buf, lower)
@@ -1558,7 +1563,9 @@ function encode_NegIOp!(cb::CodeBuilder, result_type::TypeId, source::Value;
                         overflow::IntegerOverflow=OverflowNone)
     encode_varint!(cb.buf, Opcode.NegIOp)
     encode_typeid!(cb.buf, result_type)
-    encode_enum!(cb.buf, overflow)
+    if cb.version >= v"13.2"
+        encode_enum!(cb.buf, overflow)
+    end
     encode_operand!(cb.buf, source)
     return new_op!(cb)
 end
@@ -1956,9 +1963,13 @@ end
 Element-wise hyperbolic tangent.
 Opcode: 106
 """
-function encode_TanHOp!(cb::CodeBuilder, result_type::TypeId, source::Value)
+function encode_TanHOp!(cb::CodeBuilder, result_type::TypeId, source::Value;
+                        rounding_mode::RoundingMode=RoundingFull)
     encode_varint!(cb.buf, Opcode.TanHOp)
     encode_typeid!(cb.buf, result_type)
+    if cb.version >= v"13.2"
+        encode_enum!(cb.buf, rounding_mode)
+    end
     encode_operand!(cb.buf, source)
     return new_op!(cb)
 end
diff --git a/src/bytecode/writer.jl b/src/bytecode/writer.jl
index acac1b8..d1ef9cf 100644
--- a/src/bytecode/writer.jl
+++ b/src/bytecode/writer.jl
@@ -1,7 +1,7 @@
 # Bytecode file writer - handles sections and overall structure
 
 # Bytecode version
-const BYTECODE_VERSION = (13, 1, 0)
+const DEFAULT_BYTECODE_VERSION = v"13.1"
 
 # Magic number
 const MAGIC = UInt8[0x7f, 0x54, 0x69, 0x6c, 0x65, 0x49, 0x52, 0x00]  # "\x7fTileIR\x00"
@@ -97,9 +97,11 @@ mutable struct CodeBuilder
     next_value_id::Int
     cur_debug_attr::DebugAttrId
     num_ops::Int
+    version::VersionNumber
 end
 
-function CodeBuilder(string_table::StringTable, constant_table::ConstantTable, type_table::TypeTable)
+function CodeBuilder(string_table::StringTable, constant_table::ConstantTable, type_table::TypeTable;
+                     version::VersionNumber=DEFAULT_BYTECODE_VERSION)
     CodeBuilder(
         UInt8[],
         string_table,
@@ -108,7 +110,8 @@ function CodeBuilder(string_table::StringTable, constant_table::ConstantTable, t
         DebugAttrId[],
         0,
         DebugAttrId(0),  # No debug info
-        0
+        0,
+        version
     )
 end
 
@@ -374,9 +377,10 @@ mutable struct BytecodeWriter
     debug_attr_table::DebugAttrTable
     debug_info::Vector{Vector{DebugAttrId}}
     num_functions::Int
+    version::VersionNumber
 end
 
-function BytecodeWriter()
+function BytecodeWriter(; version::VersionNumber=DEFAULT_BYTECODE_VERSION)
     string_table = StringTable()
     BytecodeWriter(
         UInt8[],
@@ -385,21 +389,21 @@ function BytecodeWriter()
         TypeTable(),
         DebugAttrTable(string_table),
         Vector{Vector{DebugAttrId}}[],
-        0
+        0,
+        version
     )
 end
 
 """
 Write the bytecode header.
 """
-function write_header!(buf::Vector{UInt8})
+function write_header!(buf::Vector{UInt8}, version::VersionNumber)
     append!(buf, MAGIC)
-    major, minor, tag = BYTECODE_VERSION
-    push!(buf, UInt8(major))
-    push!(buf, UInt8(minor))
-    # Tag as 2-byte little-endian
-    push!(buf, UInt8(tag & 0xff))
-    push!(buf, UInt8((tag >> 8) & 0xff))
+    push!(buf, UInt8(version.major))
+    push!(buf, UInt8(version.minor))
+    # Patch as 2-byte little-endian
+    push!(buf, UInt8(version.patch & 0xff))
+    push!(buf, UInt8((version.patch >> 8) & 0xff))
 end
 
 """
@@ -486,8 +490,9 @@ end
 Write complete bytecode to a buffer.
 Returns the buffer with all sections.
 """
-function write_bytecode!(f::Function, num_functions::Int)
-    writer = BytecodeWriter()
+function write_bytecode!(f::Function, num_functions::Int;
+                         version::VersionNumber=DEFAULT_BYTECODE_VERSION)
+    writer = BytecodeWriter(; version)
 
     # Function section content
     func_buf = UInt8[]
@@ -502,7 +507,7 @@ function write_bytecode!(f::Function, num_functions::Int)
 
     # Build final output
     buf = UInt8[]
-    write_header!(buf)
+    write_header!(buf, version)
 
     # Sections in order: Func, Global (if any), Constant, Debug, Type, String, End
     write_section!(buf, Section.Func, func_buf, 8)
@@ -574,7 +579,8 @@ function add_function!(writer::BytecodeWriter, func_buf::Vector{UInt8},
     end
 
     # Create code builder for function body
-    cb = CodeBuilder(writer.string_table, writer.constant_table, writer.type_table)
+    cb = CodeBuilder(writer.string_table, writer.constant_table, writer.type_table;
+                     version=writer.version)
 
     return cb
 end
diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index 1f0dee9..91571fb 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -291,7 +291,8 @@ const CGOpts = @NamedTuple{
     sm_arch::Union{String, Nothing},
     opt_level::Int,
     num_ctas::Union{Int, Nothing},
-    occupancy::Union{Int, Nothing}
+    occupancy::Union{Int, Nothing},
+    bytecode_version::VersionNumber
 }
 
 # Results struct for caching compilation phases
@@ -394,7 +395,7 @@ function emit_code(cache::CacheView, mi::Core.MethodInstance;
     opts = cache.owner[2]
 
     # Generate Tile IR bytecode
-    bytecode = write_bytecode!(1) do writer, func_buf
+    bytecode = write_bytecode!(1; version=opts.bytecode_version) do writer, func_buf
         emit_kernel!(writer, func_buf, sci, rettype;
             name = sanitize_name(string(mi.def.name)),
             sm_arch = opts.sm_arch,
@@ -508,6 +509,7 @@ function code_tiled(io::IO, @nospecialize(f), @nospecialize(argtypes);
                     opt_level::Int=3,
                     num_ctas::Union{Int, Nothing}=nothing,
                     occupancy::Union{Int, Nothing}=nothing,
+                    bytecode_version::VersionNumber=DEFAULT_BYTECODE_VERSION,
                     world::UInt=Base.get_world_counter())
     # Strip Constant types from argtypes for MI lookup, build const_argtypes
     stripped, const_argtypes = process_const_argtypes(f, argtypes)
@@ -518,7 +520,8 @@ function code_tiled(io::IO, @nospecialize(f), @nospecialize(argtypes);
     mi = @something(method_instance(f, stripped; world, method_table=cuTileMethodTable),
                     method_instance(f, stripped; world),
                     throw(MethodError(f, stripped)))
-    opts = (sm_arch=sm_arch, opt_level=opt_level, num_ctas=num_ctas, occupancy=occupancy)
+    opts = (sm_arch=sm_arch, opt_level=opt_level, num_ctas=num_ctas, occupancy=occupancy,
+            bytecode_version=bytecode_version)
     cache = CacheView{CuTileResults}((:cuTile, opts), world)
     bytecode = emit_code(cache, mi; const_argtypes)
     print(io, disassemble_tileir(bytecode))
diff --git a/src/language/atomics.jl b/src/language/atomics.jl
index 535d111..4942196 100644
--- a/src/language/atomics.jl
+++ b/src/language/atomics.jl
@@ -107,6 +107,14 @@ end
     S === () ? Intrinsics.to_scalar(result) : result
 end
 
+# Convert mismatched scalar/tile types to match array element type
+@inline function atomic_cas(array::TileArray{T}, indices,
+                            expected::TileOrScalar, desired::TileOrScalar;
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T}
+    atomic_cas(array, indices, T(expected), T(desired); memory_order, memory_scope)
+end
+
 # ============================================================================
 # Atomic RMW operations (atomic_add, atomic_xchg)
 # ============================================================================
@@ -150,4 +158,11 @@ for op in (:add, :xchg)
         result = Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope)
         S === () ? Intrinsics.to_scalar(result) : result
     end
+
+    # Convert mismatched scalar/tile types to match array element type
+    @eval @inline function $fname(array::TileArray{T}, indices, val::TileOrScalar;
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T}
+        $fname(array, indices, T(val); memory_order, memory_scope)
+    end
 end
diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl
index e757ed8..1cd5c26 100644
--- a/test/execution/atomics.jl
+++ b/test/execution/atomics.jl
@@ -43,7 +43,8 @@ end
     # Test atomic_xchg: each thread exchanges, last one wins
     function atomic_xchg_kernel(arr::ct.TileArray{Int,1})
         bid = ct.bid(1)
-        ct.atomic_xchg(arr, 1, bid + 1;
+        # bid is 1-indexed (1..n_blocks), val is auto-converted from Int32 to Int
+        ct.atomic_xchg(arr, 1, bid;
                       memory_order=ct.MemoryOrder.AcqRel)
         return
     end
diff --git a/test/execution/hints.jl b/test/execution/hints.jl
index 56d398e..d8993b3 100644
--- a/test/execution/hints.jl
+++ b/test/execution/hints.jl
@@ -18,9 +18,12 @@ using CUDA
     b = CUDA.ones(Float32, n) .* 2
     c = CUDA.zeros(Float32, n)
 
-    ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
-
-    @test Array(c) ≈ ones(Float32, n) .* 3
+    if capability(device()) >= v"10"
+        ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
+        @test Array(c) ≈ ones(Float32, n) .* 3
+    else
+        @test_throws "num_cta_in_cga" ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
+    end
 end
 
 @testset "launch with occupancy" begin
@@ -60,9 +63,12 @@ end
     b = CUDA.ones(Float32, n) .* 2
     c = CUDA.zeros(Float32, n)
 
-    ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
-
-    @test Array(c) ≈ ones(Float32, n) .* 3
+    if capability(device()) >= v"10"
+        ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
+        @test Array(c) ≈ ones(Float32, n) .* 3
+    else
+        @test_throws "num_cta_in_cga" ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
+    end
 end
 
 end