iree-org · ftynse · Feb 3, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 4, 2025
diff --git a/iree/turbine/kernel/compiler/vector_codegen.py b/iree/turbine/kernel/compiler/vector_codegen.py
@@ -864,6 +864,7 @@ def cast_vector(
         # Implicit scalar type promotion.
         proxy_value = ScalarBuilder.to_dtype(proxy_value, element_type)
 
+    # print(f"proxy_value {proxy_value}")
     value = proxy_value.ir_value
 
     # After scalar promotion, promote to vector.

diff --git a/iree/turbine/kernel/ops/wave_ops.py b/iree/turbine/kernel/ops/wave_ops.py
@@ -148,6 +148,10 @@ def minimum(lhs: "Register", rhs: "Register") -> "Register":
     ...
 
 
+def and_op(lhs: "Register", rhs: "Register") -> "Register":
+    ...
+
+
 def broadcast(
     arg: "Register", target_shape: Optional[Sequence[IndexExpr | int]] = None
 ) -> "Register":
@@ -769,6 +773,7 @@ def infer_shape(self) -> Any:
 @define_py_op(operator.truediv)
 @define_interface_op("maximum")
 @define_interface_op("minimum")
+@define_interface_op("and_op")
 @dataclass
 class BinaryPyOp(BinaryOpBase, ABC):
     def infer_type(self):

diff --git a/iree/turbine/kernel/wave/codegen/handlers.py b/iree/turbine/kernel/wave/codegen/handlers.py
@@ -54,6 +54,7 @@
 from ...ops.wave_ops import (
     abs,
     allocate,
+    and_op,
     apply_expr,
     broadcast,
     cast,
@@ -261,7 +262,8 @@ def handle_set_symbol(emitter: WaveEmitter, node: fx.Node):
         raise ValidationError("Malformed arguments") from e
 
     register = cast_vector(emitter, register, element_type=IndexType.get())
-    emitter.dynamic_dims[symbol] = _to_scalar(register)
+    # emitter.dynamic_dims[symbol] = _to_scalar(register)
+    emitter.dynamic_dims[symbol] = register
 
 
 ###############################################################################
@@ -523,6 +525,16 @@ def handle_le(lhs: Value, rhs: Value) -> OpResult:
     return result
 
 
+@handle_binary_op(and_op)
+def handle_and_op(lhs: Value, rhs: Value) -> OpResult:
+    element_type = get_type_or_element_type(lhs.type)
+    if _is_integer_like_type(element_type):
+        result = arith_d.andi(lhs, rhs)
+    else:
+        raise ValidationError(f"Found unhandled operand type for le: {element_type}")
+    return result
+
+
 @handle_binary_op(maximum)
 def handle_maximum(lhs: Value, rhs: Value) -> OpResult:
     element_type = get_type_or_element_type(lhs.type)
@@ -545,7 +557,7 @@ def handle_minimum(lhs: Value, rhs: Value) -> OpResult:
     if _is_float_type(element_type):
         result = arith_d.minimumf(lhs, rhs)
     elif _is_integer_like_type(element_type) and (
-        element_type.is_signed() or element_type.is_signless()
+        element_type.is_signed or element_type.is_signless
     ):
         result = arith_d.minsi(lhs, rhs)
     else:
@@ -849,8 +861,12 @@ def handle_broadcast(emitter: WaveEmitter, node: fx.Node):
     # Get thread_shape/size for broadcast.
     get_thread_shape = lambda index: max(subs_idxc(x.size) for x in index.values())
 
-    src_thread_size = get_thread_shape(register.index) if register.index else None
-    target_thread_size = get_thread_shape(node.index)
+    src_thread_size = (
+        get_thread_shape(register.index)
+        if hasattr(register, "index") and register.index
+        else None
+    )
+    target_thread_size = get_thread_shape(node.index) if node.index else None
 
     # Check MLIR shape
     vector_src = cast_vector(emitter, register)

diff --git a/iree/turbine/kernel/wave/codegen/read_write.py b/iree/turbine/kernel/wave/codegen/read_write.py
@@ -7,6 +7,7 @@
 import sympy
 import functools
 from typing import Any, Callable, ClassVar, Optional, List, Type, Dict
+import math
 
 import torch.fx as fx
 
@@ -134,26 +135,17 @@ def _build_mask(
     return mask
 
 
-def _get_splat_const(vec_type: IrType, value: Any) -> Value:
-    splat = DenseElementsAttr.get_splat(
-        vec_type, get_constant_attr(value, vec_type.element_type)
-    )
-    return arith_d.constant(vec_type, splat)
-
-
-def _constant_mask(vec_type: IrType) -> Value:
-    return _get_splat_const(vec_type, 1)
-
-
 def _construct_gather_scatter_indices(
     emitter: WaveEmitter,
+    # TODO TODO TODO  fix typo
     symbolc_shape: tuple[IndexExpr],
     index: tuple[IndexExpr],
     mapping: IndexMapping,
     elements_per_thread: int,
     is_read: bool,
     dynamic_vals: tuple[Any, ...],
     is_contiguous: bool,
+    vector_shaped_symbols={},
 ) -> tuple[OpResult, OpResult, OpResult]:
     # Apply symbolc_shape order to indices, e.g. if original mapping is
     # {M: iter(0), N: iter(1)} and symbolc_shape is (N, M), result will
@@ -189,7 +181,7 @@ def _construct_gather_scatter_indices(
         mask_vec_type = VectorType.get(
             [elements_per_thread], IntegerType.get_signless(1)
         )
-        mask = _constant_mask(mask_vec_type)
+        mask = vector_d.constant_mask(mask_vec_type, [elements_per_thread])
 
     def extract0(src):
         static_pos = [0] * src.type.rank
@@ -221,34 +213,46 @@ def extract0(src):
     offsets = []
     strides = strides_from_symbolic_shape(idxc, symbolc_shape, allow_mixed_shapes=True)
     start_indices_offset = _compute_offset(start_indices, strides)
-    for i in range(elements_per_thread):
-        # Update fastest dim, i.e. in case of identity mapping it will
-        # be equivalent to just vector.load
-        subs = [(sym, idx) for sym, idx in zip(iters.keys(), start_indices_orig)]
-        subs[fastest_dim] = (subs[fastest_dim][0], start_indices_orig[fastest_dim] + i)
-        indices = [i.subs(subs) for i in index_mapping]
 
-        # First, we build indices as if resulting gather/scatter `start_indices`
-        # are 0 as mapping expression may depend on absolute value of index
-        # (e.g. `index % 32`). Then we adjust for the non-0 `start_indices` by
-        # subtracting computed previously linear `start_indices_offset`. For
-        # simple cases like transpose, the resulting expression should fold into
-        # simple constant while more complex expressions may requires actual
-        # arith ops on dynamic values.
-        offset = _compute_offset(indices, strides) - start_indices_offset
-        offset = subs_idxc(offset)
-
-        if offset.is_number:
-            # If resulted offset sympy expr is convertible to int constant it
-            # will be directly encoded into `arith.constant`.
-            # For non-constant expressions, we will generate a real sequence of
-            # arith ops and then `vector.insertelement` them into offsets vec.
-            offset = int(offset)
-        else:
-            need_dynamic_offsets = True
-            break
+    # TODO TODO TODO  we don't necessarily care if they are vector shaped, but
+    # if they are indxed by the fastest varying dimension?
+    # Note that we may want to "expand" the symbol to per-element
+    # copies and trigger `need_dynamic_offests` below
+    if len(start_indices_offset.free_symbols.intersection(vector_shaped_symbols)) != 0:
+        need_dynamic_offsets = True
 
-        offsets.append(offset)
+    if not need_dynamic_offsets:
+        for i in range(elements_per_thread):
+            # Update fastest dim, i.e. in case of identity mapping it will
+            # be equivalent to just vector.load
+            subs = [(sym, idx) for sym, idx in zip(iters.keys(), start_indices_orig)]
+            subs[fastest_dim] = (
+                subs[fastest_dim][0],
+                start_indices_orig[fastest_dim] + i,
+            )
+            indices = [i.subs(subs) for i in index_mapping]
+
+            # First, we build indices as if resulting gather/scatter `start_indices`
+            # are 0 as mapping expression may depend on absolute value of index
+            # (e.g. `index % 32`). Then we adjust for the non-0 `start_indices` by
+            # subtracting computed previously linear `start_indices_offset`. For
+            # simple cases like transpose, the resulting expression should fold into
+            # simple constant while more complex expressions may requires actual
+            # arith ops on dynamic values.
+            offset = _compute_offset(indices, strides) - start_indices_offset
+            offset = subs_idxc(offset)
+
+            if offset.is_number:
+                # If resulted offset sympy expr is convertible to int constant it
+                # will be directly encoded into `arith.constant`.
+                # For non-constant expressions, we will generate a real sequence of
+                # arith ops and then `vector.insertelement` them into offsets vec.
+                offset = int(offset)
+            else:
+                need_dynamic_offsets = True
+                break
+
+            offsets.append(offset)
 
     offsets_vec_type = VectorType.get([elements_per_thread], IndexType.get())
     if need_dynamic_offsets:
@@ -260,13 +264,22 @@ def extract0(src):
         )
         subs = [(sym, idx) for sym, idx in zip(iters.keys(), start_indices_orig)]
         # Last item in `subs` corresponds to last item in `start_indices_orig`
-        # which is fastest changing dim.
-        # Replacing last element with `idxc.iota(elements_per_thread)` will
-        # generate vectorized index code, each element in it corresponding to
-        # individual vector element index.
+        # which is fastest changing dim. Replacing last element with
+        # `idxc.iota(elements_per_thread)` will generate vectorized index code,
+        # each element in it corresponding to individual vector element index.
+        #
+        # TODO TODO TODO: vector shaped symbol means we can't just iota here
+        # instead we should just take the value of the symbol; we should instead
+        # somehow get different values of OFFSET (or other vector shaped
+        # symbols) into the `get_sympy_index` below
+        #
+        # we also need to take care if there are several symbols, especially a
+        # mix of constant and non-constant symbols...
+        #
+        # what happens when there are no symbols?
         subs[-1] = (
             subs[-1][0],
-            start_indices_orig[-1] + idxc.iota(elements_per_thread),
+            start_indices_orig[-1],  # + idxc.iota(elements_per_thread),
         )
         dynamic_vals_map = {
             sym: val
@@ -460,6 +473,15 @@ def handle_read(emitter: WaveEmitter, node: fx.Node):
         dyn_vals = tuple(
             cast_vector(emitter, reg, element_type=IndexType.get()) for reg in dyn_vals
         )
+
+        # TODO TODO TODO we can sink this down, actually...
+        vector_shaped_symbols = set(
+            sym
+            for sym, value in emitter.dynamic_dims.items()
+            if isinstance(value.type, ShapedType)
+            and math.prod(ShapedType(value.type).shape) != 1
+        )
+
         start_indices, offsets_vec, mask = _construct_gather_scatter_indices(
             emitter=emitter,
             symbolc_shape=input_shape,
@@ -469,6 +491,7 @@ def handle_read(emitter: WaveEmitter, node: fx.Node):
             is_read=True,
             dynamic_vals=dyn_vals,
             is_contiguous=get_custom(node).is_contiguous_vec(),
+            vector_shaped_symbols=vector_shaped_symbols,
         )
         result = _create_vec_read(
             emitter,