-
Notifications
You must be signed in to change notification settings - Fork 7
Add Workload object and execution methods #18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,181 @@ | ||||||
| """ | ||||||
| Workload example: Element-wise sum of two (M, N) float32 arrays on CPU. | ||||||
| """ | ||||||
| import numpy as np | ||||||
| from mlir import ir | ||||||
| from mlir.runtime.np_to_memref import get_ranked_memref_descriptor | ||||||
| from mlir.dialects import func, linalg, bufferization | ||||||
| from mlir.dialects import transform | ||||||
| from functools import cached_property | ||||||
| from lighthouse import Workload | ||||||
| from lighthouse.utils.mlir import ( | ||||||
| apply_registered_pass, | ||||||
| canonicalize, | ||||||
| cse, | ||||||
| match, | ||||||
| ) | ||||||
| from lighthouse.utils.execution import ( | ||||||
| lower_payload, | ||||||
| execute, | ||||||
| benchmark, | ||||||
| ) | ||||||
|
|
||||||
|
|
||||||
| class ElementwiseSum(Workload): | ||||||
| """ | ||||||
| Computes element-wise sum of (M, N) float32 arrays on CPU. | ||||||
|
|
||||||
| We can construct the input arrays and compute the reference solution in | ||||||
| Python with Numpy. | ||||||
|
|
||||||
| We use @cached_property to store the inputs and reference solution in the | ||||||
| object so that they are only computed once. | ||||||
| """ | ||||||
|
|
||||||
| def __init__(self, M, N): | ||||||
| self.M = M | ||||||
| self.N = N | ||||||
| self.dtype = np.float32 | ||||||
| self.context = ir.Context() | ||||||
| self.location = ir.Location.unknown(context=self.context) | ||||||
|
|
||||||
| @cached_property | ||||||
| def _input_arrays(self): | ||||||
| print(" * Generating input arrays...") | ||||||
| np.random.seed(2) | ||||||
| A = np.random.rand(self.M, self.N).astype(self.dtype) | ||||||
| B = np.random.rand(self.M, self.N).astype(self.dtype) | ||||||
| C = np.zeros((self.M, self.N), dtype=self.dtype) | ||||||
| return [A, B, C] | ||||||
|
|
||||||
| @cached_property | ||||||
| def _reference_solution(self): | ||||||
| print(" * Computing reference solution...") | ||||||
| A, B, _ = self._input_arrays | ||||||
| return A + B | ||||||
|
|
||||||
| def get_input_arrays(self, execution_engine): | ||||||
| return [ | ||||||
| get_ranked_memref_descriptor(a) for a in self._input_arrays | ||||||
| ] | ||||||
|
|
||||||
| def verify(self, execution_engine, verbose: int = 0) -> bool: | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should |
||||||
| C = self._input_arrays[2] | ||||||
| C_ref = self._reference_solution | ||||||
| if verbose > 1: | ||||||
| print("Reference solution:") | ||||||
| print(C_ref) | ||||||
| print("Computed solution:") | ||||||
| print(C) | ||||||
| success = np.allclose(C, C_ref) | ||||||
| if verbose: | ||||||
| if success: | ||||||
| print("PASSED") | ||||||
| else: | ||||||
| print("FAILED Result mismatch!") | ||||||
| return success | ||||||
|
|
||||||
| def requirements(self): | ||||||
| return [] | ||||||
|
|
||||||
| def get_complexity(self): | ||||||
| nbytes = np.dtype(self.dtype).itemsize | ||||||
| flop_count = self.M * self.N # one addition per element | ||||||
| memory_reads = 2 * self.M * self.N * nbytes # read A and B | ||||||
| memory_writes = self.M * self.N * nbytes # write C | ||||||
| return (flop_count, memory_reads, memory_writes) | ||||||
|
|
||||||
| def payload_module(self): | ||||||
| with self.context, self.location: | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| float32_t = ir.F32Type.get() | ||||||
| shape = (self.M, self.N) | ||||||
| tensor_t = ir.RankedTensorType.get(shape, float32_t) | ||||||
| memref_t = ir.MemRefType.get(shape, float32_t) | ||||||
| mod = ir.Module.create() | ||||||
| with ir.InsertionPoint(mod.body): | ||||||
| args = [memref_t, memref_t, memref_t] | ||||||
| f = func.FuncOp(self.payload_function_name, (tuple(args), ())) | ||||||
| f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() | ||||||
| with ir.InsertionPoint(f.add_entry_block()): | ||||||
| A = f.arguments[0] | ||||||
| B = f.arguments[1] | ||||||
| C = f.arguments[2] | ||||||
| a_tensor = bufferization.ToTensorOp(tensor_t, A, restrict=True) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
| b_tensor = bufferization.ToTensorOp(tensor_t, B, restrict=True) | ||||||
| c_tensor = bufferization.ToTensorOp( | ||||||
| tensor_t, C, restrict=True, writable=True | ||||||
| ) | ||||||
| add = linalg.add(a_tensor, b_tensor, outs=[c_tensor]) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason to prefer doing the |
||||||
| bufferization.MaterializeInDestinationOp( | ||||||
| None, add, C, restrict=True, writable=True | ||||||
| ) | ||||||
| func.ReturnOp(()) | ||||||
| return mod | ||||||
|
|
||||||
| def schedule_module(self, dump_kernel=None, parameters=None): | ||||||
| with self.context, self.location: | ||||||
| schedule_module = ir.Module.create() | ||||||
| schedule_module.operation.attributes[ | ||||||
| "transform.with_named_sequence"] = (ir.UnitAttr.get()) | ||||||
| with ir.InsertionPoint(schedule_module.body): | ||||||
| named_sequence = transform.NamedSequenceOp( | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| "__transform_main", | ||||||
| [transform.AnyOpType.get()], | ||||||
| [], | ||||||
| arg_attrs=[{"transform.readonly": ir.UnitAttr.get()}], | ||||||
| ) | ||||||
| with ir.InsertionPoint(named_sequence.body): | ||||||
| anytype = transform.AnyOpType.get() | ||||||
| func = match(named_sequence.bodyTarget, ops={"func.func"}) | ||||||
| mod = transform.get_parent_op( | ||||||
| anytype, | ||||||
| func, | ||||||
| op_name="builtin.module", | ||||||
| deduplicate=True, | ||||||
| ) | ||||||
| mod = apply_registered_pass(mod, "one-shot-bufferize") | ||||||
| mod = apply_registered_pass(mod, "convert-linalg-to-loops") | ||||||
| cse(mod) | ||||||
| canonicalize(mod) | ||||||
|
|
||||||
| if dump_kernel == "bufferized": | ||||||
| transform.YieldOp() | ||||||
| return schedule_module | ||||||
|
Comment on lines
+141
to
+143
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want this in the committed version? An alternative to consider is inserting |
||||||
|
|
||||||
| mod = apply_registered_pass(mod, "convert-scf-to-cf") | ||||||
| mod = apply_registered_pass(mod, "finalize-memref-to-llvm") | ||||||
| mod = apply_registered_pass(mod, "convert-cf-to-llvm") | ||||||
| mod = apply_registered_pass(mod, "convert-arith-to-llvm") | ||||||
| mod = apply_registered_pass(mod, "convert-func-to-llvm") | ||||||
| mod = apply_registered_pass(mod, | ||||||
| "reconcile-unrealized-casts") | ||||||
| transform.YieldOp() | ||||||
|
|
||||||
| return schedule_module | ||||||
|
|
||||||
|
|
||||||
| if __name__ == "__main__": | ||||||
| wload = ElementwiseSum(400, 400) | ||||||
|
|
||||||
| print(" Dump kernel ".center(60, "-")) | ||||||
| lower_payload(wload, dump_kernel="bufferized", dump_schedule=True) | ||||||
|
|
||||||
| print(" Execute 1 ".center(60, "-")) | ||||||
| execute(wload, verbose=2) | ||||||
|
|
||||||
| print(" Execute 2 ".center(60, "-")) | ||||||
| execute(wload, verbose=1) | ||||||
|
|
||||||
| print(" Benchmark ".center(60, "-")) | ||||||
| times = benchmark(wload) | ||||||
| times *= 1e6 # convert to microseconds | ||||||
| # compute statistics | ||||||
| mean = np.mean(times) | ||||||
| min = np.min(times) | ||||||
| max = np.max(times) | ||||||
| std = np.std(times) | ||||||
| print(f"Timings (us): " | ||||||
| f"mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") | ||||||
| flop_count = wload.get_complexity()[0] | ||||||
| gflops = flop_count / (mean * 1e-6) / 1e9 | ||||||
| print(f"Throughput: {gflops:.2f} GFLOPS") | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,217 @@ | ||
| """ | ||
| Workload example: Element-wise sum of two (M, N) float32 arrays on CPU. | ||
|
|
||
| In this example, allocation and deallocation of input arrays is done in MLIR. | ||
| """ | ||
| import numpy as np | ||
| from mlir import ir | ||
| from mlir.runtime.np_to_memref import ( | ||
| ranked_memref_to_numpy, | ||
| make_nd_memref_descriptor, | ||
| as_ctype, | ||
| ) | ||
| from mlir.dialects import func, linalg, bufferization, arith, memref | ||
| from mlir.dialects import transform | ||
| import ctypes | ||
| from contextlib import contextmanager | ||
| from lighthouse import Workload | ||
| from lighthouse.utils.mlir import ( | ||
| apply_registered_pass, | ||
| canonicalize, | ||
| cse, | ||
| match, | ||
| ) | ||
| from lighthouse.utils import get_packed_arg | ||
| from lighthouse.utils.execution import ( | ||
| lower_payload, | ||
| execute, | ||
| benchmark, | ||
| ) | ||
| from example import ElementwiseSum | ||
|
|
||
|
|
||
| def emit_host_alloc(mod, suffix, element_type, rank=2): | ||
| dyn = ir.ShapedType.get_dynamic_size() | ||
| memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type) | ||
| index_t = ir.IndexType.get() | ||
| i32_t = ir.IntegerType.get_signless(32) | ||
| with ir.InsertionPoint(mod.body): | ||
| f = func.FuncOp( | ||
| "host_alloc_" + suffix, (rank*(i32_t,), (memref_dyn_t,)) | ||
| ) | ||
| f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() | ||
| with ir.InsertionPoint(f.add_entry_block()): | ||
| dims = [ | ||
| arith.IndexCastOp(index_t, a) for a in list(f.arguments) | ||
| ] | ||
| alloc = memref.alloc(memref_dyn_t, dims, []) | ||
| func.ReturnOp((alloc,)) | ||
|
|
||
|
|
||
| def emit_host_dealloc(mod, suffix, element_type, rank=2): | ||
| dyn = ir.ShapedType.get_dynamic_size() | ||
| memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type) | ||
| with ir.InsertionPoint(mod.body): | ||
| f = func.FuncOp("host_dealloc_" + suffix, ((memref_dyn_t,), ())) | ||
| f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() | ||
| with ir.InsertionPoint(f.add_entry_block()): | ||
| memref.dealloc(f.arguments[0]) | ||
| func.ReturnOp(()) | ||
|
|
||
|
|
||
| def emit_fill_constant(mod, suffix, value, element_type, rank=2): | ||
| dyn = ir.ShapedType.get_dynamic_size() | ||
| memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type) | ||
| with ir.InsertionPoint(mod.body): | ||
| f = func.FuncOp("host_fill_constant_" + suffix, ((memref_dyn_t,), ())) | ||
| f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() | ||
| with ir.InsertionPoint(f.add_entry_block()): | ||
| const = arith.constant(element_type, value) | ||
| linalg.fill(const, outs=[f.arguments[0]]) | ||
| func.ReturnOp(()) | ||
|
|
||
|
|
||
| def emit_fill_random(mod, suffix, element_type, min=0.0, max=1.0, seed=2): | ||
| rank = 2 | ||
| dyn = ir.ShapedType.get_dynamic_size() | ||
| memref_dyn_t = ir.MemRefType.get(rank*(dyn,), element_type) | ||
| i32_t = ir.IntegerType.get_signless(32) | ||
| f64_t = ir.F64Type.get() | ||
| with ir.InsertionPoint(mod.body): | ||
| f = func.FuncOp("host_fill_random_" + suffix, ((memref_dyn_t,), ())) | ||
| f.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() | ||
| with ir.InsertionPoint(f.add_entry_block()): | ||
| min_cst = arith.constant(f64_t, min) | ||
| max_cst = arith.constant(f64_t, max) | ||
| seed_cst = arith.constant(i32_t, seed) | ||
| linalg.fill_rng_2d(min_cst, max_cst, seed_cst, outs=[f.arguments[0]]) | ||
| func.ReturnOp(()) | ||
|
|
||
|
|
||
| class ElementwiseSumMLIRAlloc(ElementwiseSum): | ||
| """ | ||
| Computes element-wise sum of (M, N) float32 arrays on CPU. | ||
|
|
||
| Extends ElementwiseSum by allocating input arrays in MLIR. | ||
| """ | ||
|
|
||
| def __init__(self, M, N): | ||
| super().__init__(M, N) | ||
| # keep track of allocated memrefs | ||
| self.memrefs = {} | ||
|
|
||
| def _allocate_array(self, name, execution_engine): | ||
| if name in self.memrefs: | ||
| return self.memrefs[name] | ||
| alloc_func = execution_engine.lookup("host_alloc_f32") | ||
| shape = (self.M, self.N) | ||
| mref = make_nd_memref_descriptor(len(shape), as_ctype(self.dtype))() | ||
| ptr_mref = ctypes.pointer(ctypes.pointer(mref)) | ||
| ptr_dims = [ctypes.pointer(ctypes.c_int32(d)) for d in shape] | ||
| alloc_func(get_packed_arg([ptr_mref, *ptr_dims])) | ||
| self.memrefs[name] = mref | ||
| return mref | ||
|
|
||
| def _allocate_inputs(self, execution_engine): | ||
| self._allocate_array("A", execution_engine) | ||
| self._allocate_array("B", execution_engine) | ||
| self._allocate_array("C", execution_engine) | ||
|
|
||
| def _deallocate_all(self, execution_engine): | ||
| for mref in self.memrefs.values(): | ||
| dealloc_func = execution_engine.lookup("host_dealloc_f32") | ||
| ptr_mref = ctypes.pointer(ctypes.pointer(mref)) | ||
| dealloc_func(get_packed_arg([ptr_mref])) | ||
| self.memrefs = {} | ||
|
|
||
| @contextmanager | ||
| def allocate(self, execution_engine): | ||
| try: | ||
| self._allocate_inputs(execution_engine) | ||
| yield None | ||
| finally: | ||
| self._deallocate_all(execution_engine) | ||
|
|
||
| def get_input_arrays(self, execution_engine): | ||
| A = self._allocate_array("A", execution_engine) | ||
| B = self._allocate_array("B", execution_engine) | ||
| C = self._allocate_array("C", execution_engine) | ||
|
|
||
| # initialize with MLIR | ||
| fill_zero_func = execution_engine.lookup("host_fill_constant_zero_f32") | ||
| fill_random_func = execution_engine.lookup("host_fill_random_f32") | ||
| fill_zero_func(get_packed_arg([ctypes.pointer(ctypes.pointer(C))])) | ||
| fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(A))])) | ||
| fill_random_func(get_packed_arg([ctypes.pointer(ctypes.pointer(B))])) | ||
|
|
||
| return [A, B, C] | ||
|
|
||
| def verify(self, execution_engine, verbose: int = 0) -> bool: | ||
| # compute reference solution with numpy | ||
| A = ranked_memref_to_numpy([self.memrefs["A"]]) | ||
| B = ranked_memref_to_numpy([self.memrefs["B"]]) | ||
| C = ranked_memref_to_numpy([self.memrefs["C"]]) | ||
| C_ref = A + B | ||
| if verbose > 1: | ||
| print("Reference solution:") | ||
| print(C_ref) | ||
| print("Computed solution:") | ||
| print(C) | ||
| success = np.allclose(C, C_ref) | ||
|
|
||
| # Alternatively we could have done the verification in MLIR by emitting | ||
| # a check function. | ||
| # Here we just call the payload function again. | ||
| # self._allocate_array("C_ref", execution_engine) | ||
| # func = execution_engine.lookup("payload") | ||
| # func(get_packed_arg([ | ||
| # ctypes.pointer(ctypes.pointer(self.memrefs["A"])), | ||
| # ctypes.pointer(ctypes.pointer(self.memrefs["B"])), | ||
| # ctypes.pointer(ctypes.pointer(self.memrefs["C_ref"])), | ||
| # ])) | ||
| # Check correctness with numpy. | ||
| # C = ranked_memref_to_numpy([self.memrefs["C"]]) | ||
| # C_ref = ranked_memref_to_numpy([self.memrefs["C_ref"]]) | ||
| # success = np.allclose(C, C_ref) | ||
|
|
||
| if verbose: | ||
| if success: | ||
| print("PASSED") | ||
| else: | ||
| print("FAILED Result mismatch!") | ||
| return success | ||
|
|
||
| def payload_module(self): | ||
| mod = super().payload_module() | ||
| # extend the payload module with de/alloc/fill functions | ||
| with self.context, self.location: | ||
| float32_t = ir.F32Type.get() | ||
| emit_host_alloc(mod, "f32", float32_t) | ||
| emit_host_dealloc(mod, "f32", float32_t) | ||
| emit_fill_constant(mod, "zero_f32", 0.0, float32_t) | ||
| emit_fill_random(mod, "f32", float32_t, min=-1.0, max=1.0) | ||
| return mod | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| wload = ElementwiseSumMLIRAlloc(400, 400) | ||
|
|
||
| print(" Dump kernel ".center(60, "-")) | ||
| lower_payload(wload, dump_kernel="bufferized", dump_schedule=False) | ||
|
|
||
| print(" Execute ".center(60, "-")) | ||
| execute(wload, verbose=2) | ||
|
|
||
| print(" Benchmark ".center(60, "-")) | ||
| times = benchmark(wload) | ||
| times *= 1e6 # convert to microseconds | ||
| # compute statistics | ||
| mean = np.mean(times) | ||
| min = np.min(times) | ||
| max = np.max(times) | ||
| std = np.std(times) | ||
| print(f"Timings (us): " | ||
| f"mean={mean:.2f}+/-{std:.2f} min={min:.2f} max={max:.2f}") | ||
| flop_count = wload.get_complexity()[0] | ||
| gflops = flop_count / (mean * 1e-6) / 1e9 | ||
| print(f"Throughput: {gflops:.2f} GFLOPS") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left a comment elsewhere about it being mostly possible to elide thinking about
Contexts and unknownLocations in most code.