diff --git a/iris/ccl/all_gather.py b/iris/ccl/all_gather.py index 7a177f1c0..7918729f9 100644 --- a/iris/ccl/all_gather.py +++ b/iris/ccl/all_gather.py @@ -6,9 +6,12 @@ Gathers tensors from all ranks and concatenates them along the last dimension. """ +import logging + import triton import triton.language as tl import iris +from iris.host.logging.logging import _log_rank from iris.host.tracing.kernel_artifacts import iris_launch from .config import Config from .utils import extract_group_info @@ -484,6 +487,18 @@ def all_gather( rank_in_group, rank_global, world_size, rank_start, rank_stride = extract_group_info(group, ctx) M, N = input_tensor.shape[:2] + _log_rank( + logging.DEBUG, + "all_gather: shape=(%d,%d) dtype=%s rank=%d/%d async_op=%s", + M, + N, + input_tensor.dtype, + rank_global, + world_size, + async_op, + rank=rank_global, + num_ranks=world_size, + ) expected_output_shape = (world_size * M, N) if output_tensor.shape[:2] != expected_output_shape: diff --git a/iris/ccl/all_reduce.py b/iris/ccl/all_reduce.py index cc301f609..adbe8f8c2 100644 --- a/iris/ccl/all_reduce.py +++ b/iris/ccl/all_reduce.py @@ -6,6 +6,7 @@ Supports multiple variants: atomic, spinlock, ring, two-shot, and one-shot. """ +import logging from dataclasses import dataclass from typing import Optional, Tuple @@ -13,6 +14,7 @@ import triton.language as tl import torch import iris +from iris.host.logging.logging import _log_rank from iris.host.tracing.kernel_artifacts import iris_launch from .config import Config from .utils import chiplet_transform_chunked, ReduceOp, extract_group_info @@ -75,6 +77,16 @@ def all_reduce_preamble( M, N = input_tensor.shape[:2] dtype = input_tensor.dtype + _log_rank( + logging.DEBUG, + "all_reduce_preamble: variant=%s shape=(%d,%d) dtype=%s", + variant, + M, + N, + dtype, + rank=shmem.cur_rank, + num_ranks=shmem.num_ranks, + ) if workspace is None: workspace = AllReduceWorkspace() @@ -767,6 +779,19 @@ def all_reduce( stride_out_m, stride_out_n = output_tensor.stride(0), output_tensor.stride(1) variant = config.all_reduce_variant.lower() + _log_rank( + logging.DEBUG, + "all_reduce: variant=%s shape=(%d,%d) dtype=%s rank=%d/%d async_op=%s", + variant, + M, + N, + input_tensor.dtype, + rank_global, + world_size, + async_op, + rank=rank_global, + num_ranks=world_size, + ) if variant not in [ VARIANT_ATOMIC, VARIANT_SPINLOCK, diff --git a/iris/ccl/all_to_all.py b/iris/ccl/all_to_all.py index 47db0e823..255a952d3 100644 --- a/iris/ccl/all_to_all.py +++ b/iris/ccl/all_to_all.py @@ -6,9 +6,12 @@ Supports both Triton and Gluon implementations based on config. """ +import logging + import triton import triton.language as tl import iris +from iris.host.logging.logging import _log_rank from iris.host.tracing.kernel_artifacts import iris_launch from .config import Config from .utils import chiplet_transform_chunked, extract_group_info @@ -368,6 +371,19 @@ def all_to_all( M, total_N = input_tensor.shape[:2] N = total_N // world_size + _log_rank( + logging.DEBUG, + "all_to_all: shape=(%d,%d) N_per_rank=%d dtype=%s rank=%d/%d async_op=%s", + M, + total_N, + N, + input_tensor.dtype, + rank_global, + world_size, + async_op, + rank=rank_global, + num_ranks=world_size, + ) stride_in_m, stride_in_n = input_tensor.stride(0), input_tensor.stride(1) stride_out_m, stride_out_n = output_tensor.stride(0), output_tensor.stride(1) diff --git a/iris/ccl/reduce_scatter.py b/iris/ccl/reduce_scatter.py index 10d95e85e..772f43ce9 100644 --- a/iris/ccl/reduce_scatter.py +++ b/iris/ccl/reduce_scatter.py @@ -6,9 +6,12 @@ Uses the two-shot approach: reduce assigned tiles and store only to own rank. """ +import logging + import triton import triton.language as tl import iris +from iris.host.logging.logging import _log_rank from iris.host.tracing.kernel_artifacts import iris_launch from .config import Config from .utils import chiplet_transform_chunked, ReduceOp, extract_group_info @@ -214,6 +217,18 @@ def reduce_scatter( # rank_global: global rank in iris context - passed as iris_rank to kernel for RMA operations rank_in_group, rank_global, world_size, rank_start, rank_stride = extract_group_info(group, shmem) M, N = input_tensor.shape[:2] + _log_rank( + logging.DEBUG, + "reduce_scatter: shape=(%d,%d) dtype=%s rank=%d/%d async_op=%s", + M, + N, + input_tensor.dtype, + rank_global, + world_size, + async_op, + rank=rank_global, + num_ranks=world_size, + ) # Validate output shape matches input shape if output_tensor.shape[:2] != (M, N): diff --git a/iris/host/distributed/fd_passing.py b/iris/host/distributed/fd_passing.py index f0c911138..24cc52b4e 100644 --- a/iris/host/distributed/fd_passing.py +++ b/iris/host/distributed/fd_passing.py @@ -156,6 +156,18 @@ def setup_fd_infrastructure(cur_rank: int, num_ranks: int): if num_ranks <= 1: return None + import logging + from iris.host.logging.logging import _log_rank + + _log_rank( + logging.DEBUG, + "setup_fd_infrastructure: rank=%d num_ranks=%d", + cur_rank, + num_ranks, + rank=cur_rank, + num_ranks=num_ranks, + ) + import torch.distributed as dist from iris.host.distributed.helpers import distributed_barrier diff --git a/iris/host/distributed/helpers.py b/iris/host/distributed/helpers.py index 0354d73a9..7da47bbe1 100644 --- a/iris/host/distributed/helpers.py +++ b/iris/host/distributed/helpers.py @@ -2,11 +2,14 @@ # Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +import logging + import torch import torch.distributed as dist import numpy as np import triton import triton.language as tl +from iris.host.logging.logging import _log_rank from iris.host.tracing.kernel_artifacts import iris_launch @@ -58,6 +61,15 @@ def distributed_allgather(data): world_size = dist.get_world_size() device = _infer_device() backend = str(dist.get_backend()).lower() + _log_rank( + logging.DEBUG, + "distributed_allgather: shape=%s backend=%s world_size=%d", + data.shape, + backend, + world_size, + rank=dist.get_rank(), + num_ranks=world_size, + ) # Fast path: tensor all_gather if dtype is NCCL-supported or backend != nccl data_tensor = torch.from_numpy(data) @@ -180,6 +192,14 @@ def distributed_broadcast_tensor(value_to_broadcast=None, root=0): rank = dist.get_rank() device = _infer_device() backend = str(dist.get_backend()).lower() + _log_rank( + logging.DEBUG, + "distributed_broadcast_tensor: src=%d rank=%d", + root, + rank, + rank=rank, + num_ranks=dist.get_world_size(), + ) if rank == root: if value_to_broadcast is None: @@ -291,6 +311,13 @@ def distributed_barrier(group=None): """ if not dist.is_initialized(): raise RuntimeError("PyTorch distributed is not initialized") + _log_rank( + logging.DEBUG, + "distributed_barrier: group=%s", + group, + rank=dist.get_rank(), + num_ranks=dist.get_world_size(), + ) dist.barrier(group=group) diff --git a/iris/host/iris.py b/iris/host/iris.py index 74f798358..a0851595b 100644 --- a/iris/host/iris.py +++ b/iris/host/iris.py @@ -105,6 +105,12 @@ def __init__(self, heap_size=1 << 30, allocator_type="torch"): self.gpu_id = gpu_id self.heap_size = heap_size + if logger.isEnabledFor(logging.INFO): + self._log_with_rank( + logging.INFO, + f"init: heap_size={heap_size / (1 << 30):.1f}GB rank={cur_rank}/{num_ranks} allocator={allocator_type}", + ) + # Initialize symmetric heap with specified allocator self.heap = SymmetricHeap(heap_size, gpu_id, cur_rank, num_ranks, allocator_type) self.device = f"cuda:{gpu_id}" @@ -997,6 +1003,7 @@ def barrier(self, stream=None, group=None): >>> ctx.barrier() # Synchronize all ranks >>> ctx.barrier(group=my_group) # Synchronize only ranks in my_group """ + self._log_with_rank(logging.DEBUG, "barrier: start") # Wait for all GPUs to finish work if stream is None: torch.cuda.synchronize() diff --git a/iris/host/logging/logging.py b/iris/host/logging/logging.py index 3df2dddb5..23edd221d 100644 --- a/iris/host/logging/logging.py +++ b/iris/host/logging/logging.py @@ -6,6 +6,8 @@ """ import logging +import os +import sys # Logging constants (compatible with Python logging levels) DEBUG = logging.DEBUG @@ -15,20 +17,23 @@ class IrisFormatter(logging.Formatter): - """Custom formatter that automatically includes rank information when available.""" + """Custom formatter that includes timestamp, level, rank, and module information.""" def __init__(self): super().__init__() def format(self, record): - # Check if rank information is available in the record - if hasattr(record, "iris_rank") and hasattr(record, "iris_num_ranks"): - prefix = f"[Iris] [{record.iris_rank}/{record.iris_num_ranks}]" - else: - prefix = "[Iris]" - - # Format the message with the appropriate prefix - return f"{prefix} {record.getMessage()}" + rank = getattr(record, "iris_rank", "?") + num_ranks = getattr(record, "iris_num_ranks", "?") + ts = self.formatTime(record, "%H:%M:%S") + level = record.levelname + # Only show [module] for internal iris logs (set by _log_rank), + # not for user-facing ctx.info()/ctx.debug() etc. + iris_internal = getattr(record, "iris_internal", False) + if iris_internal: + module = record.module + return f"{ts} {level:<5s} [Iris] [{rank}/{num_ranks}] [{module}] {record.getMessage()}" + return f"{ts} {level:<5s} [Iris] [{rank}/{num_ranks}] {record.getMessage()}" # Logger instance that can be accessed as iris.logger @@ -37,6 +42,11 @@ def format(self, record): # Set up iris logger logger.setLevel(logging.INFO) # Default level +# Override from environment +_env_level = os.environ.get("IRIS_LOG_LEVEL", "").upper() +if _env_level in ("DEBUG", "INFO", "WARNING", "ERROR"): + logger.setLevel(getattr(logging, _env_level)) + # Add a console handler if none exists if not logger.handlers: _console_handler = logging.StreamHandler() @@ -45,6 +55,28 @@ def format(self, record): logger.addHandler(_console_handler) +def _log_rank(level, msg, *args, rank=None, num_ranks=None): + """Log with optional rank injection. Captures caller's module automatically.""" + if logger.isEnabledFor(level): + # Capture caller's file/line so the formatter can show [module] + frame = sys._getframe(1) + record = logging.LogRecord( + name=logger.name, + level=level, + pathname=frame.f_code.co_filename, + lineno=frame.f_lineno, + msg=msg, + args=args, + exc_info=None, + ) + record.iris_internal = True + if rank is not None: + record.iris_rank = rank + if num_ranks is not None: + record.iris_num_ranks = num_ranks + logger.handle(record) + + def set_logger_level(level): """ Set the logging level for the iris logger. diff --git a/iris/host/memory/allocators/torch_allocator.py b/iris/host/memory/allocators/torch_allocator.py index a0e245516..bbffb6f00 100644 --- a/iris/host/memory/allocators/torch_allocator.py +++ b/iris/host/memory/allocators/torch_allocator.py @@ -8,6 +8,7 @@ sub-allocations within it using bump allocation. """ +import logging import math import numpy as np import torch @@ -15,6 +16,7 @@ import struct from .base import BaseAllocator +from iris.host.logging.logging import _log_rank from iris.host.platform.hip import export_dmabuf_handle, import_dmabuf_handle, destroy_external_memory from iris.host.distributed.fd_passing import send_fd, recv_fd, managed_fd from iris.host.platform.utils import is_simulation_env @@ -41,6 +43,14 @@ def __init__(self, heap_size: int, device_id: int, cur_rank: int, num_ranks: int super().__init__(heap_size, device_id, cur_rank, num_ranks) self.device = f"cuda:{device_id}" + _log_rank( + logging.INFO, + "TorchAllocator: init heap_size=%.1fGB device=%d", + heap_size / (1 << 30), + device_id, + rank=cur_rank, + num_ranks=num_ranks, + ) if is_simulation_env(): import json @@ -93,7 +103,26 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024) size_in_bytes = num_elements * element_size aligned_size = math.ceil(size_in_bytes / alignment) * alignment + _log_rank( + logging.DEBUG, + "TorchAllocator.allocate: num_elements=%d dtype=%s size_bytes=%d offset=%d", + num_elements, + dtype, + size_in_bytes, + self.heap_offset, + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) + if self.heap_offset + aligned_size > self.heap_size: + _log_rank( + logging.ERROR, + "TorchAllocator: OOM requested=%d available=%d", + aligned_size, + self.heap_size - self.heap_offset, + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) raise MemoryError("Heap out of memory") start = self.heap_offset diff --git a/iris/host/memory/allocators/vmem_allocator.py b/iris/host/memory/allocators/vmem_allocator.py index 5bbfbd67d..1c39a6312 100644 --- a/iris/host/memory/allocators/vmem_allocator.py +++ b/iris/host/memory/allocators/vmem_allocator.py @@ -8,12 +8,14 @@ enabling features like memory oversubscription and on-demand paging. """ +import logging import torch import os from typing import Dict from threading import Lock from .base import BaseAllocator +from iris.host.logging.logging import _log_rank from iris.host.platform.hip import ( get_allocation_granularity, get_address_range, @@ -59,6 +61,14 @@ def __init__( va_multiplier: float = 1.0, ): super().__init__(heap_size, device_id, rank, world_size) + _log_rank( + logging.INFO, + "VMemAllocator: init heap_size=%.1fGB device=%d", + heap_size / (1 << 30), + device_id, + rank=rank, + num_ranks=world_size, + ) self.va_multiplier = va_multiplier self.device = torch.device(f"cuda:{device_id}") self.lock = Lock() @@ -143,6 +153,17 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024) aligned_size = (actual_size_bytes + self.granularity - 1) & ~(self.granularity - 1) aligned_offset = (self.current_offset + alignment - 1) & ~(alignment - 1) + _log_rank( + logging.DEBUG, + "VMemAllocator.allocate: num_elements=%d dtype=%s size_bytes=%d offset=%d", + num_elements, + dtype, + size_bytes, + aligned_offset, + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) + if aligned_offset + aligned_size > self.aligned_heap_size: raise RuntimeError( f"Out of VMem address space for allocation: " diff --git a/iris/host/memory/symmetric_heap.py b/iris/host/memory/symmetric_heap.py index e4e68959c..ff5b6a3cb 100644 --- a/iris/host/memory/symmetric_heap.py +++ b/iris/host/memory/symmetric_heap.py @@ -8,10 +8,13 @@ hiding the details of allocators and inter-process memory sharing. """ +import logging + import numpy as np import torch import os +from iris.host.logging.logging import _log_rank, logger from iris.host.memory.allocators import TorchAllocator, VMemAllocator from iris.host.distributed.fd_passing import setup_fd_infrastructure from iris.host.distributed.helpers import distributed_allgather @@ -54,6 +57,15 @@ def __init__( self.cur_rank = cur_rank self.num_ranks = num_ranks allocator_type = os.environ.get("IRIS_ALLOCATOR", allocator_type).lower() + _log_rank( + logging.INFO, + "SymmetricHeap: init heap_size=%.1fGB device=%d allocator=%s", + heap_size / (1 << 30), + device_id, + allocator_type, + rank=cur_rank, + num_ranks=num_ranks, + ) if is_simulation_env(): allocator_type = "torch" @@ -100,6 +112,14 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024) This should be called collectively across all ranks to maintain symmetric heap consistency. After allocation, peer access is refreshed. """ + _log_rank( + logging.DEBUG, + "SymmetricHeap.allocate: num_elements=%d dtype=%s", + num_elements, + dtype, + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) min_bytes = self.allocator.get_minimum_allocation_size() element_size = torch.tensor([], dtype=dtype).element_size() min_elements = max(1, (min_bytes + element_size - 1) // element_size) @@ -169,6 +189,15 @@ def refresh_peer_access(self): hipMemAccessFlagsProtReadWrite, ) + _log_rank( + logging.DEBUG, + "refresh_peer_access: start rank=%d num_ranks=%d", + self.cur_rank, + self.num_ranks, + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) + if dist.is_initialized(): dist.barrier() @@ -211,6 +240,14 @@ def refresh_peer_access(self): for peer, sock in self.fd_conns.items(): if peer == self.cur_rank: continue + _log_rank( + logging.DEBUG, + "refresh_peer_access: peer=%d segments=%d", + peer, + len(my_exported_fds), + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) if not hasattr(self, "_peer_va_ranges"): self._peer_va_ranges = {} @@ -272,6 +309,15 @@ def refresh_peer_access(self): os.close(fd) + if logger.isEnabledFor(logging.DEBUG): + _log_rank( + logging.DEBUG, + "refresh_peer_access: done heap_bases=[%s]", + ", ".join(f"0x{int(self.heap_bases[r].item()):x}" for r in range(min(3, self.num_ranks))), + rank=self.cur_rank, + num_ranks=self.num_ranks, + ) + if dist.is_initialized(): dist.barrier() diff --git a/iris/host/platform/hip.py b/iris/host/platform/hip.py index e6dc598d8..a23ea8f9e 100644 --- a/iris/host/platform/hip.py +++ b/iris/host/platform/hip.py @@ -2,11 +2,14 @@ # Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. import ctypes +import logging import numpy as np import torch import subprocess import os +from iris.host.logging.logging import _log_rank + # Auto-detect backend _is_amd_backend = True try: @@ -27,10 +30,12 @@ def gpu_try(err): if _is_amd_backend: gpu_runtime.hipGetErrorString.restype = ctypes.c_char_p error_string = gpu_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8") + _log_rank(logging.ERROR, "HIP error code %d: %s", err, error_string) raise RuntimeError(f"HIP error code {err}: {error_string}") else: gpu_runtime.cudaGetErrorString.restype = ctypes.c_char_p error_string = gpu_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8") + _log_rank(logging.ERROR, "CUDA error code %d: %s", err, error_string) raise RuntimeError(f"CUDA error code {err}: {error_string}") @@ -280,6 +285,8 @@ def export_dmabuf_handle(ptr, size): if not _is_amd_backend: raise RuntimeError("DMA-BUF export only supported on AMD/HIP backend") + _log_rank(logging.DEBUG, "export_dmabuf_handle: ptr=0x%x size=%d", ptr if isinstance(ptr, int) else ptr.value, size) + ptr_int = ptr if isinstance(ptr, int) else ptr.value ptr_arg = ctypes.c_void_p(ptr_int) diff --git a/iris/host/tracing/kernel_artifacts.py b/iris/host/tracing/kernel_artifacts.py index 064e636c0..c6215e7f9 100644 --- a/iris/host/tracing/kernel_artifacts.py +++ b/iris/host/tracing/kernel_artifacts.py @@ -27,11 +27,14 @@ import hashlib import json +import logging import os from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, Optional +from iris.host.logging.logging import _log_rank + _artifacts_dir: Optional[Path] = None _enabled: bool = False @@ -64,6 +67,15 @@ def iris_launch(kernel_fn, grid, *args, algorithm: str, rank: int, dtype=None, * dtype: Optional tensor dtype for the spec directory name. **kwargs: Keyword arguments forwarded to the kernel (num_warps, etc.). """ + _log_rank( + logging.DEBUG, + "iris_launch: algorithm=%s kernel=%s grid=%s rank=%d", + algorithm, + _get_kernel_name(kernel_fn), + grid, + rank, + rank=rank, + ) compiled = kernel_fn[grid](*args, **kwargs) if _enabled and compiled is not None: kernel_name = _get_kernel_name(kernel_fn) diff --git a/iris/ops/all_gather_matmul.py b/iris/ops/all_gather_matmul.py index c39fc2ede..317c37042 100644 --- a/iris/ops/all_gather_matmul.py +++ b/iris/ops/all_gather_matmul.py @@ -9,6 +9,7 @@ tiles from remote ranks on-demand during GEMM computation. """ +import logging from typing import Optional import torch import triton @@ -203,6 +204,21 @@ def all_gather_matmul( world_size = shmem.get_num_ranks() rank = shmem.get_rank() + from iris.host.logging.logging import _log_rank + + _log_rank( + logging.DEBUG, + "all_gather_matmul: shape=(%d,%d,%d) dtype=%s rank=%d/%d", + M, + N, + K, + A_sharded.dtype, + rank, + world_size, + rank=rank, + num_ranks=world_size, + ) + expected_K = world_size * K_local assert K == expected_K, f"K ({K}) must equal world_size ({world_size}) * K_local ({K_local})" assert output_tensor.shape == (M, N), f"Output must be ({M}, {N}), got {output_tensor.shape}" diff --git a/iris/ops/matmul_all_gather.py b/iris/ops/matmul_all_gather.py index b011367fa..43b2cc8c1 100644 --- a/iris/ops/matmul_all_gather.py +++ b/iris/ops/matmul_all_gather.py @@ -10,6 +10,7 @@ This is useful for tensor-parallel workloads where outputs need to be gathered. """ +import logging from typing import Optional import torch import triton @@ -174,6 +175,21 @@ def matmul_all_gather( world_size = shmem.get_num_ranks() rank = shmem.get_rank() + from iris.host.logging.logging import _log_rank + + _log_rank( + logging.DEBUG, + "matmul_all_gather: shape=(%d,%d,%d) dtype=%s rank=%d/%d", + M_local * world_size, + N, + K, + A.dtype, + rank, + world_size, + rank=rank, + num_ranks=world_size, + ) + assert K == K2, f"Inner dimensions must match: A has K={K}, B has K={K2}" M = M_local * world_size diff --git a/iris/ops/matmul_all_reduce.py b/iris/ops/matmul_all_reduce.py index 634026cf6..29b84b6e3 100644 --- a/iris/ops/matmul_all_reduce.py +++ b/iris/ops/matmul_all_reduce.py @@ -8,6 +8,7 @@ automatically inferring dimensions, strides, and hardware parameters. """ +import logging from typing import Optional import torch import triton @@ -286,6 +287,22 @@ def matmul_all_reduce( rank = shmem.get_rank() world_size = shmem.get_num_ranks() + from iris.host.logging.logging import _log_rank + + _log_rank( + logging.DEBUG, + "matmul_all_reduce: shape=(%d,%d,%d) dtype=%s variant=%s rank=%d/%d", + M, + N, + K, + A.dtype, + config.all_reduce_variant, + rank, + world_size, + rank=rank, + num_ranks=world_size, + ) + # Prepare workspace if needed needs_prepare = workspace is None or not workspace.matches( "matmul_all_reduce", (M, N, K), A.dtype, world_size, config.all_reduce_variant diff --git a/iris/ops/matmul_reduce_scatter.py b/iris/ops/matmul_reduce_scatter.py index 2842fb27c..47454d630 100644 --- a/iris/ops/matmul_reduce_scatter.py +++ b/iris/ops/matmul_reduce_scatter.py @@ -8,6 +8,7 @@ automatically inferring dimensions, strides, and hardware parameters. """ +import logging from typing import Optional import torch import triton @@ -228,6 +229,21 @@ def matmul_reduce_scatter( rank = shmem.get_rank() world_size = shmem.get_num_ranks() + from iris.host.logging.logging import _log_rank + + _log_rank( + logging.DEBUG, + "matmul_reduce_scatter: shape=(%d,%d,%d) dtype=%s rank=%d/%d", + M, + N, + K, + A.dtype, + rank, + world_size, + rank=rank, + num_ranks=world_size, + ) + num_pid_m = (M + config.block_size_m - 1) // config.block_size_m num_pid_n = (N + config.block_size_n - 1) // config.block_size_n total_tiles = num_pid_m * num_pid_n diff --git a/tests/unittests/test_logging.py b/tests/unittests/test_logging.py index 399d2c369..911bc53b9 100644 --- a/tests/unittests/test_logging.py +++ b/tests/unittests/test_logging.py @@ -109,8 +109,8 @@ def test_logger_api_usage(): iris.logger.debug("Test debug message (should be visible)") output = log_capture.getvalue() - assert "[Iris] Test info message" in output - assert "[Iris] Test debug message (should be visible)" in output + assert "[Iris]" in output and "Test info message" in output + assert "[Iris]" in output and "Test debug message (should be visible)" in output # The hidden debug message should not appear lines = output.split("\n") hidden_debug_count = sum(1 for line in lines if "should be hidden" in line) @@ -130,7 +130,8 @@ def test_iris_formatter(): ) formatted_no_rank = formatter.format(record_no_rank) - assert formatted_no_rank == "[Iris] Test message without rank" + # Format: "HH:MM:SS LEVEL [Iris] [?/?] message" + assert "[Iris] [?/?] Test message without rank" in formatted_no_rank # Test record with rank information record_with_rank = logging.LogRecord( @@ -140,7 +141,19 @@ def test_iris_formatter(): record_with_rank.iris_num_ranks = 4 formatted_with_rank = formatter.format(record_with_rank) - assert formatted_with_rank == "[Iris] [2/4] Test message with rank" + assert "[Iris] [2/4] Test message with rank" in formatted_with_rank + + # Test internal record shows [module] + record_internal = logging.LogRecord( + name="iris", level=logging.INFO, pathname="all_gather.py", lineno=0, msg="Internal log", args=(), exc_info=None + ) + record_internal.iris_internal = True + record_internal.iris_rank = 0 + record_internal.iris_num_ranks = 4 + + formatted_internal = formatter.format(record_internal) + assert "[all_gather]" in formatted_internal + assert "[Iris] [0/4]" in formatted_internal def test_api_import():