diff --git a/iris/ccl/all_gather.py b/iris/ccl/all_gather.py
index 7a177f1c0..7918729f9 100644
--- a/iris/ccl/all_gather.py
+++ b/iris/ccl/all_gather.py
@@ -6,9 +6,12 @@
 Gathers tensors from all ranks and concatenates them along the last dimension.
 """
 
+import logging
+
 import triton
 import triton.language as tl
 import iris
+from iris.host.logging.logging import _log_rank
 from iris.host.tracing.kernel_artifacts import iris_launch
 from .config import Config
 from .utils import extract_group_info
@@ -484,6 +487,18 @@ def all_gather(
     rank_in_group, rank_global, world_size, rank_start, rank_stride = extract_group_info(group, ctx)
 
     M, N = input_tensor.shape[:2]
+    _log_rank(
+        logging.DEBUG,
+        "all_gather: shape=(%d,%d) dtype=%s rank=%d/%d async_op=%s",
+        M,
+        N,
+        input_tensor.dtype,
+        rank_global,
+        world_size,
+        async_op,
+        rank=rank_global,
+        num_ranks=world_size,
+    )
     expected_output_shape = (world_size * M, N)
 
     if output_tensor.shape[:2] != expected_output_shape:
diff --git a/iris/ccl/all_reduce.py b/iris/ccl/all_reduce.py
index cc301f609..adbe8f8c2 100644
--- a/iris/ccl/all_reduce.py
+++ b/iris/ccl/all_reduce.py
@@ -6,6 +6,7 @@
 Supports multiple variants: atomic, spinlock, ring, two-shot, and one-shot.
 """
 
+import logging
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
@@ -13,6 +14,7 @@
 import triton.language as tl
 import torch
 import iris
+from iris.host.logging.logging import _log_rank
 from iris.host.tracing.kernel_artifacts import iris_launch
 from .config import Config
 from .utils import chiplet_transform_chunked, ReduceOp, extract_group_info
@@ -75,6 +77,16 @@ def all_reduce_preamble(
 
     M, N = input_tensor.shape[:2]
     dtype = input_tensor.dtype
+    _log_rank(
+        logging.DEBUG,
+        "all_reduce_preamble: variant=%s shape=(%d,%d) dtype=%s",
+        variant,
+        M,
+        N,
+        dtype,
+        rank=shmem.cur_rank,
+        num_ranks=shmem.num_ranks,
+    )
 
     if workspace is None:
         workspace = AllReduceWorkspace()
@@ -767,6 +779,19 @@ def all_reduce(
     stride_out_m, stride_out_n = output_tensor.stride(0), output_tensor.stride(1)
 
     variant = config.all_reduce_variant.lower()
+    _log_rank(
+        logging.DEBUG,
+        "all_reduce: variant=%s shape=(%d,%d) dtype=%s rank=%d/%d async_op=%s",
+        variant,
+        M,
+        N,
+        input_tensor.dtype,
+        rank_global,
+        world_size,
+        async_op,
+        rank=rank_global,
+        num_ranks=world_size,
+    )
     if variant not in [
         VARIANT_ATOMIC,
         VARIANT_SPINLOCK,
diff --git a/iris/ccl/all_to_all.py b/iris/ccl/all_to_all.py
index 47db0e823..255a952d3 100644
--- a/iris/ccl/all_to_all.py
+++ b/iris/ccl/all_to_all.py
@@ -6,9 +6,12 @@
 Supports both Triton and Gluon implementations based on config.
 """
 
+import logging
+
 import triton
 import triton.language as tl
 import iris
+from iris.host.logging.logging import _log_rank
 from iris.host.tracing.kernel_artifacts import iris_launch
 from .config import Config
 from .utils import chiplet_transform_chunked, extract_group_info
@@ -368,6 +371,19 @@ def all_to_all(
 
     M, total_N = input_tensor.shape[:2]
     N = total_N // world_size
+    _log_rank(
+        logging.DEBUG,
+        "all_to_all: shape=(%d,%d) N_per_rank=%d dtype=%s rank=%d/%d async_op=%s",
+        M,
+        total_N,
+        N,
+        input_tensor.dtype,
+        rank_global,
+        world_size,
+        async_op,
+        rank=rank_global,
+        num_ranks=world_size,
+    )
 
     stride_in_m, stride_in_n = input_tensor.stride(0), input_tensor.stride(1)
     stride_out_m, stride_out_n = output_tensor.stride(0), output_tensor.stride(1)
diff --git a/iris/ccl/reduce_scatter.py b/iris/ccl/reduce_scatter.py
index 10d95e85e..772f43ce9 100644
--- a/iris/ccl/reduce_scatter.py
+++ b/iris/ccl/reduce_scatter.py
@@ -6,9 +6,12 @@
 Uses the two-shot approach: reduce assigned tiles and store only to own rank.
 """
 
+import logging
+
 import triton
 import triton.language as tl
 import iris
+from iris.host.logging.logging import _log_rank
 from iris.host.tracing.kernel_artifacts import iris_launch
 from .config import Config
 from .utils import chiplet_transform_chunked, ReduceOp, extract_group_info
@@ -214,6 +217,18 @@ def reduce_scatter(
     # rank_global: global rank in iris context - passed as iris_rank to kernel for RMA operations
     rank_in_group, rank_global, world_size, rank_start, rank_stride = extract_group_info(group, shmem)
     M, N = input_tensor.shape[:2]
+    _log_rank(
+        logging.DEBUG,
+        "reduce_scatter: shape=(%d,%d) dtype=%s rank=%d/%d async_op=%s",
+        M,
+        N,
+        input_tensor.dtype,
+        rank_global,
+        world_size,
+        async_op,
+        rank=rank_global,
+        num_ranks=world_size,
+    )
 
     # Validate output shape matches input shape
     if output_tensor.shape[:2] != (M, N):
diff --git a/iris/host/distributed/fd_passing.py b/iris/host/distributed/fd_passing.py
index f0c911138..24cc52b4e 100644
--- a/iris/host/distributed/fd_passing.py
+++ b/iris/host/distributed/fd_passing.py
@@ -156,6 +156,18 @@ def setup_fd_infrastructure(cur_rank: int, num_ranks: int):
     if num_ranks <= 1:
         return None
 
+    import logging
+    from iris.host.logging.logging import _log_rank
+
+    _log_rank(
+        logging.DEBUG,
+        "setup_fd_infrastructure: rank=%d num_ranks=%d",
+        cur_rank,
+        num_ranks,
+        rank=cur_rank,
+        num_ranks=num_ranks,
+    )
+
     import torch.distributed as dist
     from iris.host.distributed.helpers import distributed_barrier
 
diff --git a/iris/host/distributed/helpers.py b/iris/host/distributed/helpers.py
index 0354d73a9..7da47bbe1 100644
--- a/iris/host/distributed/helpers.py
+++ b/iris/host/distributed/helpers.py
@@ -2,11 +2,14 @@
 # Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
 
 
+import logging
+
 import torch
 import torch.distributed as dist
 import numpy as np
 import triton
 import triton.language as tl
+from iris.host.logging.logging import _log_rank
 from iris.host.tracing.kernel_artifacts import iris_launch
 
 
@@ -58,6 +61,15 @@ def distributed_allgather(data):
     world_size = dist.get_world_size()
     device = _infer_device()
     backend = str(dist.get_backend()).lower()
+    _log_rank(
+        logging.DEBUG,
+        "distributed_allgather: shape=%s backend=%s world_size=%d",
+        data.shape,
+        backend,
+        world_size,
+        rank=dist.get_rank(),
+        num_ranks=world_size,
+    )
 
     # Fast path: tensor all_gather if dtype is NCCL-supported or backend != nccl
     data_tensor = torch.from_numpy(data)
@@ -180,6 +192,14 @@ def distributed_broadcast_tensor(value_to_broadcast=None, root=0):
     rank = dist.get_rank()
     device = _infer_device()
     backend = str(dist.get_backend()).lower()
+    _log_rank(
+        logging.DEBUG,
+        "distributed_broadcast_tensor: src=%d rank=%d",
+        root,
+        rank,
+        rank=rank,
+        num_ranks=dist.get_world_size(),
+    )
 
     if rank == root:
         if value_to_broadcast is None:
@@ -291,6 +311,13 @@ def distributed_barrier(group=None):
     """
     if not dist.is_initialized():
         raise RuntimeError("PyTorch distributed is not initialized")
+    _log_rank(
+        logging.DEBUG,
+        "distributed_barrier: group=%s",
+        group,
+        rank=dist.get_rank(),
+        num_ranks=dist.get_world_size(),
+    )
     dist.barrier(group=group)
 
 
diff --git a/iris/host/iris.py b/iris/host/iris.py
index 74f798358..a0851595b 100644
--- a/iris/host/iris.py
+++ b/iris/host/iris.py
@@ -105,6 +105,12 @@ def __init__(self, heap_size=1 << 30, allocator_type="torch"):
         self.gpu_id = gpu_id
         self.heap_size = heap_size
 
+        if logger.isEnabledFor(logging.INFO):
+            self._log_with_rank(
+                logging.INFO,
+                f"init: heap_size={heap_size / (1 << 30):.1f}GB rank={cur_rank}/{num_ranks} allocator={allocator_type}",
+            )
+
         # Initialize symmetric heap with specified allocator
         self.heap = SymmetricHeap(heap_size, gpu_id, cur_rank, num_ranks, allocator_type)
         self.device = f"cuda:{gpu_id}"
@@ -997,6 +1003,7 @@ def barrier(self, stream=None, group=None):
             >>> ctx.barrier()  # Synchronize all ranks
             >>> ctx.barrier(group=my_group)  # Synchronize only ranks in my_group
         """
+        self._log_with_rank(logging.DEBUG, "barrier: start")
         # Wait for all GPUs to finish work
         if stream is None:
             torch.cuda.synchronize()
diff --git a/iris/host/logging/logging.py b/iris/host/logging/logging.py
index 3df2dddb5..23edd221d 100644
--- a/iris/host/logging/logging.py
+++ b/iris/host/logging/logging.py
@@ -6,6 +6,8 @@
 """
 
 import logging
+import os
+import sys
 
 # Logging constants (compatible with Python logging levels)
 DEBUG = logging.DEBUG
@@ -15,20 +17,23 @@
 
 
 class IrisFormatter(logging.Formatter):
-    """Custom formatter that automatically includes rank information when available."""
+    """Custom formatter that includes timestamp, level, rank, and module information."""
 
     def __init__(self):
         super().__init__()
 
     def format(self, record):
-        # Check if rank information is available in the record
-        if hasattr(record, "iris_rank") and hasattr(record, "iris_num_ranks"):
-            prefix = f"[Iris] [{record.iris_rank}/{record.iris_num_ranks}]"
-        else:
-            prefix = "[Iris]"
-
-        # Format the message with the appropriate prefix
-        return f"{prefix} {record.getMessage()}"
+        rank = getattr(record, "iris_rank", "?")
+        num_ranks = getattr(record, "iris_num_ranks", "?")
+        ts = self.formatTime(record, "%H:%M:%S")
+        level = record.levelname
+        # Only show [module] for internal iris logs (set by _log_rank),
+        # not for user-facing ctx.info()/ctx.debug() etc.
+        iris_internal = getattr(record, "iris_internal", False)
+        if iris_internal:
+            module = record.module
+            return f"{ts} {level:<5s} [Iris] [{rank}/{num_ranks}] [{module}] {record.getMessage()}"
+        return f"{ts} {level:<5s} [Iris] [{rank}/{num_ranks}] {record.getMessage()}"
 
 
 # Logger instance that can be accessed as iris.logger
@@ -37,6 +42,11 @@ def format(self, record):
 # Set up iris logger
 logger.setLevel(logging.INFO)  # Default level
 
+# Override from environment
+_env_level = os.environ.get("IRIS_LOG_LEVEL", "").upper()
+if _env_level in ("DEBUG", "INFO", "WARNING", "ERROR"):
+    logger.setLevel(getattr(logging, _env_level))
+
 # Add a console handler if none exists
 if not logger.handlers:
     _console_handler = logging.StreamHandler()
@@ -45,6 +55,28 @@ def format(self, record):
     logger.addHandler(_console_handler)
 
 
+def _log_rank(level, msg, *args, rank=None, num_ranks=None):
+    """Log with optional rank injection. Captures caller's module automatically."""
+    if logger.isEnabledFor(level):
+        # Capture caller's file/line so the formatter can show [module]
+        frame = sys._getframe(1)
+        record = logging.LogRecord(
+            name=logger.name,
+            level=level,
+            pathname=frame.f_code.co_filename,
+            lineno=frame.f_lineno,
+            msg=msg,
+            args=args,
+            exc_info=None,
+        )
+        record.iris_internal = True
+        if rank is not None:
+            record.iris_rank = rank
+        if num_ranks is not None:
+            record.iris_num_ranks = num_ranks
+        logger.handle(record)
+
+
 def set_logger_level(level):
     """
     Set the logging level for the iris logger.
diff --git a/iris/host/memory/allocators/torch_allocator.py b/iris/host/memory/allocators/torch_allocator.py
index a0e245516..bbffb6f00 100644
--- a/iris/host/memory/allocators/torch_allocator.py
+++ b/iris/host/memory/allocators/torch_allocator.py
@@ -8,6 +8,7 @@
 sub-allocations within it using bump allocation.
 """
 
+import logging
 import math
 import numpy as np
 import torch
@@ -15,6 +16,7 @@
 import struct
 
 from .base import BaseAllocator
+from iris.host.logging.logging import _log_rank
 from iris.host.platform.hip import export_dmabuf_handle, import_dmabuf_handle, destroy_external_memory
 from iris.host.distributed.fd_passing import send_fd, recv_fd, managed_fd
 from iris.host.platform.utils import is_simulation_env
@@ -41,6 +43,14 @@ def __init__(self, heap_size: int, device_id: int, cur_rank: int, num_ranks: int
         super().__init__(heap_size, device_id, cur_rank, num_ranks)
 
         self.device = f"cuda:{device_id}"
+        _log_rank(
+            logging.INFO,
+            "TorchAllocator: init heap_size=%.1fGB device=%d",
+            heap_size / (1 << 30),
+            device_id,
+            rank=cur_rank,
+            num_ranks=num_ranks,
+        )
         if is_simulation_env():
             import json
 
@@ -93,7 +103,26 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024)
         size_in_bytes = num_elements * element_size
         aligned_size = math.ceil(size_in_bytes / alignment) * alignment
 
+        _log_rank(
+            logging.DEBUG,
+            "TorchAllocator.allocate: num_elements=%d dtype=%s size_bytes=%d offset=%d",
+            num_elements,
+            dtype,
+            size_in_bytes,
+            self.heap_offset,
+            rank=self.cur_rank,
+            num_ranks=self.num_ranks,
+        )
+
         if self.heap_offset + aligned_size > self.heap_size:
+            _log_rank(
+                logging.ERROR,
+                "TorchAllocator: OOM requested=%d available=%d",
+                aligned_size,
+                self.heap_size - self.heap_offset,
+                rank=self.cur_rank,
+                num_ranks=self.num_ranks,
+            )
             raise MemoryError("Heap out of memory")
 
         start = self.heap_offset
diff --git a/iris/host/memory/allocators/vmem_allocator.py b/iris/host/memory/allocators/vmem_allocator.py
index 5bbfbd67d..1c39a6312 100644
--- a/iris/host/memory/allocators/vmem_allocator.py
+++ b/iris/host/memory/allocators/vmem_allocator.py
@@ -8,12 +8,14 @@
 enabling features like memory oversubscription and on-demand paging.
 """
 
+import logging
 import torch
 import os
 from typing import Dict
 from threading import Lock
 
 from .base import BaseAllocator
+from iris.host.logging.logging import _log_rank
 from iris.host.platform.hip import (
     get_allocation_granularity,
     get_address_range,
@@ -59,6 +61,14 @@ def __init__(
         va_multiplier: float = 1.0,
     ):
         super().__init__(heap_size, device_id, rank, world_size)
+        _log_rank(
+            logging.INFO,
+            "VMemAllocator: init heap_size=%.1fGB device=%d",
+            heap_size / (1 << 30),
+            device_id,
+            rank=rank,
+            num_ranks=world_size,
+        )
         self.va_multiplier = va_multiplier
         self.device = torch.device(f"cuda:{device_id}")
         self.lock = Lock()
@@ -143,6 +153,17 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024)
             aligned_size = (actual_size_bytes + self.granularity - 1) & ~(self.granularity - 1)
             aligned_offset = (self.current_offset + alignment - 1) & ~(alignment - 1)
 
+            _log_rank(
+                logging.DEBUG,
+                "VMemAllocator.allocate: num_elements=%d dtype=%s size_bytes=%d offset=%d",
+                num_elements,
+                dtype,
+                size_bytes,
+                aligned_offset,
+                rank=self.cur_rank,
+                num_ranks=self.num_ranks,
+            )
+
             if aligned_offset + aligned_size > self.aligned_heap_size:
                 raise RuntimeError(
                     f"Out of VMem address space for allocation: "
diff --git a/iris/host/memory/symmetric_heap.py b/iris/host/memory/symmetric_heap.py
index e4e68959c..ff5b6a3cb 100644
--- a/iris/host/memory/symmetric_heap.py
+++ b/iris/host/memory/symmetric_heap.py
@@ -8,10 +8,13 @@
 hiding the details of allocators and inter-process memory sharing.
 """
 
+import logging
+
 import numpy as np
 import torch
 import os
 
+from iris.host.logging.logging import _log_rank, logger
 from iris.host.memory.allocators import TorchAllocator, VMemAllocator
 from iris.host.distributed.fd_passing import setup_fd_infrastructure
 from iris.host.distributed.helpers import distributed_allgather
@@ -54,6 +57,15 @@ def __init__(
         self.cur_rank = cur_rank
         self.num_ranks = num_ranks
         allocator_type = os.environ.get("IRIS_ALLOCATOR", allocator_type).lower()
+        _log_rank(
+            logging.INFO,
+            "SymmetricHeap: init heap_size=%.1fGB device=%d allocator=%s",
+            heap_size / (1 << 30),
+            device_id,
+            allocator_type,
+            rank=cur_rank,
+            num_ranks=num_ranks,
+        )
 
         if is_simulation_env():
             allocator_type = "torch"
@@ -100,6 +112,14 @@ def allocate(self, num_elements: int, dtype: torch.dtype, alignment: int = 1024)
             This should be called collectively across all ranks to maintain
             symmetric heap consistency. After allocation, peer access is refreshed.
         """
+        _log_rank(
+            logging.DEBUG,
+            "SymmetricHeap.allocate: num_elements=%d dtype=%s",
+            num_elements,
+            dtype,
+            rank=self.cur_rank,
+            num_ranks=self.num_ranks,
+        )
         min_bytes = self.allocator.get_minimum_allocation_size()
         element_size = torch.tensor([], dtype=dtype).element_size()
         min_elements = max(1, (min_bytes + element_size - 1) // element_size)
@@ -169,6 +189,15 @@ def refresh_peer_access(self):
             hipMemAccessFlagsProtReadWrite,
         )
 
+        _log_rank(
+            logging.DEBUG,
+            "refresh_peer_access: start rank=%d num_ranks=%d",
+            self.cur_rank,
+            self.num_ranks,
+            rank=self.cur_rank,
+            num_ranks=self.num_ranks,
+        )
+
         if dist.is_initialized():
             dist.barrier()
 
@@ -211,6 +240,14 @@ def refresh_peer_access(self):
         for peer, sock in self.fd_conns.items():
             if peer == self.cur_rank:
                 continue
+            _log_rank(
+                logging.DEBUG,
+                "refresh_peer_access: peer=%d segments=%d",
+                peer,
+                len(my_exported_fds),
+                rank=self.cur_rank,
+                num_ranks=self.num_ranks,
+            )
 
             if not hasattr(self, "_peer_va_ranges"):
                 self._peer_va_ranges = {}
@@ -272,6 +309,15 @@ def refresh_peer_access(self):
 
             os.close(fd)
 
+        if logger.isEnabledFor(logging.DEBUG):
+            _log_rank(
+                logging.DEBUG,
+                "refresh_peer_access: done heap_bases=[%s]",
+                ", ".join(f"0x{int(self.heap_bases[r].item()):x}" for r in range(min(3, self.num_ranks))),
+                rank=self.cur_rank,
+                num_ranks=self.num_ranks,
+            )
+
         if dist.is_initialized():
             dist.barrier()
 
diff --git a/iris/host/platform/hip.py b/iris/host/platform/hip.py
index e6dc598d8..a23ea8f9e 100644
--- a/iris/host/platform/hip.py
+++ b/iris/host/platform/hip.py
@@ -2,11 +2,14 @@
 # Copyright (c) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
 
 import ctypes
+import logging
 import numpy as np
 import torch
 import subprocess
 import os
 
+from iris.host.logging.logging import _log_rank
+
 # Auto-detect backend
 _is_amd_backend = True
 try:
@@ -27,10 +30,12 @@ def gpu_try(err):
         if _is_amd_backend:
             gpu_runtime.hipGetErrorString.restype = ctypes.c_char_p
             error_string = gpu_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            _log_rank(logging.ERROR, "HIP error code %d: %s", err, error_string)
             raise RuntimeError(f"HIP error code {err}: {error_string}")
         else:
             gpu_runtime.cudaGetErrorString.restype = ctypes.c_char_p
             error_string = gpu_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            _log_rank(logging.ERROR, "CUDA error code %d: %s", err, error_string)
             raise RuntimeError(f"CUDA error code {err}: {error_string}")
 
 
@@ -280,6 +285,8 @@ def export_dmabuf_handle(ptr, size):
     if not _is_amd_backend:
         raise RuntimeError("DMA-BUF export only supported on AMD/HIP backend")
 
+    _log_rank(logging.DEBUG, "export_dmabuf_handle: ptr=0x%x size=%d", ptr if isinstance(ptr, int) else ptr.value, size)
+
     ptr_int = ptr if isinstance(ptr, int) else ptr.value
     ptr_arg = ctypes.c_void_p(ptr_int)
 
diff --git a/iris/host/tracing/kernel_artifacts.py b/iris/host/tracing/kernel_artifacts.py
index 064e636c0..c6215e7f9 100644
--- a/iris/host/tracing/kernel_artifacts.py
+++ b/iris/host/tracing/kernel_artifacts.py
@@ -27,11 +27,14 @@
 
 import hashlib
 import json
+import logging
 import os
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, Optional
 
+from iris.host.logging.logging import _log_rank
+
 _artifacts_dir: Optional[Path] = None
 _enabled: bool = False
 
@@ -64,6 +67,15 @@ def iris_launch(kernel_fn, grid, *args, algorithm: str, rank: int, dtype=None, *
         dtype: Optional tensor dtype for the spec directory name.
         **kwargs: Keyword arguments forwarded to the kernel (num_warps, etc.).
     """
+    _log_rank(
+        logging.DEBUG,
+        "iris_launch: algorithm=%s kernel=%s grid=%s rank=%d",
+        algorithm,
+        _get_kernel_name(kernel_fn),
+        grid,
+        rank,
+        rank=rank,
+    )
     compiled = kernel_fn[grid](*args, **kwargs)
     if _enabled and compiled is not None:
         kernel_name = _get_kernel_name(kernel_fn)
diff --git a/iris/ops/all_gather_matmul.py b/iris/ops/all_gather_matmul.py
index c39fc2ede..317c37042 100644
--- a/iris/ops/all_gather_matmul.py
+++ b/iris/ops/all_gather_matmul.py
@@ -9,6 +9,7 @@
 tiles from remote ranks on-demand during GEMM computation.
 """
 
+import logging
 from typing import Optional
 import torch
 import triton
@@ -203,6 +204,21 @@ def all_gather_matmul(
     world_size = shmem.get_num_ranks()
     rank = shmem.get_rank()
 
+    from iris.host.logging.logging import _log_rank
+
+    _log_rank(
+        logging.DEBUG,
+        "all_gather_matmul: shape=(%d,%d,%d) dtype=%s rank=%d/%d",
+        M,
+        N,
+        K,
+        A_sharded.dtype,
+        rank,
+        world_size,
+        rank=rank,
+        num_ranks=world_size,
+    )
+
     expected_K = world_size * K_local
     assert K == expected_K, f"K ({K}) must equal world_size ({world_size}) * K_local ({K_local})"
     assert output_tensor.shape == (M, N), f"Output must be ({M}, {N}), got {output_tensor.shape}"
diff --git a/iris/ops/matmul_all_gather.py b/iris/ops/matmul_all_gather.py
index b011367fa..43b2cc8c1 100644
--- a/iris/ops/matmul_all_gather.py
+++ b/iris/ops/matmul_all_gather.py
@@ -10,6 +10,7 @@
 This is useful for tensor-parallel workloads where outputs need to be gathered.
 """
 
+import logging
 from typing import Optional
 import torch
 import triton
@@ -174,6 +175,21 @@ def matmul_all_gather(
     world_size = shmem.get_num_ranks()
     rank = shmem.get_rank()
 
+    from iris.host.logging.logging import _log_rank
+
+    _log_rank(
+        logging.DEBUG,
+        "matmul_all_gather: shape=(%d,%d,%d) dtype=%s rank=%d/%d",
+        M_local * world_size,
+        N,
+        K,
+        A.dtype,
+        rank,
+        world_size,
+        rank=rank,
+        num_ranks=world_size,
+    )
+
     assert K == K2, f"Inner dimensions must match: A has K={K}, B has K={K2}"
 
     M = M_local * world_size
diff --git a/iris/ops/matmul_all_reduce.py b/iris/ops/matmul_all_reduce.py
index 634026cf6..29b84b6e3 100644
--- a/iris/ops/matmul_all_reduce.py
+++ b/iris/ops/matmul_all_reduce.py
@@ -8,6 +8,7 @@
 automatically inferring dimensions, strides, and hardware parameters.
 """
 
+import logging
 from typing import Optional
 import torch
 import triton
@@ -286,6 +287,22 @@ def matmul_all_reduce(
     rank = shmem.get_rank()
     world_size = shmem.get_num_ranks()
 
+    from iris.host.logging.logging import _log_rank
+
+    _log_rank(
+        logging.DEBUG,
+        "matmul_all_reduce: shape=(%d,%d,%d) dtype=%s variant=%s rank=%d/%d",
+        M,
+        N,
+        K,
+        A.dtype,
+        config.all_reduce_variant,
+        rank,
+        world_size,
+        rank=rank,
+        num_ranks=world_size,
+    )
+
     # Prepare workspace if needed
     needs_prepare = workspace is None or not workspace.matches(
         "matmul_all_reduce", (M, N, K), A.dtype, world_size, config.all_reduce_variant
diff --git a/iris/ops/matmul_reduce_scatter.py b/iris/ops/matmul_reduce_scatter.py
index 2842fb27c..47454d630 100644
--- a/iris/ops/matmul_reduce_scatter.py
+++ b/iris/ops/matmul_reduce_scatter.py
@@ -8,6 +8,7 @@
 automatically inferring dimensions, strides, and hardware parameters.
 """
 
+import logging
 from typing import Optional
 import torch
 import triton
@@ -228,6 +229,21 @@ def matmul_reduce_scatter(
     rank = shmem.get_rank()
     world_size = shmem.get_num_ranks()
 
+    from iris.host.logging.logging import _log_rank
+
+    _log_rank(
+        logging.DEBUG,
+        "matmul_reduce_scatter: shape=(%d,%d,%d) dtype=%s rank=%d/%d",
+        M,
+        N,
+        K,
+        A.dtype,
+        rank,
+        world_size,
+        rank=rank,
+        num_ranks=world_size,
+    )
+
     num_pid_m = (M + config.block_size_m - 1) // config.block_size_m
     num_pid_n = (N + config.block_size_n - 1) // config.block_size_n
     total_tiles = num_pid_m * num_pid_n
diff --git a/tests/unittests/test_logging.py b/tests/unittests/test_logging.py
index 399d2c369..911bc53b9 100644
--- a/tests/unittests/test_logging.py
+++ b/tests/unittests/test_logging.py
@@ -109,8 +109,8 @@ def test_logger_api_usage():
     iris.logger.debug("Test debug message (should be visible)")
 
     output = log_capture.getvalue()
-    assert "[Iris] Test info message" in output
-    assert "[Iris] Test debug message (should be visible)" in output
+    assert "[Iris]" in output and "Test info message" in output
+    assert "[Iris]" in output and "Test debug message (should be visible)" in output
     # The hidden debug message should not appear
     lines = output.split("\n")
     hidden_debug_count = sum(1 for line in lines if "should be hidden" in line)
@@ -130,7 +130,8 @@ def test_iris_formatter():
     )
 
     formatted_no_rank = formatter.format(record_no_rank)
-    assert formatted_no_rank == "[Iris] Test message without rank"
+    # Format: "HH:MM:SS LEVEL [Iris] [?/?] message"
+    assert "[Iris] [?/?] Test message without rank" in formatted_no_rank
 
     # Test record with rank information
     record_with_rank = logging.LogRecord(
@@ -140,7 +141,19 @@ def test_iris_formatter():
     record_with_rank.iris_num_ranks = 4
 
     formatted_with_rank = formatter.format(record_with_rank)
-    assert formatted_with_rank == "[Iris] [2/4] Test message with rank"
+    assert "[Iris] [2/4] Test message with rank" in formatted_with_rank
+
+    # Test internal record shows [module]
+    record_internal = logging.LogRecord(
+        name="iris", level=logging.INFO, pathname="all_gather.py", lineno=0, msg="Internal log", args=(), exc_info=None
+    )
+    record_internal.iris_internal = True
+    record_internal.iris_rank = 0
+    record_internal.iris_num_ranks = 4
+
+    formatted_internal = formatter.format(record_internal)
+    assert "[all_gather]" in formatted_internal
+    assert "[Iris] [0/4]" in formatted_internal
 
 
 def test_api_import():