Fix #8366: Add strict shape validation to sliding_window_inference

林旻佑 · 林旻佑 · commit 02106a1a2558 · 2025-11-30T14:09:02.000+08:00
diff --git a/monai/inferers/utils.py b/monai/inferers/utils.py
@@ -33,12 +33,51 @@
     optional_import,
 )
 
+
 tqdm, _ = optional_import("tqdm", name="tqdm")
 _nearest_mode = "nearest-exact"
 
 __all__ = ["sliding_window_inference"]
 
 
+
+def assert_channel_first(
+    t: torch.Tensor,
+    name: str,
+    num_classes: Optional[int] = None,
+    allow_binary_two_channel: bool = False,
+) -> None:
+    """
+    Enforce channel-first layout without guessing.
+    Accepts only:
+      - 4D: NCHW  (channel at dim=1)
+      - 5D: NCDHW (channel at dim=1)
+    If not satisfied, raise with a clear message asking users to apply
+    EnsureChannelFirst / EnsureChannelFirstd upstream.
+    """
+    if not isinstance(t, torch.Tensor):
+        return
+    if t.ndim not in (4, 5):
+        return
+
+    c = int(t.shape[1])
+    layout = "NCHW" if t.ndim == 4 else "NCDHW"
+    layout_last = "NHWC" if t.ndim == 4 else "NDHWC"
+
+    if num_classes is not None:
+        ok = (c == num_classes) or (num_classes == 1 and c == 1)
+        if allow_binary_two_channel and num_classes == 2:
+            ok = ok or (c == 2)
+        if not ok:
+            raise ValueError(
+                f"{name}: expected {layout} with C(dim=1)==num_classes, "
+                f"but got shape={tuple(t.shape)} (C={c}) and num_classes={num_classes}. "
+                f"If your data is {layout_last}, please apply EnsureChannelFirst/EnsureChannelFirstd upstream."
+            )
+    # No guessing when num_classes is None; we simply require channel at dim=1.
+    # If callers provided NHWC/NDHWC, they must convert upstream.
+
+
 def sliding_window_inference(
     inputs: torch.Tensor | MetaTensor,
     roi_size: Sequence[int] | int,
@@ -131,9 +170,29 @@ def sliding_window_inference(
         kwargs: optional keyword args to be passed to ``predictor``.
 
     Note:
-        - input must be channel-first and have a batch dim, supports N-D sliding window.
-
+        - Inputs must be channel-first and have a batch dim (NCHW / NCDHW).
+        - If your data is NHWC/NDHWC, please apply `EnsureChannelFirst` / `EnsureChannelFirstd` upstream.
+ 
     """
+    num_spatial_dims = len(inputs.shape) - 2
+    
+    # Only perform strict shape validation if roi_size is a sequence (explicit dimensions).
+    # If roi_size is an integer, it is broadcast to all dimensions, so we cannot 
+    # infer the expected dimensionality to enforce a strict check here.
+    if not isinstance(roi_size, int):
+        roi_dims = len(roi_size)
+        if num_spatial_dims != roi_dims:
+            raise ValueError(
+                f"inputs must have {roi_dims + 2} dimensions for {roi_dims}D roi_size "
+                f"(Batch, Channel, {', '.join(['Spatial'] * roi_dims)}), "
+                f"but got inputs shape {inputs.shape}.\n"
+                "If you have channel-last data (e.g. B, D, H, W, C), please use "
+                "monai.transforms.EnsureChannelFirst or EnsureChannelFirstd upstream."
+            )
+    # -----------------------------------------------------------------
+    # ---- Strict validation: do NOT guess or permute layouts ----
+    if isinstance(inputs, torch.Tensor):
+        assert_channel_first(inputs, "inputs")
     buffered = buffer_steps is not None and buffer_steps > 0
     num_spatial_dims = len(inputs.shape) - 2
     if buffered:
diff --git a/tests/inferers/test_sliding_window_inference.py b/tests/inferers/test_sliding_window_inference.py
@@ -372,6 +372,26 @@ def compute_dict(data):
         for rr, _ in zip(result_dict, expected_dict):
             np.testing.assert_allclose(result_dict[rr].cpu().numpy(), expected_dict[rr], rtol=1e-4)
 
+    def test_strict_shape_validation(self):
+        """Test strict shape validation to ensure inputs match roi_size dimensions."""
+        device = "cpu"
+        roi_size = (16, 16, 16)
+        sw_batch_size = 4
+        
+        def predictor(data):
+            return data
+
+        # Case 1: Input has fewer dimensions than expected (e.g., missing Batch or Channel)
+        # 3D roi_size requires 5D input (B, C, D, H, W), giving 4D here.
+        inputs_4d = torch.randn((1, 16, 16, 16), device=device)
+        with self.assertRaisesRegex(ValueError, "inputs must have 5 dimensions"):
+            sliding_window_inference(inputs_4d, roi_size, sw_batch_size, predictor)
+
+        # Case 2: Input is 3D (missing Batch AND Channel)
+        inputs_3d = torch.randn((16, 16, 16), device=device)
+        with self.assertRaisesRegex(ValueError, "inputs must have 5 dimensions"):
+            sliding_window_inference(inputs_3d, roi_size, sw_batch_size, predictor)
+
 
 class TestSlidingWindowInferenceCond(unittest.TestCase):
     @parameterized.expand(TEST_CASES)