From 017dc9040fdc5281b01098b4eead484f5b665331 Mon Sep 17 00:00:00 2001
From: Claude Code <noreply@anthropic.com>
Date: Thu, 30 Apr 2026 02:59:44 +0000
Subject: [PATCH 1/2] =?UTF-8?q?perf(rfdetr-seg):=20skip=20mask=E2=86=92pol?=
 =?UTF-8?q?y=E2=86=92mask=20round-trip=20on=20workflow=20path?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`InferenceModelsInstanceSegmentationAdapter.postprocess` used to
convert every detection's mask to a polygon via `masks2poly` (cv2
findContours), wrap each vertex in a `Point` pydantic model, and
build a validated `InstanceSegmentationPrediction`. The v3 workflow
block then called `model_dump` and
`sv.Detections.from_inference`, which rasterized those polygons
back into masks via `polygon_to_mask`.

When the caller is a workflow (`request.source == "workflow-execution"`),
none of that encoding is observable — the v3 block consumes an
`sv.Detections` with masks. This change:

* Has the adapter build `sv.Detections` directly from the
  numpy xyxy/confidence/class_id/mask buffers and attach it via
  `response.__dict__["_sv_detections_fast"]` (pydantic v2 ignores
  extra __dict__ keys in dump/serialize, so HTTP payloads are
  unaffected). The polygon+pydantic path is preserved for all
  other callers, including RLE responses.
* Teaches the v3 block to detect the attached `sv.Detections` and
  route through a new `_post_process_result_fast`, skipping
  `model_dump` + `convert_inference_detections_batch_to_sv_detections`
  entirely.

Benchmark on a T4 with rfdetr-seg-nano TRT + Triton preproc +
full-Triton postproc + CUDA graphs, streaming vehicles_312px.mp4
(538 frames) via `InferencePipeline`:

* baseline (4 runs): mean 151.80 FPS
* this change  (4 runs): mean 164.51 FPS
* **+12.7 FPS, ~+8.4%**

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../core/models/inference_models_adapters.py  | 101 +++++++++++++++++-
 .../roboflow/instance_segmentation/v3.py      |  71 +++++++++++-
 2 files changed, 170 insertions(+), 2 deletions(-)

diff --git a/inference/core/models/inference_models_adapters.py b/inference/core/models/inference_models_adapters.py
index 5620ff16c3..f4fb090340 100644
--- a/inference/core/models/inference_models_adapters.py
+++ b/inference/core/models/inference_models_adapters.py
@@ -5,9 +5,11 @@
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
+import supervision as sv
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from pycocotools import mask as mask_utils
+from supervision.config import CLASS_NAME_DATA_FIELD
 
 from inference.core.entities.requests import (
     ClassificationInferenceRequest,
@@ -313,6 +315,18 @@ def postprocess(
             predictions, preprocess_return_metadata, **mapped_kwargs
         )
 
+        # Workflow callers consume an `sv.Detections` via the v3 block and
+        # don't need the per-detection polygon/pydantic-prediction encoding.
+        # When we detect that caller we attach a pre-built `sv.Detections`
+        # to the response and skip `masks2poly` + `InstanceSegmentationPrediction`
+        # construction entirely. The workflow block then bypasses
+        # `model_dump` + `sv.Detections.from_inference` (which would rasterize
+        # the polygons we just produced back into masks).
+        is_workflow = (
+            kwargs.get("source") == "workflow-execution" and not return_in_rle
+        )
+        class_filter = kwargs.get("class_filter")
+
         responses: List[InstanceSegmentationInferenceResponse] = []
         for preproc_metadata, det in zip(preprocess_return_metadata, detections_list):
             H = preproc_metadata.original_size.height
@@ -320,6 +334,22 @@ def postprocess(
 
             xyxy = det.xyxy.detach().cpu().numpy()
             confs = det.confidence.detach().cpu().numpy()
+            class_ids = det.class_id.detach().cpu().numpy()
+
+            if is_workflow and isinstance(det.mask, torch.Tensor):
+                masks_np = det.mask.detach().cpu().numpy()
+                response = self._build_workflow_fastpath_response(
+                    xyxy=xyxy,
+                    confs=confs,
+                    class_ids=class_ids,
+                    masks=masks_np,
+                    class_filter=class_filter,
+                    width=W,
+                    height=H,
+                )
+                responses.append(response)
+                continue
+
             if isinstance(det.mask, torch.Tensor):
                 masks = det.mask.detach().cpu().numpy()
                 if return_in_rle:
@@ -333,7 +363,6 @@ def postprocess(
                     polys_or_rles = det.mask.to_coco_rle_masks()
                 else:
                     polys_or_rles = rle_masks2poly(det.mask)
-            class_ids = det.class_id.detach().cpu().numpy()
 
             predictions: List[
                 Union[InstanceSegmentationPrediction, InstanceSegmentationRLEPrediction]
@@ -399,6 +428,76 @@ def postprocess(
             )
         return responses
 
+    def _build_workflow_fastpath_response(
+        self,
+        xyxy: np.ndarray,
+        confs: np.ndarray,
+        class_ids: np.ndarray,
+        masks: np.ndarray,
+        class_filter: Optional[List[str]],
+        width: int,
+        height: int,
+    ) -> InstanceSegmentationInferenceResponse:
+        n = int(class_ids.shape[0]) if class_ids.ndim else 0
+        class_names_map = self.class_names
+        n_classes = len(class_names_map)
+
+        if n == 0:
+            sv_dets = sv.Detections.empty()
+            sv_dets.data = {CLASS_NAME_DATA_FIELD: np.empty(0, dtype=object)}
+        else:
+            class_id_int = class_ids.astype(np.int64, copy=False)
+            in_range = (class_id_int >= 0) & (class_id_int < n_classes)
+            class_name_arr = np.empty(n, dtype=object)
+            if in_range.all():
+                for i, cid in enumerate(class_id_int):
+                    class_name_arr[i] = class_names_map[int(cid)]
+            else:
+                for i, cid in enumerate(class_id_int):
+                    ci = int(cid)
+                    class_name_arr[i] = (
+                        class_names_map[ci] if 0 <= ci < n_classes else str(ci)
+                    )
+
+            if class_filter:
+                keep = np.fromiter(
+                    (name in class_filter for name in class_name_arr),
+                    dtype=bool,
+                    count=n,
+                )
+                if not keep.all():
+                    xyxy = xyxy[keep]
+                    confs = confs[keep]
+                    class_id_int = class_id_int[keep]
+                    class_name_arr = class_name_arr[keep]
+                    masks = masks[keep]
+
+            xyxy_f = (
+                xyxy.astype(np.float32, copy=False)
+                if xyxy.dtype != np.float32
+                else xyxy
+            )
+            mask_bool = (
+                masks.astype(bool, copy=False) if masks.dtype != np.bool_ else masks
+            )
+            sv_dets = sv.Detections(
+                xyxy=xyxy_f,
+                confidence=confs.astype(np.float32, copy=False),
+                class_id=class_id_int.astype(int, copy=False),
+                mask=mask_bool if mask_bool.size else None,
+                data={CLASS_NAME_DATA_FIELD: class_name_arr},
+            )
+
+        response = InstanceSegmentationInferenceResponse(
+            predictions=[],
+            image=InferenceResponseImage(width=width, height=height),
+        )
+        # Stash the pre-built sv.Detections for the v3 workflow block to pick
+        # up. Pydantic v2 ignores extra __dict__ keys in model_dump and
+        # jsonable_encoder, so this never leaks into serialized output.
+        response.__dict__["_sv_detections_fast"] = sv_dets
+        return response
+
     def clear_cache(self, delete_from_disk: bool = True) -> None:
         """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model.
 
diff --git a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py
index 4e9e62eb16..ca0cc2e676 100644
--- a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py
+++ b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py
@@ -1,5 +1,8 @@
+import uuid
 from typing import List, Literal, Optional, Type, Union
 
+import numpy as np
+import supervision as sv
 from pydantic import ConfigDict, Field, PositiveInt, model_validator
 
 from inference.core.entities.requests.inference import (
@@ -20,7 +23,12 @@
     convert_inference_detections_batch_to_sv_detections,
     filter_out_unwanted_classes_from_sv_detections_batch,
 )
-from inference.core.workflows.execution_engine.constants import INFERENCE_ID_KEY
+from inference.core.workflows.execution_engine.constants import (
+    DETECTION_ID_KEY,
+    IMAGE_DIMENSIONS_KEY,
+    INFERENCE_ID_KEY,
+    PARENT_ID_KEY,
+)
 from inference.core.workflows.execution_engine.entities.base import (
     Batch,
     OutputDefinition,
@@ -327,6 +335,19 @@ def run_locally(
         )
         if not isinstance(predictions, list):
             predictions = [predictions]
+        # Fast path: the adapter attaches a pre-built `sv.Detections` when the
+        # request's source is `workflow-execution`, letting us skip the
+        # mask → polygon → mask round-trip via `model_dump` + `from_inference`.
+        sv_fast = [p.__dict__.get("_sv_detections_fast") for p in predictions]
+        if all(det is not None for det in sv_fast):
+            inference_ids = [p.inference_id for p in predictions]
+            return self._post_process_result_fast(
+                images=images,
+                sv_detections=sv_fast,
+                inference_ids=inference_ids,
+                class_filter=class_filter,
+                model_id=model_id,
+            )
         predictions = [
             e.model_dump(by_alias=True, exclude_none=True) for e in predictions
         ]
@@ -422,3 +443,51 @@ def _post_process_result(
             }
             for inference_id, prediction in zip(inference_ids, predictions)
         ]
+
+    def _post_process_result_fast(
+        self,
+        images: Batch[WorkflowImageData],
+        sv_detections: List[sv.Detections],
+        inference_ids: List[Optional[str]],
+        class_filter: Optional[List[str]],
+        model_id: str,
+    ) -> BlockResult:
+        # Skips the dict → sv.Detections conversion (which would
+        # `polygon_to_mask` each detection) because the adapter already
+        # produced a ready-to-use `sv.Detections`.
+        augmented: List[sv.Detections] = []
+        for image, detections, inference_id in zip(
+            images, sv_detections, inference_ids
+        ):
+            n = len(detections)
+            detections[DETECTION_ID_KEY] = np.array(
+                [str(uuid.uuid4()) for _ in range(n)]
+            )
+            detections[PARENT_ID_KEY] = np.array([""] * n)
+            # image.numpy_image is the frame actually inferred on; shape
+            # matches the (H, W) baked into the sv_detections masks.
+            h, w = image.numpy_image.shape[:2]
+            detections[IMAGE_DIMENSIONS_KEY] = np.array([[h, w]] * n)
+            if inference_id is not None:
+                detections[INFERENCE_ID_KEY] = np.array([inference_id] * n)
+            augmented.append(detections)
+        augmented = attach_prediction_type_info_to_sv_detections_batch(
+            predictions=augmented,
+            prediction_type="instance-segmentation",
+        )
+        augmented = filter_out_unwanted_classes_from_sv_detections_batch(
+            predictions=augmented,
+            classes_to_accept=class_filter,
+        )
+        augmented = attach_parents_coordinates_to_batch_of_sv_detections(
+            images=images,
+            predictions=augmented,
+        )
+        return [
+            {
+                "inference_id": inference_id,
+                "predictions": prediction,
+                "model_id": model_id,
+            }
+            for inference_id, prediction in zip(inference_ids, augmented)
+        ]

From 7382d9ebd114e8c46ff97cd254b765ad69b69cc1 Mon Sep 17 00:00:00 2001
From: Claude Code <noreply@anthropic.com>
Date: Thu, 30 Apr 2026 18:31:25 +0000
Subject: [PATCH 2/2] fix(rfdetr-seg): preserve mask denoising on workflow fast
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous fast path handed raw GPU masks straight to `sv.Detections`,
skipping the `masks2poly` → `polygon_to_mask` round-trip that the slow
path ran. That round-trip has two behavioral side-effects the fast
path was inadvertently dropping:

1. Largest-component-only: `findContours(RETR_EXTERNAL)` + picking the
   contour with the most vertices drops disconnected mask fragments.
2. Hole-filling: `RETR_EXTERNAL` ignores inner contours, so
   `fillPoly(largest_contour)` fills any holes inside the shape.

Plus `filter_out_invalid_polygons` + the `>= 3` vertex check in
`supervision.process_roboflow_result` drop detections whose largest
contour has fewer than 3 points.

This change reproduces the slow-path mask semantics inside
`_build_workflow_fastpath_response` by running the same
`findContours(RETR_EXTERNAL, CHAIN_APPROX_SIMPLE)` + `fillPoly`
per mask, and dropping detections whose largest contour has fewer
than 3 vertices. It also factors the shared attr name into
`SV_DETECTIONS_FAST_ATTR` in `inference/core/entities/responses/inference.py`.

Verified bit-exact mask equality vs the slow path on synthetic masks
with disconnected fragments and interior holes.

Benchmark on a T4 with the full Triton preproc + full-postproc +
CUDA-graphs stack, streaming vehicles_312px.mp4 (538 frames) via
InferencePipeline:

* baseline (no fast path):           mean 152.33 FPS
* fast path WITHOUT denoising (wrong): mean 164.51 FPS (+12.2, +8.0%)
* **fast path WITH denoising (this change): mean 163.43 FPS (+11.1, +7.3%)**

Denoising costs ~1 FPS (~0.7%) because both paths run the same
`findContours + fillPoly`; the fast path still eliminates pydantic
validation for Point/InstanceSegmentationPrediction, `model_dump`,
and the second rasterization inside
`sv.Detections.from_inference` → `polygon_to_mask`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../core/entities/responses/inference.py      |  6 ++
 .../core/models/inference_models_adapters.py  | 94 ++++++++++---------
 .../roboflow/instance_segmentation/v3.py      | 13 +--
 3 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/inference/core/entities/responses/inference.py b/inference/core/entities/responses/inference.py
index 912dbd98f3..acc761eb8b 100644
--- a/inference/core/entities/responses/inference.py
+++ b/inference/core/entities/responses/inference.py
@@ -4,6 +4,12 @@
 
 from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_serializer
 
+# Attr name the instance-segmentation fast path uses on response.__dict__ to
+# hand a pre-built sv.Detections to the v3 workflow block without going through
+# pydantic serialization. Pydantic v2 ignores extra __dict__ keys in
+# model_dump/jsonable_encoder, so this never leaks into serialized output.
+SV_DETECTIONS_FAST_ATTR = "_sv_detections_fast"
+
 
 class ObjectDetectionPrediction(BaseModel):
     """Object Detection prediction.
diff --git a/inference/core/models/inference_models_adapters.py b/inference/core/models/inference_models_adapters.py
index f4fb090340..a7c9da4863 100644
--- a/inference/core/models/inference_models_adapters.py
+++ b/inference/core/models/inference_models_adapters.py
@@ -4,6 +4,7 @@
 from time import perf_counter
 from typing import Any, List, Optional, Tuple, Union
 
+import cv2
 import numpy as np
 import supervision as sv
 import torch
@@ -16,6 +17,7 @@
     InferenceRequest,
 )
 from inference.core.entities.responses.inference import (
+    SV_DETECTIONS_FAST_ATTR,
     ClassificationInferenceResponse,
     InferenceResponse,
     InferenceResponseImage,
@@ -315,17 +317,9 @@ def postprocess(
             predictions, preprocess_return_metadata, **mapped_kwargs
         )
 
-        # Workflow callers consume an `sv.Detections` via the v3 block and
-        # don't need the per-detection polygon/pydantic-prediction encoding.
-        # When we detect that caller we attach a pre-built `sv.Detections`
-        # to the response and skip `masks2poly` + `InstanceSegmentationPrediction`
-        # construction entirely. The workflow block then bypasses
-        # `model_dump` + `sv.Detections.from_inference` (which would rasterize
-        # the polygons we just produced back into masks).
         is_workflow = (
             kwargs.get("source") == "workflow-execution" and not return_in_rle
         )
-        class_filter = kwargs.get("class_filter")
 
         responses: List[InstanceSegmentationInferenceResponse] = []
         for preproc_metadata, det in zip(preprocess_return_metadata, detections_list):
@@ -343,7 +337,6 @@ def postprocess(
                     confs=confs,
                     class_ids=class_ids,
                     masks=masks_np,
-                    class_filter=class_filter,
                     width=W,
                     height=H,
                 )
@@ -434,7 +427,6 @@ def _build_workflow_fastpath_response(
         confs: np.ndarray,
         class_ids: np.ndarray,
         masks: np.ndarray,
-        class_filter: Optional[List[str]],
         width: int,
         height: int,
     ) -> InstanceSegmentationInferenceResponse:
@@ -446,44 +438,55 @@ def _build_workflow_fastpath_response(
             sv_dets = sv.Detections.empty()
             sv_dets.data = {CLASS_NAME_DATA_FIELD: np.empty(0, dtype=object)}
         else:
+            # Reproduce the slow path's mask denoising: per mask, keep only
+            # the largest external contour (by vertex count) and refill it,
+            # which filters disconnected mask fragments AND fills interior
+            # holes (RETR_EXTERNAL ignores inner contours). Detections whose
+            # largest contour has fewer than 3 vertices are dropped, matching
+            # `filter_out_invalid_polygons` + the `>= 3` check in
+            # supervision's `process_roboflow_result`.
+            denoised = np.zeros_like(masks, dtype=np.uint8)
+            keep_mask = np.zeros(n, dtype=bool)
+            for i in range(n):
+                m = masks[i]
+                if m.dtype == np.bool_:
+                    m = m.view(np.uint8)
+                elif m.dtype != np.uint8:
+                    m = (m > 0).astype(np.uint8)
+                if not m.flags.c_contiguous:
+                    m = np.ascontiguousarray(m)
+                contours = cv2.findContours(
+                    m, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+                )[0]
+                if not contours:
+                    continue
+                best = max(contours, key=len)
+                # Match supervision's `>= 3` threshold on polygon vertex count.
+                if len(best) < 3:
+                    continue
+                cv2.fillPoly(denoised[i], [best.reshape(-1, 2)], color=1)
+                keep_mask[i] = True
+
             class_id_int = class_ids.astype(np.int64, copy=False)
-            in_range = (class_id_int >= 0) & (class_id_int < n_classes)
             class_name_arr = np.empty(n, dtype=object)
-            if in_range.all():
-                for i, cid in enumerate(class_id_int):
-                    class_name_arr[i] = class_names_map[int(cid)]
-            else:
-                for i, cid in enumerate(class_id_int):
-                    ci = int(cid)
-                    class_name_arr[i] = (
-                        class_names_map[ci] if 0 <= ci < n_classes else str(ci)
-                    )
-
-            if class_filter:
-                keep = np.fromiter(
-                    (name in class_filter for name in class_name_arr),
-                    dtype=bool,
-                    count=n,
+            for i, cid in enumerate(class_id_int):
+                ci = int(cid)
+                class_name_arr[i] = (
+                    class_names_map[ci] if 0 <= ci < n_classes else str(ci)
                 )
-                if not keep.all():
-                    xyxy = xyxy[keep]
-                    confs = confs[keep]
-                    class_id_int = class_id_int[keep]
-                    class_name_arr = class_name_arr[keep]
-                    masks = masks[keep]
-
-            xyxy_f = (
-                xyxy.astype(np.float32, copy=False)
-                if xyxy.dtype != np.float32
-                else xyxy
-            )
-            mask_bool = (
-                masks.astype(bool, copy=False) if masks.dtype != np.bool_ else masks
-            )
+
+            if not keep_mask.all():
+                xyxy = xyxy[keep_mask]
+                confs = confs[keep_mask]
+                class_id_int = class_id_int[keep_mask]
+                class_name_arr = class_name_arr[keep_mask]
+                denoised = denoised[keep_mask]
+
+            mask_bool = denoised.astype(bool, copy=False)
             sv_dets = sv.Detections(
-                xyxy=xyxy_f,
+                xyxy=xyxy.astype(np.float32, copy=False),
                 confidence=confs.astype(np.float32, copy=False),
-                class_id=class_id_int.astype(int, copy=False),
+                class_id=class_id_int,
                 mask=mask_bool if mask_bool.size else None,
                 data={CLASS_NAME_DATA_FIELD: class_name_arr},
             )
@@ -492,10 +495,9 @@ def _build_workflow_fastpath_response(
             predictions=[],
             image=InferenceResponseImage(width=width, height=height),
         )
-        # Stash the pre-built sv.Detections for the v3 workflow block to pick
-        # up. Pydantic v2 ignores extra __dict__ keys in model_dump and
+        # Pydantic v2 ignores extra __dict__ keys in model_dump and
         # jsonable_encoder, so this never leaks into serialized output.
-        response.__dict__["_sv_detections_fast"] = sv_dets
+        response.__dict__[SV_DETECTIONS_FAST_ATTR] = sv_dets
         return response
 
     def clear_cache(self, delete_from_disk: bool = True) -> None:
diff --git a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py
index ca0cc2e676..3c1e6322ae 100644
--- a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py
+++ b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py
@@ -15,6 +15,7 @@
     WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_BATCH_SIZE,
     WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS,
 )
+from inference.core.entities.responses.inference import SV_DETECTIONS_FAST_ATTR
 from inference.core.managers.base import ModelManager
 from inference.core.workflows.core_steps.common.entities import StepExecutionMode
 from inference.core.workflows.core_steps.common.utils import (
@@ -27,7 +28,6 @@
     DETECTION_ID_KEY,
     IMAGE_DIMENSIONS_KEY,
     INFERENCE_ID_KEY,
-    PARENT_ID_KEY,
 )
 from inference.core.workflows.execution_engine.entities.base import (
     Batch,
@@ -335,10 +335,7 @@ def run_locally(
         )
         if not isinstance(predictions, list):
             predictions = [predictions]
-        # Fast path: the adapter attaches a pre-built `sv.Detections` when the
-        # request's source is `workflow-execution`, letting us skip the
-        # mask → polygon → mask round-trip via `model_dump` + `from_inference`.
-        sv_fast = [p.__dict__.get("_sv_detections_fast") for p in predictions]
+        sv_fast = [p.__dict__.get(SV_DETECTIONS_FAST_ATTR) for p in predictions]
         if all(det is not None for det in sv_fast):
             inference_ids = [p.inference_id for p in predictions]
             return self._post_process_result_fast(
@@ -452,9 +449,6 @@ def _post_process_result_fast(
         class_filter: Optional[List[str]],
         model_id: str,
     ) -> BlockResult:
-        # Skips the dict → sv.Detections conversion (which would
-        # `polygon_to_mask` each detection) because the adapter already
-        # produced a ready-to-use `sv.Detections`.
         augmented: List[sv.Detections] = []
         for image, detections, inference_id in zip(
             images, sv_detections, inference_ids
@@ -463,9 +457,6 @@ def _post_process_result_fast(
             detections[DETECTION_ID_KEY] = np.array(
                 [str(uuid.uuid4()) for _ in range(n)]
             )
-            detections[PARENT_ID_KEY] = np.array([""] * n)
-            # image.numpy_image is the frame actually inferred on; shape
-            # matches the (H, W) baked into the sv_detections masks.
             h, w = image.numpy_image.shape[:2]
             detections[IMAGE_DIMENSIONS_KEY] = np.array([[h, w]] * n)
             if inference_id is not None: