From 017dc9040fdc5281b01098b4eead484f5b665331 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 30 Apr 2026 02:59:44 +0000 Subject: [PATCH 1/2] =?UTF-8?q?perf(rfdetr-seg):=20skip=20mask=E2=86=92pol?= =?UTF-8?q?y=E2=86=92mask=20round-trip=20on=20workflow=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `InferenceModelsInstanceSegmentationAdapter.postprocess` used to convert every detection's mask to a polygon via `masks2poly` (cv2 findContours), wrap each vertex in a `Point` pydantic model, and build a validated `InstanceSegmentationPrediction`. The v3 workflow block then called `model_dump` and `sv.Detections.from_inference`, which rasterized those polygons back into masks via `polygon_to_mask`. When the caller is a workflow (`request.source == "workflow-execution"`), none of that encoding is observable — the v3 block consumes an `sv.Detections` with masks. This change: * Has the adapter build `sv.Detections` directly from the numpy xyxy/confidence/class_id/mask buffers and attach it via `response.__dict__["_sv_detections_fast"]` (pydantic v2 ignores extra __dict__ keys in dump/serialize, so HTTP payloads are unaffected). The polygon+pydantic path is preserved for all other callers, including RLE responses. * Teaches the v3 block to detect the attached `sv.Detections` and route through a new `_post_process_result_fast`, skipping `model_dump` + `convert_inference_detections_batch_to_sv_detections` entirely. Benchmark on a T4 with rfdetr-seg-nano TRT + Triton preproc + full-Triton postproc + CUDA graphs, streaming vehicles_312px.mp4 (538 frames) via `InferencePipeline`: * baseline (4 runs): mean 151.80 FPS * this change (4 runs): mean 164.51 FPS * **+12.7 FPS, ~+8.4%** Co-Authored-By: Claude Opus 4.7 (1M context) --- .../core/models/inference_models_adapters.py | 101 +++++++++++++++++- .../roboflow/instance_segmentation/v3.py | 71 +++++++++++- 2 files changed, 170 insertions(+), 2 deletions(-) diff --git a/inference/core/models/inference_models_adapters.py b/inference/core/models/inference_models_adapters.py index 5620ff16c3..f4fb090340 100644 --- a/inference/core/models/inference_models_adapters.py +++ b/inference/core/models/inference_models_adapters.py @@ -5,9 +5,11 @@ from typing import Any, List, Optional, Tuple, Union import numpy as np +import supervision as sv import torch from PIL import Image, ImageDraw, ImageFont from pycocotools import mask as mask_utils +from supervision.config import CLASS_NAME_DATA_FIELD from inference.core.entities.requests import ( ClassificationInferenceRequest, @@ -313,6 +315,18 @@ def postprocess( predictions, preprocess_return_metadata, **mapped_kwargs ) + # Workflow callers consume an `sv.Detections` via the v3 block and + # don't need the per-detection polygon/pydantic-prediction encoding. + # When we detect that caller we attach a pre-built `sv.Detections` + # to the response and skip `masks2poly` + `InstanceSegmentationPrediction` + # construction entirely. The workflow block then bypasses + # `model_dump` + `sv.Detections.from_inference` (which would rasterize + # the polygons we just produced back into masks). + is_workflow = ( + kwargs.get("source") == "workflow-execution" and not return_in_rle + ) + class_filter = kwargs.get("class_filter") + responses: List[InstanceSegmentationInferenceResponse] = [] for preproc_metadata, det in zip(preprocess_return_metadata, detections_list): H = preproc_metadata.original_size.height @@ -320,6 +334,22 @@ def postprocess( xyxy = det.xyxy.detach().cpu().numpy() confs = det.confidence.detach().cpu().numpy() + class_ids = det.class_id.detach().cpu().numpy() + + if is_workflow and isinstance(det.mask, torch.Tensor): + masks_np = det.mask.detach().cpu().numpy() + response = self._build_workflow_fastpath_response( + xyxy=xyxy, + confs=confs, + class_ids=class_ids, + masks=masks_np, + class_filter=class_filter, + width=W, + height=H, + ) + responses.append(response) + continue + if isinstance(det.mask, torch.Tensor): masks = det.mask.detach().cpu().numpy() if return_in_rle: @@ -333,7 +363,6 @@ def postprocess( polys_or_rles = det.mask.to_coco_rle_masks() else: polys_or_rles = rle_masks2poly(det.mask) - class_ids = det.class_id.detach().cpu().numpy() predictions: List[ Union[InstanceSegmentationPrediction, InstanceSegmentationRLEPrediction] @@ -399,6 +428,76 @@ def postprocess( ) return responses + def _build_workflow_fastpath_response( + self, + xyxy: np.ndarray, + confs: np.ndarray, + class_ids: np.ndarray, + masks: np.ndarray, + class_filter: Optional[List[str]], + width: int, + height: int, + ) -> InstanceSegmentationInferenceResponse: + n = int(class_ids.shape[0]) if class_ids.ndim else 0 + class_names_map = self.class_names + n_classes = len(class_names_map) + + if n == 0: + sv_dets = sv.Detections.empty() + sv_dets.data = {CLASS_NAME_DATA_FIELD: np.empty(0, dtype=object)} + else: + class_id_int = class_ids.astype(np.int64, copy=False) + in_range = (class_id_int >= 0) & (class_id_int < n_classes) + class_name_arr = np.empty(n, dtype=object) + if in_range.all(): + for i, cid in enumerate(class_id_int): + class_name_arr[i] = class_names_map[int(cid)] + else: + for i, cid in enumerate(class_id_int): + ci = int(cid) + class_name_arr[i] = ( + class_names_map[ci] if 0 <= ci < n_classes else str(ci) + ) + + if class_filter: + keep = np.fromiter( + (name in class_filter for name in class_name_arr), + dtype=bool, + count=n, + ) + if not keep.all(): + xyxy = xyxy[keep] + confs = confs[keep] + class_id_int = class_id_int[keep] + class_name_arr = class_name_arr[keep] + masks = masks[keep] + + xyxy_f = ( + xyxy.astype(np.float32, copy=False) + if xyxy.dtype != np.float32 + else xyxy + ) + mask_bool = ( + masks.astype(bool, copy=False) if masks.dtype != np.bool_ else masks + ) + sv_dets = sv.Detections( + xyxy=xyxy_f, + confidence=confs.astype(np.float32, copy=False), + class_id=class_id_int.astype(int, copy=False), + mask=mask_bool if mask_bool.size else None, + data={CLASS_NAME_DATA_FIELD: class_name_arr}, + ) + + response = InstanceSegmentationInferenceResponse( + predictions=[], + image=InferenceResponseImage(width=width, height=height), + ) + # Stash the pre-built sv.Detections for the v3 workflow block to pick + # up. Pydantic v2 ignores extra __dict__ keys in model_dump and + # jsonable_encoder, so this never leaks into serialized output. + response.__dict__["_sv_detections_fast"] = sv_dets + return response + def clear_cache(self, delete_from_disk: bool = True) -> None: """Clears any cache if necessary. TODO: Implement this to delete the cache from the experimental model. diff --git a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py index 4e9e62eb16..ca0cc2e676 100644 --- a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py +++ b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py @@ -1,5 +1,8 @@ +import uuid from typing import List, Literal, Optional, Type, Union +import numpy as np +import supervision as sv from pydantic import ConfigDict, Field, PositiveInt, model_validator from inference.core.entities.requests.inference import ( @@ -20,7 +23,12 @@ convert_inference_detections_batch_to_sv_detections, filter_out_unwanted_classes_from_sv_detections_batch, ) -from inference.core.workflows.execution_engine.constants import INFERENCE_ID_KEY +from inference.core.workflows.execution_engine.constants import ( + DETECTION_ID_KEY, + IMAGE_DIMENSIONS_KEY, + INFERENCE_ID_KEY, + PARENT_ID_KEY, +) from inference.core.workflows.execution_engine.entities.base import ( Batch, OutputDefinition, @@ -327,6 +335,19 @@ def run_locally( ) if not isinstance(predictions, list): predictions = [predictions] + # Fast path: the adapter attaches a pre-built `sv.Detections` when the + # request's source is `workflow-execution`, letting us skip the + # mask → polygon → mask round-trip via `model_dump` + `from_inference`. + sv_fast = [p.__dict__.get("_sv_detections_fast") for p in predictions] + if all(det is not None for det in sv_fast): + inference_ids = [p.inference_id for p in predictions] + return self._post_process_result_fast( + images=images, + sv_detections=sv_fast, + inference_ids=inference_ids, + class_filter=class_filter, + model_id=model_id, + ) predictions = [ e.model_dump(by_alias=True, exclude_none=True) for e in predictions ] @@ -422,3 +443,51 @@ def _post_process_result( } for inference_id, prediction in zip(inference_ids, predictions) ] + + def _post_process_result_fast( + self, + images: Batch[WorkflowImageData], + sv_detections: List[sv.Detections], + inference_ids: List[Optional[str]], + class_filter: Optional[List[str]], + model_id: str, + ) -> BlockResult: + # Skips the dict → sv.Detections conversion (which would + # `polygon_to_mask` each detection) because the adapter already + # produced a ready-to-use `sv.Detections`. + augmented: List[sv.Detections] = [] + for image, detections, inference_id in zip( + images, sv_detections, inference_ids + ): + n = len(detections) + detections[DETECTION_ID_KEY] = np.array( + [str(uuid.uuid4()) for _ in range(n)] + ) + detections[PARENT_ID_KEY] = np.array([""] * n) + # image.numpy_image is the frame actually inferred on; shape + # matches the (H, W) baked into the sv_detections masks. + h, w = image.numpy_image.shape[:2] + detections[IMAGE_DIMENSIONS_KEY] = np.array([[h, w]] * n) + if inference_id is not None: + detections[INFERENCE_ID_KEY] = np.array([inference_id] * n) + augmented.append(detections) + augmented = attach_prediction_type_info_to_sv_detections_batch( + predictions=augmented, + prediction_type="instance-segmentation", + ) + augmented = filter_out_unwanted_classes_from_sv_detections_batch( + predictions=augmented, + classes_to_accept=class_filter, + ) + augmented = attach_parents_coordinates_to_batch_of_sv_detections( + images=images, + predictions=augmented, + ) + return [ + { + "inference_id": inference_id, + "predictions": prediction, + "model_id": model_id, + } + for inference_id, prediction in zip(inference_ids, augmented) + ] From 7382d9ebd114e8c46ff97cd254b765ad69b69cc1 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Thu, 30 Apr 2026 18:31:25 +0000 Subject: [PATCH 2/2] fix(rfdetr-seg): preserve mask denoising on workflow fast path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fast path handed raw GPU masks straight to `sv.Detections`, skipping the `masks2poly` → `polygon_to_mask` round-trip that the slow path ran. That round-trip has two behavioral side-effects the fast path was inadvertently dropping: 1. Largest-component-only: `findContours(RETR_EXTERNAL)` + picking the contour with the most vertices drops disconnected mask fragments. 2. Hole-filling: `RETR_EXTERNAL` ignores inner contours, so `fillPoly(largest_contour)` fills any holes inside the shape. Plus `filter_out_invalid_polygons` + the `>= 3` vertex check in `supervision.process_roboflow_result` drop detections whose largest contour has fewer than 3 points. This change reproduces the slow-path mask semantics inside `_build_workflow_fastpath_response` by running the same `findContours(RETR_EXTERNAL, CHAIN_APPROX_SIMPLE)` + `fillPoly` per mask, and dropping detections whose largest contour has fewer than 3 vertices. It also factors the shared attr name into `SV_DETECTIONS_FAST_ATTR` in `inference/core/entities/responses/inference.py`. Verified bit-exact mask equality vs the slow path on synthetic masks with disconnected fragments and interior holes. Benchmark on a T4 with the full Triton preproc + full-postproc + CUDA-graphs stack, streaming vehicles_312px.mp4 (538 frames) via InferencePipeline: * baseline (no fast path): mean 152.33 FPS * fast path WITHOUT denoising (wrong): mean 164.51 FPS (+12.2, +8.0%) * **fast path WITH denoising (this change): mean 163.43 FPS (+11.1, +7.3%)** Denoising costs ~1 FPS (~0.7%) because both paths run the same `findContours + fillPoly`; the fast path still eliminates pydantic validation for Point/InstanceSegmentationPrediction, `model_dump`, and the second rasterization inside `sv.Detections.from_inference` → `polygon_to_mask`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../core/entities/responses/inference.py | 6 ++ .../core/models/inference_models_adapters.py | 94 ++++++++++--------- .../roboflow/instance_segmentation/v3.py | 13 +-- 3 files changed, 56 insertions(+), 57 deletions(-) diff --git a/inference/core/entities/responses/inference.py b/inference/core/entities/responses/inference.py index 912dbd98f3..acc761eb8b 100644 --- a/inference/core/entities/responses/inference.py +++ b/inference/core/entities/responses/inference.py @@ -4,6 +4,12 @@ from pydantic import BaseModel, ConfigDict, Field, ValidationError, field_serializer +# Attr name the instance-segmentation fast path uses on response.__dict__ to +# hand a pre-built sv.Detections to the v3 workflow block without going through +# pydantic serialization. Pydantic v2 ignores extra __dict__ keys in +# model_dump/jsonable_encoder, so this never leaks into serialized output. +SV_DETECTIONS_FAST_ATTR = "_sv_detections_fast" + class ObjectDetectionPrediction(BaseModel): """Object Detection prediction. diff --git a/inference/core/models/inference_models_adapters.py b/inference/core/models/inference_models_adapters.py index f4fb090340..a7c9da4863 100644 --- a/inference/core/models/inference_models_adapters.py +++ b/inference/core/models/inference_models_adapters.py @@ -4,6 +4,7 @@ from time import perf_counter from typing import Any, List, Optional, Tuple, Union +import cv2 import numpy as np import supervision as sv import torch @@ -16,6 +17,7 @@ InferenceRequest, ) from inference.core.entities.responses.inference import ( + SV_DETECTIONS_FAST_ATTR, ClassificationInferenceResponse, InferenceResponse, InferenceResponseImage, @@ -315,17 +317,9 @@ def postprocess( predictions, preprocess_return_metadata, **mapped_kwargs ) - # Workflow callers consume an `sv.Detections` via the v3 block and - # don't need the per-detection polygon/pydantic-prediction encoding. - # When we detect that caller we attach a pre-built `sv.Detections` - # to the response and skip `masks2poly` + `InstanceSegmentationPrediction` - # construction entirely. The workflow block then bypasses - # `model_dump` + `sv.Detections.from_inference` (which would rasterize - # the polygons we just produced back into masks). is_workflow = ( kwargs.get("source") == "workflow-execution" and not return_in_rle ) - class_filter = kwargs.get("class_filter") responses: List[InstanceSegmentationInferenceResponse] = [] for preproc_metadata, det in zip(preprocess_return_metadata, detections_list): @@ -343,7 +337,6 @@ def postprocess( confs=confs, class_ids=class_ids, masks=masks_np, - class_filter=class_filter, width=W, height=H, ) @@ -434,7 +427,6 @@ def _build_workflow_fastpath_response( confs: np.ndarray, class_ids: np.ndarray, masks: np.ndarray, - class_filter: Optional[List[str]], width: int, height: int, ) -> InstanceSegmentationInferenceResponse: @@ -446,44 +438,55 @@ def _build_workflow_fastpath_response( sv_dets = sv.Detections.empty() sv_dets.data = {CLASS_NAME_DATA_FIELD: np.empty(0, dtype=object)} else: + # Reproduce the slow path's mask denoising: per mask, keep only + # the largest external contour (by vertex count) and refill it, + # which filters disconnected mask fragments AND fills interior + # holes (RETR_EXTERNAL ignores inner contours). Detections whose + # largest contour has fewer than 3 vertices are dropped, matching + # `filter_out_invalid_polygons` + the `>= 3` check in + # supervision's `process_roboflow_result`. + denoised = np.zeros_like(masks, dtype=np.uint8) + keep_mask = np.zeros(n, dtype=bool) + for i in range(n): + m = masks[i] + if m.dtype == np.bool_: + m = m.view(np.uint8) + elif m.dtype != np.uint8: + m = (m > 0).astype(np.uint8) + if not m.flags.c_contiguous: + m = np.ascontiguousarray(m) + contours = cv2.findContours( + m, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + )[0] + if not contours: + continue + best = max(contours, key=len) + # Match supervision's `>= 3` threshold on polygon vertex count. + if len(best) < 3: + continue + cv2.fillPoly(denoised[i], [best.reshape(-1, 2)], color=1) + keep_mask[i] = True + class_id_int = class_ids.astype(np.int64, copy=False) - in_range = (class_id_int >= 0) & (class_id_int < n_classes) class_name_arr = np.empty(n, dtype=object) - if in_range.all(): - for i, cid in enumerate(class_id_int): - class_name_arr[i] = class_names_map[int(cid)] - else: - for i, cid in enumerate(class_id_int): - ci = int(cid) - class_name_arr[i] = ( - class_names_map[ci] if 0 <= ci < n_classes else str(ci) - ) - - if class_filter: - keep = np.fromiter( - (name in class_filter for name in class_name_arr), - dtype=bool, - count=n, + for i, cid in enumerate(class_id_int): + ci = int(cid) + class_name_arr[i] = ( + class_names_map[ci] if 0 <= ci < n_classes else str(ci) ) - if not keep.all(): - xyxy = xyxy[keep] - confs = confs[keep] - class_id_int = class_id_int[keep] - class_name_arr = class_name_arr[keep] - masks = masks[keep] - - xyxy_f = ( - xyxy.astype(np.float32, copy=False) - if xyxy.dtype != np.float32 - else xyxy - ) - mask_bool = ( - masks.astype(bool, copy=False) if masks.dtype != np.bool_ else masks - ) + + if not keep_mask.all(): + xyxy = xyxy[keep_mask] + confs = confs[keep_mask] + class_id_int = class_id_int[keep_mask] + class_name_arr = class_name_arr[keep_mask] + denoised = denoised[keep_mask] + + mask_bool = denoised.astype(bool, copy=False) sv_dets = sv.Detections( - xyxy=xyxy_f, + xyxy=xyxy.astype(np.float32, copy=False), confidence=confs.astype(np.float32, copy=False), - class_id=class_id_int.astype(int, copy=False), + class_id=class_id_int, mask=mask_bool if mask_bool.size else None, data={CLASS_NAME_DATA_FIELD: class_name_arr}, ) @@ -492,10 +495,9 @@ def _build_workflow_fastpath_response( predictions=[], image=InferenceResponseImage(width=width, height=height), ) - # Stash the pre-built sv.Detections for the v3 workflow block to pick - # up. Pydantic v2 ignores extra __dict__ keys in model_dump and + # Pydantic v2 ignores extra __dict__ keys in model_dump and # jsonable_encoder, so this never leaks into serialized output. - response.__dict__["_sv_detections_fast"] = sv_dets + response.__dict__[SV_DETECTIONS_FAST_ATTR] = sv_dets return response def clear_cache(self, delete_from_disk: bool = True) -> None: diff --git a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py index ca0cc2e676..3c1e6322ae 100644 --- a/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py +++ b/inference/core/workflows/core_steps/models/roboflow/instance_segmentation/v3.py @@ -15,6 +15,7 @@ WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_BATCH_SIZE, WORKFLOWS_REMOTE_EXECUTION_MAX_STEP_CONCURRENT_REQUESTS, ) +from inference.core.entities.responses.inference import SV_DETECTIONS_FAST_ATTR from inference.core.managers.base import ModelManager from inference.core.workflows.core_steps.common.entities import StepExecutionMode from inference.core.workflows.core_steps.common.utils import ( @@ -27,7 +28,6 @@ DETECTION_ID_KEY, IMAGE_DIMENSIONS_KEY, INFERENCE_ID_KEY, - PARENT_ID_KEY, ) from inference.core.workflows.execution_engine.entities.base import ( Batch, @@ -335,10 +335,7 @@ def run_locally( ) if not isinstance(predictions, list): predictions = [predictions] - # Fast path: the adapter attaches a pre-built `sv.Detections` when the - # request's source is `workflow-execution`, letting us skip the - # mask → polygon → mask round-trip via `model_dump` + `from_inference`. - sv_fast = [p.__dict__.get("_sv_detections_fast") for p in predictions] + sv_fast = [p.__dict__.get(SV_DETECTIONS_FAST_ATTR) for p in predictions] if all(det is not None for det in sv_fast): inference_ids = [p.inference_id for p in predictions] return self._post_process_result_fast( @@ -452,9 +449,6 @@ def _post_process_result_fast( class_filter: Optional[List[str]], model_id: str, ) -> BlockResult: - # Skips the dict → sv.Detections conversion (which would - # `polygon_to_mask` each detection) because the adapter already - # produced a ready-to-use `sv.Detections`. augmented: List[sv.Detections] = [] for image, detections, inference_id in zip( images, sv_detections, inference_ids @@ -463,9 +457,6 @@ def _post_process_result_fast( detections[DETECTION_ID_KEY] = np.array( [str(uuid.uuid4()) for _ in range(n)] ) - detections[PARENT_ID_KEY] = np.array([""] * n) - # image.numpy_image is the frame actually inferred on; shape - # matches the (H, W) baked into the sv_detections masks. h, w = image.numpy_image.shape[:2] detections[IMAGE_DIMENSIONS_KEY] = np.array([[h, w]] * n) if inference_id is not None: