scaleapi · sasha-scale · Apr 19, 2022 · Apr 19, 2022 · Apr 23, 2022 · gatli
diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py
@@ -11,6 +11,8 @@
 )
 from nucleus.prediction import PredictionList
 
+EPSILON = 10 ** -4  # 0.0001
+
 
 class MetricResult(ABC):
     """Base MetricResult class"""
@@ -41,6 +43,14 @@ def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult":
         value = total_value / max(total_weight, sys.float_info.epsilon)
         return ScalarResult(value, total_weight)
 
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return False
+        return (
+            abs(self.value - other.value) < EPSILON
+            and self.weight == other.weight
+        )
+
 
 class Metric(ABC):
     """Abstract class for defining a metric, which takes a list of annotations

diff --git a/nucleus/metrics/cuboid_metrics.py b/nucleus/metrics/cuboid_metrics.py
@@ -1,4 +1,5 @@
 import sys
+import warnings
 from abc import abstractmethod
 from typing import List, Optional, Union
 
@@ -10,6 +11,9 @@
 from .filtering import ListOfAndFilters, ListOfOrAndFilters
 from .filters import confidence_filter
 
+DEFAULT_IOU_THRESHOLD = 0.1
+DEFAULT_CONFIDENCE_THRESHOLD = 0.0
+
 
 class CuboidMetric(Metric):
     """Abstract class for metrics of cuboids.
@@ -27,8 +31,9 @@ class CuboidMetric(Metric):
 
     def __init__(
         self,
+        iou_threshold: float,
         enforce_label_match: bool = False,
-        confidence_threshold: float = 0.0,
+        confidence_threshold: Optional[float] = None,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
         ] = None,
@@ -54,6 +59,11 @@ def __init__(
                 (AND), forming a more selective and multiple column predicate. Finally, the most outer list combines
                 these filters as a disjunction (OR).
         """
+        if not confidence_threshold:
+            confidence_threshold = DEFAULT_CONFIDENCE_THRESHOLD
+            warnings.warn(
+                f"Got confidence_threshold value of `None`. In this case, we set the confidence_threshold to {confidence_threshold} (include all predictions, regardless of confidence).  Consider specifying this value explicitly during metric initialization"
+            )
         self.enforce_label_match = enforce_label_match
         assert 0 <= confidence_threshold <= 1
         self.confidence_threshold = confidence_threshold
@@ -99,8 +109,8 @@ class CuboidIOU(CuboidMetric):
     def __init__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
-        confidence_threshold: float = 0.0,
+        iou_threshold: Optional[float] = None,
+        confidence_threshold: Optional[float] = None,
         iou_2d: bool = False,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -127,13 +137,19 @@ def __init__(
                 interpreted as a conjunction (AND), forming a more selective and multiple column predicate.
                 Finally, the most outer list combines these filters as a disjunction (OR).
         """
+        if not iou_threshold:
+            iou_threshold = DEFAULT_IOU_THRESHOLD
+            warnings.warn(
+                f"The IoU threshold used for matching was initialized to `None`. In this case, the value of iou_threshold defaults to {iou_threshold}. If this values will produce unexpected behavior, consider specifying the iou_threshold argument during metric initialization"
+            )
         assert (
             0 <= iou_threshold <= 1
         ), "IoU threshold must be between 0 and 1."
         self.iou_threshold = iou_threshold
         self.iou_2d = iou_2d
         super().__init__(
             enforce_label_match=enforce_label_match,
+            iou_threshold=iou_threshold,
             confidence_threshold=confidence_threshold,
             annotation_filters=annotation_filters,
             prediction_filters=prediction_filters,
@@ -147,14 +163,16 @@ def eval(
         iou_3d_metric, iou_2d_metric = detection_iou(
             predictions,
             annotations,
-            threshold_in_overlap_ratio=self.iou_threshold,
+            self.iou_threshold,
         )
 
-        weight = max(len(annotations), len(predictions))
+        # If there are zero IoU matches, avg_iou defaults to value 0
         if self.iou_2d:
-            avg_iou = iou_2d_metric.sum() / max(weight, sys.float_info.epsilon)
+            weight = len(iou_2d_metric)
+            avg_iou = iou_2d_metric.sum() / weight if weight > 0 else 0.0
         else:
-            avg_iou = iou_3d_metric.sum() / max(weight, sys.float_info.epsilon)
+            weight = len(iou_3d_metric)
+            avg_iou = iou_3d_metric.sum() / weight if weight > 0 else 0.0
 
         return ScalarResult(avg_iou, weight)
 
@@ -166,8 +184,8 @@ class CuboidPrecision(CuboidMetric):
     def __init__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
-        confidence_threshold: float = 0.0,
+        iou_threshold: Optional[float] = None,
+        confidence_threshold: Optional[float] = None,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
         ] = None,
@@ -192,12 +210,18 @@ def __init__(
                 interpreted as a conjunction (AND), forming a more selective and multiple column predicate.
                 Finally, the most outer list combines these filters as a disjunction (OR).
         """
+        if not iou_threshold:
+            iou_threshold = DEFAULT_IOU_THRESHOLD
+            warnings.warn(
+                f"The IoU threshold used for matching was initialized to `None`. In this case, the value of iou_threshold defaults to {iou_threshold}. If this values will produce unexpected behavior, consider specifying the iou_threshold argument during metric initialization"
+            )
         assert (
             0 <= iou_threshold <= 1
         ), "IoU threshold must be between 0 and 1."
         self.iou_threshold = iou_threshold
         super().__init__(
             enforce_label_match=enforce_label_match,
+            iou_threshold=iou_threshold,
             confidence_threshold=confidence_threshold,
             annotation_filters=annotation_filters,
             prediction_filters=prediction_filters,
@@ -211,7 +235,9 @@ def eval(
         stats = recall_precision(
             predictions,
             annotations,
-            threshold_in_overlap_ratio=self.iou_threshold,
+            self.iou_threshold,
+            self.confidence_threshold,
+            self.enforce_label_match,
         )
         weight = stats["tp_sum"] + stats["fp_sum"]
         precision = stats["tp_sum"] / max(weight, sys.float_info.epsilon)
@@ -225,8 +251,8 @@ class CuboidRecall(CuboidMetric):
     def __init__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
-        confidence_threshold: float = 0.0,
+        iou_threshold: Optional[float] = None,
+        confidence_threshold: Optional[float] = None,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
         ] = None,
@@ -241,12 +267,18 @@ def __init__(
             iou_threshold: IOU threshold to consider detection as valid. Must be in [0, 1]. Default 0.0
             confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0
         """
+        if not iou_threshold:
+            iou_threshold = DEFAULT_IOU_THRESHOLD
+            warnings.warn(
+                f"The IoU threshold used for matching was initialized to `None`. In this case, the value of iou_threshold defaults to {iou_threshold}. If this values will produce unexpected behavior, consider specifying the iou_threshold argument during metric initialization"
+            )
         assert (
             0 <= iou_threshold <= 1
         ), "IoU threshold must be between 0 and 1."
         self.iou_threshold = iou_threshold
         super().__init__(
             enforce_label_match=enforce_label_match,
+            iou_threshold=iou_threshold,
             confidence_threshold=confidence_threshold,
             annotation_filters=annotation_filters,
             prediction_filters=prediction_filters,
@@ -260,7 +292,9 @@ def eval(
         stats = recall_precision(
             predictions,
             annotations,
-            threshold_in_overlap_ratio=self.iou_threshold,
+            self.iou_threshold,
+            self.confidence_threshold,
+            self.enforce_label_match,
         )
         weight = stats["tp_sum"] + stats["fn_sum"]
         recall = stats["tp_sum"] / max(weight, sys.float_info.epsilon)

diff --git a/nucleus/metrics/cuboid_utils.py b/nucleus/metrics/cuboid_utils.py
@@ -1,4 +1,5 @@
 from functools import wraps
+from dataclasses import dataclass
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -35,6 +36,14 @@ def __init__(self, *args, **kwargs):
 from .base import ScalarResult
 
 
+@dataclass
+class ProcessedCuboids:
+    xyz: np.array
+    wlh: np.array
+    yaw: np.array
+    labels: List[str]
+
+
 def group_cuboids_by_label(
     annotations: List[CuboidAnnotation],
     predictions: List[CuboidPrediction],
@@ -101,19 +110,25 @@ def wrapper(
     return wrapper
 
 
-def process_dataitem(dataitem):
-    processed_item = {}
-    processed_item["xyz"] = np.array(
-        [[ann.position.x, ann.position.y, ann.position.z] for ann in dataitem]
+def process_cuboids(item_list, confidence_threshold=None):
+    if confidence_threshold:
+        item_list = [
+            item
+            for item in item_list
+            if item.confidence >= confidence_threshold
+        ]
+    xyz = np.array(
+        [[ann.position.x, ann.position.y, ann.position.z] for ann in item_list]
     )
-    processed_item["wlh"] = np.array(
+    wlh = np.array(
         [
             [ann.dimensions.x, ann.dimensions.y, ann.dimensions.z]
-            for ann in dataitem
+            for ann in item_list
         ]
     )
-    processed_item["yaw"] = np.array([ann.yaw for ann in dataitem])
-    return processed_item
+    yaw = np.array([ann.yaw for ann in item_list])
+    labels = [ann.label for ann in item_list]
+    return ProcessedCuboids(xyz, wlh, yaw, labels)
 
 
 def compute_outer_iou(
@@ -178,7 +193,6 @@ def compute_outer_iou(
                     .intersection(polygon_1)
                     .area
                 )
-
     intersection = height_intersection * area_intersection
     area_0 = wlh_0[:, 0] * wlh_0[:, 1]
     area_1 = wlh_1[:, 0] * wlh_1[:, 1]
@@ -278,6 +292,8 @@ def recall_precision(
     prediction: List[CuboidPrediction],
     groundtruth: List[CuboidAnnotation],
     threshold_in_overlap_ratio: float,
+    confidence_threshold: float,
+    enforce_label_match: bool,
 ) -> Dict[str, float]:
     """
     Calculates the precision and recall of each lidar frame.
@@ -294,23 +310,23 @@ def recall_precision(
     num_predicted = 0
     num_instances = 0
 
-    gt_items = process_dataitem(groundtruth)
-    pred_items = process_dataitem(prediction)
+    gt_items = process_cuboids(groundtruth)
+    pred_items = process_cuboids(prediction, confidence_threshold)
 
-    num_predicted += pred_items["xyz"].shape[0]
-    num_instances += gt_items["xyz"].shape[0]
+    num_predicted += pred_items.xyz.shape[0]
+    num_instances += gt_items.xyz.shape[0]
 
-    tp = np.zeros(pred_items["xyz"].shape[0])
-    fp = np.ones(pred_items["xyz"].shape[0])
-    fn = np.ones(gt_items["xyz"].shape[0])
+    tp = np.zeros(pred_items.xyz.shape[0])
+    fp = np.ones(pred_items.xyz.shape[0])
+    fn = np.ones(gt_items.xyz.shape[0])
 
     mapping = associate_cuboids_on_iou(
-        pred_items["xyz"],
-        pred_items["wlh"],
-        pred_items["yaw"] + np.pi / 2,
-        gt_items["xyz"],
-        gt_items["wlh"],
-        gt_items["yaw"] + np.pi / 2,
+        pred_items.xyz,
+        pred_items.wlh,
+        pred_items.yaw + np.pi / 2,
+        gt_items.xyz,
+        gt_items.wlh,
+        gt_items.yaw + np.pi / 2,
         threshold_in_overlap_ratio=threshold_in_overlap_ratio,
     )
 
@@ -351,27 +367,27 @@ def detection_iou(
         :param threshold: IOU threshold to consider detection as valid. Must be in [0, 1].
     """
 
-    gt_items = process_dataitem(groundtruth)
-    pred_items = process_dataitem(prediction)
+    gt_items = process_cuboids(groundtruth)
+    pred_items = process_cuboids(prediction)
 
     meter_2d = []
     meter_3d = []
 
-    if gt_items["xyz"].shape[0] == 0 or pred_items["xyz"].shape[0] == 0:
+    if gt_items.xyz.shape[0] == 0 or pred_items.xyz.shape[0] == 0:
         return np.array([0.0]), np.array([0.0])
 
     iou_3d, iou_2d = compute_outer_iou(
-        gt_items["xyz"],
-        gt_items["wlh"],
-        gt_items["yaw"],
-        pred_items["xyz"],
-        pred_items["wlh"],
-        pred_items["yaw"],
+        gt_items.xyz,
+        gt_items.wlh,
+        gt_items.yaw,
+        pred_items.xyz,
+        pred_items.wlh,
+        pred_items.yaw,
     )
 
     for i, m in enumerate(iou_3d.max(axis=1)):
+        j = iou_3d[i].argmax()
         if m >= threshold_in_overlap_ratio:
-            j = iou_3d[i].argmax()
             meter_3d.append(iou_3d[i, j])
             meter_2d.append(iou_2d[i, j])
 

diff --git a/nucleus/validate/eval_functions/available_eval_functions.py b/nucleus/validate/eval_functions/available_eval_functions.py
@@ -10,12 +10,15 @@
 from ..data_transfer_objects.eval_function import EvalFunctionEntry
 from ..errors import EvalFunctionNotAvailableError
 
+DEFAULT_2D_IOU_THRESHOLD = 0.5
+DEFAULT_3D_IOU_THRESHOLD = 0.1
+
 
 class PolygonIOUConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = False,
-        iou_threshold: float = 0.0,
+        iou_threshold: float = DEFAULT_2D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -77,7 +80,7 @@ def expected_name(cls) -> str:
 class PolygonMAPConfig(EvalFunctionConfig):
     def __call__(
         self,
-        iou_threshold: float = 0.5,
+        iou_threshold: float = DEFAULT_2D_IOU_THRESHOLD,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
         ] = None,
@@ -135,7 +138,7 @@ class PolygonRecallConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = False,
-        iou_threshold: float = 0.5,
+        iou_threshold: float = DEFAULT_2D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -198,7 +201,7 @@ class PolygonPrecisionConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = False,
-        iou_threshold: float = 0.5,
+        iou_threshold: float = DEFAULT_2D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -261,7 +264,7 @@ class CuboidIOU2DConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
+        iou_threshold: float = DEFAULT_2D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -315,7 +318,7 @@ class CuboidIOU3DConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
+        iou_threshold: float = DEFAULT_3D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -370,7 +373,7 @@ class CuboidPrecisionConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
+        iou_threshold: float = DEFAULT_3D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]
@@ -424,7 +427,7 @@ class CuboidRecallConfig(EvalFunctionConfig):
     def __call__(
         self,
         enforce_label_match: bool = True,
-        iou_threshold: float = 0.0,
+        iou_threshold: float = DEFAULT_3D_IOU_THRESHOLD,
         confidence_threshold: float = 0.0,
         annotation_filters: Optional[
             Union[ListOfOrAndFilters, ListOfAndFilters]

diff --git a/tests/metrics/test_cuboid_metrics.py b/tests/metrics/test_cuboid_metrics.py
@@ -1,5 +1,14 @@
 import pytest
 
+from nucleus.annotation import CuboidAnnotation, Point3D, AnnotationList
+from nucleus.metrics.base import ScalarResult
+from nucleus.metrics.cuboid_metrics import (
+    CuboidIOU,
+    CuboidPrecision,
+    CuboidRecall,
+)
+from nucleus.prediction import CuboidPrediction, PredictionList
+
 try:
     import shapely
 except ModuleNotFoundError:
@@ -8,4 +17,368 @@
         allow_module_level=True,
     )
 
-# TODO(gunnar): Add Cuboid tests!
+CAR_LABEL = "car"
+PEDESTRIAN_LABEL = "pedestrian"
+DEFAULT_90_DEGREE_ROTATION = 1.57079
+
+
+def test_cuboid_metrics_simple():
+    # single item, perfect predictions
+    annotations = AnnotationList(
+        cuboid_annotations=[
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+        ]
+    )
+    predictions = PredictionList(
+        cuboid_predictions=[
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+        ]
+    )
+    assert CuboidIOU()(annotations, predictions) == ScalarResult(
+        1.0, len(annotations)
+    ), "Unexpected Cuboid IoU result"
+    assert CuboidPrecision()(annotations, predictions) == ScalarResult(
+        1.0, len(annotations)
+    ), "Unexpected Cuboid Precision result"
+    assert CuboidRecall()(annotations, predictions) == ScalarResult(
+        1.0, len(annotations)
+    ), "Unexpected Cuboid Recall result"
+
+
+def test_cuboid_metrics_numerical_check():
+    # single item, realistic predictions w/ matches and non-matches
+    annotations = AnnotationList(
+        cuboid_annotations=[
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(-100, -100, -100),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),  # false negative
+        ]
+    )
+    predictions = PredictionList(
+        cuboid_predictions=[
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(1.0, 1.0, 1.0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(999, 999, 999),
+                dimensions=Point3D(8, 8, 6),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(250, 250, 250),
+                dimensions=Point3D(2, 2, 2),
+                yaw=0.0,
+                reference_id="item_A",
+            ),  # false positive
+        ]
+    )
+    cuboid_iou_result = CuboidIOU()(annotations, predictions)
+    cuboid_precision_result = CuboidPrecision()(annotations, predictions)
+    cuboid_recall_result = CuboidRecall()(annotations, predictions)
+    assert cuboid_iou_result == ScalarResult(
+        0.4316, 2
+    ), f"Unexpected Cuboid IoU result: {cuboid_iou_result}"
+    assert cuboid_precision_result == ScalarResult(
+        2.0 / 3.0, len(predictions)
+    ), f"Unexpected Cuboid Precision result {cuboid_precision_result}"
+    assert cuboid_recall_result == ScalarResult(
+        2.0 / 3.0, len(annotations)
+    ), f"Unexpected Cuboid Recall result {cuboid_recall_result}"
+
+
+def test_cuboid_metrics_numerical_check_rotation():
+    annotations = AnnotationList(
+        cuboid_annotations=[
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 5, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(8, 4, 6),
+                yaw=DEFAULT_90_DEGREE_ROTATION,
+                reference_id="item_A",
+            ),
+        ]
+    )
+    predictions = PredictionList(
+        cuboid_predictions=[
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 5, 5),
+                yaw=DEFAULT_90_DEGREE_ROTATION,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(8, 4, 6),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+        ]
+    )
+    cuboid_iou_result = CuboidIOU()(annotations, predictions)
+    cuboid_precision_result = CuboidPrecision()(annotations, predictions)
+    cuboid_recall_result = CuboidRecall()(annotations, predictions)
+    assert cuboid_iou_result == ScalarResult(
+        1.0 / 3.0, 2
+    ), f"Unexpected Cuboid IoU result: {cuboid_iou_result}"
+
+
+def test_cuboid_metrics_class_labels():
+    annotations = AnnotationList(
+        cuboid_annotations=[
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(-100, -100, -100),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),  # false negative
+        ]
+    )
+    predictions = PredictionList(
+        cuboid_predictions=[
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(1.0, 1.0, 1.0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=PEDESTRIAN_LABEL,
+                position=Point3D(999, 999, 999),
+                dimensions=Point3D(8, 8, 6),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(250, 250, 250),
+                dimensions=Point3D(2, 2, 2),
+                yaw=0.0,
+                reference_id="item_A",
+            ),  # false positive
+        ]
+    )
+
+    cuboid_iou_result1 = CuboidIOU(enforce_label_match=True)(
+        annotations, predictions
+    )
+    cuboid_precision_result1 = CuboidPrecision(enforce_label_match=True)(
+        annotations, predictions
+    )
+    cuboid_recall_result1 = CuboidRecall(enforce_label_match=True)(
+        annotations, predictions
+    )
+    assert cuboid_iou_result1 == ScalarResult(
+        0.47928, 1
+    ), f"Unexpected Cuboid IoU result: {cuboid_iou_result1}"
+    assert cuboid_precision_result1 == ScalarResult(
+        1.0 / 3.0, len(predictions)
+    ), f"Unexpected Cuboid Precision result {cuboid_precision_result1}"
+    assert cuboid_recall_result1 == ScalarResult(
+        1.0 / 3.0, len(annotations)
+    ), f"Unexpected Cuboid Recall result {cuboid_recall_result1}"
+
+    cuboid_iou_result2 = CuboidIOU(enforce_label_match=False)(
+        annotations, predictions
+    )
+    cuboid_precision_result2 = CuboidPrecision(enforce_label_match=False)(
+        annotations, predictions
+    )
+    cuboid_recall_result2 = CuboidRecall(enforce_label_match=False)(
+        annotations, predictions
+    )
+    assert cuboid_iou_result2 == ScalarResult(
+        0.4316, 2
+    ), f"Unexpected Cuboid IoU result: {cuboid_iou_result2}"
+    assert cuboid_precision_result2 == ScalarResult(
+        2.0 / 3.0, len(predictions)
+    ), f"Unexpected Cuboid Precision result {cuboid_precision_result2}"
+    assert cuboid_recall_result2 == ScalarResult(
+        2.0 / 3.0, len(annotations)
+    ), f"Unexpected Cuboid Recall result {cuboid_recall_result2}"
+
+
+def test_cuboid_metrics_multi_item():
+    # single item, perfect precision
+    annotations = AnnotationList(
+        cuboid_annotations=[
+            # first item
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(1000, 1000, 1000),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(-100, -100, -100),
+                dimensions=Point3D(10, 10, 10),
+                yaw=0.0,
+                reference_id="item_A",
+            ),  # false negative
+            # second item
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(0, 0, 0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_B",
+            ),
+            CuboidAnnotation(
+                label=CAR_LABEL,
+                position=Point3D(30, 50, 120),
+                dimensions=Point3D(1, 2.5, 3),
+                yaw=0.0,
+                reference_id="item_B",
+            ),
+        ]
+    )
+    predictions = PredictionList(
+        cuboid_predictions=[
+            # first item
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(1.0, 1.0, 1.0),
+                dimensions=Point3D(10, 10, 5),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=PEDESTRIAN_LABEL,
+                position=Point3D(999, 999, 999),
+                dimensions=Point3D(8, 8, 6),
+                yaw=0.0,
+                reference_id="item_A",
+            ),
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(250, 250, 250),
+                dimensions=Point3D(2, 2, 2),
+                yaw=0.0,
+                reference_id="item_A",
+            ),  # false positive
+            # second item
+            CuboidPrediction(
+                label=CAR_LABEL,
+                position=Point3D(250, 250, 250),
+                dimensions=Point3D(2, 2, 2),
+                yaw=0.0,
+                reference_id="item_B",
+            ),  # false positive
+        ]
+    )
+
+    cuboid_iou_result1 = CuboidIOU()(annotations, predictions)
+    cuboid_precision_result1 = CuboidPrecision()(annotations, predictions)
+    cuboid_recall_result1 = CuboidRecall()(annotations, predictions)
+    assert cuboid_iou_result1 == ScalarResult(
+        0.47928, 1
+    ), f"Unexpected Cuboid IoU result: {cuboid_iou_result1}"
+    assert cuboid_precision_result1 == ScalarResult(
+        1.0 / len(predictions), len(predictions)
+    ), f"Unexpected Cuboid Precision result {cuboid_precision_result1}"
+    assert cuboid_recall_result1 == ScalarResult(
+        1.0 / len(annotations), len(annotations)
+    ), f"Unexpected Cuboid Recall result {cuboid_recall_result1}"
+
+    cuboid_iou_result2 = CuboidIOU(enforce_label_match=False)(
+        annotations, predictions
+    )
+    cuboid_precision_result2 = CuboidPrecision(enforce_label_match=False)(
+        annotations, predictions
+    )
+    cuboid_recall_result2 = CuboidRecall(enforce_label_match=False)(
+        annotations, predictions
+    )
+    assert cuboid_iou_result2 == ScalarResult(
+        0.4316, 2
+    ), f"Unexpected Cuboid IoU result: {cuboid_iou_result2}"
+    assert cuboid_precision_result2 == ScalarResult(
+        2.0 / len(predictions), len(predictions)
+    ), f"Unexpected Cuboid Precision result {cuboid_precision_result2}"
+    assert cuboid_recall_result2 == ScalarResult(
+        2.0 / len(annotations), len(annotations)
+    ), f"Unexpected Cuboid Recall result {cuboid_recall_result2}"