add label filter

antoinedaurat · antoinedaurat · commit 64a16bffd8e3 · 2023-12-16T09:45:45.000+01:00
diff --git a/mimikit/extract/__init__.py b/mimikit/extract/__init__.py
@@ -2,6 +2,7 @@
 from .samplify import *
 from .segment import *
 from .from_neighbors import *
+from .label_filter import *
 
 
 __all__ = [_ for _ in dir() if not _.startswith("_")]
diff --git a/mimikit/extract/from_neighbors.py b/mimikit/extract/from_neighbors.py
@@ -10,11 +10,11 @@
 ]
 
 
-def nearest_neighbor(X, Y):
+def nearest_neighbor(X, Y, metric=AngularDistance()):
     """
     computes nearest neighbor by angular distance
     """
-    D_xy = AngularDistance()(X, Y)
+    D_xy = metric(X, Y)
     dists, nn = torch.min(D_xy, dim=-1)
     return dists, nn
 
diff --git a/mimikit/extract/label_filter.py b/mimikit/extract/label_filter.py
@@ -0,0 +1,109 @@
+import numpy as np
+import dataclasses as dtc
+from ..features.functionals import Functional
+
+__all__ = [
+    "label_filter",
+    "LabelFilter"
+]
+
+
+def _get_counts(labels, R):
+    K = R + 1
+    if K % 2 == 0:
+        K += 1
+    l2d = np.lib.stride_tricks.sliding_window_view(np.pad(labels, (K//2, K//2), constant_values=-1), (K,))
+    where_l, where_r = np.ones(l2d.shape[0], dtype=bool), np.ones(l2d.shape[0], dtype=bool)
+    counts = np.ones(l2d.shape[0], dtype=int)
+    center = K//2
+    for r in range(1, K//2 + 1):
+        where_l[where_l] = (l2d[where_l, center] == l2d[where_l, center - r])
+        where_r[where_r] = (l2d[where_r, center] == l2d[where_r, center + r])
+        counts[where_l] += 1
+        counts[where_r] += 1
+    flagged = counts < R
+    c2d = np.lib.stride_tricks.sliding_window_view(np.pad(counts, (K//2, K//2), constant_values=0), (K,))
+    return l2d, c2d, flagged
+
+
+def _filter_window(w, elem_counts, glob_counts, label_undecidable):
+    e_i = w.shape[0]//2
+    elem = w[e_i]
+    elem_count = elem_counts[e_i]
+    glob_elem_count = glob_counts[elem]
+    c_max_i = elem_counts.argmax()
+    c_max = elem_counts[c_max_i]
+    if c_max == 1:
+        # w = np.sort(w)
+        w_hat = w[elem_counts == 1]
+        gc = glob_counts[w_hat]
+        gc_max_i = gc.argmax()
+        gc_max = gc[gc_max_i]
+        if gc_max == 1 or (gc == gc_max).all():
+            # all labels are global singletons
+            if label_undecidable:
+                v = -1
+            elif w[e_i-1] == w[e_i+1]:
+                # elem is surrounded by the same element
+                v = w[e_i-1]
+            else:
+                v = elem
+        elif (gc == gc_max).sum() > 1:
+            # tie between labels
+            if gc_max == glob_elem_count:
+                # we keep it
+                v = elem
+            else:
+                # first max
+                v = w_hat[gc_max_i]
+        else:
+            v = w_hat[gc_max_i]
+    else:
+        if elem_count == c_max:
+            v = elem
+        else:
+            v = w[c_max_i]
+    return v
+
+
+def label_filter(
+        labels: np.ndarray,
+        min_repetition: int,
+        label_undecidable: bool = True,
+        relabel_output: bool = True
+) -> np.ndarray:
+    if min_repetition == 1:
+        return labels
+    glob_counts = np.r_[np.bincount(labels), 0]  # for -1 labels
+    l2d, c2d, flagged = _get_counts(labels, min_repetition)
+    while np.any(flagged):
+        out = np.zeros_like(labels)
+        for i in flagged.nonzero()[0]:
+            out[i] = _filter_window(l2d[i], c2d[i], glob_counts, label_undecidable)
+        if np.all(labels[flagged] == out[flagged]):
+            break
+        out[~flagged] = labels[~flagged]
+        labels = out
+        l2d, c2d, flagged = _get_counts(labels, min_repetition)
+    if relabel_output:
+        _, labels = np.unique(labels, return_inverse=True)
+    return labels
+
+
+@dtc.dataclass
+class LabelFilter(Functional):
+    min_repetition: int = 1
+    label_undecidable: bool = False
+
+    def np_func(self, inputs):
+        return label_filter(inputs,
+                            self.min_repetition,
+                            self.label_undecidable,
+                            relabel_output=False)
+
+    def torch_func(self, inputs):
+        pass
+
+    @property
+    def inv(self) -> "Functional":
+        return None
diff --git a/tests/test_label_filter.py b/tests/test_label_filter.py
@@ -0,0 +1,86 @@
+from mimikit.extract.label_filter import label_filter
+import numpy as np
+from assertpy import assert_that
+import pytest
+
+
+def test_should_extend_repetition_on_the_edges():
+    given_labels = np.r_[0, 0, 1, 2, 3, 4, 4]
+    given_min_rep = 2
+    expected_result = np.r_[0, 0, 0, 0, 4, 4, 4]
+
+    result = label_filter(given_labels, given_min_rep, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_extend_edges_and_replace_single_labels_with_undecidable_label():
+    given_labels = np.r_[0, 0, 1, 2, 3, 4, 5, 5]
+    given_min_rep = 2
+    # if min_rep was 3, the -1 in the middle would be absorbed by 0 and 5
+    expected_result = np.r_[0, 0, 0, -1, -1, 5, 5, 5]
+
+    result = label_filter(given_labels, given_min_rep, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_extend_edges_and_not_replace_single_labels_without_undecidable_label():
+    given_labels = np.r_[0, 0, 1, 2, 3, 4, 5, 5]
+    given_min_rep = 2
+    # if min_rep was 3, the -1 in the middle would be absorbed by 0 and 5
+    expected_result = np.r_[0, 0, 0, 0, 5, 5, 5, 5]
+
+    result = label_filter(given_labels, given_min_rep, label_undecidable=False, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_replace_undecidable_with_minus_one():
+    given_labels = np.r_[0, 1, 2, 1, 2, 0]
+    given_min_rep = 2
+    expected_result = np.r_[-1, -1, -1, -1, -1, -1]
+
+    result = label_filter(given_labels, given_min_rep, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_replace_with_surrounding_elem_without_undecidable_label():
+    given_labels = np.r_[0, 1, 2, 1, 2, 0]
+    given_min_rep = 2
+    expected_result = np.r_[1, 1, 1, 2, 2, 2]
+
+    result = label_filter(given_labels, given_min_rep, label_undecidable=False, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_return_input_if_undecidable():
+    given_labels = np.r_[0, 1, 2, 3, 1, 2, 3, 0]
+    given_min_rep = 2
+    expected_result = given_labels
+
+    result = label_filter(given_labels, given_min_rep, label_undecidable=False, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_fallback_to_global_counts():
+    given_labels = np.r_[0, 1, 2, 1, 2]
+    given_min_rep = 2
+    expected_result = np.r_[1, 1, -1, -1, -1]
+
+    result = label_filter(given_labels, given_min_rep, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()
+
+
+def test_should_handle_edges_correctly():
+    given_labels = np.r_[0, 0, 1, 1, 1]
+    given_min_rep = 3
+    expected_result = np.r_[1, 1, 1, 1, 1]
+
+    result = label_filter(given_labels, given_min_rep, relabel_output=False)
+
+    assert_that(np.all(result == expected_result)).is_true()