Skip to content

Commit

Permalink
add propagate sub cluster label option
Browse files Browse the repository at this point in the history
  • Loading branch information
JelmerBot committed Dec 28, 2024
1 parent d40ab87 commit 388590b
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 1 deletion.
41 changes: 40 additions & 1 deletion fast_hdbscan/sub_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,36 @@ def update_labels(
return labels, probabilities, sub_labels, sub_probabilities, lens_values


def propagate_sub_cluster_labels(
labels, cluster_labels, sub_labels, graph_list, points_list
):
running_id = 0
for cluster, (points, core_graph) in enumerate(zip(points_list, graph_list)):
unique_sub_labels = np.unique(sub_labels[points])
if unique_sub_labels[0] != -1 or len(unique_sub_labels) == 1:
continue
while True:
noise_idx = np.nonzero(sub_labels[points] == -1)[0]
if noise_idx.shape[0] == 0:
break
for idx in noise_idx:
start = core_graph.indptr[idx]
end = core_graph.indptr[idx + 1]
neighbors = points[core_graph.indices[start:end]]
candidate_labels = np.where(
cluster_labels[neighbors] == cluster, sub_labels[neighbors], -1
)
unique, count = np.unique(candidate_labels, return_counts=True)
offset = 1 if unique[0] == -1 else 0
if len(unique) == offset:
continue
sub_labels[points[idx]] = unique[np.argmax(count[offset:]) + offset]

labels[points] = sub_labels[points] + running_id
running_id += len(unique_sub_labels) - 1
return labels, sub_labels


def remap_results(
labels,
probabilities,
Expand Down Expand Up @@ -192,6 +222,7 @@ def find_sub_clusters(
cluster_selection_method=None,
cluster_selection_epsilon=0.0,
cluster_selection_persistence=0.0,
propagate_labels=False,
):
check_is_fitted(
clusterer,
Expand Down Expand Up @@ -323,6 +354,12 @@ def find_sub_clusters(
data.shape[0],
)

# Propagate labels if requested
if propagate_labels:
labels, sub_labels = propagate_sub_cluster_labels(
labels, cluster_labels, sub_labels, core_graphs, points
)

# Reset for infinite data points
if last_outlier > 0:
(
Expand Down Expand Up @@ -377,6 +414,7 @@ def __init__(
cluster_selection_method="eom",
cluster_selection_epsilon=0.0,
cluster_selection_persistence=0.0,
propagate_labels=False,
):
self.lens_values = lens_values
self.min_cluster_size = min_cluster_size
Expand All @@ -385,6 +423,7 @@ def __init__(
self.cluster_selection_method = cluster_selection_method
self.cluster_selection_epsilon = cluster_selection_epsilon
self.cluster_selection_persistence = cluster_selection_persistence
self.propagate_labels = propagate_labels

def fit(self, clusterer, labels=None, probabilities=None, lens_callback=None):
"""labels and probabilities override the clusterer's values."""
Expand Down Expand Up @@ -413,6 +452,7 @@ def fit(self, clusterer, labels=None, probabilities=None, lens_callback=None):
cluster_selection_method=self.cluster_selection_method,
cluster_selection_epsilon=self.cluster_selection_epsilon,
cluster_selection_persistence=self.cluster_selection_persistence,
propagate_labels=self.propagate_labels,
)
# also store the core distances and raw data for the member functions
self._raw_data = clusterer._raw_data
Expand Down Expand Up @@ -458,7 +498,6 @@ def _make_approximation_graph(self, lens_name=None, sub_cluster_name=None):
raw_data=self._raw_data,
)


@property
def condensed_trees_(self):
"""See :class:`~hdbscan.plots.CondensedTree` for documentation."""
Expand Down
6 changes: 6 additions & 0 deletions fast_hdbscan/tests/test_sub_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ def test_selection_method():
check_detected_groups(b, n_clusters=2, n_subs=7)


def test_label_propagation():
b = SubClusterDetector(lens_values=centrality, propagate_labels=True).fit(c)
assert np.all(b.sub_cluster_labels_ >= 0)
check_detected_groups(b, n_clusters=2, n_subs=5)


def test_min_cluster_size():
b = SubClusterDetector(lens_values=centrality, min_cluster_size=7).fit(c)
labels, counts = np.unique(b.labels_[b.sub_cluster_labels_ >= 0], return_counts=True)
Expand Down

0 comments on commit 388590b

Please sign in to comment.