Fix: avoid unnecessary allocation & 2nd time sorting of steps

dzwiedziu · dzwiedziu · commit ace595b6e84a · 2025-11-06T17:58:54.000+01:00
diff --git a/jitter-test.py b/jitter-test.py
@@ -0,0 +1,106 @@
+import timeit
+from statistics import mean
+from typing import Callable
+
+import numpy as np
+
+try:
+    from numba import njit
+except ImportError as exc:
+    raise SystemExit(
+        "Numba is required for this benchmark. Install it with `pip install numba` before running the script."
+    ) from exc
+
+
+NUM_LISTS = 4
+LIST_LENGTH = 100000
+REPEAT = 5
+NUMBER = 1000
+
+SENTINEL = np.int64(np.iinfo(np.int64).max)
+
+
+def make_sorted_arrays(num_lists: int, list_length: int) -> list[np.ndarray]:
+    rng = np.random.default_rng(seed=0)
+    return [
+        np.sort(rng.integers(0, 10_000, size=list_length, dtype=np.int64))
+        for _ in range(num_lists)
+    ]
+
+
+@njit(cache=True)
+def _merge_numba_impl(data: np.ndarray) -> np.ndarray:
+    num_lists, list_len = data.shape
+    total = num_lists * list_len
+    indices = np.zeros(num_lists, dtype=np.int64)
+    merged = np.empty(total, dtype=np.int64)
+
+    out_idx = 0
+    last_val = SENTINEL
+
+    while True:
+        best_val = SENTINEL
+        best_list = -1
+        for list_idx in range(num_lists):
+            pos = indices[list_idx]
+            if pos < list_len:
+                value = data[list_idx, pos]
+                if value < best_val:
+                    best_val = value
+                    best_list = list_idx
+
+        if best_list == -1:
+            break
+
+        if best_val != last_val:
+            merged[out_idx] = best_val
+            out_idx += 1
+            last_val = best_val
+
+        for list_idx in range(num_lists):
+            pos = indices[list_idx]
+            if pos < list_len:
+                value = data[list_idx, pos]
+                if value == best_val:
+                    pos += 1
+                    while pos < list_len and data[list_idx, pos] == best_val:
+                        pos += 1
+                    indices[list_idx] = pos
+
+    return merged[:out_idx]
+
+
+def merge_numba(data: list[np.ndarray], precomputed: np.ndarray | None = None) -> np.ndarray:
+    stacked = precomputed if precomputed is not None else np.vstack(data)
+    return _merge_numba_impl(stacked)
+
+
+def merge_numpy(data: list[np.ndarray]) -> np.ndarray:
+    return np.unique(np.concatenate(data))
+
+
+def time_function(action: Callable[[], np.ndarray]) -> float:
+    timer = timeit.Timer(action)
+    runs = timer.repeat(repeat=REPEAT, number=NUMBER)
+    return mean(runs) / NUMBER
+
+
+if __name__ == "__main__":
+    dataset = make_sorted_arrays(NUM_LISTS, LIST_LENGTH)
+
+    stacked_dataset = np.vstack(dataset)
+    numpy_result = merge_numpy(dataset)
+    numba_result = merge_numba(dataset, stacked_dataset)
+    assert np.array_equal(numpy_result, numba_result)
+
+    # Ensure Numba compilation happens before timing.
+    merge_numba(dataset, stacked_dataset)
+
+    benchmarks = {
+        "NumPy sort": lambda: merge_numpy(dataset),
+        "Numba merge": lambda: merge_numba(dataset, stacked_dataset),
+    }
+
+    for label, action in benchmarks.items():
+        per_call = time_function(action)
+        print(f"{label}: {per_call * 1_000_000:.2f} microseconds per merge")
diff --git a/src/neptune_query/internal/output_format.py b/src/neptune_query/internal/output_format.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pathlib
-import sys
-import time
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import (
@@ -488,21 +486,18 @@ def from_observed_steps(
         total_rows_count = sum(len(steps) for steps in observed_steps.values())
         display_names: list[str] = [""] * total_rows_count
         step_values: np.ndarray = np.empty(shape=(total_rows_count,), dtype=np.float64)
-
         row_num: int = 0
-        for display_name in sorted(observed_steps.keys()):
+        sorted_observed_steps = sorted(observed_steps.items(), key=lambda x: x[0])
+        for display_name, steps in sorted_observed_steps:
             sys_id = display_name_to_sys_id[display_name]
-            sorted_steps = np.sort(observed_steps[display_name], kind="stable")
-            for i, step in enumerate(sorted_steps, start=row_num):
-                display_names[i] = display_name
-                step_values[i] = step
-
+            step_values[row_num:row_num + steps.size] = steps
+            display_names[row_num:row_num + steps.size] = [display_name] * steps.size
             if sys_id_ranges is not None:
-                sys_id_ranges[sys_id] = (row_num, row_num + sorted_steps.size)
+                sys_id_ranges[sys_id] = (row_num, row_num + steps.size)
             if row_dict_lookup is not None:
-                row_dict_lookup[sys_id] = {float(step): idx for idx, step in enumerate(sorted_steps, start=row_num)}
+                row_dict_lookup[sys_id] = {float(step): idx for idx, step in enumerate(steps, start=row_num)}
+            row_num += steps.size
 
-            row_num += sorted_steps.size
 
         return cls(
             display_names=display_names,
diff --git a/src/neptune_query/internal/retrieval/metrics.py b/src/neptune_query/internal/retrieval/metrics.py
@@ -192,10 +192,11 @@ def _process_metrics_page(
 ) -> util.Page[tuple[identifiers.RunAttributeDefinition, MetricDatapoints]]:
     result = {}
     for series in data.series:
+        pass
         metric_values = MetricDatapoints.allocate(
             size=len(series.series.values), include_timestamp=include_timestamp, include_preview=include_preview
         )
-
+        
         for i, point in enumerate(series.series.values):
             idx = metric_values.length - 1 - i if reverse_order else i
 
diff --git a/tests/debug/test_output_format_debug.py b/tests/debug/test_output_format_debug.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pytest
+
+from neptune_query.internal.identifiers import (
+    AttributeDefinition,
+    ProjectIdentifier,
+    RunAttributeDefinition,
+    RunIdentifier,
+    SysId,
+)
+from neptune_query.internal.output_format import create_metrics_dataframe
+from neptune_query.internal.retrieval.metrics import MetricDatapoints
+
+
+def test_create_metrics_dataframe_large_debug_workload():
+    num_runs = 1000000
+    num_metrics = 1
+    num_datapoints = 1
+
+    project_identifier = ProjectIdentifier("debug/project")
+    metrics_data: dict[RunAttributeDefinition, MetricDatapoints] = {}
+    sys_id_label_mapping: dict[SysId, str] = {}
+    base_steps = np.arange(num_datapoints, dtype=np.float64)
+
+    for run_idx in range(num_runs):
+        sys_id = SysId(f"sys{run_idx:04d}")
+        sys_id_label_mapping[sys_id] = f"run-{run_idx:04d}"
+        run_identifier = RunIdentifier(project_identifier, sys_id)
+
+        for metric_idx in range(num_metrics):
+            attribute_definition = AttributeDefinition(f"metric_{metric_idx:02d}", "float_series")
+            run_attribute_definition = RunAttributeDefinition(run_identifier, attribute_definition)
+
+            datapoints = MetricDatapoints.allocate(
+                size=num_datapoints, include_timestamp=False, include_preview=False
+            )
+            step_offset = metric_idx + 1
+            base_value = run_idx * num_metrics * num_datapoints + metric_idx * num_datapoints
+
+            shifted_steps = base_steps + step_offset
+            for idx, base_step in enumerate(base_steps):
+                datapoints.append(step=float(shifted_steps[idx]), value=float(base_value + base_step))
+
+            metrics_data[run_attribute_definition] = datapoints.compile()
+
+    dataframe = create_metrics_dataframe(
+        metrics_data=metrics_data,
+        sys_id_label_mapping=sys_id_label_mapping,
+        type_suffix_in_column_names=False,
+        include_point_previews=False,
+        index_column_name="run",
+        timestamp_column_name=None,
+    )