refactor: emit LLM token / request metrics via RunHooks

x · x · commit 65d2e81c4f62 · 2026-05-08T01:34:51.000-04:00
Splits metric emission by what each layer can see:
- agentex.lib.core.observability.llm_metrics_hooks.LLMMetricsHooks
  (RunHooks subclass) emits agentex.llm.requests + the four token
  counters in on_llm_end. Works for any RunHooks-aware path.
- TemporalStreamingHooks now inherits from LLMMetricsHooks so the
  async path picks up the same metrics automatically.
- TemporalStreamingModel keeps only the streaming-only metrics
  (ttft, ttat, tps) — those need per-chunk visibility hooks can't
  provide. Failure path uses the new record_llm_failure helper.

This makes adding the sync ACP path trivial later: pass
LLMMetricsHooks() to Runner.run from services/adk/providers/openai.py
and it'll emit the same metrics with no double-counting.

Tests cover:
- classify_status branches (rate_limit / timeout / server_error /
  network_error / client_error / other_error / success)
- get_llm_metrics singleton + instrument presence
- LLMMetricsHooks.on_llm_end emits requests + token counters with
  the right model attribute
- Both the hooks path and record_llm_failure swallow exporter
  exceptions so callers don't break when metrics fail
diff --git a/src/agentex/lib/core/observability/llm_metrics_hooks.py b/src/agentex/lib/core/observability/llm_metrics_hooks.py
@@ -0,0 +1,46 @@
+"""``RunHooks`` adapter that emits per-call LLM metrics.
+
+Used by the sync ACP path and as a base class for ``TemporalStreamingHooks``
+on the async path, so token / request / cache metrics emit consistently
+across both. Streaming-only metrics (ttft, ttat, tps) are emitted from the
+streaming model itself, not here — hooks don't see individual chunks.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from agents import Agent, RunHooks, ModelResponse, RunContextWrapper
+
+from agentex.lib.core.observability.llm_metrics import classify_status, get_llm_metrics
+
+
+class LLMMetricsHooks(RunHooks):
+    """Emits ``agentex.llm.requests`` + token counters on every LLM call."""
+
+    async def on_llm_end(
+        self,
+        context: RunContextWrapper[Any],
+        agent: Agent[Any],
+        response: ModelResponse,
+    ) -> None:
+        del context  # part of the RunHooks contract; unused here
+        m = get_llm_metrics()
+        attrs = {"model": str(agent.model) if agent.model else "unknown"}
+        try:
+            usage = response.usage
+            m.requests.add(1, {**attrs, "status": "success"})
+            m.input_tokens.add(usage.input_tokens or 0, attrs)
+            m.output_tokens.add(usage.output_tokens or 0, attrs)
+            m.cached_input_tokens.add(usage.input_tokens_details.cached_tokens or 0, attrs)
+            m.reasoning_tokens.add(usage.output_tokens_details.reasoning_tokens or 0, attrs)
+        except Exception:
+            pass
+
+
+def record_llm_failure(model: str, exc: BaseException) -> None:
+    """Best-effort counter bump for an LLM call that raised before ``on_llm_end``."""
+    try:
+        get_llm_metrics().requests.add(1, {"model": model, "status": classify_status(exc)})
+    except Exception:
+        pass
diff --git a/src/agentex/lib/core/observability/tests/__init__.py b/src/agentex/lib/core/observability/tests/__init__.py
diff --git a/src/agentex/lib/core/observability/tests/test_llm_metrics.py b/src/agentex/lib/core/observability/tests/test_llm_metrics.py
@@ -0,0 +1,83 @@
+"""Tests for ``agentex.lib.core.observability.llm_metrics``."""
+
+from __future__ import annotations
+
+import agentex.lib.core.observability.llm_metrics as llm_metrics
+from agentex.lib.core.observability.llm_metrics import (
+    LLMMetrics,
+    classify_status,
+    get_llm_metrics,
+)
+
+
+class TestClassifyStatus:
+    def test_none_is_success(self):
+        assert classify_status(None) == "success"
+
+    def test_rate_limit(self):
+        class RateLimitError(Exception):
+            pass
+
+        assert classify_status(RateLimitError()) == "rate_limit"
+
+    def test_timeout(self):
+        class APITimeoutError(Exception):
+            pass
+
+        assert classify_status(APITimeoutError()) == "timeout"
+
+    def test_server_error(self):
+        class InternalServerError(Exception):
+            pass
+
+        assert classify_status(InternalServerError()) == "server_error"
+
+        class ServiceUnavailable(Exception):
+            pass
+
+        assert classify_status(ServiceUnavailable()) == "server_error"
+
+    def test_network_error(self):
+        class APIConnectionError(Exception):
+            pass
+
+        assert classify_status(APIConnectionError()) == "network_error"
+
+    def test_client_error(self):
+        for cls_name in ("BadRequestError", "AuthenticationError", "PermissionError"):
+            cls = type(cls_name, (Exception,), {})
+            assert classify_status(cls()) == "client_error"
+
+    def test_unknown_falls_back(self):
+        class WeirdProviderException(Exception):
+            pass
+
+        assert classify_status(WeirdProviderException()) == "other_error"
+
+
+class TestGetLLMMetrics:
+    def test_returns_llm_metrics_instance(self, monkeypatch):
+        monkeypatch.setattr(llm_metrics, "_llm_metrics", None)
+        m = get_llm_metrics()
+        assert isinstance(m, LLMMetrics)
+
+    def test_singleton_returns_same_instance(self, monkeypatch):
+        monkeypatch.setattr(llm_metrics, "_llm_metrics", None)
+        first = get_llm_metrics()
+        second = get_llm_metrics()
+        assert first is second
+
+    def test_instruments_exist(self, monkeypatch):
+        monkeypatch.setattr(llm_metrics, "_llm_metrics", None)
+        m = get_llm_metrics()
+        for name in (
+            "requests",
+            "ttft_ms",
+            "ttat_ms",
+            "tps",
+            "input_tokens",
+            "output_tokens",
+            "cached_input_tokens",
+            "reasoning_tokens",
+        ):
+            assert hasattr(m, name), f"missing instrument: {name}"
diff --git a/src/agentex/lib/core/observability/tests/test_llm_metrics_hooks.py b/src/agentex/lib/core/observability/tests/test_llm_metrics_hooks.py
@@ -0,0 +1,135 @@
+"""Tests for ``agentex.lib.core.observability.llm_metrics_hooks``."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import pytest
+
+import agentex.lib.core.observability.llm_metrics_hooks as hooks_module
+from agentex.lib.core.observability.llm_metrics_hooks import (
+    LLMMetricsHooks,
+    record_llm_failure,
+)
+
+
+def _mock_response(
+    *,
+    input_tokens: int = 100,
+    output_tokens: int = 50,
+    cached_tokens: int = 30,
+    reasoning_tokens: int = 10,
+) -> MagicMock:
+    response = MagicMock()
+    response.usage.input_tokens = input_tokens
+    response.usage.output_tokens = output_tokens
+    response.usage.input_tokens_details.cached_tokens = cached_tokens
+    response.usage.output_tokens_details.reasoning_tokens = reasoning_tokens
+    return response
+
+
+def _mock_agent(model: str = "gpt-5") -> MagicMock:
+    agent = MagicMock()
+    agent.model = model
+    return agent
+
+
+class TestLLMMetricsHooksOnLLMEnd:
+    @pytest.mark.asyncio
+    async def test_emits_success_request_counter(self, monkeypatch):
+        m = MagicMock()
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        await LLMMetricsHooks().on_llm_end(
+            context=MagicMock(),
+            agent=_mock_agent("gpt-5"),
+            response=_mock_response(),
+        )
+
+        m.requests.add.assert_called_once_with(1, {"model": "gpt-5", "status": "success"})
+
+    @pytest.mark.asyncio
+    async def test_emits_token_counters(self, monkeypatch):
+        m = MagicMock()
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        await LLMMetricsHooks().on_llm_end(
+            context=MagicMock(),
+            agent=_mock_agent("gpt-5"),
+            response=_mock_response(
+                input_tokens=200,
+                output_tokens=75,
+                cached_tokens=50,
+                reasoning_tokens=20,
+            ),
+        )
+
+        attrs = {"model": "gpt-5"}
+        m.input_tokens.add.assert_called_once_with(200, attrs)
+        m.output_tokens.add.assert_called_once_with(75, attrs)
+        m.cached_input_tokens.add.assert_called_once_with(50, attrs)
+        m.reasoning_tokens.add.assert_called_once_with(20, attrs)
+
+    @pytest.mark.asyncio
+    async def test_zero_tokens_emit_zero_not_skip(self, monkeypatch):
+        m = MagicMock()
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        await LLMMetricsHooks().on_llm_end(
+            context=MagicMock(),
+            agent=_mock_agent(),
+            response=_mock_response(input_tokens=0, output_tokens=0, cached_tokens=0, reasoning_tokens=0),
+        )
+
+        m.input_tokens.add.assert_called_once_with(0, {"model": "gpt-5"})
+        m.output_tokens.add.assert_called_once_with(0, {"model": "gpt-5"})
+
+    @pytest.mark.asyncio
+    async def test_unknown_model_falls_back(self, monkeypatch):
+        m = MagicMock()
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        agent = MagicMock()
+        agent.model = None
+
+        await LLMMetricsHooks().on_llm_end(
+            context=MagicMock(),
+            agent=agent,
+            response=_mock_response(),
+        )
+
+        m.requests.add.assert_called_once_with(1, {"model": "unknown", "status": "success"})
+
+    @pytest.mark.asyncio
+    async def test_swallows_exporter_failure(self, monkeypatch):
+        m = MagicMock()
+        m.requests.add.side_effect = RuntimeError("exporter exploded")
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        # Should not raise — caller's flow must not break on metric failure.
+        await LLMMetricsHooks().on_llm_end(
+            context=MagicMock(),
+            agent=_mock_agent(),
+            response=_mock_response(),
+        )
+
+
+class TestRecordLLMFailure:
+    def test_emits_classified_status(self, monkeypatch):
+        m = MagicMock()
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        class RateLimitError(Exception):
+            pass
+
+        record_llm_failure("gpt-5", RateLimitError())
+
+        m.requests.add.assert_called_once_with(1, {"model": "gpt-5", "status": "rate_limit"})
+
+    def test_swallows_exporter_failure(self, monkeypatch):
+        m = MagicMock()
+        m.requests.add.side_effect = RuntimeError("exporter exploded")
+        monkeypatch.setattr(hooks_module, "get_llm_metrics", lambda: m)
+
+        # Should not raise.
+        record_llm_failure("gpt-5", Exception("upstream"))
diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/hooks/hooks.py b/src/agentex/lib/core/temporal/plugins/openai_agents/hooks/hooks.py
@@ -8,18 +8,19 @@
 from typing import Any, override
 from datetime import timedelta
 
-from agents import Tool, Agent, RunHooks, RunContextWrapper
+from agents import Tool, Agent, RunContextWrapper
 from temporalio import workflow
 from agents.tool_context import ToolContext
 
 from agentex.types.text_content import TextContent
 from agentex.types.task_message_content import ToolRequestContent, ToolResponseContent
+from agentex.lib.core.observability.llm_metrics_hooks import LLMMetricsHooks
 from agentex.lib.core.temporal.plugins.openai_agents.hooks.activities import stream_lifecycle_content
 
 logger = logging.getLogger(__name__)
 
 
-class TemporalStreamingHooks(RunHooks):
+class TemporalStreamingHooks(LLMMetricsHooks):
     """Convenience hooks class for streaming OpenAI Agent lifecycle events to the AgentEx UI.
 
     This class automatically streams agent lifecycle events (tool calls, handoffs) to the
diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py
@@ -32,7 +32,8 @@
 # Re-export the canonical StreamingMode literal from the streaming service so
 # all layers share a single definition.
 from agentex.lib.core.services.adk.streaming import StreamingMode as StreamingMode
-from agentex.lib.core.observability.llm_metrics import classify_status, get_llm_metrics
+from agentex.lib.core.observability.llm_metrics import get_llm_metrics
+from agentex.lib.core.observability.llm_metrics_hooks import record_llm_failure
 
 try:
     from agents.tool import ShellTool  # type: ignore[attr-defined]
@@ -1026,34 +1027,24 @@ async def get_response(
 
                     span.output = output_data
 
-                # Emit LLM metrics derived from the captured stream. The meter is a
-                # no-op if the application hasn't configured a MeterProvider, so this
-                # is safe to do unconditionally. We only emit ttft / tps when their
-                # input data is actually meaningful (got a content delta, got tokens).
+                # Streaming-only metrics. Token counters and the success request
+                # counter are emitted by LLMMetricsHooks.on_llm_end so they fire
+                # consistently across streaming and non-streaming paths.
                 m = get_llm_metrics()
                 metric_attrs = {"model": self.model_name}
-                m.requests.add(1, {**metric_attrs, "status": "success"})
-                m.input_tokens.add(usage.input_tokens or 0, metric_attrs)
-                m.output_tokens.add(usage.output_tokens or 0, metric_attrs)
-                m.cached_input_tokens.add(usage.input_tokens_details.cached_tokens or 0, metric_attrs)
-                m.reasoning_tokens.add(usage.output_tokens_details.reasoning_tokens or 0, metric_attrs)
                 if first_token_at is not None:
                     m.ttft_ms.record((first_token_at - stream_start_perf) * 1000, metric_attrs)
                 if first_answer_at is not None:
                     m.ttat_ms.record((first_answer_at - stream_start_perf) * 1000, metric_attrs)
-                # tps denominator is the generation window (first→last delta), not
-                # total stream wall time — see LLMMetrics for rationale. Single-token
-                # responses (where first_token_at == last_token_at, e.g. a one-token
-                # tool-result acknowledgement) collapse the window to 0 and are
-                # intentionally skipped — TPS is undefined in that case.
+                # Single-token responses collapse the generation window to 0; tps
+                # is undefined and skipped.
                 if (
                     first_token_at is not None
                     and last_token_at is not None
                     and last_token_at > first_token_at
                     and (usage.output_tokens or 0) > 0
                 ):
-                    generation_window_s = last_token_at - first_token_at
-                    m.tps.record(usage.output_tokens / generation_window_s, metric_attrs)
+                    m.tps.record(usage.output_tokens / (last_token_at - first_token_at), metric_attrs)
 
                 # Return the response. response_id is the server-issued id from
                 # ResponseCompletedEvent.response.id, or None when the stream ended
@@ -1070,18 +1061,10 @@ async def get_response(
 
             except Exception as e:
                 logger.error(f"Error using Responses API: {e}")
-                # Emit a request-counter event so 429s, 5xxs, timeouts, etc. are
-                # observable on the SDK side. Status histograms / token counters
-                # only fire on successful completion above. Wrapped in a bare
-                # try/except so a misbehaving exporter can't shadow the original
-                # LLM exception — callers (retry logic, circuit breakers) need
-                # to see the typed RateLimitError / APITimeoutError / etc.
-                try:
-                    get_llm_metrics().requests.add(
-                        1, {"model": self.model_name, "status": classify_status(e)}
-                    )
-                except Exception:
-                    pass
+                # LLMMetricsHooks.on_llm_end doesn't fire on error, so emit the
+                # failure counter here. Best-effort so the typed LLM exception
+                # always propagates intact for retry / circuit-breaker logic.
+                record_llm_failure(self.model_name, e)
                 raise
 
     # The _get_response_with_responses_api method has been merged into get_response above