review (stas): extract llm metrics to core/observability + add request counter

x · x · commit da85d7b7eb84 · 2026-05-07T13:07:47.000-04:00
Two follow-up changes from the PR review:

1. Move the LLM metric instruments from _StreamingMetrics in
   temporal_streaming_model.py to a new module:
     agentex.lib.core.observability.llm_metrics

   Public API: get_llm_metrics() returns a singleton LLMMetrics with
   the same six instruments (ttft, tps, input_tokens, output_tokens,
   cached_input_tokens, reasoning_tokens) plus a new requests counter.

   This makes the temporal+openai_agents plugin one of several future
   call sites — the sync ACP path and the Claude SDK plugin can
   record to the same instruments without redefining names, units,
   or descriptions. Keeps cross-provider naming consistent.

2. Add agentex.llm.requests counter with a status label so 429s,
   5xxs, timeouts, and other failures are observable on the SDK
   side without scraping logs. classify_status() maps exception
   types to a small fixed set (success / rate_limit / server_error
   / client_error / timeout / network_error / other_error) by class
   name, so it works across OpenAI, Anthropic, and other provider
   SDKs that use similar exception naming.

   Recorded in two places: success path (alongside token counters)
   and the existing get_response except handler (so terminal
   failures emit a counter event before re-raising).

Cardinality remains bounded — model + status (7 values) on the
counter; all other metrics keep just `model`.
diff --git a/src/agentex/lib/core/observability/__init__.py b/src/agentex/lib/core/observability/__init__.py
diff --git a/src/agentex/lib/core/observability/llm_metrics.py b/src/agentex/lib/core/observability/llm_metrics.py
@@ -0,0 +1,112 @@
+"""OTel metrics for LLM calls.
+
+Single source of truth for LLM-call instrumentation across all agentex code
+paths — temporal+openai_agents streaming today, sync ACP and the Claude SDK
+plugin in future PRs. Centralizing the instrument definitions here means
+those follow-ups don't need to redefine the metric names, units, or
+description strings; they import ``get_llm_metrics()`` and record values.
+
+The meter is no-op when the application hasn't configured a ``MeterProvider``,
+so importing this module is safe for runtimes that don't use OTel. Instruments
+are created lazily on first ``get_llm_metrics()`` call so a ``MeterProvider``
+configured *after* this module is imported still binds correctly.
+
+Cardinality is bounded:
+- All metrics carry only ``model`` (the LLM model name).
+- ``requests`` additionally carries ``status``, drawn from a small fixed set
+  (see ``classify_status``).
+
+Resource attributes (``service.name``, ``k8s.*``, etc.) come from the
+application's OTel resource configuration and are added to every series
+automatically.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from opentelemetry import metrics
+
+
+class LLMMetrics:
+    """Lazily-created OTel instruments for LLM call telemetry."""
+
+    def __init__(self) -> None:
+        meter = metrics.get_meter("agentex.llm")
+        self.requests = meter.create_counter(
+            name="agentex.llm.requests",
+            unit="1",
+            description=(
+                "LLM call count tagged with status (success / rate_limit / "
+                "server_error / client_error / timeout / network_error / "
+                "other_error). Use to alert on 429s, 5xxs, etc."
+            ),
+        )
+        self.ttft_ms = meter.create_histogram(
+            name="agentex.llm.ttft",
+            unit="ms",
+            description="Time from request submission to first content token (ms)",
+        )
+        # Note: TPS denominator is the model-generation window
+        # (last_token_time - first_token_time), not total stream wall time.
+        # This isolates raw model throughput from event-loop / tool-call latency.
+        self.tps = meter.create_histogram(
+            name="agentex.llm.tps",
+            unit="tokens/s",
+            description="Output tokens per second over the generation window",
+        )
+        self.input_tokens = meter.create_counter(
+            name="agentex.llm.input_tokens",
+            unit="tokens",
+            description="Total input tokens sent to the LLM",
+        )
+        self.output_tokens = meter.create_counter(
+            name="agentex.llm.output_tokens",
+            unit="tokens",
+            description="Total output tokens returned by the LLM",
+        )
+        self.cached_input_tokens = meter.create_counter(
+            name="agentex.llm.cached_input_tokens",
+            unit="tokens",
+            description="Subset of input tokens served from prompt cache",
+        )
+        self.reasoning_tokens = meter.create_counter(
+            name="agentex.llm.reasoning_tokens",
+            unit="tokens",
+            description="Output tokens spent on reasoning (subset of output_tokens)",
+        )
+
+
+_llm_metrics: Optional[LLMMetrics] = None
+
+
+def get_llm_metrics() -> LLMMetrics:
+    """Return the LLM metrics singleton, creating it on first use."""
+    global _llm_metrics
+    if _llm_metrics is None:
+        _llm_metrics = LLMMetrics()
+    return _llm_metrics
+
+
+def classify_status(exc: Optional[BaseException]) -> str:
+    """Categorize an LLM call's outcome into a small fixed set of status labels.
+
+    A successful call returns ``"success"``. Exceptions are mapped by type name
+    so we don't depend on a specific provider SDK's exception class hierarchy:
+    OpenAI, Anthropic, and other providers all use names like ``RateLimitError``,
+    ``APITimeoutError``, ``InternalServerError``, etc.
+    """
+    if exc is None:
+        return "success"
+    name = type(exc).__name__
+    if "RateLimit" in name:
+        return "rate_limit"
+    if "Timeout" in name:
+        return "timeout"
+    if any(s in name for s in ("ServerError", "InternalServer", "ServiceUnavailable", "BadGateway")):
+        return "server_error"
+    if "Connection" in name:
+        return "network_error"
+    if any(s in name for s in ("BadRequest", "Authentication", "Permission", "NotFound", "Conflict", "UnprocessableEntity")):
+        return "client_error"
+    return "other_error"
diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py
@@ -27,12 +27,12 @@
     CodeInterpreterTool,
     ImageGenerationTool,
 )
-from opentelemetry import metrics
 from agents.computer import Computer, AsyncComputer
 
 # Re-export the canonical StreamingMode literal from the streaming service so
 # all layers share a single definition.
 from agentex.lib.core.services.adk.streaming import StreamingMode as StreamingMode
+from agentex.lib.core.observability.llm_metrics import classify_status, get_llm_metrics
 
 try:
     from agents.tool import ShellTool  # type: ignore[attr-defined]
@@ -80,61 +80,9 @@
 logger = make_logger("agentex.temporal.streaming")
 
 
-# OTel metrics for LLM streaming behavior. Instruments are created lazily on
-# first use so the meter resolves to whatever MeterProvider the application
-# eventually configures, even if that happens after this module is imported.
-# All metrics carry only a ``model`` attribute to keep cardinality bounded;
-# resource attributes (service.name, k8s.*, etc.) come from the application's
-# OTel resource configuration.
-class _StreamingMetrics:
-    """Lazily-created OTel instruments for streaming LLM telemetry."""
-
-    def __init__(self) -> None:
-        meter = metrics.get_meter("agentex.openai_agents.streaming")
-        self.ttft_ms = meter.create_histogram(
-            name="agentex.llm.ttft",
-            unit="ms",
-            description="Time from request submission to first content token (ms)",
-        )
-        # Note: TPS denominator is the model-generation window
-        # (last_token_time - first_token_time), not total stream wall time.
-        # This isolates raw model throughput from event-loop / tool-call latency.
-        self.tps = meter.create_histogram(
-            name="agentex.llm.tps",
-            unit="tokens/s",
-            description="Output tokens per second over the generation window",
-        )
-        self.input_tokens = meter.create_counter(
-            name="agentex.llm.input_tokens",
-            unit="tokens",
-            description="Total input tokens sent to the LLM",
-        )
-        self.output_tokens = meter.create_counter(
-            name="agentex.llm.output_tokens",
-            unit="tokens",
-            description="Total output tokens returned by the LLM",
-        )
-        self.cached_input_tokens = meter.create_counter(
-            name="agentex.llm.cached_input_tokens",
-            unit="tokens",
-            description="Subset of input tokens served from prompt cache",
-        )
-        self.reasoning_tokens = meter.create_counter(
-            name="agentex.llm.reasoning_tokens",
-            unit="tokens",
-            description="Output tokens spent on reasoning (subset of output_tokens)",
-        )
-
-
-_streaming_metrics: Optional[_StreamingMetrics] = None
-
-
-def _get_streaming_metrics() -> _StreamingMetrics:
-    """Return the streaming metrics singleton, creating it on first use."""
-    global _streaming_metrics
-    if _streaming_metrics is None:
-        _streaming_metrics = _StreamingMetrics()
-    return _streaming_metrics
+# LLM metrics live in agentex.lib.core.observability.llm_metrics so other
+# code paths (sync ACP, Claude SDK plugin, future provider integrations)
+# can share the same instrument definitions without redefining names.
 
 
 def _serialize_item(item: Any) -> dict[str, Any]:
@@ -1070,19 +1018,20 @@ async def get_response(
                 # no-op if the application hasn't configured a MeterProvider, so this
                 # is safe to do unconditionally. We only emit ttft / tps when their
                 # input data is actually meaningful (got a content delta, got tokens).
-                m = _get_streaming_metrics()
+                m = get_llm_metrics()
                 metric_attrs = {"model": self.model_name}
+                m.requests.add(1, {**metric_attrs, "status": "success"})
                 m.input_tokens.add(usage.input_tokens or 0, metric_attrs)
                 m.output_tokens.add(usage.output_tokens or 0, metric_attrs)
                 m.cached_input_tokens.add(usage.input_tokens_details.cached_tokens or 0, metric_attrs)
                 m.reasoning_tokens.add(usage.output_tokens_details.reasoning_tokens or 0, metric_attrs)
                 if first_token_at is not None:
                     m.ttft_ms.record((first_token_at - stream_start_perf) * 1000, metric_attrs)
                 # tps denominator is the generation window (first→last delta), not
-                # total stream wall time — see _StreamingMetrics for rationale.
-                # Note: single-token responses (where first_token_at == last_token_at,
-                # e.g. a one-token tool-result acknowledgement) collapse the window
-                # to 0 and are intentionally skipped — TPS is undefined in that case.
+                # total stream wall time — see LLMMetrics for rationale. Single-token
+                # responses (where first_token_at == last_token_at, e.g. a one-token
+                # tool-result acknowledgement) collapse the window to 0 and are
+                # intentionally skipped — TPS is undefined in that case.
                 if (
                     first_token_at is not None
                     and last_token_at is not None
@@ -1107,6 +1056,12 @@ async def get_response(
 
             except Exception as e:
                 logger.error(f"Error using Responses API: {e}")
+                # Emit a request-counter event so 429s, 5xxs, timeouts, etc. are
+                # observable on the SDK side. Status histograms / token counters
+                # only fire on successful completion above.
+                get_llm_metrics().requests.add(
+                    1, {"model": self.model_name, "status": classify_status(e)}
+                )
                 raise
 
     # The _get_response_with_responses_api method has been merged into get_response above