feat(streaming): add ttat (time-to-first-answering-token)

x · x · commit b08e48f4bfae · 2026-05-08T00:24:23.000-04:00
ttft fires on the first content delta of any kind, which for reasoning
models means the first reasoning chunk — arrives quickly even when the
user-perceived latency is much longer. ttat fires only on the first
user-visible answer token (text delta or tool-call arguments delta),
excluding reasoning chunks. For non-reasoning models the two are equal;
for gpt-5-class / o-series models they differ by the reasoning duration.

This pairs with ttft for "did the model start thinking quickly?" vs
"how long did the user wait for an answer?" — both are valuable signals
that mean different things on reasoning workloads.

Implementation: a third bookmark variable (``first_answer_at``) set
inside the same up-front event-type check, restricted to
ResponseTextDeltaEvent / ResponseFunctionCallArgumentsDeltaEvent.
Adds one new histogram (``agentex.llm.ttat``) — same labels and units
as ttft.
diff --git a/src/agentex/lib/core/observability/llm_metrics.py b/src/agentex/lib/core/observability/llm_metrics.py
@@ -47,6 +47,15 @@ def __init__(self) -> None:
             unit="ms",
             description="Time from request submission to first content token (ms)",
         )
+        # ttat (time-to-first-answering-token) is distinct from ttft for reasoning
+        # models: ttft fires on the first reasoning chunk (which arrives quickly),
+        # while ttat fires on the first user-visible answer token (text or tool
+        # call). For non-reasoning models the two are equal.
+        self.ttat_ms = meter.create_histogram(
+            name="agentex.llm.ttat",
+            unit="ms",
+            description="Time from request submission to first answering token (text or tool-call delta) — excludes reasoning chunks",
+        )
         # Note: TPS denominator is the model-generation window
         # (last_token_time - first_token_time), not total stream wall time.
         # This isolates raw model throughput from event-loop / tool-call latency.
diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py
@@ -653,12 +653,16 @@ async def get_response(
                 reasoning_summaries = []
                 reasoning_contents = []
                 event_count = 0
-                # ttft / tps instrumentation. ``stream_start_perf`` is set above,
-                # before the responses.create() await, so it captures the full
+                # ttft / ttat / tps instrumentation. ``stream_start_perf`` is set
+                # above, before the responses.create() await, so it captures the full
                 # request-to-first-token latency. ``first_token_at`` and
                 # ``last_token_at`` bracket the model-generation window for tps.
+                # ``first_answer_at`` is set on the first user-visible answer token
+                # (text or tool-call delta) and excludes reasoning chunks, so ttat
+                # measures the latency users actually perceive on reasoning models.
                 first_token_at: Optional[float] = None
                 last_token_at: Optional[float] = None
+                first_answer_at: Optional[float] = None
 
                 # We expect task_id to always be provided for streaming
                 if not task_id:
@@ -686,6 +690,14 @@ async def get_response(
                         if first_token_at is None:
                             first_token_at = now_perf
                         last_token_at = now_perf
+                        # ttat: first user-visible answer token (text or tool call),
+                        # excluding reasoning chunks. Equal to ttft for non-reasoning
+                        # models; differs by reasoning duration for reasoning models.
+                        if first_answer_at is None and isinstance(event, (
+                            ResponseTextDeltaEvent,
+                            ResponseFunctionCallArgumentsDeltaEvent,
+                        )):
+                            first_answer_at = now_perf
 
                     # Handle different event types using isinstance for type safety
                     if isinstance(event, ResponseOutputItemAddedEvent):
@@ -1027,6 +1039,8 @@ async def get_response(
                 m.reasoning_tokens.add(usage.output_tokens_details.reasoning_tokens or 0, metric_attrs)
                 if first_token_at is not None:
                     m.ttft_ms.record((first_token_at - stream_start_perf) * 1000, metric_attrs)
+                if first_answer_at is not None:
+                    m.ttat_ms.record((first_answer_at - stream_start_perf) * 1000, metric_attrs)
                 # tps denominator is the generation window (first→last delta), not
                 # total stream wall time — see LLMMetrics for rationale. Single-token
                 # responses (where first_token_at == last_token_at, e.g. a one-token