Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/google/adk/cli/adk_web_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
from .utils.base_agent_loader import BaseAgentLoader
from .utils.shared_value import SharedValue
from .utils.state import create_empty_state
from ..utils.pydantic_v2_compatibility import patch_types_for_pydantic_v2, create_robust_openapi_function

logger = logging.getLogger("google_adk." + __name__)

Expand Down Expand Up @@ -686,6 +687,13 @@ async def internal_lifespan(app: FastAPI):
tracer_provider = trace.get_tracer_provider()
register_processors(tracer_provider)

# Apply Pydantic v2 compatibility patches before creating FastAPI app
patches_applied = patch_types_for_pydantic_v2()
if patches_applied:
logger.info("Pydantic v2 compatibility patches applied successfully")
else:
logger.warning("Pydantic v2 compatibility patches could not be applied")

# Run the FastAPI server.
app = FastAPI(lifespan=internal_lifespan)

Expand All @@ -698,6 +706,10 @@ async def internal_lifespan(app: FastAPI):
allow_headers=["*"],
)

# Replace default OpenAPI function with robust version
app.openapi = create_robust_openapi_function(app)
logger.info("Robust OpenAPI generation enabled with Pydantic v2 error handling")

@app.get("/list-apps")
async def list_apps() -> list[str]:
return self.agent_loader.list_agents()
Expand Down
70 changes: 70 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,76 @@ class HallucinationsCriterion(BaseCriterion):
)


class ToolTrajectoryCriterion(BaseCriterion):
"""Criterion to use when evaluating agent's tool trajectories with a reference one."""

class MatchType(Enum):
"""The type of Match between actual and expected tool call trajectories."""

EXACT = 0
"""Requires a perfect match between the actual and expected tool calls."""

IN_ORDER = 1
"""Requires the actual tool calls to be in the same order as expected tools,
with allowance for extra tool calls to have happened.

This criteria is useful in assuring if certain key actions/tool calls
occur and in certain order, leaving some scope for other tools calls to
happen as well.

Example 1: Set of actual vs expected tool calls that satisfies the criteria:

Expected tools calls: [T1, T2, T3]
Actual tool calls: [T1, T1.1, T2, T2.1, T2.2, T3, T3.1]

This satisfies, as the tools T1, T2 and T3 happened in the "Actual" and in
the same order.

Example 2: Set of actual vs expected tool calls that don't satisfy the
criteria:

Expected tools calls: [T1, T2, T3, T4]
Actual tool calls: [T1, T1.1, T2, T2.1, T2.2, T3, T3.1]

While the tool calls T1, T2 and T3 happened in the "Actual" and in
the same order as "Expected", but the the tool calls T4 is missing.
"""

ANY_ORDER = 2
"""Requires the actual tool calls to be in the any order as expected tools,
with allowance for extra tool calls to have happened.

This criteria is helpful for cases where multiple tool calls about the same
concept occur, like your agent issues 5 search queries. You don't really
care the order in which the search queries are issues, till they occur.

Example 1: Set of actual vs expected tool calls that satisfies the criteria:

Expected tools calls: [T1, T2, T3]
Actual tool calls: [T2, T2.1, T1, T1.1, T1.2, T3, T3.1]

This satisfies, as the tools T1, T2 and T3 happened in the "Actual" and
are also present in expected. Note that the order is different.

Example 2: Set of actual vs expected tool calls that don't satisfy the
criteria:

Expected tools calls: [T1, T2, T3, T4]
Actual tool calls: [T1, T1.1, T2, T2.1, T2.2, T3, T3.1]

While the tool calls T1, T2 and T3 happened in the "Actual" and in
the same order as "Expected", but the the tool calls T4 is missing.
"""

match_type: MatchType = Field(
default=MatchType.EXACT,
description=(
"The type of Match between actual and expected tool call"
" trajectories."
),
)


class EvalMetric(EvalBaseModel):
"""A metric used to evaluate a particular aspect of an eval case."""

Expand Down
185 changes: 171 additions & 14 deletions src/google/adk/evaluation/trajectory_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@

from __future__ import annotations

import logging
from typing import ClassVar
from typing import Optional

from google.genai import types as genai_types
from pydantic import ValidationError
from typing_extensions import override

from .eval_case import get_all_tool_calls
Expand All @@ -26,14 +29,43 @@
from .eval_metrics import MetricInfo
from .eval_metrics import MetricValueInfo
from .eval_metrics import PrebuiltMetrics
from .eval_metrics import ToolTrajectoryCriterion
from .evaluator import EvalStatus
from .evaluator import EvaluationResult
from .evaluator import Evaluator
from .evaluator import PerInvocationResult

logger = logging.getLogger("google_adk." + __name__)


class TrajectoryEvaluator(Evaluator):
"""Evaluates tool use trajectories for accuracy."""
"""Evaluates tool use trajectories for accuracy.

This evaluator compares the sequence of tools called by the agent against a
list of expected calls and computes an average score based on one of the match
types: `EXACT`, `IN_ORDER`, or `ANY_ORDER`.

For each invocation being evaluated, this evaluator compares the list of
tool calls produced by the agent with the list of expected tool calls using
one of three match types. If the tool calls match based on the selected match
type, a score of 1.0 is awarded for that invocation, otherwise the score is
0.0. The final value is the average of these scores across all
invocations in the eval case.

The comparison can be done using one of following match types:
- `EXACT`: Requires a perfect match between the actual and expected tool
calls, with no extra or missing tool calls.
- `IN_ORDER`: Requires all tool calls from the expected list to be present
in the actual list, in the same order, but allows for other tool calls
to appear in between.
- `ANY_ORDER`: Requires all tool calls from the expected list to be
present in the actual list, in any order, and allows for other tool
calls to appear in between.
"""

criterion_type: ClassVar[type[ToolTrajectoryCriterion]] = (
ToolTrajectoryCriterion
)

def __init__(
self,
Expand All @@ -46,10 +78,25 @@ def __init__(
" specified."
)

if eval_metric:
threshold = eval_metric.threshold

self._threshold = threshold
if eval_metric and eval_metric.criterion:
try:
criterion = TrajectoryEvaluator.criterion_type.model_validate(
eval_metric.criterion.model_dump()
)
self._threshold = criterion.threshold
self._match_type = criterion.match_type
except ValidationError as e:
expected_criterion_type_error = ValueError(
f"`{eval_metric.metric_name}` metric expects a criterion of type"
f" `{TrajectoryEvaluator.criterion_type}`."
)
raise expected_criterion_type_error from e
elif eval_metric:
self._threshold = eval_metric.threshold
self._match_type = ToolTrajectoryCriterion.MatchType.EXACT
else:
self._threshold = threshold
self._match_type = ToolTrajectoryCriterion.MatchType.EXACT

@staticmethod
def get_metric_info() -> MetricInfo:
Expand Down Expand Up @@ -82,14 +129,7 @@ def evaluate_invocations(
per_invocation_results = []

for actual, expected in zip(actual_invocations, expected_invocations):
actual_tool_uses = get_all_tool_calls(actual.intermediate_data)
expected_tool_uses = get_all_tool_calls(expected.intermediate_data)

tool_use_accuracy = (
1.0
if self._are_tool_calls_equal(actual_tool_uses, expected_tool_uses)
else 0.0
)
tool_use_accuracy = self._calculate_tool_use_accuracy(actual, expected)
per_invocation_results.append(
PerInvocationResult(
actual_invocation=actual,
Expand All @@ -111,11 +151,128 @@ def evaluate_invocations(

return EvaluationResult()

def _are_tool_calls_equal(
def _calculate_tool_use_accuracy(
self,
actual_invocation: Invocation,
expected_invocation: Invocation,
) -> float:
"""Calculates tool use accuracy for a single invocation."""
actual_tool_uses = get_all_tool_calls(actual_invocation.intermediate_data)
expected_tool_uses = get_all_tool_calls(
expected_invocation.intermediate_data
)

tool_use_match_status = False
if self._match_type == ToolTrajectoryCriterion.MatchType.EXACT:
tool_use_match_status = self._are_tool_calls_exact_match(
actual_tool_uses, expected_tool_uses
)
elif self._match_type == ToolTrajectoryCriterion.MatchType.IN_ORDER:
tool_use_match_status = self._are_tool_calls_in_order_match(
actual_tool_uses, expected_tool_uses
)
elif self._match_type == ToolTrajectoryCriterion.MatchType.ANY_ORDER:
tool_use_match_status = self._are_tool_calls_any_order_match(
actual_tool_uses, expected_tool_uses
)
else:
raise ValueError(f"Unsupported match type {self._match_type}")

return 1.0 if tool_use_match_status else 0.0

def _are_tool_calls_in_order_match(
self,
actual_tool_calls: list[genai_types.FunctionCall],
expected_tool_calls: list[genai_types.FunctionCall],
) -> bool:
"""Checks if expected tool calls appear in actual tool calls in order.

This method implements IN_ORDER match type. It allows for additional
tool calls in actual_tool_calls, as long as all expected tool calls are
present in the same order.

Args:
actual_tool_calls: A list of tool calls that actually happened.
expected_tool_calls: A list of tool calls that were expected to happen.

Returns:
True if actual tool calls match expected tool calls in order,
False otherwise.
"""
if not expected_tool_calls:
return True
if not actual_tool_calls and expected_tool_calls:
return False

expected_it = iter(expected_tool_calls)
try:
current_expected = next(expected_it)
for actual in actual_tool_calls:
if (
actual.name == current_expected.name
and actual.args == current_expected.args
):
current_expected = next(expected_it)
except StopIteration:
return True

return False

def _are_tool_calls_any_order_match(
self,
actual_tool_calls: list[genai_types.FunctionCall],
expected_tool_calls: list[genai_types.FunctionCall],
) -> bool:
"""Checks if expected tool calls appear in actual tool calls in any order.

This method implements ANY_ORDER match type. It allows for additional
tool calls in actual_tool_calls, as long as all expected tool calls are
present.

Args:
actual_tool_calls: A list of tool calls that actually happened.
expected_tool_calls: A list of tool calls that were expected to happen.

Returns:
True if actual tool calls contain all expected tool calls,
False otherwise.
"""
if not expected_tool_calls:
return True
if not actual_tool_calls and expected_tool_calls:
return False

actual_tool_calls_copy = list(actual_tool_calls)
for expected in expected_tool_calls:
found = False
for i, actual in enumerate(actual_tool_calls_copy):
if actual.name == expected.name and actual.args == expected.args:
actual_tool_calls_copy.pop(i)
found = True
break
if not found:
return False
return True

def _are_tool_calls_exact_match(
self,
actual_tool_calls: list[genai_types.FunctionCall],
expected_tool_calls: list[genai_types.FunctionCall],
) -> bool:
"""Checks if actual tool calls exactly match expected tool calls.

This method implements EXACT match type. It requires that
actual_tool_calls and expected_tool_calls have the same tool calls in
the same order, with no extra or missing tool calls.

Args:
actual_tool_calls: A list of tool calls that actually happened.
expected_tool_calls: A list of tool calls that were expected to happen.

Returns:
True if actual tool calls exactly match expected tool calls,
False otherwise.
"""
if len(actual_tool_calls) != len(expected_tool_calls):
return False

Expand Down
Loading
Loading