Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/google/adk/evaluation/eval_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pydantic import model_validator
from typing_extensions import TypeAlias

from ..events.event import Event
from .app_details import AppDetails
from .common import EvalBaseModel
from .conversation_scenarios import ConversationScenario
Expand Down Expand Up @@ -124,6 +125,14 @@ class SessionInput(EvalBaseModel):
state: SessionState = Field(default_factory=dict)
"""The state of the session."""

conversation_history: Optional[list[Event]] = None
"""Optional list of events to seed the session with prior conversation history.

When provided, these events are appended to the session after creation but
before inference begins. This enables eval cases to test agent behavior that
depends on prior context without scoring the context turns.
"""


StaticConversation: TypeAlias = list[Invocation]
"""A conversation where the user's queries for each invocation are already specified."""
Expand Down
8 changes: 7 additions & 1 deletion src/google/adk/evaluation/evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,19 @@ async def _generate_inferences_from_root_agent(
user_id = initial_session.user_id if initial_session else "test_user_id"
session_id = session_id if session_id else str(uuid.uuid4())

_ = await session_service.create_session(
session = await session_service.create_session(
app_name=app_name,
user_id=user_id,
state=initial_session.state if initial_session else {},
session_id=session_id,
)

# Seed session with prior conversation history so that eval turns
# run with realistic prior context without being scored.
if initial_session and initial_session.conversation_history:
for event in initial_session.conversation_history:
await session_service.append_event(session=session, event=event)

if not artifact_service:
artifact_service = InMemoryArtifactService()

Expand Down
109 changes: 109 additions & 0 deletions tests/unittests/evaluation/test_evaluation_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,3 +457,112 @@ async def mock_generate_inferences_side_effect(
mock_generate_inferences.assert_called_once()
called_with_content = mock_generate_inferences.call_args.args[3]
assert called_with_content.parts[0].text == "message 1"

@pytest.mark.asyncio
async def test_seed_events_are_appended_to_session(self, mocker, mock_runner):
"""Tests that seed events from SessionInput.conversation_history are appended after create_session."""
from google.adk.evaluation.eval_case import SessionInput

mock_agent = mocker.MagicMock()
mock_user_sim = mocker.MagicMock(spec=UserSimulator)
mock_user_sim.get_next_user_message = mocker.AsyncMock(
return_value=NextUserMessage(
status=UserSimulatorStatus.STOP_SIGNAL_DETECTED
)
)

# Create a real InMemorySessionService so we can verify events are appended
mock_session_service = mocker.MagicMock()
mock_session = mocker.MagicMock()
mock_session_service.create_session = mocker.AsyncMock(
return_value=mock_session
)
mock_session_service.append_event = mocker.AsyncMock()

mocker.patch(
"google.adk.evaluation.evaluation_generator.EvaluationGenerator._get_app_details_by_invocation_id"
)
mocker.patch(
"google.adk.evaluation.evaluation_generator.EvaluationGenerator.convert_events_to_eval_invocations"
)

seed_event_1 = Event(
content=types.Content(
parts=[types.Part(text="How do I pay my water bill?")],
role="user",
),
author="user",
invocation_id="seed-0",
)
seed_event_2 = Event(
content=types.Content(
parts=[types.Part(text="You can pay by calling ABC.")],
role="model",
),
author="root_agent",
invocation_id="seed-0",
)

initial_session = SessionInput(
app_name="test_app",
user_id="test_user",
conversation_history=[seed_event_1, seed_event_2],
)

await EvaluationGenerator._generate_inferences_from_root_agent(
root_agent=mock_agent,
user_simulator=mock_user_sim,
initial_session=initial_session,
session_service=mock_session_service,
)

# Verify append_event was called for each seed event
assert mock_session_service.append_event.call_count == 2
mock_session_service.append_event.assert_any_call(
session=mock_session, event=seed_event_1
)
mock_session_service.append_event.assert_any_call(
session=mock_session, event=seed_event_2
)

@pytest.mark.asyncio
async def test_no_seed_events_when_events_is_none(self, mocker, mock_runner):
"""Tests that append_event is not called when SessionInput.conversation_history is None."""
from google.adk.evaluation.eval_case import SessionInput

mock_agent = mocker.MagicMock()
mock_user_sim = mocker.MagicMock(spec=UserSimulator)
mock_user_sim.get_next_user_message = mocker.AsyncMock(
return_value=NextUserMessage(
status=UserSimulatorStatus.STOP_SIGNAL_DETECTED
)
)

mock_session_service = mocker.MagicMock()
mock_session = mocker.MagicMock()
mock_session_service.create_session = mocker.AsyncMock(
return_value=mock_session
)
mock_session_service.append_event = mocker.AsyncMock()

mocker.patch(
"google.adk.evaluation.evaluation_generator.EvaluationGenerator._get_app_details_by_invocation_id"
)
mocker.patch(
"google.adk.evaluation.evaluation_generator.EvaluationGenerator.convert_events_to_eval_invocations"
)

initial_session = SessionInput(
app_name="test_app",
user_id="test_user",
)

await EvaluationGenerator._generate_inferences_from_root_agent(
root_agent=mock_agent,
user_simulator=mock_user_sim,
initial_session=initial_session,
session_service=mock_session_service,
)

# Verify append_event was NOT called
mock_session_service.append_event.assert_not_called()