diff --git a/src/google/adk/evaluation/eval_case.py b/src/google/adk/evaluation/eval_case.py index 8560762483..09f1dba151 100644 --- a/src/google/adk/evaluation/eval_case.py +++ b/src/google/adk/evaluation/eval_case.py @@ -23,6 +23,7 @@ from pydantic import model_validator from typing_extensions import TypeAlias +from ..events.event import Event from .app_details import AppDetails from .common import EvalBaseModel from .conversation_scenarios import ConversationScenario @@ -124,6 +125,14 @@ class SessionInput(EvalBaseModel): state: SessionState = Field(default_factory=dict) """The state of the session.""" + conversation_history: Optional[list[Event]] = None + """Optional list of events to seed the session with prior conversation history. + + When provided, these events are appended to the session after creation but + before inference begins. This enables eval cases to test agent behavior that + depends on prior context without scoring the context turns. + """ + StaticConversation: TypeAlias = list[Invocation] """A conversation where the user's queries for each invocation are already specified.""" diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 725bddc11c..8ecf0f8aca 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -212,13 +212,19 @@ async def _generate_inferences_from_root_agent( user_id = initial_session.user_id if initial_session else "test_user_id" session_id = session_id if session_id else str(uuid.uuid4()) - _ = await session_service.create_session( + session = await session_service.create_session( app_name=app_name, user_id=user_id, state=initial_session.state if initial_session else {}, session_id=session_id, ) + # Seed session with prior conversation history so that eval turns + # run with realistic prior context without being scored. + if initial_session and initial_session.conversation_history: + for event in initial_session.conversation_history: + await session_service.append_event(session=session, event=event) + if not artifact_service: artifact_service = InMemoryArtifactService() diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index 29ac75ffb5..1720dc07b5 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -457,3 +457,112 @@ async def mock_generate_inferences_side_effect( mock_generate_inferences.assert_called_once() called_with_content = mock_generate_inferences.call_args.args[3] assert called_with_content.parts[0].text == "message 1" + + @pytest.mark.asyncio + async def test_seed_events_are_appended_to_session(self, mocker, mock_runner): + """Tests that seed events from SessionInput.conversation_history are appended after create_session.""" + from google.adk.evaluation.eval_case import SessionInput + + mock_agent = mocker.MagicMock() + mock_user_sim = mocker.MagicMock(spec=UserSimulator) + mock_user_sim.get_next_user_message = mocker.AsyncMock( + return_value=NextUserMessage( + status=UserSimulatorStatus.STOP_SIGNAL_DETECTED + ) + ) + + # Create a real InMemorySessionService so we can verify events are appended + mock_session_service = mocker.MagicMock() + mock_session = mocker.MagicMock() + mock_session_service.create_session = mocker.AsyncMock( + return_value=mock_session + ) + mock_session_service.append_event = mocker.AsyncMock() + + mocker.patch( + "google.adk.evaluation.evaluation_generator.EvaluationGenerator._get_app_details_by_invocation_id" + ) + mocker.patch( + "google.adk.evaluation.evaluation_generator.EvaluationGenerator.convert_events_to_eval_invocations" + ) + + seed_event_1 = Event( + content=types.Content( + parts=[types.Part(text="How do I pay my water bill?")], + role="user", + ), + author="user", + invocation_id="seed-0", + ) + seed_event_2 = Event( + content=types.Content( + parts=[types.Part(text="You can pay by calling ABC.")], + role="model", + ), + author="root_agent", + invocation_id="seed-0", + ) + + initial_session = SessionInput( + app_name="test_app", + user_id="test_user", + conversation_history=[seed_event_1, seed_event_2], + ) + + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=mock_agent, + user_simulator=mock_user_sim, + initial_session=initial_session, + session_service=mock_session_service, + ) + + # Verify append_event was called for each seed event + assert mock_session_service.append_event.call_count == 2 + mock_session_service.append_event.assert_any_call( + session=mock_session, event=seed_event_1 + ) + mock_session_service.append_event.assert_any_call( + session=mock_session, event=seed_event_2 + ) + + @pytest.mark.asyncio + async def test_no_seed_events_when_events_is_none(self, mocker, mock_runner): + """Tests that append_event is not called when SessionInput.conversation_history is None.""" + from google.adk.evaluation.eval_case import SessionInput + + mock_agent = mocker.MagicMock() + mock_user_sim = mocker.MagicMock(spec=UserSimulator) + mock_user_sim.get_next_user_message = mocker.AsyncMock( + return_value=NextUserMessage( + status=UserSimulatorStatus.STOP_SIGNAL_DETECTED + ) + ) + + mock_session_service = mocker.MagicMock() + mock_session = mocker.MagicMock() + mock_session_service.create_session = mocker.AsyncMock( + return_value=mock_session + ) + mock_session_service.append_event = mocker.AsyncMock() + + mocker.patch( + "google.adk.evaluation.evaluation_generator.EvaluationGenerator._get_app_details_by_invocation_id" + ) + mocker.patch( + "google.adk.evaluation.evaluation_generator.EvaluationGenerator.convert_events_to_eval_invocations" + ) + + initial_session = SessionInput( + app_name="test_app", + user_id="test_user", + ) + + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=mock_agent, + user_simulator=mock_user_sim, + initial_session=initial_session, + session_service=mock_session_service, + ) + + # Verify append_event was NOT called + mock_session_service.append_event.assert_not_called()