diff --git a/python/packages/azure-ai/agent_framework_azure_ai/__init__.py b/python/packages/azure-ai/agent_framework_azure_ai/__init__.py index 46b1ed5b3b..b583414685 100644 --- a/python/packages/azure-ai/agent_framework_azure_ai/__init__.py +++ b/python/packages/azure-ai/agent_framework_azure_ai/__init__.py @@ -11,6 +11,11 @@ AzureAIInferenceEmbeddingSettings, RawAzureAIInferenceEmbeddingClient, ) +from ._foundry_evals import ( + FoundryEvals, + evaluate_foundry_target, + evaluate_traces, +) from ._foundry_memory_provider import FoundryMemoryProvider from ._project_provider import AzureAIProjectAgentProvider from ._shared import AzureAISettings @@ -31,8 +36,11 @@ "AzureAIProjectAgentOptions", "AzureAIProjectAgentProvider", "AzureAISettings", + "FoundryEvals", "FoundryMemoryProvider", "RawAzureAIClient", "RawAzureAIInferenceEmbeddingClient", "__version__", + "evaluate_foundry_target", + "evaluate_traces", ] diff --git a/python/packages/azure-ai/agent_framework_azure_ai/_foundry_evals.py b/python/packages/azure-ai/agent_framework_azure_ai/_foundry_evals.py new file mode 100644 index 0000000000..bcf9dcdef5 --- /dev/null +++ b/python/packages/azure-ai/agent_framework_azure_ai/_foundry_evals.py @@ -0,0 +1,845 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Microsoft Foundry Evals integration for Microsoft Agent Framework. + +Provides ``FoundryEvals``, an ``Evaluator`` implementation backed by Azure AI +Foundry's built-in evaluators. See docs/decisions/0018-foundry-evals-integration.md +for the design rationale. + +Typical usage:: + + from agent_framework import evaluate_agent + from agent_framework_azure_ai import FoundryEvals + + evals = FoundryEvals(project_client=project_client, model_deployment="gpt-4o") + results = await evaluate_agent( + agent=my_agent, + queries=["What's the weather in Seattle?"], + evaluators=evals, + ) + assert results.all_passed + print(results.report_url) +""" + +from __future__ import annotations + +import asyncio +import logging +from collections.abc import Sequence +from typing import TYPE_CHECKING, Any, cast + +from agent_framework._evaluation import ( + ConversationSplit, + ConversationSplitter, + EvalItem, + EvalItemResult, + EvalResults, + EvalScoreResult, +) + +if TYPE_CHECKING: + from azure.ai.projects.aio import AIProjectClient + from openai import AsyncOpenAI + +logger = logging.getLogger(__name__) + +# Agent evaluators that accept query/response as conversation arrays. +# Maintained manually — check https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/evaluate-sdk +# for the latest evaluator list. These are the evaluators that need conversation-format input. +_AGENT_EVALUATORS: set[str] = { + "builtin.intent_resolution", + "builtin.task_adherence", + "builtin.task_completion", + "builtin.task_navigation_efficiency", + "builtin.tool_call_accuracy", + "builtin.tool_selection", + "builtin.tool_input_accuracy", + "builtin.tool_output_utilization", + "builtin.tool_call_success", +} + +# Evaluators that additionally require tool_definitions. +_TOOL_EVALUATORS: set[str] = { + "builtin.tool_call_accuracy", + "builtin.tool_selection", + "builtin.tool_input_accuracy", + "builtin.tool_output_utilization", + "builtin.tool_call_success", +} + +_BUILTIN_EVALUATORS: dict[str, str] = { + # Agent behavior + "intent_resolution": "builtin.intent_resolution", + "task_adherence": "builtin.task_adherence", + "task_completion": "builtin.task_completion", + "task_navigation_efficiency": "builtin.task_navigation_efficiency", + # Tool usage + "tool_call_accuracy": "builtin.tool_call_accuracy", + "tool_selection": "builtin.tool_selection", + "tool_input_accuracy": "builtin.tool_input_accuracy", + "tool_output_utilization": "builtin.tool_output_utilization", + "tool_call_success": "builtin.tool_call_success", + # Quality + "coherence": "builtin.coherence", + "fluency": "builtin.fluency", + "relevance": "builtin.relevance", + "groundedness": "builtin.groundedness", + "response_completeness": "builtin.response_completeness", + "similarity": "builtin.similarity", + # Safety + "violence": "builtin.violence", + "sexual": "builtin.sexual", + "self_harm": "builtin.self_harm", + "hate_unfairness": "builtin.hate_unfairness", +} + +# Default evaluator sets used when evaluators=None +_DEFAULT_EVALUATORS: list[str] = [ + "relevance", + "coherence", + "task_adherence", +] + +_DEFAULT_TOOL_EVALUATORS: list[str] = [ + "tool_call_accuracy", +] + + +def _resolve_evaluator(name: str) -> str: + """Resolve a short evaluator name to its fully-qualified ``builtin.*`` form. + + Args: + name: Short name (e.g. ``"relevance"``) or fully-qualified name + (e.g. ``"builtin.relevance"``). + + Returns: + The fully-qualified evaluator name. + + Raises: + ValueError: If the name is not recognized. + """ + if name.startswith("builtin."): + return name + resolved = _BUILTIN_EVALUATORS.get(name) + if resolved is None: + raise ValueError(f"Unknown evaluator '{name}'. Available: {sorted(_BUILTIN_EVALUATORS)}") + return resolved + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _build_testing_criteria( + evaluators: Sequence[str], + model_deployment: str, + *, + include_data_mapping: bool = False, +) -> list[dict[str, Any]]: + """Build ``testing_criteria`` for ``evals.create()``. + + Args: + evaluators: Evaluator names. + model_deployment: Model deployment for the LLM judge. + include_data_mapping: Whether to include field-level data mapping + (required for the JSONL data source, not needed for response-based). + """ + criteria: list[dict[str, Any]] = [] + for name in evaluators: + qualified = _resolve_evaluator(name) + short = name if not name.startswith("builtin.") else name.split(".")[-1] + + entry: dict[str, Any] = { + "type": "azure_ai_evaluator", + "name": short, + "evaluator_name": qualified, + "initialization_parameters": {"deployment_name": model_deployment}, + } + + if include_data_mapping: + if qualified in _AGENT_EVALUATORS: + # Agent evaluators: query/response as conversation arrays + mapping: dict[str, str] = { + "query": "{{item.query_messages}}", + "response": "{{item.response_messages}}", + } + else: + # Quality evaluators: query/response as strings + mapping = { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + if qualified == "builtin.groundedness": + mapping["context"] = "{{item.context}}" + if qualified in _TOOL_EVALUATORS: + mapping["tool_definitions"] = "{{item.tool_definitions}}" + entry["data_mapping"] = mapping + + criteria.append(entry) + return criteria + + +def _build_item_schema(*, has_context: bool = False, has_tools: bool = False) -> dict[str, Any]: + """Build the ``item_schema`` for custom JSONL eval definitions.""" + properties: dict[str, Any] = { + "query": {"type": "string"}, + "response": {"type": "string"}, + "query_messages": {"type": "array"}, + "response_messages": {"type": "array"}, + } + if has_context: + properties["context"] = {"type": "string"} + if has_tools: + properties["tool_definitions"] = {"type": "array"} + return { + "type": "object", + "properties": properties, + "required": ["query", "response"], + } + + +def _resolve_default_evaluators( + evaluators: Sequence[str] | None, + items: Sequence[EvalItem | dict[str, Any]] | None = None, +) -> list[str]: + """Resolve evaluators, applying defaults when ``None``. + + Defaults to relevance + coherence + task_adherence. Automatically adds + tool_call_accuracy when items contain tools. + """ + if evaluators is not None: + return list(evaluators) + + result = list(_DEFAULT_EVALUATORS) + if items is not None: + has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) + if has_tools: + result.extend(_DEFAULT_TOOL_EVALUATORS) + return result + + +def _filter_tool_evaluators( + evaluators: list[str], + items: Sequence[EvalItem | dict[str, Any]], +) -> list[str]: + """Remove tool evaluators if no items have tool definitions.""" + has_tools = any((item.tools if isinstance(item, EvalItem) else item.get("tool_definitions")) for item in items) + if has_tools: + return evaluators + filtered = [e for e in evaluators if _resolve_evaluator(e) not in _TOOL_EVALUATORS] + if not filtered: + logger.warning( + "All requested evaluators (%s) require tool definitions, but no items have tools. " + "Falling back to default evaluators: %s", + evaluators, + list(_DEFAULT_EVALUATORS), + ) + return list(_DEFAULT_EVALUATORS) + if len(filtered) < len(evaluators): + removed = [e for e in evaluators if _resolve_evaluator(e) in _TOOL_EVALUATORS] + logger.info("Removed tool evaluators %s (no items have tools)", removed) + return filtered + + +async def _ensure_async_result(func: Any, *args: Any, **kwargs: Any) -> Any: + """Invoke an async client method and await the result. + + Only async clients (``AsyncOpenAI``) are supported. The function call is + awaited directly. + """ + return await func(*args, **kwargs) + + +async def _poll_eval_run( + client: AsyncOpenAI, + eval_id: str, + run_id: str, + poll_interval: float = 5.0, + timeout: float = 600.0, + provider: str = "Microsoft Foundry", + *, + fetch_output_items: bool = True, +) -> EvalResults: + """Poll an eval run until completion or timeout.""" + loop = asyncio.get_running_loop() + deadline = loop.time() + timeout + while True: + run = await _ensure_async_result(client.evals.runs.retrieve, run_id=run_id, eval_id=eval_id) + if run.status in ("completed", "failed", "canceled"): + error_msg = None + if run.status == "failed": + error_msg = ( + getattr(run, "error", None) + or getattr(run, "error_message", None) + or getattr(run, "failure_reason", None) + ) + if error_msg and not isinstance(error_msg, str): + error_msg = str(error_msg) + + items: list[EvalItemResult] = [] + if fetch_output_items and run.status == "completed": + items = await _fetch_output_items(client, eval_id, run_id) + + return EvalResults( + provider=provider, + eval_id=eval_id, + run_id=run_id, + status=run.status, + result_counts=_extract_result_counts(run), + report_url=getattr(run, "report_url", None), + error=error_msg, + per_evaluator=_extract_per_evaluator(run), + items=items, + ) + remaining = deadline - loop.time() + if remaining <= 0: + return EvalResults(provider=provider, eval_id=eval_id, run_id=run_id, status="timeout") + logger.debug("Eval run %s status: %s (%.0fs remaining)", run_id, run.status, remaining) + await asyncio.sleep(min(poll_interval, remaining)) + + +def _extract_result_counts(run: Any) -> dict[str, int] | None: + """Safely extract result_counts from an eval run object.""" + counts = getattr(run, "result_counts", None) + if counts is None: + return None + if isinstance(counts, dict): + return cast(dict[str, int], counts) + try: + attrs = cast(dict[str, Any], vars(counts)) + return {str(k): v for k, v in attrs.items() if isinstance(v, int)} + except TypeError: + return None + + +def _extract_per_evaluator(run: Any) -> dict[str, dict[str, int]]: + """Safely extract per-evaluator result breakdowns from an eval run.""" + per_eval: dict[str, dict[str, int]] = {} + per_testing_criteria = getattr(run, "per_testing_criteria_results", None) + if per_testing_criteria is None: + return per_eval + try: + items = cast(list[Any], per_testing_criteria) if isinstance(per_testing_criteria, list) else [] # type: ignore[redundant-cast] + for item in items: + name: str = str(getattr(item, "name", None) or getattr(item, "testing_criteria", "unknown")) + counts = _extract_result_counts(item) + if name and counts: + per_eval[name] = counts + except (TypeError, AttributeError): + pass + return per_eval + + +async def _fetch_output_items( + client: AsyncOpenAI, + eval_id: str, + run_id: str, +) -> list[EvalItemResult]: + """Fetch per-item results from the output_items API. + + Converts the provider-specific ``OutputItemListResponse`` objects into + provider-agnostic ``EvalItemResult`` instances with per-evaluator scores, + error categorization, and token usage. + """ + items: list[EvalItemResult] = [] + try: + output_items_page = await _ensure_async_result( + client.evals.runs.output_items.list, + run_id=run_id, + eval_id=eval_id, + ) + + for oi in output_items_page: + item_id = getattr(oi, "id", "") or "" + status = getattr(oi, "status", "unknown") or "unknown" + + # Extract per-evaluator scores + scores: list[EvalScoreResult] = [] + for r in getattr(oi, "results", []) or []: + scores.append( + EvalScoreResult( + name=getattr(r, "name", "unknown"), + score=getattr(r, "score", 0.0), + passed=getattr(r, "passed", None), + sample=getattr(r, "sample", None), + ) + ) + + # Extract error info from sample + error_code: str | None = None + error_message: str | None = None + token_usage: dict[str, int] | None = None + input_text: str | None = None + output_text: str | None = None + response_id: str | None = None + + sample = getattr(oi, "sample", None) + if sample is not None: + error = getattr(sample, "error", None) + if error is not None: + code = getattr(error, "code", None) + msg = getattr(error, "message", None) + if code or msg: + error_code = code or None + error_message = msg or None + + usage = getattr(sample, "usage", None) + if usage is not None: + total = getattr(usage, "total_tokens", 0) + if total: + token_usage = { + "prompt_tokens": getattr(usage, "prompt_tokens", 0), + "completion_tokens": getattr(usage, "completion_tokens", 0), + "total_tokens": total, + "cached_tokens": getattr(usage, "cached_tokens", 0), + } + + # Extract input/output text + sample_input = getattr(sample, "input", None) + if sample_input: + parts = [getattr(si, "content", "") for si in sample_input if getattr(si, "role", "") == "user"] + if parts: + input_text = " ".join(parts) + + sample_output = getattr(sample, "output", None) + if sample_output: + parts = [ + getattr(so, "content", "") or "" + for so in sample_output + if getattr(so, "role", "") == "assistant" + ] + if parts: + output_text = " ".join(parts) + + # Extract response_id from datasource_item + ds_item = getattr(oi, "datasource_item", None) + if ds_item and isinstance(ds_item, dict): + ds_dict = cast(dict[str, Any], ds_item) + resp_id_val = ds_dict.get("resp_id") or ds_dict.get("response_id") + response_id = str(resp_id_val) if resp_id_val else None + + items.append( + EvalItemResult( + item_id=item_id, + status=status, + scores=scores, + error_code=error_code, + error_message=error_message, + response_id=response_id, + input_text=input_text, + output_text=output_text, + token_usage=token_usage, + ) + ) + except (AttributeError, KeyError, TypeError) as exc: + logger.warning("Could not fetch output_items for run %s: %s", run_id, exc) + + return items + + +def _resolve_openai_client( + openai_client: AsyncOpenAI | None = None, + project_client: AIProjectClient | None = None, +) -> AsyncOpenAI: + """Resolve an OpenAI client from explicit client or project_client.""" + if openai_client is not None: + return openai_client + if project_client is not None: + return project_client.get_openai_client() + raise ValueError("Provide either 'openai_client' or 'project_client'.") + + +# --------------------------------------------------------------------------- +# FoundryEvals — Evaluator implementation for Microsoft Foundry +# --------------------------------------------------------------------------- + + +class FoundryEvals: + """Evaluation provider backed by Microsoft Foundry. + + Implements the ``Evaluator`` protocol so it can be passed to the + provider-agnostic ``evaluate_agent()`` and + ``evaluate_workflow()`` functions from ``agent_framework``. + + Also provides constants for built-in evaluator names for IDE + autocomplete and typo prevention:: + + from agent_framework_azure_ai import FoundryEvals + + evaluators = [FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY] + + The simplest usage:: + + from agent_framework import evaluate_agent + from agent_framework_azure_ai import FoundryEvals + + evals = FoundryEvals(project_client=client, model_deployment="gpt-4o") + results = await evaluate_agent(agent=agent, queries=queries, evaluators=evals) + + **Evaluator selection:** + + By default, runs ``relevance``, ``coherence``, and ``task_adherence``. + Automatically adds ``tool_call_accuracy`` when items contain tool + definitions. Override with ``evaluators=``. + + **Responses API optimization:** + + When all items have a ``response_id`` and no tool evaluators are needed, + uses Foundry's server-side response retrieval path (no data upload). + + Args: + project_client: An ``AIProjectClient`` instance (sync or async). + Provide this or *openai_client*. + openai_client: An ``AsyncOpenAI`` client with evals API. + model_deployment: Model deployment name for the evaluator LLM judge. + evaluators: Evaluator names (e.g. ``["relevance", "tool_call_accuracy"]``). + When ``None`` (default), uses smart defaults based on item data. + conversation_split: How to split multi-turn conversations into + query/response halves. Defaults to ``LAST_TURN``. Pass a + ``ConversationSplit`` enum value or a custom callable — see + ``ConversationSplitter``. + poll_interval: Seconds between status polls (default 5.0). + timeout: Maximum seconds to wait for completion (default 600.0). + """ + + # --------------------------------------------------------------------------- + # Built-in evaluator name constants + # --------------------------------------------------------------------------- + + # Agent behavior + INTENT_RESOLUTION: str = "intent_resolution" + TASK_ADHERENCE: str = "task_adherence" + TASK_COMPLETION: str = "task_completion" + TASK_NAVIGATION_EFFICIENCY: str = "task_navigation_efficiency" + + # Tool usage + TOOL_CALL_ACCURACY: str = "tool_call_accuracy" + TOOL_SELECTION: str = "tool_selection" + TOOL_INPUT_ACCURACY: str = "tool_input_accuracy" + TOOL_OUTPUT_UTILIZATION: str = "tool_output_utilization" + TOOL_CALL_SUCCESS: str = "tool_call_success" + + # Quality + COHERENCE: str = "coherence" + FLUENCY: str = "fluency" + RELEVANCE: str = "relevance" + GROUNDEDNESS: str = "groundedness" + RESPONSE_COMPLETENESS: str = "response_completeness" + SIMILARITY: str = "similarity" + + # Safety + VIOLENCE: str = "violence" + SEXUAL: str = "sexual" + SELF_HARM: str = "self_harm" + HATE_UNFAIRNESS: str = "hate_unfairness" + + def __init__( + self, + *, + project_client: AIProjectClient | None = None, + openai_client: AsyncOpenAI | None = None, + model_deployment: str, + evaluators: Sequence[str] | None = None, + conversation_split: ConversationSplitter = ConversationSplit.LAST_TURN, + poll_interval: float = 5.0, + timeout: float = 600.0, + ): + self.name = "Microsoft Foundry" + self._client = _resolve_openai_client(openai_client, project_client) + self._model_deployment = model_deployment + self._evaluators = list(evaluators) if evaluators is not None else None + self._conversation_split = conversation_split + self._poll_interval = poll_interval + self._timeout = timeout + + async def evaluate( + self, + items: Sequence[EvalItem], + *, + eval_name: str = "Agent Framework Eval", + ) -> EvalResults: + """Evaluate items using Foundry evaluators. + + Implements the ``Evaluator`` protocol. Automatically selects the + optimal data path (Responses API vs JSONL dataset) and filters + tool evaluators for items without tool definitions. + + Args: + items: Eval data items from ``AgentEvalConverter.to_eval_item()``. + eval_name: Display name for the evaluation run. + + Returns: + ``EvalResults`` with status, counts, and portal link. + """ + # Resolve evaluators with auto-detection + resolved = _resolve_default_evaluators(self._evaluators, items=items) + # Filter tool evaluators if items don't have tools + resolved = _filter_tool_evaluators(resolved, items) + + # Standard JSONL dataset path + return await self._evaluate_via_dataset(items, resolved, eval_name) + + # -- Internal evaluation paths -- + + async def _evaluate_via_responses( + self, + response_ids: Sequence[str], + evaluators: list[str], + eval_name: str, + ) -> EvalResults: + """Evaluate using Foundry's Responses API retrieval path.""" + eval_obj = await _ensure_async_result( + self._client.evals.create, + name=eval_name, + data_source_config={"type": "azure_ai_source", "scenario": "responses"}, + testing_criteria=_build_testing_criteria(evaluators, self._model_deployment), + ) + + data_source = { + "type": "azure_ai_responses", + "item_generation_params": { + "type": "response_retrieval", + "data_mapping": {"response_id": "{{item.resp_id}}"}, + "source": { + "type": "file_content", + "content": [{"item": {"resp_id": rid}} for rid in response_ids], + }, + }, + } + + run = await _ensure_async_result( + self._client.evals.runs.create, + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=data_source, + ) + + return await _poll_eval_run( + self._client, + eval_obj.id, + run.id, + self._poll_interval, + self._timeout, + provider=self.name, + ) + + async def _evaluate_via_dataset( + self, + items: Sequence[EvalItem], + evaluators: list[str], + eval_name: str, + ) -> EvalResults: + """Evaluate using JSONL dataset upload path.""" + dicts = [item.to_eval_data(split=item.split_strategy or self._conversation_split) for item in items] + has_context = any("context" in d for d in dicts) + has_tools = any("tool_definitions" in d for d in dicts) + + eval_obj = await _ensure_async_result( + self._client.evals.create, + name=eval_name, + data_source_config={ + "type": "custom", + "item_schema": _build_item_schema(has_context=has_context, has_tools=has_tools), + "include_sample_schema": True, + }, + testing_criteria=_build_testing_criteria( + evaluators, + self._model_deployment, + include_data_mapping=True, + ), + ) + + data_source = { + "type": "jsonl", + "source": { + "type": "file_content", + "content": [{"item": d} for d in dicts], + }, + } + + run = await _ensure_async_result( + self._client.evals.runs.create, + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=data_source, + ) + + return await _poll_eval_run( + self._client, + eval_obj.id, + run.id, + self._poll_interval, + self._timeout, + provider=self.name, + ) + + +# --------------------------------------------------------------------------- +# Foundry-specific functions (not part of the Evaluator protocol) +# --------------------------------------------------------------------------- + + +async def evaluate_traces( + *, + evaluators: Sequence[str] | None = None, + openai_client: AsyncOpenAI | None = None, + project_client: AIProjectClient | None = None, + model_deployment: str, + response_ids: Sequence[str] | None = None, + trace_ids: Sequence[str] | None = None, + agent_id: str | None = None, + lookback_hours: int = 24, + eval_name: str = "Agent Framework Trace Eval", + poll_interval: float = 5.0, + timeout: float = 600.0, +) -> EvalResults: + """Evaluate agent behavior from OTel traces or response IDs. + + Foundry-specific function — works with any agent that emits OTel traces + to App Insights. Provide *response_ids* for specific responses, + *trace_ids* for specific traces, or *agent_id* with *lookback_hours* + to evaluate recent activity. + + Args: + evaluators: Evaluator names (e.g. ``[FoundryEvals.RELEVANCE]``). + Defaults to relevance, coherence, and task_adherence. + openai_client: ``AsyncOpenAI`` client. Provide this or *project_client*. + project_client: An ``AIProjectClient`` instance. + model_deployment: Model deployment name for the evaluator LLM judge. + response_ids: Evaluate specific Responses API responses. + trace_ids: Evaluate specific OTel trace IDs from App Insights. + agent_id: Filter traces by agent ID (used with *lookback_hours*). + lookback_hours: Hours of trace history to evaluate (default 24). + eval_name: Display name for the evaluation. + poll_interval: Seconds between status polls. + timeout: Maximum seconds to wait for completion. + + Returns: + ``EvalResults`` with status, result counts, and portal link. + + Example:: + + results = await evaluate_traces( + response_ids=[response.response_id], + evaluators=[FoundryEvals.RELEVANCE], + project_client=project_client, + model_deployment="gpt-4o", + ) + """ + client = _resolve_openai_client(openai_client, project_client) + resolved_evaluators = _resolve_default_evaluators(evaluators) + + if response_ids: + foundry = FoundryEvals( + openai_client=client, + model_deployment=model_deployment, + evaluators=resolved_evaluators, + poll_interval=poll_interval, + timeout=timeout, + ) + return await foundry._evaluate_via_responses( # pyright: ignore[reportPrivateUsage] + response_ids, + resolved_evaluators, + eval_name, + ) + + if not trace_ids and not agent_id: + raise ValueError("Provide at least one of: response_ids, trace_ids, or agent_id") + + trace_source: dict[str, Any] = { + "type": "azure_ai_traces", + "lookback_hours": lookback_hours, + } + if trace_ids: + trace_source["trace_ids"] = list(trace_ids) + if agent_id: + trace_source["agent_id"] = agent_id + + eval_obj = await _ensure_async_result( + client.evals.create, + name=eval_name, + data_source_config={"type": "azure_ai_source", "scenario": "traces"}, + testing_criteria=_build_testing_criteria(resolved_evaluators, model_deployment), + ) + + run = await _ensure_async_result( + client.evals.runs.create, + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=trace_source, + ) + + return await _poll_eval_run(client, eval_obj.id, run.id, poll_interval, timeout) + + +async def evaluate_foundry_target( + *, + target: dict[str, Any], + test_queries: Sequence[str], + evaluators: Sequence[str] | None = None, + openai_client: AsyncOpenAI | None = None, + project_client: AIProjectClient | None = None, + model_deployment: str, + eval_name: str = "Agent Framework Target Eval", + poll_interval: float = 5.0, + timeout: float = 600.0, +) -> EvalResults: + """Evaluate a Foundry-registered agent or model deployment. + + Foundry invokes the target, captures the output, and evaluates it. Use + this for scheduled evals, red teaming, and CI/CD quality gates. + + Args: + target: Target configuration dict. + test_queries: Queries for Foundry to send to the target. + evaluators: Evaluator names. + openai_client: ``AsyncOpenAI`` client. Provide this or *project_client*. + project_client: An ``AIProjectClient`` instance. + model_deployment: Model deployment name for the evaluator LLM judge. + eval_name: Display name for the evaluation. + poll_interval: Seconds between status polls. + timeout: Maximum seconds to wait for completion. + + Returns: + ``EvalResults`` with status, result counts, and portal link. + + Example:: + + results = await evaluate_foundry_target( + target={"type": "azure_ai_agent", "name": "my-agent"}, + test_queries=["Book a flight to Paris"], + project_client=project_client, + model_deployment="gpt-4o", + ) + """ + client = _resolve_openai_client(openai_client, project_client) + resolved_evaluators = _resolve_default_evaluators(evaluators) + + eval_obj = await _ensure_async_result( + client.evals.create, + name=eval_name, + data_source_config={ + "type": "azure_ai_source", + "scenario": "target_completions", + }, + testing_criteria=_build_testing_criteria(resolved_evaluators, model_deployment), + ) + + data_source: dict[str, Any] = { + "type": "azure_ai_target_completions", + "target": target, + "source": { + "type": "file_content", + "content": [{"item": {"query": q}} for q in test_queries], + }, + } + + run = await _ensure_async_result( + client.evals.runs.create, + eval_id=eval_obj.id, + name=f"{eval_name} Run", + data_source=data_source, + ) + + return await _poll_eval_run(client, eval_obj.id, run.id, poll_interval, timeout) diff --git a/python/packages/azure-ai/tests/test_foundry_evals.py b/python/packages/azure-ai/tests/test_foundry_evals.py new file mode 100644 index 0000000000..07f071459a --- /dev/null +++ b/python/packages/azure-ai/tests/test_foundry_evals.py @@ -0,0 +1,2248 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Tests for the AgentEvalConverter, FoundryEvals, and eval helper functions.""" + +from __future__ import annotations + +import json +from unittest.mock import AsyncMock, MagicMock + +import pytest +from agent_framework import AgentExecutorResponse, AgentResponse, Content, FunctionTool, Message, WorkflowEvent +from agent_framework._evaluation import ( + AgentEvalConverter, + ConversationSplit, + EvalItem, + EvalResults, + _extract_agent_eval_data, + _extract_overall_query, + evaluate_agent, + evaluate_workflow, +) +from agent_framework._workflows._workflow import WorkflowRunResult + +from agent_framework_azure_ai._foundry_evals import ( + FoundryEvals, + _build_item_schema, + _build_testing_criteria, + _filter_tool_evaluators, + _resolve_default_evaluators, + _resolve_evaluator, + _resolve_openai_client, +) + + +def _make_tool(name: str) -> MagicMock: + """Create a mock FunctionTool for use in tests.""" + t = MagicMock() + t.name = name + t.description = f"{name} tool" + t.parameters = MagicMock(return_value={"type": "object"}) + return t + + +# --------------------------------------------------------------------------- +# _resolve_evaluator +# --------------------------------------------------------------------------- + + +class TestResolveEvaluator: + def test_short_name(self) -> None: + assert _resolve_evaluator("relevance") == "builtin.relevance" + assert _resolve_evaluator("tool_call_accuracy") == "builtin.tool_call_accuracy" + assert _resolve_evaluator("violence") == "builtin.violence" + + def test_already_qualified(self) -> None: + assert _resolve_evaluator("builtin.relevance") == "builtin.relevance" + assert _resolve_evaluator("builtin.custom") == "builtin.custom" + + def test_unknown_raises(self) -> None: + with pytest.raises(ValueError, match="Unknown evaluator 'bogus'"): + _resolve_evaluator("bogus") + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.convert_message +# --------------------------------------------------------------------------- + + +class TestConvertMessage: + def test_user_text_message(self) -> None: + msg = Message("user", ["Hello, world!"]) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0] == {"role": "user", "content": [{"type": "text", "text": "Hello, world!"}]} + + def test_system_message(self) -> None: + msg = Message("system", ["You are helpful."]) + result = AgentEvalConverter.convert_message(msg) + assert result[0] == {"role": "system", "content": [{"type": "text", "text": "You are helpful."}]} + + def test_assistant_text_message(self) -> None: + msg = Message("assistant", ["Here is the answer."]) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["role"] == "assistant" + assert result[0]["content"] == [{"type": "text", "text": "Here is the answer."}] + assert len(result[0]["content"]) == 1 + + def test_assistant_with_tool_call(self) -> None: + msg = Message( + "assistant", + [ + Content.from_function_call( + call_id="call_1", + name="get_weather", + arguments=json.dumps({"location": "Seattle"}), + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["role"] == "assistant" + tc = result[0]["content"][0] + assert tc["type"] == "tool_call" + assert tc["tool_call_id"] == "call_1" + assert tc["name"] == "get_weather" + assert tc["arguments"] == {"location": "Seattle"} + + def test_assistant_text_and_tool_call(self) -> None: + msg = Message( + "assistant", + [ + Content.from_text("Let me check that."), + Content.from_function_call( + call_id="call_2", + name="search", + arguments={"query": "flights"}, + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["content"][0] == {"type": "text", "text": "Let me check that."} + tc = result[0]["content"][1] + assert tc["type"] == "tool_call" + assert tc["arguments"] == {"query": "flights"} + + def test_tool_result_message(self) -> None: + msg = Message( + "tool", + [ + Content.from_function_result( + call_id="call_1", + result="72°F, sunny", + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 1 + assert result[0]["role"] == "tool" + assert result[0]["tool_call_id"] == "call_1" + assert result[0]["content"] == [{"type": "tool_result", "tool_result": "72°F, sunny"}] + + def test_multiple_tool_results(self) -> None: + msg = Message( + "tool", + [ + Content.from_function_result(call_id="call_1", result="r1"), + Content.from_function_result(call_id="call_2", result="r2"), + ], + ) + result = AgentEvalConverter.convert_message(msg) + assert len(result) == 2 + assert result[0]["tool_call_id"] == "call_1" + assert result[1]["tool_call_id"] == "call_2" + + def test_non_string_result_kept_as_object(self) -> None: + msg = Message( + "tool", + [ + Content.from_function_result( + call_id="call_1", + result={"temp": 72, "unit": "F"}, + ), + ], + ) + result = AgentEvalConverter.convert_message(msg) + tr = result[0]["content"][0] + assert tr["type"] == "tool_result" + assert tr["tool_result"] == {"temp": 72, "unit": "F"} + + def test_empty_message(self) -> None: + msg = Message("user", []) + result = AgentEvalConverter.convert_message(msg) + assert result[0] == {"role": "user", "content": [{"type": "text", "text": ""}]} + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.convert_messages +# --------------------------------------------------------------------------- + + +class TestConvertMessages: + def test_full_conversation(self) -> None: + messages = [ + Message("user", ["What's the weather?"]), + Message( + "assistant", + [Content.from_function_call(call_id="c1", name="get_weather", arguments='{"loc": "SEA"}')], + ), + Message("tool", [Content.from_function_result(call_id="c1", result="Sunny")]), + Message("assistant", ["It's sunny in Seattle!"]), + ] + result = AgentEvalConverter.convert_messages(messages) + assert len(result) == 4 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + assert result[1]["content"][0]["type"] == "tool_call" + assert result[1]["content"][0]["name"] == "get_weather" + assert result[2]["role"] == "tool" + assert result[2]["content"][0]["type"] == "tool_result" + assert result[3]["role"] == "assistant" + assert result[3]["content"] == [{"type": "text", "text": "It's sunny in Seattle!"}] + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.extract_tools +# --------------------------------------------------------------------------- + + +class TestExtractTools: + def test_extracts_function_tools(self) -> None: + tool = FunctionTool( + name="get_weather", + description="Get weather for a location", + func=lambda location: f"Sunny in {location}", + ) + agent = MagicMock() + agent.default_options = {"tools": [tool]} + + result = AgentEvalConverter.extract_tools(agent) + assert len(result) == 1 + assert result[0]["name"] == "get_weather" + assert result[0]["description"] == "Get weather for a location" + assert "parameters" in result[0] + + def test_skips_non_function_tools(self) -> None: + agent = MagicMock() + agent.default_options = {"tools": [{"type": "web_search"}, "some_string"]} + + result = AgentEvalConverter.extract_tools(agent) + assert len(result) == 0 + + def test_no_tools(self) -> None: + agent = MagicMock() + agent.default_options = {} + assert AgentEvalConverter.extract_tools(agent) == [] + + def test_no_default_options(self) -> None: + agent = MagicMock(spec=[]) # No attributes + assert AgentEvalConverter.extract_tools(agent) == [] + + +# --------------------------------------------------------------------------- +# AgentEvalConverter.to_eval_item (now returns EvalItem) +# --------------------------------------------------------------------------- + + +class TestToEvalItem: + def test_string_query(self) -> None: + response = AgentResponse(messages=[Message("assistant", ["The weather is sunny."])]) + item = AgentEvalConverter.to_eval_item(query="What's the weather?", response=response) + + assert isinstance(item, EvalItem) + assert item.query == "What's the weather?" + assert item.response == "The weather is sunny." + assert len(item.conversation) == 2 + assert item.conversation[0].role == "user" + assert item.conversation[1].role == "assistant" + + def test_message_query(self) -> None: + input_msgs = [ + Message("system", ["Be helpful."]), + Message("user", ["Hello"]), + ] + response = AgentResponse(messages=[Message("assistant", ["Hi there!"])]) + item = AgentEvalConverter.to_eval_item(query=input_msgs, response=response) + + assert item.query == "Hello" # Only user messages + assert len(item.conversation) == 3 # system + user + assistant + + def test_with_context(self) -> None: + response = AgentResponse(messages=[Message("assistant", ["Answer."])]) + item = AgentEvalConverter.to_eval_item( + query="Question?", + response=response, + context="Some reference document.", + ) + assert item.context == "Some reference document." + + def test_with_explicit_tools(self) -> None: + tool = FunctionTool( + name="search", + description="Search the web", + func=lambda q: f"Results for {q}", + ) + response = AgentResponse(messages=[Message("assistant", ["Found it."])]) + item = AgentEvalConverter.to_eval_item( + query="Find info", + response=response, + tools=[tool], + ) + assert item.tools is not None + assert len(item.tools) == 1 + assert item.tools[0].name == "search" + + def test_with_agent_tools(self) -> None: + tool = FunctionTool(name="calc", description="Calculate", func=lambda x: str(x)) + agent = MagicMock() + agent.default_options = {"tools": [tool]} + + response = AgentResponse(messages=[Message("assistant", ["42"])]) + item = AgentEvalConverter.to_eval_item( + query="What is 6*7?", + response=response, + agent=agent, + ) + assert item.tools is not None + assert item.tools[0].name == "calc" + + def test_explicit_tools_override_agent(self) -> None: + agent_tool = FunctionTool(name="agent_tool", description="from agent", func=lambda: "") + explicit_tool = FunctionTool(name="explicit_tool", description="explicit", func=lambda: "") + + agent = MagicMock() + agent.default_options = {"tools": [agent_tool]} + + response = AgentResponse(messages=[Message("assistant", ["Done"])]) + item = AgentEvalConverter.to_eval_item( + query="Test", + response=response, + agent=agent, + tools=[explicit_tool], + ) + assert item.tools is not None + assert len(item.tools) == 1 + assert item.tools[0].name == "explicit_tool" + + def test_to_dict_format(self) -> None: + """EvalItem.to_eval_data() should split conversation at last user message.""" + response = AgentResponse(messages=[Message("assistant", ["Answer"])]) + item = AgentEvalConverter.to_eval_item( + query="Q", + response=response, + tools=[FunctionTool(name="t", description="d", func=lambda: "")], + ) + d = item.to_eval_data() + assert isinstance(d["query_messages"], list) + assert isinstance(d["response_messages"], list) + # Single-turn: query_messages has just the user msg, response_messages has the assistant msg + assert len(d["query_messages"]) == 1 + assert d["query_messages"][0]["role"] == "user" + assert len(d["response_messages"]) == 1 + assert d["response_messages"][0]["role"] == "assistant" + assert isinstance(d["tool_definitions"], list) + assert len(d["tool_definitions"]) == 1 + assert d["tool_definitions"][0]["name"] == "t" + assert "conversation" not in d + + def test_to_dict_multiturn_preserves_interleaving(self) -> None: + """Multi-turn to_dict() splits at last user message, preserving interleaving.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", ["It's sunny in Seattle."]), + Message("user", ["And tomorrow?"]), + Message("assistant", [Content(type="function_call", name="get_forecast")]), + Message("tool", [Content(type="function_result", result="Rain expected")]), + Message("assistant", ["Rain is expected tomorrow."]), + ] + item = EvalItem(conversation=conversation) + d = item.to_eval_data() + # query_messages: everything up to and including the last user message + assert len(d["query_messages"]) == 3 # user, assistant, user + assert d["query_messages"][0]["role"] == "user" + assert d["query_messages"][1]["role"] == "assistant" # interleaved! + assert d["query_messages"][2]["role"] == "user" + # response_messages: everything after the last user message + assert len(d["response_messages"]) == 3 # assistant(tool_call), tool, assistant + assert d["response_messages"][0]["role"] == "assistant" + assert d["response_messages"][1]["role"] == "tool" + assert d["response_messages"][2]["role"] == "assistant" + + def test_to_dict_full_split(self) -> None: + """ConversationSplit.FULL splits after the first user message.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", ["It's 62°F in Seattle."]), + Message("user", ["And tomorrow?"]), + Message("assistant", ["Rain is expected tomorrow."]), + ] + item = EvalItem(conversation=conversation) + d = item.to_eval_data(split=ConversationSplit.FULL) + # query_messages: just the first user message + assert len(d["query_messages"]) == 1 + assert d["query_messages"][0]["role"] == "user" + assert d["query_messages"][0]["content"] == [{"type": "text", "text": "What's the weather?"}] + # response_messages: everything after the first user message + assert len(d["response_messages"]) == 3 + assert d["response_messages"][0]["role"] == "assistant" + assert d["response_messages"][1]["role"] == "user" + assert d["response_messages"][2]["role"] == "assistant" + + def test_to_dict_full_split_with_system(self) -> None: + """FULL split includes system messages before the first user message in query.""" + conversation = [ + Message("system", ["You are a weather assistant."]), + Message("user", ["What's the weather?"]), + Message("assistant", ["It's sunny."]), + ] + item = EvalItem(conversation=conversation) + d = item.to_eval_data(split=ConversationSplit.FULL) + # query includes system + first user + assert len(d["query_messages"]) == 2 + assert d["query_messages"][0]["role"] == "system" + assert d["query_messages"][1]["role"] == "user" + assert len(d["response_messages"]) == 1 + + def test_to_dict_full_split_with_tools(self) -> None: + """FULL split puts all tool interactions in response_messages.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", [Content(type="function_call", name="get_weather")]), + Message("tool", [Content(type="function_result", result="62°F")]), + Message("assistant", ["It's 62°F."]), + Message("user", ["Thanks!"]), + Message("assistant", ["You're welcome!"]), + ] + item = EvalItem(conversation=conversation) + d = item.to_eval_data(split=ConversationSplit.FULL) + assert len(d["query_messages"]) == 1 + assert len(d["response_messages"]) == 5 + + def test_to_dict_last_turn_is_default(self) -> None: + """Default to_dict() uses LAST_TURN split.""" + conversation = [ + Message("user", ["Hello"]), + Message("assistant", ["Hi there"]), + Message("user", ["Bye"]), + Message("assistant", ["Goodbye"]), + ] + item = EvalItem(conversation=conversation) + d_default = item.to_eval_data() + d_explicit = item.to_eval_data(split=ConversationSplit.LAST_TURN) + assert d_default["query_messages"] == d_explicit["query_messages"] + assert d_default["response_messages"] == d_explicit["response_messages"] + + def test_per_turn_items_simple(self) -> None: + """per_turn_items produces one EvalItem per user message.""" + conversation = [ + Message("user", ["What's the weather?"]), + Message("assistant", ["It's 62°F."]), + Message("user", ["And tomorrow?"]), + Message("assistant", ["Rain expected."]), + ] + items = EvalItem.per_turn_items(conversation) + assert len(items) == 2 + + # Turn 1 + assert items[0].query == "What's the weather?" + assert items[0].response == "It's 62°F." + assert len(items[0].conversation) == 2 + + # Turn 2 — includes cumulative context; query joins all user texts in query split + assert items[1].query == "What's the weather? And tomorrow?" + assert items[1].response == "Rain expected." + assert len(items[1].conversation) == 4 + + def test_per_turn_items_with_tools(self) -> None: + """per_turn_items handles tool calls within a turn.""" + conversation = [ + Message("user", ["Check weather"]), + Message("assistant", [Content(type="function_call", name="get_weather")]), + Message("tool", [Content(type="function_result", result="sunny")]), + Message("assistant", ["It's sunny."]), + Message("user", ["Thanks"]), + Message("assistant", ["You're welcome!"]), + ] + tool_objs = [_make_tool("get_weather")] + items = EvalItem.per_turn_items(conversation, tools=tool_objs) + assert len(items) == 2 + + # Turn 1: response includes tool_call, tool_result, and final assistant + assert items[0].response == "It's sunny." + assert items[0].tools == tool_objs + assert len(items[0].conversation) == 4 # user, assistant(tool), tool, assistant + + # Turn 2 + assert items[1].response == "You're welcome!" + assert len(items[1].conversation) == 6 # full conversation + + def test_per_turn_items_empty(self) -> None: + """per_turn_items returns empty list when no user messages.""" + items = EvalItem.per_turn_items([Message("assistant", ["Hello"])]) + assert items == [] + + def test_per_turn_items_single_turn(self) -> None: + """per_turn_items with single turn produces one item.""" + conversation = [ + Message("user", ["Hi"]), + Message("assistant", ["Hello!"]), + ] + items = EvalItem.per_turn_items(conversation) + assert len(items) == 1 + assert items[0].query == "Hi" + assert items[0].response == "Hello!" + + def test_custom_splitter_callable(self) -> None: + """Custom callable splitter is used by to_dict().""" + conversation = [ + Message("user", ["Remember my name is Alice"]), + Message("assistant", ["Got it, Alice!"]), + Message("user", ["What's the capital of France?"]), + Message("assistant", [Content(type="function_call", name="retrieve_memory", call_id="m1")]), + Message("tool", [Content(type="function_result", call_id="m1", result="User name: Alice")]), + Message("assistant", ["The capital of France is Paris, Alice!"]), + ] + + def split_before_memory(conv): + """Split just before the memory retrieval tool call.""" + for i, msg in enumerate(conv): + for c in msg.contents: + if c.name == "retrieve_memory": + return conv[:i], conv[i:] + return EvalItem._split_last_turn_static(conv) + + item = EvalItem(conversation=conversation) + d = item.to_eval_data(split=split_before_memory) + + # split_before_memory finds "retrieve_memory" at conv[3] (assistant tool_call msg) + # query = conv[:3] = [user, assistant, user] + # response = conv[3:] = [assistant(tool_call), tool, assistant] + assert len(d["query_messages"]) == 3 + assert d["query_messages"][-1]["role"] == "user" + assert len(d["response_messages"]) == 3 + assert d["response_messages"][0]["role"] == "assistant" # the tool_call msg + + def test_custom_splitter_with_fallback(self) -> None: + """Custom splitter falls back to _split_last_turn_static when pattern not found.""" + conversation = [ + Message("user", ["Hello"]), + Message("assistant", ["Hi there!"]), + ] + + def split_before_memory(conv): + for i, msg in enumerate(conv): + for c in msg.contents: + if c.name == "retrieve_memory": + return conv[:i], conv[i:] + return EvalItem._split_last_turn_static(conv) + + item = EvalItem(conversation=conversation) + d = item.to_eval_data(split=split_before_memory) + # Falls back to last-turn split + assert len(d["query_messages"]) == 1 + assert d["query_messages"][0]["role"] == "user" + assert len(d["response_messages"]) == 1 + assert d["response_messages"][0]["role"] == "assistant" + + def test_custom_splitter_lambda(self) -> None: + """A lambda works as a custom splitter.""" + conversation = [ + Message("user", ["A"]), + Message("assistant", ["B"]), + Message("user", ["C"]), + Message("assistant", ["D"]), + ] + # Split at index 2 (arbitrary) + item = EvalItem(conversation=conversation) + d = item.to_eval_data(split=lambda conv: (conv[:2], conv[2:])) + assert len(d["query_messages"]) == 2 + assert len(d["response_messages"]) == 2 + + def test_split_strategy_on_item_used_by_to_dict(self) -> None: + """split_strategy field on EvalItem is used as default by to_dict().""" + conversation = [ + Message("user", ["First"]), + Message("assistant", ["Response 1"]), + Message("user", ["Second"]), + Message("assistant", ["Response 2"]), + ] + item = EvalItem( + conversation=conversation, + split_strategy=ConversationSplit.FULL, + ) + # to_dict() with no split arg should use item.split_strategy + d = item.to_eval_data() + assert len(d["query_messages"]) == 1 # FULL: just first user msg + assert d["query_messages"][0]["content"] == [{"type": "text", "text": "First"}] + assert len(d["response_messages"]) == 3 + + def test_explicit_split_overrides_item_split_strategy(self) -> None: + """Explicit split= arg to to_dict() overrides item.split_strategy.""" + conversation = [ + Message("user", ["First"]), + Message("assistant", ["Response 1"]), + Message("user", ["Second"]), + Message("assistant", ["Response 2"]), + ] + item = EvalItem( + conversation=conversation, + split_strategy=ConversationSplit.FULL, + ) + # Explicit split= should override split_strategy + d = item.to_eval_data(split=ConversationSplit.LAST_TURN) + assert len(d["query_messages"]) == 3 # LAST_TURN: up to last user + assert d["query_messages"][-1]["content"] == [{"type": "text", "text": "Second"}] + assert len(d["response_messages"]) == 1 + + def test_no_split_defaults_to_last_turn(self) -> None: + """When neither split= nor split_strategy is set, defaults to LAST_TURN.""" + conversation = [ + Message("user", ["Hello"]), + Message("assistant", ["Hi"]), + ] + item = EvalItem(conversation=conversation) + assert item.split_strategy is None + d = item.to_eval_data() + assert len(d["query_messages"]) == 1 + assert d["query_messages"][0]["role"] == "user" + + +# --------------------------------------------------------------------------- +# _build_testing_criteria +# --------------------------------------------------------------------------- + + +class TestBuildTestingCriteria: + def test_without_data_mapping(self) -> None: + criteria = _build_testing_criteria(["relevance", "coherence"], "gpt-4o") + assert len(criteria) == 2 + assert criteria[0]["evaluator_name"] == "builtin.relevance" + assert criteria[0]["initialization_parameters"] == {"deployment_name": "gpt-4o"} + assert "data_mapping" not in criteria[0] + + def test_with_data_mapping(self) -> None: + criteria = _build_testing_criteria(["relevance", "groundedness"], "gpt-4o", include_data_mapping=True) + assert "data_mapping" in criteria[0] + # Quality evaluators should NOT have conversation + assert criteria[0]["data_mapping"] == { + "query": "{{item.query}}", + "response": "{{item.response}}", + } + # Groundedness has an extra context mapping + assert "context" in criteria[1]["data_mapping"] + assert "conversation" not in criteria[1]["data_mapping"] + + def test_tool_evaluator_includes_tool_definitions(self) -> None: + criteria = _build_testing_criteria(["relevance", "tool_call_accuracy"], "gpt-4o", include_data_mapping=True) + # relevance: string query/response + assert criteria[0]["data_mapping"]["query"] == "{{item.query}}" + assert criteria[0]["data_mapping"]["response"] == "{{item.response}}" + assert "tool_definitions" not in criteria[0]["data_mapping"] + # tool_call_accuracy: array query/response + tool_definitions + assert criteria[1]["data_mapping"]["query"] == "{{item.query_messages}}" + assert criteria[1]["data_mapping"]["response"] == "{{item.response_messages}}" + assert criteria[1]["data_mapping"]["tool_definitions"] == "{{item.tool_definitions}}" + + def test_agent_evaluators_use_message_arrays(self) -> None: + agent_evals = ["task_adherence", "intent_resolution", "task_completion"] + criteria = _build_testing_criteria(agent_evals, "gpt-4o", include_data_mapping=True) + for c in criteria: + assert c["data_mapping"]["query"] == "{{item.query_messages}}", f"{c['name']}" + assert c["data_mapping"]["response"] == "{{item.response_messages}}", f"{c['name']}" + + def test_quality_evaluators_use_strings(self) -> None: + quality_evals = ["coherence", "relevance", "fluency"] + criteria = _build_testing_criteria(quality_evals, "gpt-4o", include_data_mapping=True) + for c in criteria: + assert c["data_mapping"]["query"] == "{{item.query}}", f"{c['name']}" + assert c["data_mapping"]["response"] == "{{item.response}}", f"{c['name']}" + + def test_all_tool_evaluators_include_tool_definitions(self) -> None: + tool_evals = [ + "tool_call_accuracy", + "tool_selection", + "tool_input_accuracy", + "tool_output_utilization", + "tool_call_success", + ] + criteria = _build_testing_criteria(tool_evals, "gpt-4o", include_data_mapping=True) + for c in criteria: + assert "tool_definitions" in c["data_mapping"], f"{c['name']} missing tool_definitions" + + +# --------------------------------------------------------------------------- +# _build_item_schema +# --------------------------------------------------------------------------- + + +class TestBuildItemSchema: + def test_without_context(self) -> None: + schema = _build_item_schema(has_context=False) + assert "context" not in schema["properties"] + assert schema["required"] == ["query", "response"] + + def test_with_context(self) -> None: + schema = _build_item_schema(has_context=True) + assert "context" in schema["properties"] + + def test_with_tools(self) -> None: + schema = _build_item_schema(has_tools=True) + assert "tool_definitions" in schema["properties"] + + def test_with_context_and_tools(self) -> None: + schema = _build_item_schema(has_context=True, has_tools=True) + assert "context" in schema["properties"] + assert "tool_definitions" in schema["properties"] + + +# --------------------------------------------------------------------------- +# FoundryEvals (constructor, name, select, evaluate via dataset) +# --------------------------------------------------------------------------- + + +class TestFoundryEvals: + def test_constructor_with_openai_client(self) -> None: + mock_client = MagicMock() + fe = FoundryEvals(openai_client=mock_client, model_deployment="gpt-4o") + assert fe.name == "Microsoft Foundry" + + def test_constructor_with_project_client(self) -> None: + mock_oai = MagicMock() + mock_project = MagicMock() + mock_project.get_openai_client.return_value = mock_oai + fe = FoundryEvals(project_client=mock_project, model_deployment="gpt-4o") + assert fe.name == "Microsoft Foundry" + mock_project.get_openai_client.assert_called_once() + + def test_constructor_no_client_raises(self) -> None: + with pytest.raises(ValueError, match="Provide either"): + FoundryEvals(model_deployment="gpt-4o") + + def test_name_property(self) -> None: + fe = FoundryEvals(openai_client=MagicMock(), model_deployment="gpt-4o") + assert fe.name == "Microsoft Foundry" + + def test_evaluators_passed_in_constructor(self) -> None: + fe = FoundryEvals( + openai_client=MagicMock(), + model_deployment="gpt-4o", + evaluators=["relevance", "coherence"], + ) + assert fe._evaluators == ["relevance", "coherence"] + + @pytest.mark.asyncio + async def test_evaluate_calls_evals_api(self) -> None: + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_123" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_456" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval/run_456" + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + items = [ + EvalItem(conversation=[Message("user", ["Hello"]), Message("assistant", ["Hi there!"])]), + EvalItem(conversation=[Message("user", ["Weather?"]), Message("assistant", ["Sunny."])]), + ] + + fe = FoundryEvals( + openai_client=mock_client, + model_deployment="gpt-4o", + evaluators=[FoundryEvals.RELEVANCE], + ) + results = await fe.evaluate(items) + + assert isinstance(results, EvalResults) + assert results.status == "completed" + assert results.eval_id == "eval_123" + assert results.run_id == "run_456" + assert results.report_url == "https://portal.azure.com/eval/run_456" + assert results.all_passed + assert results.passed == 2 + assert results.failed == 0 + + # Verify evals.create was called with correct structure + create_call = mock_client.evals.create.call_args + assert create_call.kwargs["name"] == "Agent Framework Eval" + assert create_call.kwargs["data_source_config"]["type"] == "custom" + + # Verify evals.runs.create was called with JSONL data source + run_call = mock_client.evals.runs.create.call_args + assert run_call.kwargs["data_source"]["type"] == "jsonl" + content = run_call.kwargs["data_source"]["source"]["content"] + assert len(content) == 2 + + @pytest.mark.asyncio + async def test_evaluate_uses_default_evaluators(self) -> None: + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_1" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_1" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + fe = FoundryEvals(openai_client=mock_client, model_deployment="gpt-4o") + await fe.evaluate([EvalItem(conversation=[Message("user", ["Hi"]), Message("assistant", ["Hello"])])]) + + # Verify default evaluators were used + create_call = mock_client.evals.create.call_args + criteria = create_call.kwargs["testing_criteria"] + names = {c["name"] for c in criteria} + assert "relevance" in names + assert "coherence" in names + assert "task_adherence" in names + + @pytest.mark.asyncio + async def test_evaluate_uses_dataset_path(self) -> None: + """Items use the JSONL dataset path.""" + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_ds" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_ds" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + items = [ + EvalItem( + conversation=[Message("user", ["What's the weather?"]), Message("assistant", ["Sunny"])], + ), + ] + + fe = FoundryEvals(openai_client=mock_client, model_deployment="gpt-4o") + await fe.evaluate(items) + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + content = ds["source"]["content"] + assert content[0]["item"]["query"] == "What's the weather?" + + @pytest.mark.asyncio + async def test_evaluate_with_tool_items_uses_dataset_path(self) -> None: + """Items with tool_definitions use the dataset path.""" + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tool" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tool" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + items = [ + EvalItem( + conversation=[Message("user", ["Do the thing"]), Message("assistant", ["Done"])], + tools=[_make_tool("my_tool")], + ), + ] + + fe = FoundryEvals( + openai_client=mock_client, + model_deployment="gpt-4o", + evaluators=[FoundryEvals.TOOL_CALL_ACCURACY], + ) + await fe.evaluate(items) + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + assert "tool_definitions" in ds["source"]["content"][0]["item"] + + @pytest.mark.asyncio + async def test_evaluate_with_project_client(self) -> None: + mock_oai = MagicMock() + mock_project = MagicMock() + mock_project.get_openai_client.return_value = mock_oai + + mock_eval = MagicMock() + mock_eval.id = "eval_pc" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_pc" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + fe = FoundryEvals(project_client=mock_project, model_deployment="gpt-4o") + results = await fe.evaluate([EvalItem(conversation=[Message("user", ["Hi"]), Message("assistant", ["Hello"])])]) + + assert results.status == "completed" + mock_project.get_openai_client.assert_called_once() + + +# --------------------------------------------------------------------------- +# FoundryEvals constants +# --------------------------------------------------------------------------- + + +class TestEvaluators: + def test_constants_resolve(self) -> None: + assert _resolve_evaluator(FoundryEvals.RELEVANCE) == "builtin.relevance" + assert _resolve_evaluator(FoundryEvals.TOOL_CALL_ACCURACY) == "builtin.tool_call_accuracy" + assert _resolve_evaluator(FoundryEvals.VIOLENCE) == "builtin.violence" + assert _resolve_evaluator(FoundryEvals.INTENT_RESOLUTION) == "builtin.intent_resolution" + + def test_all_constants_are_valid(self) -> None: + for attr in dir(FoundryEvals): + if attr.startswith("_"): + continue + value = getattr(FoundryEvals, attr) + if isinstance(value, str): + _resolve_evaluator(value) # should not raise + + +# --------------------------------------------------------------------------- +# _resolve_default_evaluators +# --------------------------------------------------------------------------- + + +class TestResolveDefaultEvaluators: + def test_explicit_evaluators_passthrough(self) -> None: + result = _resolve_default_evaluators([FoundryEvals.VIOLENCE]) + assert result == [FoundryEvals.VIOLENCE] + + def test_none_gives_defaults(self) -> None: + result = _resolve_default_evaluators(None) + assert FoundryEvals.RELEVANCE in result + assert FoundryEvals.COHERENCE in result + assert FoundryEvals.TASK_ADHERENCE in result + assert FoundryEvals.TOOL_CALL_ACCURACY not in result + + def test_none_with_tool_items_adds_tool_eval(self) -> None: + items = [ + EvalItem( + conversation=[Message("user", ["search for stuff"]), Message("assistant", ["found it"])], + tools=[_make_tool("search")], + ), + ] + result = _resolve_default_evaluators(None, items=items) + assert FoundryEvals.TOOL_CALL_ACCURACY in result + + def test_explicit_evaluators_ignore_tool_items(self) -> None: + items = [ + EvalItem( + conversation=[Message("user", ["search"]), Message("assistant", ["found"])], + tools=[_make_tool("search")], + ), + ] + result = _resolve_default_evaluators([FoundryEvals.RELEVANCE], items=items) + assert result == [FoundryEvals.RELEVANCE] + + +# --------------------------------------------------------------------------- +# _filter_tool_evaluators +# --------------------------------------------------------------------------- + + +class TestFilterToolEvaluators: + def test_keeps_tool_evaluators_when_items_have_tools(self) -> None: + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])], tools=[_make_tool("t")]), + ] + result = _filter_tool_evaluators( + ["relevance", "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert "tool_call_accuracy" in result + + def test_removes_tool_evaluators_when_no_tools(self) -> None: + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators( + ["relevance", "tool_call_accuracy"], + items, + ) + assert "relevance" in result + assert "tool_call_accuracy" not in result + + def test_falls_back_to_defaults_when_all_filtered(self) -> None: + items = [ + EvalItem(conversation=[Message("user", ["q"]), Message("assistant", ["r"])]), + ] + result = _filter_tool_evaluators( + ["tool_call_accuracy", "tool_selection"], + items, + ) + # Should fall back to defaults since all evaluators were tool evaluators + assert FoundryEvals.RELEVANCE in result + + +# --------------------------------------------------------------------------- +# EvalResults +# --------------------------------------------------------------------------- + + +class TestEvalResults: + def test_all_passed_true(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 3, "failed": 0, "errored": 0}, + ) + assert r.all_passed + assert r.passed == 3 + assert r.failed == 0 + assert r.errored == 0 + assert r.total == 3 + + def test_all_passed_false_on_failure(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 2, "failed": 1, "errored": 0}, + ) + assert not r.all_passed + assert r.failed == 1 + + def test_all_passed_false_on_error(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 2, "failed": 0, "errored": 1}, + ) + assert not r.all_passed + + def test_all_passed_false_on_non_completed(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="timeout", + result_counts={"passed": 2, "failed": 0, "errored": 0}, + ) + assert not r.all_passed + + def test_all_passed_false_on_empty(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 0, "failed": 0, "errored": 0}, + ) + assert not r.all_passed + + def test_assert_passed_succeeds(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 1, "failed": 0, "errored": 0}, + ) + r.assert_passed() # should not raise + + def test_assert_passed_raises(self) -> None: + r = EvalResults( + provider="test", + eval_id="e", + run_id="r", + status="completed", + result_counts={"passed": 1, "failed": 1, "errored": 0}, + ) + with pytest.raises(AssertionError, match="1 passed, 1 failed"): + r.assert_passed() + + def test_assert_passed_custom_message(self) -> None: + r = EvalResults(provider="test", eval_id="e", run_id="r", status="failed") + with pytest.raises(AssertionError, match="custom error"): + r.assert_passed("custom error") + + def test_none_result_counts(self) -> None: + r = EvalResults(provider="test", eval_id="e", run_id="r", status="completed") + assert r.passed == 0 + assert r.failed == 0 + assert r.total == 0 + assert not r.all_passed + + +# --------------------------------------------------------------------------- +# _resolve_openai_client +# --------------------------------------------------------------------------- + + +class TestResolveOpenAIClient: + def test_explicit_client(self) -> None: + mock_client = MagicMock() + assert _resolve_openai_client(openai_client=mock_client) is mock_client + + def test_project_client(self) -> None: + mock_oai = MagicMock() + mock_project = MagicMock() + mock_project.get_openai_client.return_value = mock_oai + + result = _resolve_openai_client(project_client=mock_project) + assert result is mock_oai + mock_project.get_openai_client.assert_called_once() + + def test_explicit_takes_precedence(self) -> None: + mock_client = MagicMock() + mock_project = MagicMock() + + result = _resolve_openai_client(openai_client=mock_client, project_client=mock_project) + assert result is mock_client + mock_project.get_openai_client.assert_not_called() + + def test_neither_raises(self) -> None: + with pytest.raises(ValueError, match="Provide either"): + _resolve_openai_client() + + +# --------------------------------------------------------------------------- +# evaluate_agent with responses= (core function, uses FoundryEvals as evaluator) +# --------------------------------------------------------------------------- + + +class TestEvaluateAgentWithResponses: + @pytest.mark.asyncio + async def test_responses_without_queries_raises(self) -> None: + mock_oai = MagicMock() + response = AgentResponse(messages=[Message("assistant", ["Hello"])]) + + with pytest.raises(ValueError, match="Provide 'queries' alongside 'responses'"): + await evaluate_agent( + responses=response, + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + @pytest.mark.asyncio + async def test_fallback_to_dataset_with_query(self) -> None: + """Non-Responses-API: falls back to dataset path when query is provided.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_fb" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_fb" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval" + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + response = AgentResponse(messages=[Message("assistant", ["It's sunny."])]) + + results = await evaluate_agent( + responses=response, + queries=["What's the weather?"], + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + assert results[0].status == "completed" + assert results[0].all_passed + + # Should use jsonl data source (dataset path), not azure_ai_responses + run_call = mock_oai.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + content = ds["source"]["content"] + assert len(content) == 1 + assert content[0]["item"]["query"] == "What's the weather?" + assert content[0]["item"]["response"] == "It's sunny." + + @pytest.mark.asyncio + async def test_fallback_with_agent_extracts_tools(self) -> None: + """Non-Responses-API with agent: tool definitions are included in the eval item.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tools" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tools" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + mock_agent = MagicMock() + mock_agent.default_options = { + "tools": [FunctionTool(name="my_tool", description="A test tool", func=lambda x: x)] + } + + response = AgentResponse(messages=[Message("assistant", ["Result."])]) + + results = await evaluate_agent( + responses=response, + queries=["Do the thing"], + agent=mock_agent, + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + assert results[0].status == "completed" + + run_call = mock_oai.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + content = ds["source"]["content"] + item = content[0]["item"] + assert "tool_definitions" in item + tool_defs = item["tool_definitions"] + assert any(t["name"] == "my_tool" for t in tool_defs) + + @pytest.mark.asyncio + async def test_fallback_multiple_responses_with_queries(self) -> None: + """Non-Responses-API with multiple responses requires matching queries.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_multi_fb" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_multi_fb" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + responses = [ + AgentResponse(messages=[Message("assistant", ["Answer 1"])]), + AgentResponse(messages=[Message("assistant", ["Answer 2"])]), + ] + + results = await evaluate_agent( + responses=responses, + queries=["Question 1", "Question 2"], + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + assert results[0].passed == 2 + run_call = mock_oai.evals.runs.create.call_args + content = run_call.kwargs["data_source"]["source"]["content"] + assert len(content) == 2 + assert content[0]["item"]["query"] == "Question 1" + assert content[1]["item"]["query"] == "Question 2" + + @pytest.mark.asyncio + async def test_query_response_count_mismatch_raises(self) -> None: + """Mismatched query and response counts should raise.""" + mock_oai = MagicMock() + + responses = [ + AgentResponse(messages=[Message("assistant", ["A1"])]), + AgentResponse(messages=[Message("assistant", ["A2"])]), + ] + + with pytest.raises(ValueError, match="queries but"): + await evaluate_agent( + responses=responses, + queries=["Q1", "Q2", "Q3"], + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + @pytest.mark.asyncio + async def test_tool_evaluators_with_query_and_agent_uses_dataset_path(self) -> None: + """Tool evaluators with query+agent uses dataset path.""" + mock_oai = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tool" + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tool" + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + response = AgentResponse( + messages=[Message("assistant", ["It's sunny"])], + ) + + agent = MagicMock() + agent.default_options = { + "tools": [ + FunctionTool(name="get_weather", description="Get weather", func=lambda: None), + ] + } + + fe = FoundryEvals( + openai_client=mock_oai, + model_deployment="gpt-4o", + evaluators=[FoundryEvals.TOOL_CALL_ACCURACY], + ) + + await evaluate_agent( + responses=response, + queries=["What's the weather?"], + agent=agent, + evaluators=fe, + ) + + # Verify it used the dataset path (jsonl), not Responses API path + run_call = mock_oai.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "jsonl" + + # Verify tool_definitions are in the data items + items = ds["source"]["content"] + assert "tool_definitions" in items[0]["item"] + + +# --------------------------------------------------------------------------- +# EvalResults.sub_results +# --------------------------------------------------------------------------- + + +class TestEvalResultsSubResults: + def test_sub_results_default_empty(self) -> None: + r = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ) + assert r.sub_results == {} + assert r.all_passed + + def test_all_passed_checks_sub_results(self) -> None: + parent = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0}, + sub_results={ + "agent-a": EvalResults( + provider="test", + eval_id="e2", + run_id="r2", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ), + "agent-b": EvalResults( + provider="test", + eval_id="e3", + run_id="r3", + status="completed", + result_counts={"passed": 1, "failed": 1}, + ), + }, + ) + assert not parent.all_passed # agent-b has a failure + + def test_all_passed_with_all_sub_passing(self) -> None: + parent = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0}, + sub_results={ + "agent-a": EvalResults( + provider="test", + eval_id="e2", + run_id="r2", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ), + }, + ) + assert parent.all_passed + + def test_assert_passed_includes_failed_agents(self) -> None: + parent = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 0}, + sub_results={ + "good-agent": EvalResults( + provider="test", + eval_id="e2", + run_id="r2", + status="completed", + result_counts={"passed": 1, "failed": 0}, + ), + "bad-agent": EvalResults( + provider="test", + eval_id="e3", + run_id="r3", + status="completed", + result_counts={"passed": 0, "failed": 1}, + ), + }, + ) + with pytest.raises(AssertionError, match="bad-agent"): + parent.assert_passed() + + +# --------------------------------------------------------------------------- +# _extract_agent_eval_data +# --------------------------------------------------------------------------- + + +def _make_agent_exec_response( + executor_id: str, + response_text: str, + user_messages: list[str] | None = None, +) -> AgentExecutorResponse: + """Helper to build an AgentExecutorResponse for testing.""" + agent_response = AgentResponse(messages=[Message("assistant", [response_text])]) + full_conv: list[Message] = [] + if user_messages: + for m in user_messages: + full_conv.append(Message("user", [m])) + full_conv.extend(agent_response.messages) + return AgentExecutorResponse( + executor_id=executor_id, + agent_response=agent_response, + full_conversation=full_conv, + ) + + +class TestExtractAgentEvalData: + def test_extracts_single_agent(self) -> None: + aer = _make_agent_exec_response("planner", "Plan is ready", ["Plan a trip"]) + + events = [ + WorkflowEvent.executor_invoked("planner", "Plan a trip"), + WorkflowEvent.executor_completed("planner", [aer]), + ] + result = WorkflowRunResult(events, []) + + data = _extract_agent_eval_data(result) + assert len(data) == 1 + assert data[0]["executor_id"] == "planner" + assert data[0]["response"].text == "Plan is ready" + + def test_extracts_multiple_agents(self) -> None: + aer1 = _make_agent_exec_response("planner", "Plan done", ["Plan a trip"]) + aer2 = _make_agent_exec_response("booker", "Booked!", ["Book flight"]) + + events = [ + WorkflowEvent.executor_invoked("planner", "Plan a trip"), + WorkflowEvent.executor_completed("planner", [aer1]), + WorkflowEvent.executor_invoked("booker", "Book flight"), + WorkflowEvent.executor_completed("booker", [aer2]), + ] + result = WorkflowRunResult(events, []) + + data = _extract_agent_eval_data(result) + assert len(data) == 2 + assert data[0]["executor_id"] == "planner" + assert data[1]["executor_id"] == "booker" + + def test_skips_internal_executors(self) -> None: + aer = _make_agent_exec_response("planner", "Done", ["Go"]) + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "hello"), + WorkflowEvent.executor_completed("input-conversation", ["hello"]), + WorkflowEvent.executor_invoked("planner", "Go"), + WorkflowEvent.executor_completed("planner", [aer]), + WorkflowEvent.executor_invoked("end", []), + WorkflowEvent.executor_completed("end", None), + ] + result = WorkflowRunResult(events, []) + + data = _extract_agent_eval_data(result) + assert len(data) == 1 + assert data[0]["executor_id"] == "planner" + + def test_resolves_agent_from_workflow(self) -> None: + aer = _make_agent_exec_response("my-agent", "Done", ["Do it"]) + + events = [ + WorkflowEvent.executor_invoked("my-agent", "Do it"), + WorkflowEvent.executor_completed("my-agent", [aer]), + ] + result = WorkflowRunResult(events, []) + + # Build a mock workflow with AgentExecutor + from agent_framework import AgentExecutor + + mock_agent = MagicMock() + mock_agent.default_options = {"tools": []} + mock_executor = MagicMock(spec=AgentExecutor) + mock_executor.agent = mock_agent + + mock_workflow = MagicMock() + mock_workflow.executors = {"my-agent": mock_executor} + + data = _extract_agent_eval_data(result, mock_workflow) + assert len(data) == 1 + assert data[0]["agent"] is mock_agent + + +class TestExtractOverallQuery: + def test_extracts_string_query(self) -> None: + events = [WorkflowEvent.executor_invoked("input", "Plan a trip")] + result = WorkflowRunResult(events, []) + assert _extract_overall_query(result) == "Plan a trip" + + def test_extracts_message_query(self) -> None: + msgs = [Message("user", ["What's the weather?"])] + events = [WorkflowEvent.executor_invoked("input", msgs)] + result = WorkflowRunResult(events, []) + assert "What's the weather?" in (_extract_overall_query(result) or "") + + def test_returns_none_for_empty(self) -> None: + result = WorkflowRunResult([], []) + assert _extract_overall_query(result) is None + + +# --------------------------------------------------------------------------- +# evaluate_workflow (core function, uses FoundryEvals as evaluator) +# --------------------------------------------------------------------------- + + +class TestEvaluateWorkflow: + def _mock_oai_client(self, eval_id: str = "eval_wf", run_id: str = "run_wf") -> MagicMock: + mock_oai = MagicMock() + mock_eval = MagicMock() + mock_eval.id = eval_id + mock_oai.evals.create = AsyncMock(return_value=mock_eval) + mock_run = MagicMock() + mock_run.id = run_id + mock_oai.evals.runs.create = AsyncMock(return_value=mock_run) + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval" + mock_completed.per_testing_criteria_results = None + mock_oai.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + return mock_oai + + @pytest.mark.asyncio + async def test_post_hoc_with_workflow_result(self) -> None: + """Evaluate a workflow result that was already produced.""" + mock_oai = self._mock_oai_client() + + aer1 = _make_agent_exec_response("writer", "Draft written", ["Write about Paris"]) + aer2 = _make_agent_exec_response("reviewer", "Looks good!", ["Review: Draft written"]) + + final_output = [Message("assistant", ["Final reviewed output"])] + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "Write about Paris"), + WorkflowEvent.executor_completed("input-conversation", None), + WorkflowEvent.executor_invoked("writer", "Write about Paris"), + WorkflowEvent.executor_completed("writer", [aer1]), + WorkflowEvent.executor_invoked("reviewer", [aer1]), + WorkflowEvent.executor_completed("reviewer", [aer2]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + results = await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + include_overall=False, + ) + + assert results[0].status == "completed" + assert "writer" in results[0].sub_results + assert "reviewer" in results[0].sub_results + assert len(results[0].sub_results) == 2 + + @pytest.mark.asyncio + async def test_with_queries_runs_workflow(self) -> None: + """Passing queries= runs the workflow and evaluates.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("agent", "Response", ["Query"]) + final_output = [Message("assistant", ["Final"])] + + events = [ + WorkflowEvent.executor_invoked("agent", "Test query"), + WorkflowEvent.executor_completed("agent", [aer]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + mock_workflow.run = AsyncMock(return_value=wf_result) + + results = await evaluate_workflow( + workflow=mock_workflow, + queries=["Test query"], + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + include_overall=False, + ) + + mock_workflow.run.assert_called_once_with("Test query") + assert "agent" in results[0].sub_results + + @pytest.mark.asyncio + async def test_overall_plus_per_agent(self) -> None: + """Both overall and per-agent evals run by default.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("planner", "Plan done", ["Plan trip"]) + final_output = [Message("assistant", ["Trip planned!"])] + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "Plan trip"), + WorkflowEvent.executor_completed("input-conversation", None), + WorkflowEvent.executor_invoked("planner", "Plan trip"), + WorkflowEvent.executor_completed("planner", [aer]), + WorkflowEvent.output("end", final_output), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + results = await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + # Should have per-agent sub_results AND overall + assert "planner" in results[0].sub_results + assert results[0].status == "completed" + # FoundryEvals.evaluate called twice: once for planner, once for overall + assert mock_oai.evals.create.call_count == 2 + + @pytest.mark.asyncio + async def test_no_result_or_queries_raises(self) -> None: + mock_oai = MagicMock() + mock_workflow = MagicMock() + + with pytest.raises(ValueError, match="Provide either"): + await evaluate_workflow( + workflow=mock_workflow, + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + ) + + @pytest.mark.asyncio + async def test_per_agent_only(self) -> None: + """include_overall=False skips the overall eval.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("agent-a", "Done", ["Do stuff"]) + + events = [ + WorkflowEvent.executor_invoked("agent-a", "Do stuff"), + WorkflowEvent.executor_completed("agent-a", [aer]), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + results = await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=FoundryEvals(openai_client=mock_oai, model_deployment="gpt-4o"), + include_overall=False, + ) + + assert "agent-a" in results[0].sub_results + # Only one eval call (per-agent), no overall + assert mock_oai.evals.create.call_count == 1 + + @pytest.mark.asyncio + async def test_overall_eval_excludes_tool_evaluators(self) -> None: + """Tool evaluators should not be passed to the overall workflow eval.""" + mock_oai = self._mock_oai_client() + + aer = _make_agent_exec_response("researcher", "Weather is sunny", ["What's the weather?"]) + + events = [ + WorkflowEvent.executor_invoked("input-conversation", "What's the weather?"), + WorkflowEvent.executor_completed("input-conversation", None), + WorkflowEvent.executor_invoked("researcher", "What's the weather?"), + WorkflowEvent.executor_completed("researcher", [aer]), + WorkflowEvent.output("end", [Message("assistant", ["Weather is sunny"])]), + ] + wf_result = WorkflowRunResult(events, []) + + mock_workflow = MagicMock() + mock_workflow.executors = {} + + fe = FoundryEvals( + openai_client=mock_oai, + model_deployment="gpt-4o", + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ) + + await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=fe, + ) + + # Should have 2 evals: one per-agent, one overall + assert mock_oai.evals.create.call_count == 2 + + # Check the overall eval's testing_criteria doesn't include tool_call_accuracy + overall_call = mock_oai.evals.create.call_args_list[-1] + overall_criteria = overall_call.kwargs["testing_criteria"] + evaluator_names = [c["evaluator_name"] for c in overall_criteria] + assert "builtin.tool_call_accuracy" not in evaluator_names + assert "builtin.relevance" in evaluator_names + + @pytest.mark.asyncio + async def test_per_agent_excludes_tool_evaluators_when_no_tools(self) -> None: + """Sub-agents without tools should not get tool evaluators.""" + mock_oai = self._mock_oai_client() + + # researcher has tools, planner does not + aer1 = _make_agent_exec_response("researcher", "Weather is sunny", ["Check weather"]) + aer2 = _make_agent_exec_response("planner", "Trip planned", ["Plan based on: sunny"]) + + events = [ + WorkflowEvent.executor_invoked("researcher", "Check weather"), + WorkflowEvent.executor_completed("researcher", [aer1]), + WorkflowEvent.executor_invoked("planner", "Plan based on: sunny"), + WorkflowEvent.executor_completed("planner", [aer2]), + ] + wf_result = WorkflowRunResult(events, []) + + from agent_framework import AgentExecutor + + # researcher has tools + mock_researcher = MagicMock() + mock_researcher.default_options = { + "tools": [ + FunctionTool(name="get_weather", description="Get weather", func=lambda: None), + ] + } + mock_researcher_executor = MagicMock(spec=AgentExecutor) + mock_researcher_executor.agent = mock_researcher + + # planner has NO tools + mock_planner = MagicMock() + mock_planner.default_options = {"tools": []} + mock_planner_executor = MagicMock(spec=AgentExecutor) + mock_planner_executor.agent = mock_planner + + mock_workflow = MagicMock() + mock_workflow.executors = { + "researcher": mock_researcher_executor, + "planner": mock_planner_executor, + } + + fe = FoundryEvals( + openai_client=mock_oai, + model_deployment="gpt-4o", + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ) + + await evaluate_workflow( + workflow=mock_workflow, + workflow_result=wf_result, + evaluators=fe, + include_overall=False, + ) + + # Two sub-agent evals + assert mock_oai.evals.create.call_count == 2 + + # Find which call is for researcher vs planner by eval name + for call in mock_oai.evals.create.call_args_list: + criteria = call.kwargs["testing_criteria"] + eval_names = [c["evaluator_name"] for c in criteria] + name = call.kwargs["name"] + if "planner" in name: + assert "builtin.tool_call_accuracy" not in eval_names, ( + "planner has no tools — should not get tool_call_accuracy" + ) + elif "researcher" in name: + assert "builtin.tool_call_accuracy" in eval_names, ( + "researcher has tools — should get tool_call_accuracy" + ) + + +# --------------------------------------------------------------------------- +# EvalItemResult and EvalScoreResult +# --------------------------------------------------------------------------- + + +class TestEvalItemResult: + def test_status_properties(self) -> None: + from agent_framework._evaluation import EvalItemResult + + passed = EvalItemResult(item_id="1", status="pass") + assert passed.is_passed + assert not passed.is_failed + assert not passed.is_error + + failed = EvalItemResult(item_id="2", status="fail") + assert not failed.is_passed + assert failed.is_failed + assert not failed.is_error + + errored = EvalItemResult(item_id="3", status="error") + assert not errored.is_passed + assert not errored.is_failed + assert errored.is_error + + errored2 = EvalItemResult(item_id="4", status="errored") + assert errored2.is_error + + def test_with_scores(self) -> None: + from agent_framework._evaluation import EvalItemResult, EvalScoreResult + + scores = [ + EvalScoreResult(name="relevance", score=0.9, passed=True), + EvalScoreResult(name="coherence", score=0.3, passed=False), + ] + item = EvalItemResult(item_id="1", status="fail", scores=scores) + assert len(item.scores) == 2 + assert item.scores[0].passed is True + assert item.scores[1].passed is False + + def test_with_error(self) -> None: + from agent_framework._evaluation import EvalItemResult + + item = EvalItemResult( + item_id="1", + status="error", + error_code="QueryExtractionError", + error_message="Query list cannot be empty", + ) + assert item.is_error + assert item.error_code == "QueryExtractionError" + + def test_with_token_usage(self) -> None: + from agent_framework._evaluation import EvalItemResult + + item = EvalItemResult( + item_id="1", + status="pass", + token_usage={"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150}, + ) + assert item.token_usage is not None + assert item.token_usage["total_tokens"] == 150 + + +class TestEvalResultsWithItems: + def test_item_status_properties(self) -> None: + from agent_framework._evaluation import EvalItemResult + + results = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 2, "failed": 1, "errored": 1}, + items=[ + EvalItemResult(item_id="1", status="pass"), + EvalItemResult(item_id="2", status="pass"), + EvalItemResult(item_id="3", status="fail"), + EvalItemResult(item_id="4", status="error", error_code="QueryExtractionError"), + ], + ) + assert sum(1 for i in results.items if i.is_passed) == 2 + assert sum(1 for i in results.items if i.is_failed) == 1 + assert sum(1 for i in results.items if i.is_error) == 1 + + def test_assert_passed_includes_errored_items(self) -> None: + from agent_framework._evaluation import EvalItemResult + + results = EvalResults( + provider="test", + eval_id="e1", + run_id="r1", + status="completed", + result_counts={"passed": 0, "failed": 0, "errored": 2}, + items=[ + EvalItemResult(item_id="i1", status="error", error_code="QueryExtractionError"), + EvalItemResult(item_id="i2", status="error", error_code="TimeoutError"), + ], + ) + with pytest.raises(AssertionError, match="Errored items: i1: QueryExtractionError"): + results.assert_passed() + + +# --------------------------------------------------------------------------- +# _fetch_output_items +# --------------------------------------------------------------------------- + + +class TestFetchOutputItems: + @pytest.mark.asyncio + async def test_fetches_and_converts_output_items(self) -> None: + from agent_framework_azure_ai._foundry_evals import _fetch_output_items + + # Build mock output items matching the OpenAI SDK schema + mock_result = MagicMock() + mock_result.name = "relevance" + mock_result.score = 0.85 + mock_result.passed = True + mock_result.sample = None + + mock_usage = MagicMock() + mock_usage.prompt_tokens = 100 + mock_usage.completion_tokens = 50 + mock_usage.total_tokens = 150 + mock_usage.cached_tokens = 0 + + mock_input = MagicMock() + mock_input.role = "user" + mock_input.content = "What is the weather?" + + mock_output = MagicMock() + mock_output.role = "assistant" + mock_output.content = "It is sunny." + + mock_error = MagicMock() + mock_error.code = "" + mock_error.message = "" + + mock_sample = MagicMock() + mock_sample.error = mock_error + mock_sample.usage = mock_usage + mock_sample.input = [mock_input] + mock_sample.output = [mock_output] + + mock_oi = MagicMock() + mock_oi.id = "oi_abc123" + mock_oi.status = "pass" + mock_oi.results = [mock_result] + mock_oi.sample = mock_sample + mock_oi.datasource_item = {"resp_id": "resp_xyz"} + + mock_client = MagicMock() + mock_page = MagicMock() + mock_page.__iter__ = MagicMock(return_value=iter([mock_oi])) + mock_client.evals.runs.output_items.list = AsyncMock(return_value=mock_page) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + item = items[0] + assert item.item_id == "oi_abc123" + assert item.status == "pass" + assert item.is_passed + assert len(item.scores) == 1 + assert item.scores[0].name == "relevance" + assert item.scores[0].score == 0.85 + assert item.scores[0].passed is True + assert item.response_id == "resp_xyz" + assert item.input_text == "What is the weather?" + assert item.output_text == "It is sunny." + assert item.token_usage is not None + assert item.token_usage["total_tokens"] == 150 + assert item.error_code is None + + @pytest.mark.asyncio + async def test_handles_errored_item(self) -> None: + from agent_framework_azure_ai._foundry_evals import _fetch_output_items + + mock_error = MagicMock() + mock_error.code = "QueryExtractionError" + mock_error.message = "Query list cannot be empty" + + mock_sample = MagicMock() + mock_sample.error = mock_error + mock_sample.usage = None + mock_sample.input = [] + mock_sample.output = [] + + mock_oi = MagicMock() + mock_oi.id = "oi_err1" + mock_oi.status = "error" + mock_oi.results = [] + mock_oi.sample = mock_sample + mock_oi.datasource_item = {} + + mock_client = MagicMock() + mock_page = MagicMock() + mock_page.__iter__ = MagicMock(return_value=iter([mock_oi])) + mock_client.evals.runs.output_items.list = AsyncMock(return_value=mock_page) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + + assert len(items) == 1 + item = items[0] + assert item.is_error + assert item.error_code == "QueryExtractionError" + assert item.error_message == "Query list cannot be empty" + assert len(item.scores) == 0 + + @pytest.mark.asyncio + async def test_handles_api_failure_gracefully(self) -> None: + from agent_framework_azure_ai._foundry_evals import _fetch_output_items + + mock_client = MagicMock() + mock_client.evals.runs.output_items.list = AsyncMock(side_effect=TypeError("API error")) + + items = await _fetch_output_items(mock_client, "eval_1", "run_1") + assert items == [] + + +# --------------------------------------------------------------------------- +# _poll_eval_run — timeout / failed / canceled paths +# --------------------------------------------------------------------------- + + +class TestPollEvalRun: + @pytest.mark.asyncio + async def test_timeout_returns_timeout_status(self) -> None: + """Poll timeout returns EvalResults with status='timeout'.""" + from agent_framework_azure_ai._foundry_evals import _poll_eval_run + + mock_client = MagicMock() + mock_pending = MagicMock() + mock_pending.status = "queued" + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_pending) + + results = await _poll_eval_run( + mock_client, "eval_1", "run_1", poll_interval=0.01, timeout=0.05 + ) + assert results.status == "timeout" + assert results.eval_id == "eval_1" + assert results.run_id == "run_1" + + @pytest.mark.asyncio + async def test_failed_run_returns_error(self) -> None: + """Failed run returns EvalResults with error message.""" + from agent_framework_azure_ai._foundry_evals import _poll_eval_run + + mock_client = MagicMock() + mock_failed = MagicMock() + mock_failed.status = "failed" + mock_failed.error = "Model deployment unavailable" + mock_failed.result_counts = None + mock_failed.report_url = None + mock_failed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_failed) + + results = await _poll_eval_run( + mock_client, "eval_1", "run_1", poll_interval=0.01, timeout=5.0 + ) + assert results.status == "failed" + assert results.error == "Model deployment unavailable" + + @pytest.mark.asyncio + async def test_canceled_run_returns_canceled_status(self) -> None: + """Canceled run returns EvalResults with status='canceled'.""" + from agent_framework_azure_ai._foundry_evals import _poll_eval_run + + mock_client = MagicMock() + mock_canceled = MagicMock() + mock_canceled.status = "canceled" + mock_canceled.error = None + mock_canceled.result_counts = None + mock_canceled.report_url = None + mock_canceled.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_canceled) + + results = await _poll_eval_run( + mock_client, "eval_1", "run_1", poll_interval=0.01, timeout=5.0 + ) + assert results.status == "canceled" + assert results.error is None + + +# --------------------------------------------------------------------------- +# evaluate_traces +# --------------------------------------------------------------------------- + + +class TestEvaluateTraces: + @pytest.mark.asyncio + async def test_raises_without_required_args(self) -> None: + """Raises ValueError when no response_ids, trace_ids, or agent_id given.""" + from agent_framework_azure_ai._foundry_evals import evaluate_traces + + mock_client = MagicMock() + with pytest.raises(ValueError, match="Provide at least one of"): + await evaluate_traces( + openai_client=mock_client, + model_deployment="gpt-4o", + ) + + @pytest.mark.asyncio + async def test_response_ids_path(self) -> None: + """evaluate_traces with response_ids delegates to _evaluate_via_responses.""" + from agent_framework_azure_ai._foundry_evals import evaluate_traces + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tr" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tr" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval/run_tr" + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + results = await evaluate_traces( + response_ids=["resp_abc", "resp_def"], + openai_client=mock_client, + model_deployment="gpt-4o", + ) + assert results.status == "completed" + assert results.eval_id == "eval_tr" + + # Verify the response IDs are in the data source + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_responses" + content = ds["item_generation_params"]["source"]["content"] + assert len(content) == 2 + assert content[0]["item"]["resp_id"] == "resp_abc" + + @pytest.mark.asyncio + async def test_trace_ids_path(self) -> None: + """evaluate_traces with trace_ids builds azure_ai_traces data source.""" + from agent_framework_azure_ai._foundry_evals import evaluate_traces + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tid" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tid" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 1, "failed": 0} + mock_completed.report_url = None + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + results = await evaluate_traces( + trace_ids=["trace_1"], + openai_client=mock_client, + model_deployment="gpt-4o", + ) + assert results.status == "completed" + + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_traces" + assert ds["trace_ids"] == ["trace_1"] + + +# --------------------------------------------------------------------------- +# evaluate_foundry_target +# --------------------------------------------------------------------------- + + +class TestEvaluateFoundryTarget: + @pytest.mark.asyncio + async def test_happy_path(self) -> None: + """evaluate_foundry_target creates eval + run and polls to completion.""" + from agent_framework_azure_ai._foundry_evals import evaluate_foundry_target + + mock_client = MagicMock() + + mock_eval = MagicMock() + mock_eval.id = "eval_tgt" + mock_client.evals.create = AsyncMock(return_value=mock_eval) + + mock_run = MagicMock() + mock_run.id = "run_tgt" + mock_client.evals.runs.create = AsyncMock(return_value=mock_run) + + mock_completed = MagicMock() + mock_completed.status = "completed" + mock_completed.result_counts = {"passed": 2, "failed": 0} + mock_completed.report_url = "https://portal.azure.com/eval/run_tgt" + mock_completed.per_testing_criteria_results = None + mock_client.evals.runs.retrieve = AsyncMock(return_value=mock_completed) + + results = await evaluate_foundry_target( + target={"type": "azure_ai_agent", "name": "my-agent"}, + test_queries=["Query 1", "Query 2"], + openai_client=mock_client, + model_deployment="gpt-4o", + ) + assert results.status == "completed" + assert results.eval_id == "eval_tgt" + assert results.all_passed + + # Verify the target and queries in data source + run_call = mock_client.evals.runs.create.call_args + ds = run_call.kwargs["data_source"] + assert ds["type"] == "azure_ai_target_completions" + assert ds["target"]["type"] == "azure_ai_agent" + content = ds["source"]["content"] + assert len(content) == 2 + assert content[0]["item"]["query"] == "Query 1" diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index 0f652f23bd..49b74458a2 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -57,6 +57,27 @@ included_messages, included_token_count, ) +from ._evaluation import ( + AgentEvalConverter, + CheckResult, + ConversationSplit, + ConversationSplitter, + EvalItem, + EvalItemResult, + EvalResults, + EvalScoreResult, + Evaluator, + ExpectedToolCall, + LocalEvaluator, + evaluate_agent, + evaluate_response, + evaluate_workflow, + evaluator, + keyword_check, + tool_call_args_match, + tool_called_check, + tool_calls_present, +) from ._mcp import MCPStdioTool, MCPStreamableHTTPTool, MCPWebsocketTool from ._middleware import ( AgentContext, @@ -242,6 +263,7 @@ "USER_AGENT_TELEMETRY_DISABLED_ENV_VAR", "Agent", "AgentContext", + "AgentEvalConverter", "AgentExecutor", "AgentExecutorRequest", "AgentExecutorResponse", @@ -268,11 +290,14 @@ "ChatOptions", "ChatResponse", "ChatResponseUpdate", + "CheckResult", "CheckpointStorage", "CompactionProvider", "CompactionStrategy", "Content", "ContinuationToken", + "ConversationSplit", + "ConversationSplitter", "Default", "Edge", "EdgeCondition", @@ -281,7 +306,13 @@ "EmbeddingGenerationOptions", "EmbeddingInputT", "EmbeddingT", + "EvalItem", + "EvalItemResult", + "EvalResults", + "EvalScoreResult", + "Evaluator", "Executor", + "ExpectedToolCall", "FanInEdgeGroup", "FanOutEdgeGroup", "FileCheckpointStorage", @@ -300,6 +331,7 @@ "InMemoryCheckpointStorage", "InMemoryHistoryProvider", "InProcRunnerContext", + "LocalEvaluator", "MCPStdioTool", "MCPStreamableHTTPTool", "MCPWebsocketTool", @@ -379,11 +411,16 @@ "chat_middleware", "create_edge_runner", "detect_media_type_from_base64", + "evaluate_agent", + "evaluate_response", + "evaluate_workflow", + "evaluator", "executor", "function_middleware", "handler", "included_messages", "included_token_count", + "keyword_check", "load_settings", "map_chat_to_agent_update", "merge_chat_options", @@ -396,6 +433,9 @@ "resolve_agent_id", "response_handler", "tool", + "tool_call_args_match", + "tool_called_check", + "tool_calls_present", "validate_chat_options", "validate_tool_mode", "validate_tools", diff --git a/python/packages/core/agent_framework/_evaluation.py b/python/packages/core/agent_framework/_evaluation.py new file mode 100644 index 0000000000..5257049ed7 --- /dev/null +++ b/python/packages/core/agent_framework/_evaluation.py @@ -0,0 +1,1861 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Provider-agnostic evaluation framework for Microsoft Agent Framework. + +Defines the core evaluation types and orchestration functions that work with +any evaluation provider (Azure AI Foundry, local evaluators, third-party +libraries, etc.). Also includes ``LocalEvaluator`` and built-in check +functions for fast, API-free evaluation during inner-loop development and +CI smoke tests. + +Typical usage — cloud evaluator:: + + from agent_framework import evaluate_agent, EvalResults + from agent_framework_azure_ai import FoundryEvals + + evals = FoundryEvals(project_client=client, model_deployment="gpt-4o") + results = await evaluate_agent(agent=agent, queries=["Hello"], evaluators=evals) + results.assert_passed() + +Typical usage — local evaluator:: + + from agent_framework import LocalEvaluator, keyword_check, evaluate_agent + + local = LocalEvaluator( + keyword_check("weather", "temperature"), + tool_called_check("get_weather"), + ) + results = await evaluate_agent(agent=agent, queries=queries, evaluators=local) +""" + +from __future__ import annotations + +import asyncio +import contextlib +import inspect +import json +import logging +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Literal, + Protocol, + TypedDict, + Union, + cast, + overload, + runtime_checkable, +) + +from ._tools import FunctionTool +from ._types import AgentResponse, Message + +if TYPE_CHECKING: + from ._workflows._agent_executor import AgentExecutorResponse + from ._workflows._workflow import Workflow, WorkflowRunResult + +logger = logging.getLogger(__name__) + + +# region Core types + + +class ConversationSplit(str, Enum): + """Built-in strategies for splitting a conversation into query/response halves. + + Different splits evaluate different aspects of agent behavior: + + - ``LAST_TURN``: Split at the last user message. Everything up to and + including that message is the query; everything after is the response. + Evaluates whether the agent answered the *latest* question well. + + - ``FULL``: The first user message (and any preceding system messages) is + the query; the entire remainder of the conversation is the response. + Evaluates whether the *whole conversation trajectory* served the + original request. + + For custom splits (e.g. split before a memory-retrieval tool call), + pass a callable instead — see ``ConversationSplitter``. + """ + + LAST_TURN = "last_turn" + FULL = "full" + + +ConversationSplitter = Union[ + ConversationSplit, + Callable[[list[Message]], tuple[list[Message], list[Message]]], +] +"""Type accepted by ``EvalItem.to_eval_data(split=...)``. + +Either a built-in ``ConversationSplit`` enum value **or** a callable with +signature:: + + def my_splitter(conversation: list[Message]) -> tuple[list[Message], list[Message]]: + '''Return (query_messages, response_messages).''' + +Custom splitters let you evaluate domain-specific boundaries — for example, +splitting just before a memory-retrieval tool call to evaluate recall quality:: + + def split_before_memory(conversation): + for i, msg in enumerate(conversation): + for c in msg.contents or []: + if c.type == "function_call" and c.name == "retrieve_memory": + return conversation[:i], conversation[i:] + # Fallback: split at last user message + return EvalItem._split_last_turn_static(conversation) + + item.to_eval_data(split=split_before_memory) +""" + + +@dataclass +class ExpectedToolCall: + """A tool call that an agent is expected to make. + + Used with :func:`evaluate_agent` to assert that the agent called the + correct tools. The *evaluator* decides the matching semantics (order, + extras, argument checking); this type is pure data. + + Attributes: + name: The tool/function name (e.g. ``"get_weather"``). + arguments: Expected arguments. ``None`` means "don't check arguments". + """ + + name: str + arguments: dict[str, Any] | None = None + + +class EvalItem: + """A single item to be evaluated. + + Represents one query/response interaction in a provider-agnostic format. + ``conversation`` is the single source of truth — ``query`` and ``response`` + are derived from it via the split strategy. + + Attributes: + conversation: Full conversation as ``Message`` objects. + tools: Typed tool objects (e.g. ``FunctionTool``) for evaluator logic. + context: Optional grounding context document. + expected_output: Optional expected output for ground-truth comparison. + expected_tool_calls: Expected tool calls for tool-correctness + evaluation. See :class:`ExpectedToolCall`. + split_strategy: Split strategy controlling how ``query`` and + ``response`` are derived from the conversation. Defaults to + ``ConversationSplit.LAST_TURN``. + """ + + def __init__( + self, + conversation: list[Message], + tools: list[FunctionTool] | None = None, + context: str | None = None, + expected_output: str | None = None, + expected_tool_calls: list[ExpectedToolCall] | None = None, + split_strategy: ConversationSplitter | None = None, + ) -> None: + self.conversation = conversation + self.tools = tools + self.context = context + self.expected_output = expected_output + self.expected_tool_calls = expected_tool_calls + self.split_strategy = split_strategy + + @property + def query(self) -> str: + """User query text, derived from the query side of the conversation split.""" + query_msgs, _ = self._split_conversation(self.split_strategy or ConversationSplit.LAST_TURN) + user_texts = [m.text for m in query_msgs if m.role == "user" and m.text] + return " ".join(user_texts).strip() + + @property + def response(self) -> str: + """Agent response text, derived from the response side of the conversation split.""" + _, response_msgs = self._split_conversation(self.split_strategy or ConversationSplit.LAST_TURN) + assistant_texts = [m.text for m in response_msgs if m.role == "assistant" and m.text] + return " ".join(assistant_texts).strip() + + def to_eval_data( + self, + *, + split: ConversationSplitter | None = None, + ) -> dict[str, Any]: + """Convert to a flat dict for serialization. + + Produces ``query``, ``response``, ``query_messages`` and + ``response_messages`` by splitting the conversation according to + *split*: + + - ``LAST_TURN`` (default): split at the last user message. + - ``FULL``: split after the first user message. + - A callable: your function receives the conversation list and + returns ``(query_messages, response_messages)``. + + When *split* is ``None`` (the default), uses ``self.split_strategy`` + if set, otherwise ``ConversationSplit.LAST_TURN``. + """ + effective_split = split or self.split_strategy or ConversationSplit.LAST_TURN + query_msgs, response_msgs = self._split_conversation(effective_split) + + query_text = " ".join(m.text for m in query_msgs if m.role == "user" and m.text).strip() + response_text = " ".join(m.text for m in response_msgs if m.role == "assistant" and m.text).strip() + + item: dict[str, Any] = { + "query": query_text, + "response": response_text, + "query_messages": AgentEvalConverter.convert_messages(query_msgs), + "response_messages": AgentEvalConverter.convert_messages(response_msgs), + } + if self.tools: + item["tool_definitions"] = [ + {"name": t.name, "description": t.description, "parameters": t.parameters()} for t in self.tools + ] + if self.context: + item["context"] = self.context + return item + + def _split_conversation(self, split: ConversationSplitter) -> tuple[list[Message], list[Message]]: + """Split ``self.conversation`` into (query_messages, response_messages).""" + if callable(split) and not isinstance(split, ConversationSplit): + return split(self.conversation) + if split == ConversationSplit.FULL: + return self._split_full() + return self._split_last_turn() + + def _split_last_turn(self) -> tuple[list[Message], list[Message]]: + """Split at the last user message (default strategy).""" + return self._split_last_turn_static(self.conversation) + + @staticmethod + def _split_last_turn_static( + conversation: list[Message], + ) -> tuple[list[Message], list[Message]]: + """Split at the last user message. Usable as a fallback in custom splitters.""" + last_user_idx = -1 + for i, msg in enumerate(conversation): + if msg.role == "user": + last_user_idx = i + + if last_user_idx >= 0: + return ( + conversation[: last_user_idx + 1], + conversation[last_user_idx + 1 :], + ) + return [], list(conversation) + + def _split_full(self) -> tuple[list[Message], list[Message]]: + """Split after the first user message (evaluates whole trajectory).""" + first_user_idx = -1 + for i, msg in enumerate(self.conversation): + if msg.role == "user": + first_user_idx = i + break + + if first_user_idx >= 0: + return ( + self.conversation[: first_user_idx + 1], + self.conversation[first_user_idx + 1 :], + ) + return [], list(self.conversation) + + @classmethod + def per_turn_items( + cls, + conversation: list[Message], + *, + tools: list[FunctionTool] | None = None, + context: str | None = None, + ) -> list[EvalItem]: + """Split a multi-turn conversation into one ``EvalItem`` per turn. + + Each user message starts a new turn. The resulting ``EvalItem`` + has cumulative context: ``query_messages`` contains the full + conversation up to and including that user message, and + ``response_messages`` contains the agent's actions up to the next + user message. This lets you evaluate each response independently + with its full preceding context. + + Args: + conversation: Full conversation as ``Message`` objects. + tools: Tool objects shared across all items. + context: Optional grounding context shared across all items. + + Returns: + A list of ``EvalItem`` instances, one per user turn. + """ + user_indices = [i for i, m in enumerate(conversation) if m.role == "user"] + if not user_indices: + return [] + + items: list[EvalItem] = [] + for turn_idx, _ui in enumerate(user_indices): + # Response runs from after the user message to the next user + # message (or end of conversation). + next_ui = user_indices[turn_idx + 1] if turn_idx + 1 < len(user_indices) else len(conversation) + + items.append( + cls( + conversation=conversation[:next_ui], + tools=tools, + context=context, + ) + ) + + return items + + +# endregion + +# region Score and result types + + +@dataclass +class EvalScoreResult: + """Result from a single evaluator on a single item. + + Attributes: + name: Evaluator name (e.g. ``"relevance"``). + score: Numeric score from the evaluator. + passed: Whether the item passed this evaluator's threshold. + sample: Optional raw evaluator output (rationale, metadata). + """ + + name: str + score: float + passed: bool | None = None + sample: dict[str, Any] | None = None + + +@dataclass +class EvalItemResult: + """Per-item result from an evaluation run. + + Attributes: + item_id: Provider-assigned item identifier. + status: ``"pass"``, ``"fail"``, or ``"error"``. + scores: Per-evaluator results for this item. + error_code: Error category when ``status == "error"`` + (e.g. ``"QueryExtractionError"``). + error_message: Human-readable error detail. + response_id: Responses API response ID, if applicable. + input_text: The query/input that was evaluated. + output_text: The response/output that was evaluated. + token_usage: Token counts (``prompt_tokens``, + ``completion_tokens``, ``total_tokens``). + metadata: Additional provider-specific data. + """ + + item_id: str + status: str + scores: list[EvalScoreResult] = field(default_factory=lambda: list[EvalScoreResult]()) + error_code: str | None = None + error_message: str | None = None + response_id: str | None = None + input_text: str | None = None + output_text: str | None = None + token_usage: dict[str, int] | None = None + metadata: dict[str, Any] | None = None + + @property + def is_error(self) -> bool: + """Whether this item errored (infrastructure failure, not quality).""" + return self.status in ("error", "errored") + + @property + def is_passed(self) -> bool: + """Whether this item passed all evaluators.""" + return self.status == "pass" + + @property + def is_failed(self) -> bool: + """Whether this item failed at least one evaluator.""" + return self.status == "fail" + + +@dataclass +class EvalResults: + """Results from an evaluation run by a single provider. + + Attributes: + provider: Name of the evaluation provider that produced these results. + eval_id: The evaluation definition ID (provider-specific). + run_id: The evaluation run ID (provider-specific). + status: Run status - ``"completed"``, ``"failed"``, ``"canceled"``, + or ``"timeout"`` if polling exceeded the deadline. + result_counts: Pass/fail/error counts, populated when completed. + report_url: URL to view results in the provider's portal. + error: Error details when the run failed. + per_evaluator: Per-evaluator result counts, keyed by evaluator name. + items: Per-item results with individual pass/fail/error status, + evaluator scores, error details, and token usage. Populated + when the provider supports per-item retrieval (e.g. Foundry + ``output_items`` API). + sub_results: Per-agent breakdown for workflow evaluations, keyed by + agent/executor name. + + Example:: + + results = await evaluate_agent(agent=my_agent, queries=["Hello"], evaluators=evals) + for r in results: + print(f"{r.provider}: {r.passed}/{r.total}") + + # Per-item detail + for item in r.items: + print(f" {item.item_id}: {item.status}") + for score in item.scores: + print(f" {score.name}: {score.score} ({'pass' if score.passed else 'fail'})") + if item.is_error: + print(f" Error: {item.error_code} - {item.error_message}") + + # Workflow eval - per-agent breakdown + for r in results: + for name, sub in r.sub_results.items(): + print(f" {name}: {sub.passed}/{sub.total}") + """ + + provider: str + eval_id: str + run_id: str + status: str + result_counts: dict[str, int] | None = None + report_url: str | None = None + error: str | None = None + per_evaluator: dict[str, dict[str, int]] = field(default_factory=lambda: dict[str, dict[str, int]]()) + items: list[EvalItemResult] = field(default_factory=lambda: list[EvalItemResult]()) + sub_results: dict[str, EvalResults] = field(default_factory=lambda: dict[str, EvalResults]()) + + @property + def passed(self) -> int: + """Number of passing results.""" + return (self.result_counts or {}).get("passed", 0) + + @property + def failed(self) -> int: + """Number of failing results.""" + return (self.result_counts or {}).get("failed", 0) + + @property + def errored(self) -> int: + """Number of errored results.""" + return (self.result_counts or {}).get("errored", 0) + + @property + def total(self) -> int: + """Total number of results (passed + failed + errored).""" + return self.passed + self.failed + self.errored + + @property + def all_passed(self) -> bool: + """Whether all results passed with no failures or errors. + + For workflow evals with sub-agents, checks that all sub-results passed. + Returns ``False`` if the run did not complete successfully. + """ + if self.status not in ("completed",): + return False + if self.sub_results: + return all(sub.all_passed for sub in self.sub_results.values()) + # Leaf result - check own counts + return self.failed == 0 and self.errored == 0 and self.total > 0 + + def assert_passed(self, msg: str | None = None) -> None: + """Assert all results passed. Raises ``AssertionError`` for CI use. + + Args: + msg: Optional custom failure message. + """ + if not self.all_passed: + detail = msg or ( + f"Eval run {self.run_id} {self.status}: " + f"{self.passed} passed, {self.failed} failed, {self.errored} errored." + ) + if self.report_url: + detail += f" See {self.report_url} for details." + if self.error: + detail += f" Error: {self.error}" + errored = [i for i in self.items if i.is_error] + if errored: + errors = [f"{i.item_id}: {i.error_code or 'unknown'}" for i in errored[:3]] + detail += f" Errored items: {'; '.join(errors)}." + if self.sub_results: + failed = [name for name, sub in self.sub_results.items() if not sub.all_passed] + if failed: + detail += f" Failed: {', '.join(failed)}." + raise AssertionError(detail) + + +# endregion + +# region Evaluator protocol + + +@runtime_checkable +class Evaluator(Protocol): + """Protocol for evaluation providers. + + Any evaluation backend (Azure AI Foundry, local LLM-as-judge, custom + scorers, etc.) implements this protocol. The provider encapsulates all + connection details, evaluator selection, and execution logic. + + Example implementation:: + + class MyEvaluator: + def __init__(self, name: str = "my-evaluator"): + self.name = name + + async def evaluate(self, items: Sequence[EvalItem], *, eval_name: str = "Eval") -> EvalResults: + # Score each item and return results + ... + """ + + name: str + + async def evaluate( + self, + items: Sequence[EvalItem], + *, + eval_name: str = "Agent Framework Eval", + ) -> EvalResults: + """Evaluate a batch of items and return results. + + The evaluator determines which metrics to run. It may auto-detect + capabilities from the items (e.g., run tool evaluators only when + ``tools`` is present). + + Args: + items: Eval data items to score. + eval_name: Display name for the evaluation run. + + Returns: + ``EvalResults`` with status, counts, and optional portal link. + """ + ... + + +# endregion + +# region Converter + + +class AgentEvalConverter: + """Converts agent-framework types to evaluation format. + + Handles the type gap between agent-framework's ``Message`` / ``Content`` / + ``FunctionTool`` types and the OpenAI-style agent message schema used by + evaluation providers. All methods are static — no instantiation needed. + """ + + @staticmethod + def convert_message(message: Message) -> list[dict[str, Any]]: + """Convert a single ``Message`` to Foundry agent evaluator format. + + Uses typed content lists as required by Foundry evaluators:: + + {"role": "assistant", "content": [{"type": "tool_call", ...}]} + + A single agent-framework ``Message`` with multiple ``function_result`` + contents produces multiple output messages (one per tool result). + + Args: + message: An agent-framework ``Message``. + + Returns: + A list of Foundry-format message dicts. + """ + role = message.role + contents = message.contents or [] + + content_items: list[dict[str, Any]] = [] + tool_results: list[dict[str, Any]] = [] + + for c in contents: + if c.type == "text" and c.text: + content_items.append({"type": "text", "text": c.text}) + elif c.type == "function_call": + args = c.arguments + if isinstance(args, str): + try: + args = json.loads(args) + except (json.JSONDecodeError, TypeError): + args = {"_raw_arguments": args} + tc: dict[str, Any] = { + "type": "tool_call", + "tool_call_id": c.call_id or "", + "name": c.name or "", + } + if args: + tc["arguments"] = args + content_items.append(tc) + elif c.type == "function_result": + result_val = c.result + if isinstance(result_val, str): + with contextlib.suppress(json.JSONDecodeError, TypeError): + result_val = json.loads(result_val) + tool_results.append({ + "call_id": c.call_id or "", + "result": result_val, + }) + + output: list[dict[str, Any]] = [] + + if tool_results: + for tr in tool_results: + output.append({ + "role": "tool", + "tool_call_id": tr["call_id"], + "content": [{"type": "tool_result", "tool_result": tr["result"]}], + }) + elif content_items: + output.append({"role": role, "content": content_items}) + else: + output.append({ + "role": role, + "content": [{"type": "text", "text": ""}], + }) + + return output + + @staticmethod + def convert_messages(messages: Sequence[Message]) -> list[dict[str, Any]]: + """Convert a sequence of ``Message`` objects to Foundry evaluator format. + + Args: + messages: Agent-framework messages. + + Returns: + A list of Foundry-format message dicts with typed content lists. + """ + result: list[dict[str, Any]] = [] + for msg in messages: + result.extend(AgentEvalConverter.convert_message(msg)) + return result + + @staticmethod + def extract_tools(agent: Any) -> list[dict[str, Any]]: + """Extract tool definitions from an agent instance. + + Reads ``agent.default_options["tools"]`` and ``agent.mcp_tools`` + and converts each ``FunctionTool`` to ``{name, description, parameters}``. + + Args: + agent: An agent-framework agent instance. + + Returns: + A list of tool definition dicts. + """ + tools: list[dict[str, Any]] = [] + seen: set[str] = set() + raw_tools = getattr(agent, "default_options", {}).get("tools", []) + for t in raw_tools: + if isinstance(t, FunctionTool) and t.name not in seen: + tools.append({ + "name": t.name, + "description": t.description, + "parameters": t.parameters(), + }) + seen.add(t.name) + # Include tools from connected MCP servers + for mcp in getattr(agent, "mcp_tools", []): + for t in getattr(mcp, "functions", []): + if isinstance(t, FunctionTool) and t.name not in seen: + tools.append({ + "name": t.name, + "description": t.description, + "parameters": t.parameters(), + }) + seen.add(t.name) + return tools + + @staticmethod + def to_eval_item( + *, + query: str | Sequence[Message], + response: AgentResponse[Any], + agent: Any | None = None, + tools: Sequence[FunctionTool] | None = None, + context: str | None = None, + ) -> EvalItem: + """Convert a complete agent interaction to an ``EvalItem``. + + Args: + query: The user query string, or input messages. + response: The agent's response. + agent: Optional agent instance to auto-extract tool definitions. + tools: Explicit tool list (takes precedence over *agent*). + context: Optional context document for groundedness evaluation. + + Returns: + An ``EvalItem`` suitable for passing to any ``Evaluator``. + """ + input_msgs = [Message("user", [query])] if isinstance(query, str) else list(query) + + all_msgs = list(input_msgs) + list(response.messages or []) + + typed_tools: list[FunctionTool] = [] + if tools: + typed_tools = list(tools) + elif agent: + raw_tools = getattr(agent, "default_options", {}).get("tools", []) + typed_tools = [t for t in raw_tools if isinstance(t, FunctionTool)] + # Include tools from connected MCP servers + seen = {t.name for t in typed_tools} + for mcp in getattr(agent, "mcp_tools", []): + for t in getattr(mcp, "functions", []): + if isinstance(t, FunctionTool) and t.name not in seen: + typed_tools.append(t) + seen.add(t.name) + + return EvalItem( + conversation=all_msgs, + tools=typed_tools or None, + context=context, + ) + + +# endregion + +# region Workflow extraction helpers + + +class _AgentEvalData(TypedDict): + executor_id: str + query: str | Sequence[Message] + response: AgentResponse[Any] + agent: Any | None + + +def _extract_agent_eval_data( + workflow_result: WorkflowRunResult, + workflow: Workflow | None = None, +) -> list[_AgentEvalData]: + """Walk a WorkflowRunResult and extract per-agent query/response pairs. + + Pairs ``executor_invoked`` with ``executor_completed`` events for each + ``AgentExecutor``. Skips internal framework executors. + """ + from ._workflows._agent_executor import AgentExecutor as AE + from ._workflows._agent_executor import AgentExecutorResponse + + invoked_data: dict[str, Any] = {} + results: list[_AgentEvalData] = [] + + for event in workflow_result: + if event.type == "executor_invoked" and event.executor_id: + invoked_data[event.executor_id] = event.data + + elif event.type == "executor_completed" and event.executor_id: + executor_id = event.executor_id + + # Skip internal framework executors + if executor_id.startswith("_") or any( + kw in executor_id.lower() for kw in ("input-conversation", "end-conversation", "end") + ): + continue + + completion_data: Any = event.data + agent_exec_response: AgentExecutorResponse | None = None + + if isinstance(completion_data, list): + for cdata_item in cast(list[Any], completion_data): # type: ignore[redundant-cast] + if isinstance(cdata_item, AgentExecutorResponse): + agent_exec_response = cdata_item + break + elif isinstance(completion_data, AgentExecutorResponse): + agent_exec_response = completion_data + + if agent_exec_response is None: + continue + + query: str | list[Message] + if agent_exec_response.full_conversation: + user_msgs = [m for m in agent_exec_response.full_conversation if m.role == "user"] + query = user_msgs or agent_exec_response.full_conversation # type: ignore[assignment] + elif executor_id in invoked_data: + input_data: Any = invoked_data[executor_id] + query = ( # type: ignore[assignment] + input_data if isinstance(input_data, (str, list)) else str(input_data) + ) + else: + continue + + agent_ref = None + if workflow is not None: + executor = workflow.executors.get(executor_id) + if executor is not None and isinstance(executor, AE): + agent_ref = executor.agent + + results.append( + _AgentEvalData( + executor_id=executor_id, + query=query, + response=agent_exec_response.agent_response, + agent=agent_ref, + ) + ) + + return results + + +def _extract_overall_query(workflow_result: WorkflowRunResult) -> str | None: + """Extract the original user query from a workflow result.""" + for event in workflow_result: + if event.type == "executor_invoked" and event.data is not None: + data: Any = event.data + if isinstance(data, str): + return data + if isinstance(data, list) and data: + items_list = cast(list[Any], data) # type: ignore[redundant-cast] + first = items_list[0] + if isinstance(first, Message): + msgs: list[Message] = [m for m in items_list if isinstance(m, Message)] + return " ".join(str(m.text) for m in msgs if hasattr(m, "text") and m.role == "user") + if isinstance(first, str): + return " ".join(str(s) for s in items_list) + return str(data) # type: ignore[reportUnknownArgumentType] + return None + + +# endregion + +# region Local evaluation checks + + +@dataclass +class CheckResult: + """Result of a single check on a single evaluation item. + + Attributes: + passed: Whether the check passed. + reason: Human-readable explanation. + check_name: Name of the check that produced this result. + """ + + passed: bool + reason: str + check_name: str + + +EvalCheck = Callable[[EvalItem], CheckResult | Any] +"""A check function that takes an ``EvalItem`` and returns a ``CheckResult``. + +Both sync and async functions are supported. Async checks should return +an awaitable ``CheckResult``; they will be awaited automatically by +``LocalEvaluator``. +""" + + +def keyword_check(*keywords: str, case_sensitive: bool = False) -> EvalCheck: + """Check that the response contains all specified keywords. + + Args: + *keywords: Required keywords that must appear in the response. + case_sensitive: Whether matching is case-sensitive (default ``False``). + + Returns: + A check function for use with ``LocalEvaluator``. + + Example:: + + check = keyword_check("weather", "temperature") + """ + + def _check(item: EvalItem) -> CheckResult: + text = item.response if case_sensitive else item.response.lower() + missing = [k for k in keywords if (k if case_sensitive else k.lower()) not in text] + if missing: + return CheckResult(passed=False, reason=f"Missing keywords: {missing}", check_name="keyword_check") + return CheckResult(passed=True, reason="All keywords found", check_name="keyword_check") + + return _check + + +def tool_called_check(*tool_names: str, mode: Literal["all", "any"] = "all") -> EvalCheck: + """Check that specific tools were called during the conversation. + + Inspects the conversation history for ``tool_calls`` entries matching + the expected tool names. + + Args: + *tool_names: Names of tools that should have been called. + mode: ``"all"`` requires every tool to be called; ``"any"`` requires + at least one. Defaults to ``"all"``. + + Returns: + A check function for use with ``LocalEvaluator``. + + Example:: + + check = tool_called_check("get_weather", "get_flight_price") + """ + + def _check(item: EvalItem) -> CheckResult: + expected = set(tool_names) + called: set[str] = set() + for msg in item.conversation: + for c in msg.contents or []: + if c.type == "function_call" and c.name: + called.add(c.name) + if mode == "all" and expected.issubset(called): + return CheckResult( + passed=True, + reason=f"All expected tools called: {sorted(called)}", + check_name="tool_called", + ) + if mode == "any" and expected & called: + return CheckResult( + passed=True, + reason=f"Expected tool found: {sorted(expected & called)}", + check_name="tool_called", + ) + if mode == "all": + missing = [t for t in tool_names if t not in called] + if missing: + return CheckResult( + passed=False, + reason=f"Expected tools not called: {missing} (called: {sorted(called)})", + check_name="tool_called", + ) + return CheckResult( + passed=True, + reason=f"All expected tools called: {sorted(called)}", + check_name="tool_called", + ) + return CheckResult( + passed=False, + reason=f"None of expected tools called: {list(tool_names)} (called: {sorted(called)})", + check_name="tool_called", + ) + + return _check + + +def _extract_tool_calls(item: EvalItem) -> list[tuple[str, dict[str, Any] | None]]: + """Extract (name, arguments) pairs from the conversation's function calls.""" + calls: list[tuple[str, dict[str, Any] | None]] = [] + for msg in item.conversation: + for c in msg.contents or []: + if c.type == "function_call" and c.name: + args = c.arguments if isinstance(c.arguments, dict) else None + calls.append((c.name, args)) + return calls + + +def tool_calls_present(item: EvalItem) -> CheckResult: + """Check that all expected tool calls were made (unordered, extras OK). + + Uses ``item.expected_tool_calls`` — checks that every expected tool name + appears at least once in the conversation. Does not check arguments or + ordering. Extra (unexpected) tool calls are not penalized. + + Example:: + + local = LocalEvaluator(tool_calls_present) + results = await evaluate_agent( + agent=agent, + queries=["What's the weather?"], + expected_tool_calls=[[ExpectedToolCall("get_weather")]], + evaluators=local, + ) + """ + expected = item.expected_tool_calls or [] + if not expected: + return CheckResult(passed=True, reason="No expected tool calls specified.", check_name="tool_calls_present") + + actual_names = {name for name, _ in _extract_tool_calls(item)} + expected_names = [e.name for e in expected] + found = [n for n in expected_names if n in actual_names] + missing = [n for n in expected_names if n not in actual_names] + + if missing: + return CheckResult( + passed=False, + reason=f"Missing tool calls: {missing} (called: {sorted(actual_names)})", + check_name="tool_calls_present", + ) + return CheckResult( + passed=True, + reason=f"All expected tools called: {found} (called: {sorted(actual_names)})", + check_name="tool_calls_present", + ) + + +def tool_call_args_match(item: EvalItem) -> CheckResult: + """Check that expected tool calls match on name and arguments. + + For each expected tool call, finds matching calls in the conversation + by name. If ``ExpectedToolCall.arguments`` is provided, checks that + the actual arguments contain all expected key-value pairs (subset + match — extra actual arguments are OK). + + Example:: + + local = LocalEvaluator(tool_call_args_match) + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in NYC?"], + expected_tool_calls=[ + [ExpectedToolCall("get_weather", {"location": "NYC"})], + ], + evaluators=local, + ) + """ + expected = item.expected_tool_calls or [] + if not expected: + return CheckResult(passed=True, reason="No expected tool calls specified.", check_name="tool_call_args_match") + + actual_calls = _extract_tool_calls(item) + matched = 0 + details: list[str] = [] + + for exp in expected: + matching = [(n, a) for n, a in actual_calls if n == exp.name] + if not matching: + details.append(f" {exp.name}: not called") + continue + + if exp.arguments is None: + matched += 1 + details.append(f" {exp.name}: called (args not checked)") + continue + + # Subset match — all expected keys present with expected values + found = False + for _, actual_args in matching: + if actual_args is None: + continue + if all(actual_args.get(k) == v for k, v in exp.arguments.items()): + found = True + break + + if found: + matched += 1 + details.append(f" {exp.name}: args match") + else: + actual_args_list = [a for _, a in matching] + details.append(f" {exp.name}: args mismatch (actual: {actual_args_list})") + + passed = matched == len(expected) + score_str = f"{matched}/{len(expected)}" + detail_str = "\n".join(details) + reason = f"Tool call args match: {score_str}\n{detail_str}" + + return CheckResult(passed=passed, reason=reason, check_name="tool_call_args_match") + + +# endregion + +# region Function evaluator — wrap plain functions as EvalChecks + +# Parameters recognized by the function evaluator wrapper +_KNOWN_PARAMS = frozenset({ + "query", + "response", + "expected_output", + "expected_tool_calls", + "conversation", + "tools", + "context", +}) + + +def _resolve_function_args(fn: Callable[..., Any], item: EvalItem) -> dict[str, Any]: + """Build a kwargs dict for *fn* based on its signature and the EvalItem. + + Supported parameter names: + + ====================== ==================================================== + Name Value from EvalItem + ====================== ==================================================== + query ``item.query`` + response ``item.response`` + expected_output ``item.expected_output`` (empty string if not set) + expected_tool_calls ``item.expected_tool_calls`` (empty list if not set) + conversation ``item.conversation`` (list[Message]) + tools ``item.tools`` (typed ``FunctionTool`` objects) + context ``item.context`` + ====================== ==================================================== + + Parameters with default values are only supplied when their name is + recognised. Unknown required parameters raise ``TypeError``. + """ + sig = inspect.signature(fn) + kwargs: dict[str, Any] = {} + + field_map: dict[str, Any] = { + "query": item.query, + "response": item.response, + "expected_output": item.expected_output or "", + "expected_tool_calls": item.expected_tool_calls or [], + "conversation": item.conversation, + "tools": item.tools, + "context": item.context, + } + + for name, param in sig.parameters.items(): + if name in field_map: + kwargs[name] = field_map[name] + elif param.default is inspect.Parameter.empty: + raise TypeError( + f"Function evaluator '{fn.__name__}' has unknown required parameter " + f"'{name}'. Supported: {sorted(_KNOWN_PARAMS)}" + ) + # else: has a default — leave it to Python + + return kwargs + + +def _coerce_result(value: Any, check_name: str) -> CheckResult: + """Convert a function evaluator return value to a ``CheckResult``. + + Accepted return types: + + * ``bool`` — True/False maps directly to pass/fail. + * ``int | float`` — ≥ 0.5 is pass (score is included in reason). + * ``CheckResult`` — returned as-is. + * ``dict`` with ``score`` or ``passed`` key — converted to CheckResult. + """ + if isinstance(value, CheckResult): + return value + + if isinstance(value, bool): + return CheckResult(passed=value, reason="passed" if value else "failed", check_name=check_name) + + if isinstance(value, (int, float)): + passed = value >= 0.5 + return CheckResult(passed=passed, reason=f"score={value:.3f}", check_name=check_name) + + if isinstance(value, dict): + d = cast(dict[str, Any], value) + if "score" in d: + score = float(d["score"]) + passed = score >= float(d.get("threshold", 0.5)) + reason = str(d.get("reason", f"score={score:.3f}")) + return CheckResult(passed=passed, reason=reason, check_name=check_name) + if "passed" in d: + passed_val = d["passed"] + if not isinstance(passed_val, (bool, int)): + raise TypeError( + f"Function evaluator '{check_name}' returned dict with non-boolean 'passed' value: {passed_val!r}" + ) + return CheckResult( + passed=bool(passed_val), + reason=str(d.get("reason", "passed" if passed_val else "failed")), + check_name=check_name, + ) + + value_type_name = type(value).__name__ # type: ignore[reportUnknownMemberType] + msg = ( + f"Function evaluator '{check_name}' returned unsupported type " + f"{value_type_name}. Expected bool, float, dict, or CheckResult." + ) + raise TypeError(msg) + + +@overload +def evaluator(fn: Callable[..., Any], /) -> EvalCheck: ... + + +@overload +def evaluator(*, name: str | None = None) -> Callable[[Callable[..., Any]], EvalCheck]: ... + + +def evaluator( + fn: Callable[..., Any] | None = None, + *, + name: str | None = None, +) -> EvalCheck | Callable[[Callable[..., Any]], EvalCheck]: + """Wrap a plain function as an ``EvalCheck`` for use with ``LocalEvaluator``. + + Works with both sync and async functions. The function's parameter names + determine what data it receives from the ``EvalItem``. Any combination of + the following parameter names is valid: + + * ``query`` — the user query (str) + * ``response`` — the agent response (str) + * ``expected_output`` — expected output for ground-truth comparison (str) + * ``conversation`` — full conversation history (list[Message]) + * ``tools`` — typed tool objects (list[FunctionTool]) + * ``context`` — grounding context (str | None) + + Return ``bool``, ``float`` (≥0.5 = pass), ``dict`` with ``score`` or + ``passed`` key, or ``CheckResult``. + + Can be used as a decorator (with or without arguments) or called directly:: + + # Decorator — no args + @evaluator + def mentions_weather(query: str, response: str) -> bool: + return "weather" in response.lower() + + + # Decorator — with name + @evaluator(name="length_check") + def is_not_too_long(response: str) -> bool: + return len(response) < 2000 + + + # Direct wrapping + check = evaluator(my_scorer, name="my_scorer") + + + # Async function — handled automatically + @evaluator + async def llm_judge(query: str, response: str) -> float: + result = await my_llm_client.score(query, response) + return result.score + + + # Use with LocalEvaluator + local = LocalEvaluator(mentions_weather, is_not_too_long, check, llm_judge) + + Args: + fn: The function to wrap. If omitted, returns a decorator. + name: Display name for the check (defaults to ``fn.__name__``). + """ + + def _wrap(func: Callable[..., Any]) -> EvalCheck: + check_name: str = name or getattr(func, "__name__", None) or "evaluator" + + async def _check(item: EvalItem) -> CheckResult: + kwargs = _resolve_function_args(func, item) + result = func(**kwargs) + if inspect.isawaitable(result): + result = await result + return _coerce_result(result, check_name) + + _check.__name__ = check_name # type: ignore[attr-defined,assignment] + _check.__doc__ = func.__doc__ + return _check + + # Support @evaluator (no parens) and @evaluator(name="x") + if fn is not None: + return _wrap(fn) + return _wrap + + +# endregion + +# region LocalEvaluator + + +async def _run_check(check_fn: EvalCheck, item: EvalItem) -> CheckResult: + """Run a single check, awaiting the result if it is a coroutine.""" + result = check_fn(item) + if inspect.isawaitable(result): + result = await result + return result + + +class LocalEvaluator: + """Evaluation provider that runs checks locally without API calls. + + Implements the ``Evaluator`` protocol. Each check function is applied + to every item. An item passes only if all checks pass. + + Example:: + + from agent_framework import LocalEvaluator, keyword_check, evaluate_agent + + local = LocalEvaluator( + keyword_check("weather"), + tool_called_check("get_weather"), + ) + results = await evaluate_agent(agent=agent, queries=queries, evaluators=local) + + To mix with cloud evaluators:: + + from agent_framework_azure_ai import FoundryEvals + + results = await evaluate_agent( + agent=agent, + queries=queries, + evaluators=[local, FoundryEvals(project_client=client, model_deployment="gpt-4o")], + ) + """ + + def __init__(self, *checks: EvalCheck): + self.name = "Local" + self._checks = checks + + async def evaluate( + self, + items: Sequence[EvalItem], + *, + eval_name: str = "Local Eval", + ) -> EvalResults: + """Run all checks on each item and return aggregated results. + + An item passes only if every check passes for that item. Per-check + breakdowns are available in ``per_evaluator``. + + Supports both sync and async check functions (from + :func:`evaluator`). + """ + passed = 0 + failed = 0 + per_check: dict[str, dict[str, int]] = {} + failure_reasons: list[str] = [] + result_items: list[EvalItemResult] = [] + + for item_idx, item in enumerate(items): + check_results = await asyncio.gather(*[_run_check(fn, item) for fn in self._checks]) + item_passed = True + item_scores: list[EvalScoreResult] = [] + for result in check_results: + counts = per_check.setdefault(result.check_name, {"passed": 0, "failed": 0, "errored": 0}) + if result.passed: + counts["passed"] += 1 + else: + counts["failed"] += 1 + item_passed = False + failure_reasons.append(f"{result.check_name}: {result.reason}") + item_scores.append( + EvalScoreResult( + name=result.check_name, + score=1.0 if result.passed else 0.0, + passed=result.passed, + sample={"reason": result.reason} if result.reason else None, + ) + ) + + if item_passed: + passed += 1 + else: + failed += 1 + + result_items.append( + EvalItemResult( + item_id=str(item_idx), + status="pass" if item_passed else "fail", + scores=item_scores, + input_text=item.query, + output_text=item.response, + ) + ) + + return EvalResults( + provider=self.name, + eval_id="local", + run_id=eval_name, + status="completed", + result_counts={"passed": passed, "failed": failed, "errored": 0}, + per_evaluator=per_check, + items=result_items, + error="; ".join(failure_reasons) if failure_reasons else None, + ) + + +# endregion + +# region Public orchestration functions + + +async def evaluate_agent( + *, + agent: Any | None = None, + queries: str | Sequence[str] | None = None, + expected_output: str | Sequence[str] | None = None, + expected_tool_calls: Sequence[ExpectedToolCall] | Sequence[Sequence[ExpectedToolCall]] | None = None, + responses: AgentResponse[Any] | Sequence[AgentResponse[Any]] | None = None, + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], + eval_name: str | None = None, + context: str | None = None, + conversation_split: ConversationSplitter | None = None, + num_repetitions: int = 1, +) -> list[EvalResults]: + """Run an agent against test queries and evaluate the results. + + The simplest path for evaluating an agent during development. For each + query, runs the agent, converts the interaction to eval format, and + submits to the evaluator(s). + + All sequence parameters (``queries``, ``expected_output``, + ``expected_tool_calls``, ``responses``) accept either a single value + or a list for convenience. + + If ``responses`` is provided, skips running the agent and evaluates those + responses directly — but still extracts tool definitions from the agent. + In this mode ``queries`` is required to construct the conversation. + + Args: + agent: An agent-framework agent instance. + queries: Test query or queries to run the agent against. A single + string is wrapped into a one-element list. Required when + ``responses`` is not provided. + expected_output: Ground-truth expected output(s), one per query. A + single string is wrapped into a one-element list. When provided, + must be the same length as ``queries``. Each value is stamped on + the corresponding ``EvalItem.expected_output`` for evaluators + that compare against a reference answer. + expected_tool_calls: Expected tool call(s), one list per query. A + single flat list of ``ExpectedToolCall`` is wrapped into a + one-element nested list. When provided, must be the same length + as ``queries``. + responses: Pre-existing ``AgentResponse``(s) to evaluate without + running the agent. A single response is wrapped into a one-element + list. When provided, ``queries`` must also be provided to + construct the conversation for evaluation. + evaluators: One or more ``Evaluator`` instances. + eval_name: Display name (defaults to agent name). + context: Optional context for groundedness evaluation. + conversation_split: Split strategy applied to all items, overriding + each evaluator's default. See ``ConversationSplitter``. + num_repetitions: Number of times to run each query (default 1). + When > 1, each query is invoked independently N times to measure + consistency. Results contain all N x len(queries) items. + Ignored when ``responses`` is provided (pre-existing responses + are evaluated as-is). + + Returns: + A list of ``EvalResults``, one per evaluator provider. + + Raises: + ValueError: If neither ``queries`` nor ``responses`` is provided. + + Example — run and evaluate:: + + results = await evaluate_agent( + agent=my_agent, + queries="What's the weather?", + evaluators=evals, + ) + + Example — evaluate existing responses:: + + response = await agent.run([Message("user", ["What's the weather?"])]) + results = await evaluate_agent( + agent=agent, + responses=response, + queries="What's the weather?", + evaluators=evals, + ) + + Example — with ground-truth expected answers:: + + results = await evaluate_agent( + agent=my_agent, + queries=["What's 2+2?", "Capital of France?"], + expected_output=["4", "Paris"], + evaluators=evals, + ) + + Example — with expected tool calls:: + + results = await evaluate_agent( + agent=my_agent, + queries="What's the weather in NYC?", + expected_tool_calls=[ExpectedToolCall("get_weather", {"location": "NYC"})], + evaluators=evals, + ) + """ + # Normalize singular values to lists + if isinstance(queries, str): + queries = [queries] + if isinstance(expected_output, str): + expected_output = [expected_output] + if isinstance(responses, AgentResponse): + responses = [responses] + if ( + expected_tool_calls is not None + and len(expected_tool_calls) > 0 + and isinstance(expected_tool_calls[0], ExpectedToolCall) + ): + expected_tool_calls = [list(cast(Sequence[ExpectedToolCall], expected_tool_calls))] + + items: list[EvalItem] = [] + + # Validate num_repetitions + if num_repetitions < 1: + raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.") + + # Validate expected_output length against queries + if expected_output is not None and queries is not None and len(expected_output) != len(queries): + raise ValueError(f"Got {len(queries)} queries but {len(expected_output)} expected_output values.") + + # Validate expected_tool_calls length against queries + if expected_tool_calls is not None and queries is not None and len(expected_tool_calls) != len(queries): + raise ValueError(f"Got {len(queries)} queries but {len(expected_tool_calls)} expected_tool_calls lists.") + + if responses is not None: + # Evaluate pre-existing responses (don't run the agent) + resp_list = list(responses) + + if queries is not None: + query_list = list(queries) + if len(query_list) != len(resp_list): + raise ValueError(f"Got {len(query_list)} queries but {len(resp_list)} responses.") + for q, r in zip(query_list, resp_list): + items.append( + AgentEvalConverter.to_eval_item( + query=q, + response=r, + agent=agent, + context=context, + ) + ) + else: + raise ValueError( + "Provide 'queries' alongside 'responses' so the conversation " + "can be constructed for evaluation. For Responses API " + "evaluation by response ID, use evaluate_responses() from " + "the Foundry package." + ) + elif queries is not None and agent is not None: + # Run the agent against test queries, with repetitions + for _rep in range(num_repetitions): + for query in queries: + response = await agent.run([Message("user", [query])]) + items.append( + AgentEvalConverter.to_eval_item( + query=query, + response=response, + agent=agent, + context=context, + ) + ) + else: + raise ValueError("Provide either 'queries' or 'responses' (or both).") + + # Stamp expected output values on items (repeated across all repetitions) + if expected_output is not None: + query_count = len(expected_output) + for i, item in enumerate(items): + item.expected_output = expected_output[i % query_count] + + # Stamp expected tool calls on items (repeated across all repetitions) + if expected_tool_calls is not None: + # After normalization, expected_tool_calls is Sequence[Sequence[ExpectedToolCall]] + tc_list = cast(Sequence[Sequence[ExpectedToolCall]], expected_tool_calls) + query_count = len(tc_list) + for i, item in enumerate(items): + item.expected_tool_calls = list(tc_list[i % query_count]) + + # Stamp split strategy on items so evaluators respect it + if conversation_split is not None: + for item in items: + item.split_strategy = conversation_split + + name = eval_name or f"Eval: {getattr(agent, 'name', None) or getattr(agent, 'id', 'agent') if agent else 'agent'}" + return await _run_evaluators(evaluators, items, eval_name=name) + + +async def evaluate_response( + *, + response: AgentResponse[Any] | Sequence[AgentResponse[Any]], + query: str | Message | Sequence[str | Message] | None = None, + agent: Any | None = None, + evaluators: Evaluator | Sequence[Evaluator], + eval_name: str = "Agent Framework Response Eval", +) -> list[EvalResults]: + """Deprecated: use ``evaluate_agent(responses=...)`` instead. + + Evaluate one or more agent responses that have already been produced. + This is a thin wrapper that delegates to ``evaluate_agent``. + """ + import warnings + + warnings.warn( + "evaluate_response() is deprecated; use evaluate_agent(responses=...) instead.", + DeprecationWarning, + stacklevel=2, + ) + # Normalize queries for evaluate_agent (it expects Sequence[str] | None) + queries_norm: list[str] | None = None + if query is not None: + responses_list = [response] if isinstance(response, AgentResponse) else list(response) + queries_norm = [str(q) for q in _normalize_queries(query, len(responses_list))] + + return await evaluate_agent( + agent=agent, + responses=response, + queries=queries_norm, + evaluators=evaluators, + eval_name=eval_name, + ) + + +async def evaluate_workflow( + *, + workflow: Workflow, + workflow_result: WorkflowRunResult | None = None, + queries: str | Sequence[str] | None = None, + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], + eval_name: str | None = None, + include_overall: bool = True, + include_per_agent: bool = True, + conversation_split: ConversationSplitter | None = None, + num_repetitions: int = 1, +) -> list[EvalResults]: + """Evaluate a multi-agent workflow with per-agent breakdown. + + Evaluates each sub-agent individually and (optionally) the workflow's + overall output. Returns one ``EvalResults`` per evaluator provider, each + with per-agent breakdowns in ``sub_results``. + + **Two modes:** + + - **Post-hoc**: Pass ``workflow_result`` from a previous + ``workflow.run()`` call. + - **Run + evaluate**: Pass ``queries`` and the workflow will be run + against each query, then evaluated. + + Args: + workflow: The workflow instance. + workflow_result: A completed ``WorkflowRunResult``. + queries: Test queries to run through the workflow. + evaluators: One or more ``Evaluator`` instances. + eval_name: Display name for the evaluation. + include_overall: Whether to evaluate the workflow's final output. + include_per_agent: Whether to evaluate each sub-agent individually. + conversation_split: Split strategy applied to all items, overriding + each evaluator's default. See ``ConversationSplitter``. + num_repetitions: Number of times to run each query (default 1). + When > 1, each query is run independently N times. + Ignored when ``workflow_result`` is provided. + + Returns: + Example:: + + from agent_framework_azure_ai import FoundryEvals + + evals = FoundryEvals(project_client=client, model_deployment="gpt-4o") + result = await workflow.run("Plan a trip to Paris") + + eval_results = await evaluate_workflow( + workflow=workflow, + workflow_result=result, + evaluators=evals, + ) + for r in eval_results: + print(f"{r.provider}:") + for name, sub in r.sub_results.items(): + print(f" {name}: {sub.passed}/{sub.total}") + """ + from ._workflows._workflow import WorkflowRunResult as WRR + + # Normalize singular query to list + if isinstance(queries, str): + queries = [queries] + + if workflow_result is None and queries is None: + raise ValueError("Provide either 'workflow_result' or 'queries'.") + + if num_repetitions < 1: + raise ValueError(f"num_repetitions must be >= 1, got {num_repetitions}.") + + wf_name = eval_name or f"Workflow Eval: {workflow.__class__.__name__}" + evaluator_list = _resolve_evaluators(evaluators) + + # Collect per-agent data and overall items + all_agent_data: list[_AgentEvalData] = [] + overall_items: list[EvalItem] = [] + + if queries is not None: + results_list: list[WRR] = [] + for _rep in range(num_repetitions): + for q in queries: + result = await workflow.run(q) + if not isinstance(result, WRR): + raise TypeError(f"Expected WorkflowRunResult from workflow.run(), got {type(result).__name__}.") + results_list.append(result) + all_agent_data.extend(_extract_agent_eval_data(result, workflow)) + if include_overall: + overall_item = _build_overall_item(q, result) + if overall_item: + overall_items.append(overall_item) + else: + assert workflow_result is not None # noqa: S101 # nosec B101 + all_agent_data = _extract_agent_eval_data(workflow_result, workflow) + if include_overall: + original_query = _extract_overall_query(workflow_result) + if original_query: + overall_item = _build_overall_item(original_query, workflow_result) + if overall_item: + overall_items.append(overall_item) + + # Group agent data by executor ID + agents_by_id: dict[str, list[_AgentEvalData]] = {} + if include_per_agent and all_agent_data: + for ad in all_agent_data: + agents_by_id.setdefault(ad["executor_id"], []).append(ad) + + # Build per-agent items once (shared across providers). + agent_items_by_id: dict[str, list[EvalItem]] = {} + for executor_id, agent_data_list in agents_by_id.items(): + agent_items_by_id[executor_id] = [ + AgentEvalConverter.to_eval_item( + query=ad["query"], + response=ad["response"], + agent=ad["agent"], + ) + for ad in agent_data_list + ] + + if not agent_items_by_id and not overall_items: + raise ValueError( + "No agent executor data found in the workflow result. Ensure the workflow uses AgentExecutor-based agents." + ) + + # Stamp split strategy on all items so evaluators respect it + if conversation_split is not None: + for items in agent_items_by_id.values(): + for item in items: + item.split_strategy = conversation_split + for item in overall_items: + item.split_strategy = conversation_split + + # Run each provider, building per-agent sub_results for each + all_results: list[EvalResults] = [] + for ev in evaluator_list: + suffix = f" ({ev.name})" if len(evaluator_list) > 1 else "" + sub_results: dict[str, EvalResults] = {} + + # Per-agent evals + for executor_id, items in agent_items_by_id.items(): + agent_result = await ev.evaluate(items, eval_name=f"{wf_name} — {executor_id}{suffix}") + sub_results[executor_id] = agent_result + + # Overall eval + if include_overall and overall_items: + overall_result = await ev.evaluate(overall_items, eval_name=f"{wf_name} — overall{suffix}") + elif sub_results: + # Aggregate from sub-results + total_passed = sum(s.passed for s in sub_results.values()) + total_failed = sum(s.failed for s in sub_results.values()) + total_errored = sum(s.errored for s in sub_results.values()) + all_completed = all(s.status == "completed" for s in sub_results.values()) + overall_result = EvalResults( + provider=ev.name, + eval_id="aggregate", + run_id="aggregate", + status="completed" if all_completed else "partial", + result_counts={ + "passed": total_passed, + "failed": total_failed, + "errored": total_errored, + }, + ) + else: + raise ValueError( + "No agent executor data found in the workflow result. " + "Ensure the workflow uses AgentExecutor-based agents." + ) + + overall_result.sub_results = sub_results + all_results.append(overall_result) + + return all_results + + +# endregion + +# region Internal helpers + + +def _normalize_queries( + query: str | Message | Sequence[str | Message], + expected_count: int, +) -> list[str | Message | Sequence[Message]]: + """Normalize query input to a list matching the expected count.""" + if isinstance(query, (str, Message)): + queries: list[str | Message | Sequence[Message]] = [query] * expected_count if expected_count == 1 else [query] # type: ignore[list-item] + elif isinstance(query, list) and len(query) > 0 and isinstance(query[0], Message): + queries = [query] * expected_count if expected_count == 1 else [query] # type: ignore[list-item] + else: + queries = list(query) # type: ignore[arg-type] + + if len(queries) != expected_count: + raise ValueError(f"Number of queries ({len(queries)}) does not match number of responses ({expected_count}).") + return queries + + +def _build_overall_item( + query: str, + workflow_result: WorkflowRunResult, +) -> EvalItem | None: + """Build an EvalItem for the overall workflow output.""" + outputs = workflow_result.get_outputs() + if not outputs: + return None + + final_output: Any = outputs[-1] + overall_response: AgentResponse[None] + if isinstance(final_output, list) and final_output and isinstance(final_output[0], Message): + msgs: list[Message] = [m for m in cast(list[Any], final_output) if isinstance(m, Message)] # type: ignore[redundant-cast] + response_text = " ".join(str(m.text) for m in msgs if m.role == "assistant") + overall_response = AgentResponse(messages=[Message("assistant", [response_text])]) + elif isinstance(final_output, AgentResponse): + overall_response = cast(AgentResponse[None], final_output) + else: + overall_response = AgentResponse( + messages=[Message("assistant", [str(final_output)])] # type: ignore[reportUnknownArgumentType] + ) + + return AgentEvalConverter.to_eval_item(query=query, response=overall_response) + + +def _resolve_evaluators( + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], +) -> list[Evaluator]: + """Normalize evaluators into a list of concrete ``Evaluator`` instances. + + Bare callables (``EvalCheck`` functions, ``@evaluator`` decorated) are + collected and wrapped in a single ``LocalEvaluator``. + """ + raw_list: list[Any] = ( + [evaluators] if isinstance(evaluators, Evaluator) or callable(evaluators) else list(evaluators) + ) + + resolved: list[Evaluator] = [] + pending_checks: list[Callable[..., Any]] = [] + + for item in raw_list: + if isinstance(item, Evaluator): + if pending_checks: + resolved.append(LocalEvaluator(*pending_checks)) + pending_checks = [] + resolved.append(item) + elif callable(item): + pending_checks.append(item) + else: + raise TypeError(f"Expected an Evaluator or callable, got {type(item).__name__}") + + if pending_checks: + resolved.append(LocalEvaluator(*pending_checks)) + + return resolved + + +async def _run_evaluators( + evaluators: Evaluator | Callable[..., Any] | Sequence[Evaluator | Callable[..., Any]], + items: Sequence[EvalItem], + *, + eval_name: str, +) -> list[EvalResults]: + """Run one or more evaluators and return a result per provider. + + Bare ``EvalCheck`` callables (including ``@evaluator`` decorated + functions and helpers like ``keyword_check``) are auto-wrapped in a + ``LocalEvaluator`` so they can be passed directly in the evaluators list. + """ + evaluator_list = _resolve_evaluators(evaluators) + + async def _run_single_evaluator( + ev: Evaluator, + eval_items: Sequence[EvalItem], + name: str, + suffix: str, + ) -> EvalResults: + return await ev.evaluate(eval_items, eval_name=f"{name}{suffix}") + + results = await asyncio.gather(*[ + _run_single_evaluator(ev, items, eval_name, f" ({ev.name})" if len(evaluator_list) > 1 else "") + for ev in evaluator_list + ]) + return list(results) + + +# endregion diff --git a/python/packages/core/tests/core/test_local_eval.py b/python/packages/core/tests/core/test_local_eval.py new file mode 100644 index 0000000000..812b0a1c84 --- /dev/null +++ b/python/packages/core/tests/core/test_local_eval.py @@ -0,0 +1,774 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Tests for evaluator checks and LocalEvaluator.""" + +from __future__ import annotations + +import inspect + +import pytest + +from agent_framework._evaluation import ( + CheckResult, + EvalItem, + ExpectedToolCall, + LocalEvaluator, + evaluator, + keyword_check, + tool_call_args_match, + tool_calls_present, +) +from agent_framework._types import Content, Message + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_item( + query: str = "What's the weather in Paris?", + response: str = "It's sunny and 75°F", + expected_output: str | None = None, + conversation: list | None = None, + tools: list | None = None, + context: str | None = None, +) -> EvalItem: + if conversation is None: + conversation = [Message("user", [query]), Message("assistant", [response])] + return EvalItem( + conversation=conversation, + expected_output=expected_output, + tools=tools, + context=context, + ) + + +# --------------------------------------------------------------------------- +# Tier 1: (query, response) -> result +# --------------------------------------------------------------------------- + + +class TestTier1SimpleChecks: + @pytest.mark.asyncio + async def test_bool_return_true(self): + @evaluator + def has_temperature(query: str, response: str) -> bool: + return "°F" in response + + result = await has_temperature(_make_item()) + assert result.passed is True + assert result.check_name == "has_temperature" + + @pytest.mark.asyncio + async def test_bool_return_false(self): + @evaluator + def has_celsius(query: str, response: str) -> bool: + return "°C" in response + + result = await has_celsius(_make_item()) + assert result.passed is False + + @pytest.mark.asyncio + async def test_float_return_passing(self): + @evaluator + def length_score(response: str) -> float: + return min(len(response) / 10, 1.0) + + result = await length_score(_make_item()) + assert result.passed is True + assert "score=" in result.reason + + @pytest.mark.asyncio + async def test_float_return_failing(self): + @evaluator + def always_low(response: str) -> float: + return 0.1 + + result = await always_low(_make_item()) + assert result.passed is False + + @pytest.mark.asyncio + async def test_response_only(self): + """Function with only 'response' param should work.""" + + @evaluator + def is_short(response: str) -> bool: + return len(response) < 1000 + + result = await is_short(_make_item()) + assert result.passed is True + + @pytest.mark.asyncio + async def test_query_only(self): + """Function with only 'query' param should work.""" + + @evaluator + def is_question(query: str) -> bool: + return "?" in query + + result = await is_question(_make_item()) + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Tier 2: (query, response, expected_output) -> result +# --------------------------------------------------------------------------- + + +class TestTier2GroundTruth: + @pytest.mark.asyncio + async def test_exact_match(self): + @evaluator + def exact_match(response: str, expected_output: str) -> bool: + return response.strip() == expected_output.strip() + + item = _make_item(response="42", expected_output="42") + assert (await exact_match(item)).passed is True + + item2 = _make_item(response="43", expected_output="42") + assert (await exact_match(item2)).passed is False + + @pytest.mark.asyncio + async def test_expected_output_defaults_to_empty(self): + """When expected_output is None on the item, it should be passed as ''.""" + + @evaluator + def check_expected(expected_output: str) -> bool: + return expected_output == "" + + result = await check_expected(_make_item(expected_output=None)) + assert result.passed is True + + @pytest.mark.asyncio + async def test_similarity_score(self): + @evaluator + def word_overlap(response: str, expected_output: str) -> float: + r_words = set(response.lower().split()) + e_words = set(expected_output.lower().split()) + if not e_words: + return 1.0 + return len(r_words & e_words) / len(e_words) + + item = _make_item(response="sunny warm day", expected_output="warm sunny afternoon") + result = await word_overlap(item) + assert result.passed is True # 2/3 overlap ≥ 0.5 + + +# --------------------------------------------------------------------------- +# Tier 3: full context (conversation, tools, context) +# --------------------------------------------------------------------------- + + +class TestTier3FullContext: + @pytest.mark.asyncio + async def test_conversation_access(self): + @evaluator + def multi_turn(query: str, response: str, *, conversation: list) -> bool: + return len(conversation) >= 2 + + item = _make_item(conversation=[Message("user", []), Message("assistant", [])]) + assert (await multi_turn(item)).passed is True + + item2 = _make_item(conversation=[Message("user", [])]) + assert (await multi_turn(item2)).passed is False + + @pytest.mark.asyncio + async def test_tools_access(self): + @evaluator + def has_tools(tools: list) -> bool: + return len(tools) > 0 + + mock_tool = type( + "MockTool", + (), + {"name": "get_weather", "description": "Get weather", "parameters": lambda self: {}}, + )() + item = _make_item(tools=[mock_tool]) + assert (await has_tools(item)).passed is True + + @pytest.mark.asyncio + async def test_context_access(self): + @evaluator + def grounded(response: str, context: str) -> bool: + if not context: + return True + return any(word in response.lower() for word in context.lower().split()) + + item = _make_item(response="It's sunny", context="sunny warm") + assert (await grounded(item)).passed is True + + @pytest.mark.asyncio + async def test_all_params(self): + @evaluator + def full_check( + query: str, + response: str, + expected_output: str, + conversation: list, + tools: list, + context: str, + ) -> bool: + return all([query, response, expected_output is not None, isinstance(conversation, list)]) + + item = _make_item(expected_output="foo", context="bar") + assert (await full_check(item)).passed is True + + +# --------------------------------------------------------------------------- +# Return type coercion +# --------------------------------------------------------------------------- + + +class TestReturnTypeCoercion: + @pytest.mark.asyncio + async def test_dict_with_score(self): + @evaluator + def scored(response: str) -> dict: + return {"score": 0.9, "reason": "good answer"} + + result = await scored(_make_item()) + assert result.passed is True + assert result.reason == "good answer" + + @pytest.mark.asyncio + async def test_dict_with_score_below_threshold(self): + @evaluator + def low_scored(response: str) -> dict: + return {"score": 0.3} + + result = await low_scored(_make_item()) + assert result.passed is False + + @pytest.mark.asyncio + async def test_dict_with_custom_threshold(self): + @evaluator + def custom_threshold(response: str) -> dict: + return {"score": 0.3, "threshold": 0.2} + + result = await custom_threshold(_make_item()) + assert result.passed is True + + @pytest.mark.asyncio + async def test_dict_with_passed(self): + @evaluator + def explicit_pass(response: str) -> dict: + return {"passed": True, "reason": "all good"} + + result = await explicit_pass(_make_item()) + assert result.passed is True + assert result.reason == "all good" + + @pytest.mark.asyncio + async def test_check_result_passthrough(self): + @evaluator + def returns_check_result(response: str) -> CheckResult: + return CheckResult(True, "direct result", "custom") + + result = await returns_check_result(_make_item()) + assert result.passed is True + assert result.reason == "direct result" + assert result.check_name == "custom" + + @pytest.mark.asyncio + async def test_unsupported_return_type(self): + @evaluator + def bad_return(response: str) -> str: + return "oops" + + with pytest.raises(TypeError, match="unsupported type"): + await bad_return(_make_item()) + + @pytest.mark.asyncio + async def test_int_return(self): + @evaluator + def int_score(response: str) -> int: + return 1 + + result = await int_score(_make_item()) + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Decorator variants +# --------------------------------------------------------------------------- + + +class TestDecoratorVariants: + @pytest.mark.asyncio + async def test_decorator_no_parens(self): + @evaluator + def my_check(response: str) -> bool: + return True + + assert (await my_check(_make_item())).passed is True + + @pytest.mark.asyncio + async def test_decorator_with_name(self): + @evaluator(name="custom_name") + def my_check(response: str) -> bool: + return True + + assert my_check.__name__ == "custom_name" + result = await my_check(_make_item()) + assert result.check_name == "custom_name" + + @pytest.mark.asyncio + async def test_direct_call(self): + def raw_fn(query: str, response: str) -> bool: + return len(response) > 0 + + check = evaluator(raw_fn, name="direct") + result = await check(_make_item()) + assert result.passed is True + assert result.check_name == "direct" + + +# --------------------------------------------------------------------------- +# Error handling +# --------------------------------------------------------------------------- + + +class TestErrorHandling: + @pytest.mark.asyncio + async def test_unknown_required_param_raises(self): + @evaluator + def bad_params(query: str, unknown_param: str) -> bool: + return True + + with pytest.raises(TypeError, match="unknown required parameter"): + await bad_params(_make_item()) + + @pytest.mark.asyncio + async def test_unknown_optional_param_ok(self): + @evaluator + def optional_unknown(query: str, foo: str = "default") -> bool: + return foo == "default" + + result = await optional_unknown(_make_item()) + assert result.passed is True + + @pytest.mark.asyncio + async def test_async_function_works_with_evaluator(self): + """Using an async function with @evaluator should work.""" + + @evaluator + async def async_fn(response: str) -> bool: + return True + + result = async_fn(_make_item()) + # Should return an awaitable + assert inspect.isawaitable(result) + check_result = await result + assert check_result.passed is True + + +# --------------------------------------------------------------------------- +# Integration with LocalEvaluator +# --------------------------------------------------------------------------- + + +class TestLocalEvaluatorIntegration: + @pytest.mark.asyncio + async def test_mixed_checks(self): + """Function evaluators mix with built-in checks in LocalEvaluator.""" + + @evaluator + def length_ok(response: str) -> bool: + return len(response) > 5 + + local = LocalEvaluator( + keyword_check("sunny"), + length_ok, + ) + items = [_make_item()] + results = await local.evaluate(items, eval_name="mixed test") + + assert results.status == "completed" + assert results.result_counts["passed"] == 1 + assert results.result_counts["failed"] == 0 + + @pytest.mark.asyncio + async def test_evaluator_failure_counted(self): + @evaluator + def always_fail(response: str) -> bool: + return False + + local = LocalEvaluator(always_fail) + results = await local.evaluate([_make_item()]) + + assert results.result_counts["failed"] == 1 + + @pytest.mark.asyncio + async def test_multiple_evaluators(self): + @evaluator + def check_a(response: str) -> float: + return 0.9 + + @evaluator + def check_b(query: str, response: str, expected_output: str) -> bool: + return True + + @evaluator(name="check_c") + def check_c(response: str, conversation: list) -> dict: + return {"score": 0.8, "reason": "looks good"} + + local = LocalEvaluator(check_a, check_b, check_c) + results = await local.evaluate([_make_item(expected_output="test")]) + + assert results.result_counts["passed"] == 1 + assert "check_a" in results.per_evaluator + assert "check_b" in results.per_evaluator + assert "check_c" in results.per_evaluator + + +# --------------------------------------------------------------------------- +# Async evaluator (via @evaluator which handles async automatically) +# --------------------------------------------------------------------------- + + +class TestAsyncFunctionEvaluator: + @pytest.mark.asyncio + async def test_async_evaluator_in_local(self): + @evaluator + async def async_check(query: str, response: str) -> bool: + return len(response) > 0 + + local = LocalEvaluator(async_check) + results = await local.evaluate([_make_item()]) + assert results.result_counts["passed"] == 1 + + @pytest.mark.asyncio + async def test_async_with_name(self): + @evaluator(name="named_async") + async def my_async(response: str) -> float: + return 0.75 + + result = await my_async(_make_item()) + assert result.passed is True + assert result.check_name == "named_async" + + +# --------------------------------------------------------------------------- +# Auto-wrapping bare checks in evaluate_agent +# --------------------------------------------------------------------------- + + +class TestAutoWrapEvalChecks: + @pytest.mark.asyncio + async def test_bare_check_in_evaluators_list(self): + """Bare EvalCheck callables are auto-wrapped in LocalEvaluator.""" + from agent_framework._evaluation import _run_evaluators + + @evaluator + def is_long(response: str) -> bool: + return len(response.split()) > 2 + + items = [_make_item(response="It is sunny and warm today")] + results = await _run_evaluators(is_long, items, eval_name="test") + assert len(results) == 1 + assert results[0].result_counts["passed"] == 1 + + @pytest.mark.asyncio + async def test_mixed_evaluators_and_checks(self): + """Mix of Evaluator instances and bare checks works.""" + from agent_framework._evaluation import _run_evaluators + + @evaluator + def has_words(response: str) -> bool: + return len(response.split()) > 0 + + local = LocalEvaluator(keyword_check("sunny")) + + items = [_make_item(response="It is sunny")] + results = await _run_evaluators([local, has_words], items, eval_name="test") + assert len(results) == 2 + assert all(r.result_counts["passed"] == 1 for r in results) + + @pytest.mark.asyncio + async def test_adjacent_checks_grouped(self): + """Adjacent bare checks are grouped into a single LocalEvaluator.""" + from agent_framework._evaluation import _run_evaluators + + @evaluator + def check_a(response: str) -> bool: + return True + + @evaluator + def check_b(response: str) -> bool: + return True + + items = [_make_item()] + results = await _run_evaluators([check_a, check_b], items, eval_name="test") + # Two adjacent checks → one LocalEvaluator → one result + assert len(results) == 1 + assert results[0].result_counts["passed"] == 1 + + +# --------------------------------------------------------------------------- +# Expected Tool Calls +# --------------------------------------------------------------------------- + + +def _make_tool_call_item( + calls: list[tuple[str, dict | None]], + expected: list[ExpectedToolCall] | None = None, +) -> EvalItem: + """Build an EvalItem with tool calls in the conversation.""" + msgs: list[Message] = [Message("user", ["Do something"])] + for name, args in calls: + msgs.append(Message("assistant", [Content.from_function_call("call_" + name, name, arguments=args)])) + msgs.append(Message("assistant", ["Done"])) + return EvalItem(conversation=msgs, expected_tool_calls=expected) + + +class TestExpectedToolCallType: + def test_name_only(self): + tc = ExpectedToolCall("get_weather") + assert tc.name == "get_weather" + assert tc.arguments is None + + def test_name_and_args(self): + tc = ExpectedToolCall("get_weather", {"location": "NYC"}) + assert tc.name == "get_weather" + assert tc.arguments == {"location": "NYC"} + + +class TestToolCallsPresent: + def test_all_present(self): + item = _make_tool_call_item( + calls=[("get_weather", None), ("get_news", None)], + expected=[ExpectedToolCall("get_weather"), ExpectedToolCall("get_news")], + ) + result = tool_calls_present(item) + assert result.passed is True + assert result.check_name == "tool_calls_present" + + def test_missing_tool(self): + item = _make_tool_call_item( + calls=[("get_weather", None)], + expected=[ExpectedToolCall("get_weather"), ExpectedToolCall("get_news")], + ) + result = tool_calls_present(item) + assert result.passed is False + assert "get_news" in result.reason + + def test_extras_ok(self): + item = _make_tool_call_item( + calls=[("get_weather", None), ("get_news", None), ("get_stock", None)], + expected=[ExpectedToolCall("get_weather")], + ) + result = tool_calls_present(item) + assert result.passed is True + + def test_no_expected(self): + item = _make_tool_call_item(calls=[("get_weather", None)]) + result = tool_calls_present(item) + assert result.passed is True + assert "No expected" in result.reason + + +class TestToolCallArgsMatch: + def test_name_only_match(self): + item = _make_tool_call_item( + calls=[("get_weather", {"location": "NYC"})], + expected=[ExpectedToolCall("get_weather")], + ) + result = tool_call_args_match(item) + assert result.passed is True + + def test_args_exact_match(self): + item = _make_tool_call_item( + calls=[("get_weather", {"location": "NYC", "units": "fahrenheit"})], + expected=[ExpectedToolCall("get_weather", {"location": "NYC"})], + ) + # Subset match — extra "units" key is OK + result = tool_call_args_match(item) + assert result.passed is True + + def test_args_mismatch(self): + item = _make_tool_call_item( + calls=[("get_weather", {"location": "LA"})], + expected=[ExpectedToolCall("get_weather", {"location": "NYC"})], + ) + result = tool_call_args_match(item) + assert result.passed is False + assert "args mismatch" in result.reason + + def test_tool_not_called(self): + item = _make_tool_call_item( + calls=[("get_news", None)], + expected=[ExpectedToolCall("get_weather", {"location": "NYC"})], + ) + result = tool_call_args_match(item) + assert result.passed is False + assert "not called" in result.reason + + def test_multiple_expected(self): + item = _make_tool_call_item( + calls=[ + ("get_weather", {"location": "NYC"}), + ("book_flight", {"destination": "LA", "date": "tomorrow"}), + ], + expected=[ + ExpectedToolCall("get_weather", {"location": "NYC"}), + ExpectedToolCall("book_flight", {"destination": "LA"}), + ], + ) + result = tool_call_args_match(item) + assert result.passed is True + + def test_no_expected(self): + item = _make_tool_call_item(calls=[("get_weather", None)]) + result = tool_call_args_match(item) + assert result.passed is True + + +class TestExpectedToolCallsFieldInjection: + """Test that @evaluator can receive expected_tool_calls via parameter injection.""" + + @pytest.mark.asyncio + async def test_injection(self): + @evaluator + def check_tools(expected_tool_calls: list) -> bool: + return len(expected_tool_calls) == 2 + + item = _make_tool_call_item( + calls=[], + expected=[ExpectedToolCall("a"), ExpectedToolCall("b")], + ) + result = await check_tools(item) + assert result.passed is True + + @pytest.mark.asyncio + async def test_injection_empty_default(self): + @evaluator + def check_tools(expected_tool_calls: list) -> bool: + return len(expected_tool_calls) == 0 + + item = _make_tool_call_item(calls=[]) + result = await check_tools(item) + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Per-item results (auditing) +# --------------------------------------------------------------------------- + + +class TestPerItemResults: + """LocalEvaluator should produce per-item EvalItemResult with query/response.""" + + @pytest.mark.asyncio + async def test_items_populated_with_query_and_response(self): + @evaluator + def is_sunny(response: str) -> bool: + return "sunny" in response.lower() + + item = _make_item(query="Weather?", response="It's sunny!") + local = LocalEvaluator(is_sunny) + results = await local.evaluate([item]) + + assert len(results.items) == 1 + ri = results.items[0] + assert ri.item_id == "0" + assert ri.status == "pass" + assert ri.input_text == "Weather?" + assert ri.output_text == "It's sunny!" + assert len(ri.scores) == 1 + assert ri.scores[0].name == "is_sunny" + assert ri.scores[0].passed is True + + @pytest.mark.asyncio + async def test_items_populated_on_failure(self): + @evaluator + def always_fail(response: str) -> bool: + return False + + item = _make_item(query="Hello", response="World") + local = LocalEvaluator(always_fail) + results = await local.evaluate([item]) + + assert len(results.items) == 1 + ri = results.items[0] + assert ri.status == "fail" + assert ri.input_text == "Hello" + assert ri.output_text == "World" + assert ri.scores[0].passed is False + assert ri.scores[0].score == 0.0 + + @pytest.mark.asyncio + async def test_multiple_items_indexed(self): + @evaluator + def pass_all(response: str) -> bool: + return True + + items = [ + _make_item(query="Q1", response="R1"), + _make_item(query="Q2", response="R2"), + ] + local = LocalEvaluator(pass_all) + results = await local.evaluate(items) + + assert len(results.items) == 2 + assert results.items[0].item_id == "0" + assert results.items[0].input_text == "Q1" + assert results.items[0].output_text == "R1" + assert results.items[1].item_id == "1" + assert results.items[1].input_text == "Q2" + assert results.items[1].output_text == "R2" + + +# --------------------------------------------------------------------------- +# num_repetitions validation +# --------------------------------------------------------------------------- + + +class TestNumRepetitions: + """Tests for the num_repetitions parameter on evaluate_agent.""" + + @pytest.mark.asyncio + async def test_num_repetitions_validation_rejects_zero(self): + from agent_framework._evaluation import evaluate_agent + + with pytest.raises(ValueError, match="num_repetitions must be >= 1"): + await evaluate_agent( + queries=["Hello"], + evaluators=LocalEvaluator(keyword_check("hello")), + num_repetitions=0, + ) + + @pytest.mark.asyncio + async def test_num_repetitions_validation_rejects_negative(self): + from agent_framework._evaluation import evaluate_agent + + with pytest.raises(ValueError, match="num_repetitions must be >= 1"): + await evaluate_agent( + queries=["Hello"], + evaluators=LocalEvaluator(keyword_check("hello")), + num_repetitions=-1, + ) + + @pytest.mark.asyncio + async def test_num_repetitions_multiplies_items(self): + """num_repetitions=2 produces 2× the eval items.""" + from unittest.mock import AsyncMock, MagicMock + + from agent_framework._evaluation import evaluate_agent + from agent_framework._types import AgentResponse, Message + + mock_agent = MagicMock() + mock_agent.name = "test" + mock_agent.default_options = {} + mock_agent.run = AsyncMock( + return_value=AgentResponse(messages=[Message("assistant", ["reply"])]) + ) + + results = await evaluate_agent( + agent=mock_agent, + queries=["Q1", "Q2"], + evaluators=LocalEvaluator(keyword_check("reply")), + num_repetitions=2, + ) + # 2 queries × 2 reps = 4 items + assert results[0].total == 4 + assert mock_agent.run.call_count == 4 diff --git a/python/packages/core/tests/core/test_observability.py b/python/packages/core/tests/core/test_observability.py index 5152712b8c..456367774d 100644 --- a/python/packages/core/tests/core/test_observability.py +++ b/python/packages/core/tests/core/test_observability.py @@ -3059,6 +3059,10 @@ def test_configure_otel_providers_with_env_file_path(monkeypatch, tmp_path): assert observability.OBSERVABILITY_SETTINGS.enable_sensitive_data is True +@pytest.mark.skipif( + True, + reason="Skipping OTLP exporter tests - optional dependency not installed by default", +) def test_configure_otel_providers_with_env_file_and_vs_code_port(monkeypatch, tmp_path): """Test configure_otel_providers with env_file_path and vs_code_extension_port.""" import importlib diff --git a/python/packages/core/tests/workflow/test_full_conversation.py b/python/packages/core/tests/workflow/test_full_conversation.py index b6b5260d83..d4f9466254 100644 --- a/python/packages/core/tests/workflow/test_full_conversation.py +++ b/python/packages/core/tests/workflow/test_full_conversation.py @@ -460,10 +460,10 @@ async def test_run_request_with_full_history_clears_service_session_id() -> None assert spy_agent._captured_service_session_id is None # pyright: ignore[reportPrivateUsage] -async def test_from_response_preserves_service_session_id() -> None: - """from_response hands off a prior agent's full conversation to the next executor. - The receiving executor's service_session_id is preserved so the API can continue - the conversation using previous_response_id.""" +async def test_from_response_clears_service_session_id_on_new_run() -> None: + """service_session_id set before a workflow run is cleared by the executor reset + that happens at the start of each run, preventing stale previous_response_id + from leaking between runs.""" tool_agent = _ToolHistoryAgent(id="tool_agent2", name="ToolAgent", summary_text="Done.") tool_exec = AgentExecutor(tool_agent, id="tool_agent2") @@ -477,4 +477,6 @@ async def test_from_response_preserves_service_session_id() -> None: result = await wf.run("start") assert result.get_outputs() is not None - assert spy_agent._captured_service_session_id == "resp_PREVIOUS_RUN" # pyright: ignore[reportPrivateUsage] + # service_session_id is cleared at the start of run() to prevent stale + # previous_response_id from causing "No tool output found" errors on re-runs. + assert spy_agent._captured_service_session_id is None # pyright: ignore[reportPrivateUsage] diff --git a/python/samples/01-get-started/02_add_tools.py b/python/samples/01-get-started/02_add_tools.py index 06108bb388..bffdfe518f 100644 --- a/python/samples/01-get-started/02_add_tools.py +++ b/python/samples/01-get-started/02_add_tools.py @@ -36,6 +36,8 @@ def get_weather( """Get the weather for a given location.""" conditions = ["sunny", "cloudy", "rainy", "stormy"] return f"The weather in {location} is {conditions[randint(0, 3)]} with a high of {randint(10, 30)}°C." + + # diff --git a/python/samples/01-get-started/04_memory.py b/python/samples/01-get-started/04_memory.py index c554be7337..73045458dd 100644 --- a/python/samples/01-get-started/04_memory.py +++ b/python/samples/01-get-started/04_memory.py @@ -68,6 +68,8 @@ async def after_run( text = msg.text if hasattr(msg, "text") else "" if isinstance(text, str) and "my name is" in text.lower(): state["user_name"] = text.lower().split("my name is")[-1].strip().split()[0].capitalize() + + # diff --git a/python/samples/01-get-started/05_first_workflow.py b/python/samples/01-get-started/05_first_workflow.py index 89b4f608b2..74720e529f 100644 --- a/python/samples/01-get-started/05_first_workflow.py +++ b/python/samples/01-get-started/05_first_workflow.py @@ -45,6 +45,8 @@ def create_workflow(): """Build the workflow: UpperCase → reverse_text.""" upper = UpperCase(id="upper_case") return WorkflowBuilder(start_executor=upper).add_edge(upper, reverse_text).build() + + # diff --git a/python/samples/02-agents/evaluation/evaluate_agent.py b/python/samples/02-agents/evaluation/evaluate_agent.py new file mode 100644 index 0000000000..ac37599c18 --- /dev/null +++ b/python/samples/02-agents/evaluation/evaluate_agent.py @@ -0,0 +1,68 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate an agent with local checks — no API keys needed. + +Demonstrates the simplest evaluation workflow: +1. Define checks using the @evaluator decorator +2. Run evaluate_agent() which calls agent.run() under the covers +3. Assert results in CI or inspect interactively + +Usage: + uv run python samples/02-agents/evaluation/evaluate_agent.py +""" + +import asyncio + +from agent_framework import ( + Agent, + LocalEvaluator, + evaluate_agent, + evaluator, + keyword_check, +) + + +# A custom check — parameter names determine what data you receive +@evaluator +def is_helpful(response: str) -> bool: + """Check the response isn't empty or a refusal.""" + refusals = ["i can't", "i'm not able", "i don't know"] + return len(response) > 10 and not any(r in response.lower() for r in refusals) + + +async def main() -> None: + agent = Agent( + model="gpt-4o-mini", + instructions="You are a helpful weather assistant.", + ) + + # Combine built-in and custom checks + local = LocalEvaluator( + keyword_check("weather"), # response must mention "weather" + is_helpful, # custom check + ) + + # evaluate_agent() calls agent.run() for each query, then evaluates + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "Will it rain in London tomorrow?", + "What should I wear for 30°C weather?", + ], + evaluators=local, + ) + + for r in results: + print(f"{r.provider}: {r.passed}/{r.total} passed") + for item in r.items: + print(f" [{item.status}] Q: {item.input_text[:50]} A: {item.output_text[:50]}...") + for score in item.scores: + print(f" {score.name}: {'✓' if score.passed else '✗'}") + + # Use in CI: will raise AssertionError if any check fails + # results[0].assert_passed() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/02-agents/evaluation/evaluate_with_expected.py b/python/samples/02-agents/evaluation/evaluate_with_expected.py new file mode 100644 index 0000000000..78766607fd --- /dev/null +++ b/python/samples/02-agents/evaluation/evaluate_with_expected.py @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate an agent with expected outputs and tool call checks. + +Demonstrates ground-truth comparison and tool usage evaluation: +1. Provide expected outputs alongside queries +2. Use built-in tool_calls_present for tool verification +3. Combine multiple evaluation criteria + +Usage: + uv run python samples/02-agents/evaluation/evaluate_with_expected.py +""" + +import asyncio + +from agent_framework import ( + Agent, + LocalEvaluator, + evaluate_agent, + evaluator, + tool_calls_present, +) + + +@evaluator +def response_matches_expected(response: str, expected_output: str) -> float: + """Score based on word overlap with expected output.""" + if not expected_output: + return 1.0 + response_words = set(response.lower().split()) + expected_words = set(expected_output.lower().split()) + return len(response_words & expected_words) / max(len(expected_words), 1) + + +async def main() -> None: + agent = Agent( + model="gpt-4o-mini", + instructions="You are a math tutor. Answer concisely.", + ) + + local = LocalEvaluator( + response_matches_expected, + tool_calls_present, # verifies expected tools were called + ) + + results = await evaluate_agent( + agent=agent, + queries=["What is 2 + 2?", "What is the square root of 144?"], + expected_output=["4", "12"], + expected_tool_calls=[ + [], # no tools expected for simple math + [], + ], + evaluators=local, + ) + + for r in results: + print(f"{r.provider}: {r.passed}/{r.total} passed") + for item in r.items: + print(f" [{item.status}] {item.input_text} → {item.output_text[:80]}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/02-agents/middleware/usage_tracking_middleware.py b/python/samples/02-agents/middleware/usage_tracking_middleware.py index 877d2a8a82..bffa01f7d8 100644 --- a/python/samples/02-agents/middleware/usage_tracking_middleware.py +++ b/python/samples/02-agents/middleware/usage_tracking_middleware.py @@ -50,8 +50,7 @@ def _reset_usage_counters() -> None: STREAMING_CALL_COUNT = 0 -def _create_agent( -) -> Agent: +def _create_agent() -> Agent: """Create the shared agent used by both demonstrations.""" return Agent( client=OpenAIResponsesClient(), diff --git a/python/samples/03-workflows/evaluation/evaluate_workflow.py b/python/samples/03-workflows/evaluation/evaluate_workflow.py new file mode 100644 index 0000000000..31fbdaa3a5 --- /dev/null +++ b/python/samples/03-workflows/evaluation/evaluate_workflow.py @@ -0,0 +1,55 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate a multi-agent workflow with per-agent breakdown. + +Demonstrates workflow evaluation: +1. Build a simple two-agent workflow +2. Run evaluate_workflow() which runs the workflow and evaluates each agent +3. Inspect per-agent results in sub_results + +Usage: + uv run python samples/03-workflows/evaluation/evaluate_workflow.py +""" + +import asyncio + +from agent_framework import ( + Agent, + LocalEvaluator, + WorkflowBuilder, + evaluate_workflow, + evaluator, + keyword_check, +) + + +@evaluator +def is_nonempty(response: str) -> bool: + """Check the agent produced a non-trivial response.""" + return len(response.strip()) > 5 + + +async def main() -> None: + # Build a simple planner → executor workflow + planner = Agent(model="gpt-4o-mini", instructions="You plan trips. Output a bullet-point plan.") + executor_agent = Agent(model="gpt-4o-mini", instructions="You execute travel plans. Book the items listed.") + + workflow = WorkflowBuilder(start_executor=planner).add_edge(planner, executor_agent).build() + + # Evaluate with per-agent breakdown + local = LocalEvaluator(is_nonempty, keyword_check("plan", "trip")) + + results = await evaluate_workflow( + workflow=workflow, + queries=["Plan a weekend trip to Paris"], + evaluators=local, + ) + + for r in results: + print(f"{r.provider}: {r.passed}/{r.total} passed (overall)") + for agent_name, sub in r.sub_results.items(): + print(f" {agent_name}: {sub.passed}/{sub.total}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example new file mode 100644 index 0000000000..f1bb1f27bd --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/.env.example @@ -0,0 +1,3 @@ +AZURE_AI_PROJECT_ENDPOINT="" +AZURE_AI_MODEL_DEPLOYMENT_NAME="" + diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/README.md b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md new file mode 100644 index 0000000000..56fa48c8e6 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/README.md @@ -0,0 +1,46 @@ +# Foundry Evals Integration Samples + +These samples demonstrate evaluating agent-framework agents using Azure AI Foundry's built-in evaluators. + +## Available Evaluators + +| Category | Evaluators | +|----------|-----------| +| **Agent behavior** | `intent_resolution`, `task_adherence`, `task_completion`, `task_navigation_efficiency` | +| **Tool usage** | `tool_call_accuracy`, `tool_selection`, `tool_input_accuracy`, `tool_output_utilization`, `tool_call_success` | +| **Quality** | `coherence`, `fluency`, `relevance`, `groundedness`, `response_completeness`, `similarity` | +| **Safety** | `violence`, `sexual`, `self_harm`, `hate_unfairness` | + +## Samples + +### `evaluate_agent_sample.py` — Dataset Evaluation (Path 3) + +The dev inner loop. Two patterns from simplest to most control: + +1. **`evaluate_agent()`** — One call: runs agent → converts → evaluates +2. **`evaluate_dataset()`** — Run agent yourself, convert with `AgentEvalConverter`, inspect/modify, then evaluate + +```bash +uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py +``` + +### `evaluate_traces_sample.py` — Trace & Response Evaluation (Path 1) + +Evaluate what already happened — zero changes to agent code: + +1. **`evaluate_responses()`** — Evaluate Responses API responses by ID +2. **`evaluate_traces()`** — Evaluate from OTel traces in App Insights + +```bash +uv run samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py +``` + +## Setup + +Create a `.env` file with configuration as in the `.env.example` file in this folder. + +## Which sample should I start with? + +- **"I want to test my agent during development"** → `evaluate_agent_sample.py`, Pattern 1 +- **"I want to evaluate past agent runs"** → `evaluate_traces_sample.py` +- **"I want to inspect/modify eval data before submitting"** → `evaluate_agent_sample.py`, Pattern 2 diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py new file mode 100644 index 0000000000..776147b7ca --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_agent_sample.py @@ -0,0 +1,198 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate an agent using Azure AI Foundry's built-in evaluators. + +This sample demonstrates three patterns: +1. evaluate_agent(responses=...) — Evaluate a response you already have. +2. evaluate_agent(queries=...) — Run the agent against test queries and evaluate in one call. +3. FoundryEvals.evaluate() — Full control with direct evaluator access. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set AZURE_AI_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Agent, AgentEvalConverter, ConversationSplit, evaluate_agent +from agent_framework.azure import AzureOpenAIResponsesClient +from agent_framework_azure_ai import FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + + +# Define a simple tool for the agent +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +def get_flight_price(origin: str, destination: str) -> str: + """Get the price of a flight between two cities.""" + return f"Flights from {origin} to {destination}: $450 round-trip" + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + credential=DefaultAzureCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + # 2. Create an agent with tools + agent = Agent( + client=AzureOpenAIResponsesClient( + project_client=project_client, + deployment_name=deployment, + ), + name="travel-assistant", + instructions=( + "You are a helpful travel assistant. Use your tools to answer questions about weather and flights." + ), + tools=[get_weather, get_flight_price], + ) + + # 3. Create the evaluator — provider config goes here, once + evals = FoundryEvals(project_client=project_client, model_deployment=deployment) + + # ========================================================================= + # Pattern 1: evaluate_agent(responses=...) — evaluate a response you already have + # ========================================================================= + print("=" * 60) + print("Pattern 1: evaluate_agent(responses=...) — evaluate existing response") + print("=" * 60) + + query = "How much does a flight from Seattle to Paris cost?" + response = await agent.run(query) + print(f"Agent said: {response.text[:100]}...") + + # Pass agent= so tool definitions are extracted, queries= for the eval item context + results = await evaluate_agent( + agent=agent, + responses=response, + queries=[query], + evaluators=FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ), + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed, {r.errored} errored") + + # ========================================================================= + # Pattern 2a: evaluate_agent() — batch test queries + # ========================================================================= + print() + print("=" * 60) + print("Pattern 2a: evaluate_agent()") + print("=" * 60) + + # Calls agent.run() under the covers for each query, then evaluates + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "How much does a flight from Seattle to Paris cost?", + "What should I pack for London?", + ], + evaluators=evals, # uses smart defaults (auto-adds tool_call_accuracy) + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed, {r.errored} errored") + + # ========================================================================= + # Pattern 2b: evaluate_agent() — with conversation split override + # ========================================================================= + print() + print("=" * 60) + print("Pattern 2b: evaluate_agent() with conversation_split") + print("=" * 60) + + # conversation_split forces all evaluators to use the same split strategy. + # FULL evaluates the entire conversation trajectory against the original query. + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather like in Seattle?", + "What should I pack for London?", + ], + evaluators=evals, + conversation_split=ConversationSplit.FULL, # overrides evaluator defaults + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed, {r.errored} errored") + + # ========================================================================= + # Pattern 3: FoundryEvals.evaluate() — manual control + # ========================================================================= + print() + print("=" * 60) + print("Pattern 3: FoundryEvals.evaluate() — manual control") + print("=" * 60) + + queries = [ + "What's the weather in Paris?", + "Find me a flight from London to Seattle", + ] + + items = [] + for q in queries: + response = await agent.run(q) + print(f"Query: {q}") + print(f"Response: {response.text[:100]}...") + + item = AgentEvalConverter.to_eval_item(query=q, response=response, agent=agent) + items.append(item) + + print(f" Has tools: {item.tools is not None}") + if item.tools: + print(f" Tools: {[t.name for t in item.tools]}") + + # Submit directly to the evaluator + tool_evals = FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TOOL_CALL_ACCURACY], + ) + results = await tool_evals.evaluate(items, eval_name="Travel Assistant Eval") + + print(f"\nStatus: {results.status}") + print(f"Results: {results.passed}/{results.total} passed") + print(f"Portal: {results.report_url}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_all_patterns_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_all_patterns_sample.py new file mode 100644 index 0000000000..ebe19c488c --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_all_patterns_sample.py @@ -0,0 +1,545 @@ +# Copyright (c) Microsoft. All rights reserved. + +""" +Agent Evaluation — Complete Guide +================================== + +This sample shows every way to evaluate agents and workflows in +Microsoft Agent Framework. Run the sections that match your needs. + + ┌──────────────────────────────────────┐ + │ Evaluation Options │ + ├──────────────────────────────────────┤ + │ │ + │ 1. Your own function (no setup) │ + │ 2. Built-in checks (no setup) │ + │ 3. Azure AI Foundry (cloud) │ + │ 4. Mix them all (recommended) │ + │ │ + └──────────────────────────────────────┘ + +Each evaluator plugs into the same two entry points: + + evaluate_agent() — run agent + evaluate, or evaluate existing responses + evaluate_workflow() — evaluate multi-agent workflows with per-agent breakdown +""" + +import asyncio +import os + +from agent_framework import ( + Agent, + LocalEvaluator, + Message, + Workflow, + evaluate_agent, + evaluate_workflow, + evaluator, + keyword_check, + tool_called_check, +) +from agent_framework.azure import AzureOpenAIResponsesClient +from agent_framework_azure_ai import FoundryEvals +from agent_framework_orchestrations import GroupChatBuilder, SequentialBuilder +from azure.ai.projects.aio import AIProjectClient +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + + +# ── Tools for our agents ───────────────────────────────────────────────────── + + +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + return {"seattle": "62°F, cloudy", "london": "55°F, overcast", "paris": "68°F, sunny"}.get( + location.lower(), f"No data for {location}" + ) + + +def get_flight_price(origin: str, destination: str) -> str: + """Get the price of a flight between two cities.""" + return f"Flights from {origin} to {destination}: $450 round-trip" + + +# ── Output helpers ──────────────────────────────────────────────────────────── + + +def print_workflow_results(results) -> None: + """Print workflow eval results with clear provider → overall → per-agent hierarchy.""" + for r in results: + status = "✓" if r.all_passed else "✗" + print(f"\n {r.provider}:") + print(f" {status} overall: {r.passed}/{r.total} passed") + if r.report_url: + print(f" Portal: {r.report_url}") + for agent_name, sub in r.sub_results.items(): + agent_status = "✓" if sub.all_passed else "✗" + print(f" {agent_status} {agent_name}: {sub.passed}/{sub.total}") + if sub.report_url: + print(f" Portal: {sub.report_url}") + + +# ── Agent setup ─────────────────────────────────────────────────────────────── + + +def create_agent(project_client, deployment) -> Agent: + """Create a travel assistant agent.""" + return Agent( + client=AzureOpenAIResponsesClient( + project_client=project_client, + deployment_name=deployment, + ), + name="travel-assistant", + instructions="You are a helpful travel assistant. Use your tools to answer questions.", + tools=[get_weather, get_flight_price], + ) + + +def create_workflow(project_client, deployment) -> Workflow: + """Create a researcher → planner sequential workflow.""" + client = AzureOpenAIResponsesClient( + project_client=project_client, + deployment_name=deployment, + ) + researcher = Agent( + client=client, + name="researcher", + instructions="You are a travel researcher. Use tools to gather weather and flight info.", + tools=[get_weather, get_flight_price], + default_options={"store": False}, + ) + planner = Agent( + client=client, + name="planner", + instructions="You are a travel planner. Create a concise recommendation from the research.", + default_options={"store": False}, + ) + return SequentialBuilder(participants=[researcher, planner]).build() + + +# ═════════════════════════════════════════════════════════════════════════════ +# Section 1: Custom Function Evaluators +# ═════════════════════════════════════════════════════════════════════════════ +# +# Write a plain Python function. Name your parameters to get the data you need. +# Return bool, float (≥0.5 = pass), or dict. +# +# Available parameters: +# query, response, expected_output, conversation, tool_definitions, context +# + +# ── Simple check: just query + response ────────────────────────────────────── + + +@evaluator +def is_helpful(response: str) -> bool: + """Response should be more than a one-liner.""" + return len(response.split()) > 10 + + +@evaluator +def no_apologies(query: str, response: str) -> bool: + """Agent shouldn't start with 'I'm sorry' or 'I apologize'.""" + lower = response.lower().strip() + return not lower.startswith("i'm sorry") and not lower.startswith("i apologize") + + +# ── Scored check: return a float ───────────────────────────────────────────── + + +@evaluator +def relevance_keyword_overlap(query: str, response: str) -> float: + """Score based on how many query words appear in the response.""" + query_words = set(query.lower().split()) - {"the", "a", "in", "to", "is", "what", "how"} + response_lower = response.lower() + if not query_words: + return 1.0 + return sum(1 for w in query_words if w in response_lower) / len(query_words) + + +# ── Ground truth check: compare against expected output ────────────────────── + + +@evaluator +def mentions_expected_city(response: str, expected_output: str) -> bool: + """Response should mention the expected city.""" + return expected_output.lower() in response.lower() + + +# ── Full context check: inspect conversation and tools ─────────────────────── + + +@evaluator +def used_available_tools(conversation: list, tool_definitions: list) -> dict: + """Check that the agent actually called at least one of its tools.""" + available = {t.get("name", "") for t in (tool_definitions or [])} + called = set() + for msg in conversation: + for tc in msg.get("tool_calls", []): + name = tc.get("function", {}).get("name", "") + if name: + called.add(name) + for ci in msg.get("content", []): + if isinstance(ci, dict) and ci.get("type") == "tool_call": + called.add(ci.get("name", "")) + used = called & available + return { + "passed": len(used) > 0, + "reason": f"Used {sorted(used)}" if used else f"No tools called (available: {sorted(available)})", + } + + +async def demo_evaluators(project_client, deployment) -> None: + """Evaluate an agent with custom function evaluators.""" + print() + print("═" * 60) + print(" 1. Custom Function Evaluators") + print("═" * 60) + + agent = create_agent(project_client, deployment) + + local = LocalEvaluator( + is_helpful, + no_apologies, + relevance_keyword_overlap, + used_available_tools, + ) + + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?", "How much is a flight to Paris?"], + evaluators=local, + ) + + for r in results: + print(f"\n {r.provider}: {r.passed}/{r.total} passed") + for check, counts in r.per_evaluator.items(): + status = "✓" if counts["failed"] == 0 else "✗" + print(f" {status} {check}: {counts['passed']}/{counts['passed'] + counts['failed']}") + + +# ═════════════════════════════════════════════════════════════════════════════ +# Section 2: Built-in Local Checks +# ═════════════════════════════════════════════════════════════════════════════ +# +# Pre-built checks for common patterns — no function needed. +# + + +async def demo_builtin_checks(project_client, deployment) -> None: + """Evaluate with built-in keyword and tool checks.""" + print() + print("═" * 60) + print(" 2. Built-in Local Checks") + print("═" * 60) + + agent = create_agent(project_client, deployment) + + local = LocalEvaluator( + keyword_check("weather", "seattle"), # response must contain these words + tool_called_check("get_weather"), # agent must have called this tool + ) + + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?"], + evaluators=local, + ) + + for r in results: + status = "✓" if r.all_passed else "✗" + print(f"\n {status} {r.provider}: {r.passed}/{r.total} passed") + for check, counts in r.per_evaluator.items(): + print(f" {check}: {counts}") + + +# ═════════════════════════════════════════════════════════════════════════════ +# Section 3: Azure AI Foundry Evaluators +# ═════════════════════════════════════════════════════════════════════════════ +# +# Cloud-powered AI quality assessment. Evaluates relevance, coherence, +# task adherence, tool usage, and more. +# + + +async def demo_foundry_agent(project_client, deployment) -> None: + """Evaluate a single agent with Foundry.""" + print() + print("═" * 60) + print(" 3a. Foundry — Single Agent") + print("═" * 60) + + agent = create_agent(project_client, deployment) + evals = FoundryEvals(project_client=project_client, model_deployment=deployment) + + # evaluate_agent: run + evaluate in one call + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?", "Find flights from London to Paris"], + evaluators=evals, + ) + + for r in results: + print(f"\n {r.provider}: {r.passed}/{r.total} passed") + print(f" Portal: {r.report_url}") + + +async def demo_foundry_response(project_client, deployment) -> None: + """Evaluate a response you already have.""" + print() + print("═" * 60) + print(" 3b. Foundry — Existing Response") + print("═" * 60) + + agent = create_agent(project_client, deployment) + + # Run the agent yourself + response = await agent.run([Message("user", ["What's the weather in Seattle?"])]) + print(f" Agent said: {response.text[:80]}...") + + # Then evaluate the response (without re-running the agent) + quality_evals = FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + ) + results = await evaluate_agent( + agent=agent, + responses=response, + queries=["What's the weather in Seattle?"], + evaluators=quality_evals, + ) + + for r in results: + print(f"\n {r.provider}: {r.passed}/{r.total} passed") + + +async def demo_foundry_workflow(project_client, deployment) -> None: + """Evaluate a multi-agent workflow with per-agent breakdown.""" + print() + print("═" * 60) + print(" 3c. Foundry — Multi-Agent Workflow") + print("═" * 60) + + workflow = create_workflow(project_client, deployment) + evals = FoundryEvals(project_client=project_client, model_deployment=deployment) + + # Run + evaluate with multiple queries + results = await evaluate_workflow( + workflow=workflow, + queries=["Plan a trip from Seattle to Paris"], + evaluators=evals, + ) + + print_workflow_results(results) + + +async def demo_foundry_select(project_client, deployment) -> None: + """Choose specific Foundry evaluators.""" + print() + print("═" * 60) + print(" 3d. Foundry — Selecting Evaluators") + print("═" * 60) + + agent = create_agent(project_client, deployment) + + # Pick exactly which evaluators to run + evals = FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[ + FoundryEvals.RELEVANCE, + FoundryEvals.TASK_ADHERENCE, + FoundryEvals.TOOL_CALL_ACCURACY, + ], + ) + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?"], + evaluators=evals, + ) + + for r in results: + print(f"\n {r.provider}: {r.passed}/{r.total} passed") + for ev_name, counts in r.per_evaluator.items(): + print(f" {ev_name}: {counts}") + + +# ═════════════════════════════════════════════════════════════════════════════ +# Section 4: Mix Everything Together +# ═════════════════════════════════════════════════════════════════════════════ +# +# Pass a list of evaluators — local functions, built-in checks, and Foundry +# all run together. You get one EvalResults per provider. +# + + +async def demo_mixed(project_client, deployment) -> None: + """Combine custom functions, built-in checks, and Foundry in one call.""" + print() + print("═" * 60) + print(" 4. Mixed Evaluation (recommended)") + print("═" * 60) + + agent = create_agent(project_client, deployment) + + # Local: custom functions + built-in checks + local = LocalEvaluator( + is_helpful, + no_apologies, + keyword_check("weather"), + tool_called_check("get_weather"), + ) + + # Cloud: Foundry AI quality assessment + foundry = FoundryEvals(project_client=project_client, model_deployment=deployment) + + # One call, multiple providers + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather in Seattle?", + "How much is a flight from London to Paris?", + ], + evaluators=[local, foundry], + ) + + print() + for r in results: + status = "✓" if r.all_passed else "✗" + print(f" {status} {r.provider}: {r.passed}/{r.total} passed") + for ev_name, counts in r.per_evaluator.items(): + p, f = counts["passed"], counts["failed"] + print(f" {ev_name}: {p}/{p + f}") + if r.report_url: + print(f" Portal: {r.report_url}") + + # CI assertion — fails the test if anything didn't pass + for r in results: + r.assert_passed() + print("\n ✓ All evaluations passed!") + + +# ═════════════════════════════════════════════════════════════════════════════ +# Section 5: Workflow + Mixed Evaluation +# ═════════════════════════════════════════════════════════════════════════════ + + +async def demo_workflow_mixed(project_client, deployment) -> None: + """Evaluate a workflow with both local and Foundry evaluators.""" + print() + print("═" * 60) + print(" 5. Workflow + Mixed Evaluation") + print("═" * 60) + + workflow = create_workflow(project_client, deployment) + + local = LocalEvaluator(is_helpful, no_apologies) + foundry = FoundryEvals(project_client=project_client, model_deployment=deployment) + + results = await evaluate_workflow( + workflow=workflow, + queries=["Plan a trip from Seattle to Paris"], + evaluators=[local, foundry], + ) + + print_workflow_results(results) + + +# ═════════════════════════════════════════════════════════════════════════════ +# Section 6: Iterative Workflows (agents run multiple times) +# ═════════════════════════════════════════════════════════════════════════════ +# +# When an agent runs multiple times in a single workflow execution (e.g., in +# a group chat or feedback loop), each invocation becomes a separate eval item. +# Results are grouped by agent, so you see e.g. "writer: 3/3 passed". +# + + +def create_iterative_workflow(project_client, deployment) -> Workflow: + """Create a group chat where a writer and reviewer iterate. + + The writer drafts a response, the reviewer critiques it, and the + writer revises — running 2 rounds so each agent is invoked twice. + """ + client = AzureOpenAIResponsesClient( + project_client=project_client, + deployment_name=deployment, + ) + writer = Agent( + client=client, + name="writer", + instructions=( + "You are a travel copywriter. Write or revise a short, " + "compelling travel description based on the conversation." + ), + default_options={"store": False}, + ) + reviewer = Agent( + client=client, + name="reviewer", + instructions=("You are an editor. Critique the writer's draft and suggest specific improvements. Be concise."), + default_options={"store": False}, + ) + + # Group chat with round-robin selection: writer → reviewer → writer → reviewer + # Each agent runs twice per query. + def round_robin(state): + names = list(state.participants.keys()) + return names[state.current_round % len(names)] + + return GroupChatBuilder( + participants=[writer, reviewer], + termination_condition=lambda conversation: len(conversation) >= 5, + selection_func=round_robin, + ).build() + + +async def demo_iterative_workflow(project_client, deployment) -> None: + """Evaluate a workflow where agents run multiple times.""" + print() + print("═" * 60) + print(" 6. Iterative Workflow (multi-run agents)") + print("═" * 60) + + workflow = create_iterative_workflow(project_client, deployment) + + local = LocalEvaluator(is_helpful, no_apologies) + + results = await evaluate_workflow( + workflow=workflow, + queries=["Write a travel description for Kyoto in autumn"], + evaluators=local, + ) + + print_workflow_results(results) + + +# ═════════════════════════════════════════════════════════════════════════════ +# Run it +# ═════════════════════════════════════════════════════════════════════════════ + + +async def main() -> None: + project_client = AIProjectClient( + endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + credential=DefaultAzureCredential(), + ) + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + # Run each section — comment out what you don't need + # await demo_evaluators(project_client, deployment) + # await demo_builtin_checks(project_client, deployment) + # await demo_foundry_agent(project_client, deployment) + # await demo_foundry_response(project_client, deployment) + # await demo_foundry_workflow(project_client, deployment) + # await demo_foundry_select(project_client, deployment) + # await demo_mixed(project_client, deployment) + await demo_workflow_mixed(project_client, deployment) + await demo_iterative_workflow(project_client, deployment) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_mixed_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_mixed_sample.py new file mode 100644 index 0000000000..c651cea056 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_mixed_sample.py @@ -0,0 +1,165 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Mix local and cloud evaluation providers in a single evaluate_agent() call. + +This sample demonstrates three patterns: +1. Local-only: Fast, API-free checks for inner-loop development. +2. Cloud-only: Full Foundry evaluators for comprehensive quality assessment. +3. Mixed: Local + Foundry evaluators in a single evaluate_agent() call. + +Mixing lets you get instant local feedback (keyword presence, tool usage) +alongside deeper cloud-based quality evaluation (relevance, coherence) +in one call. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set AZURE_AI_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import ( + Agent, + LocalEvaluator, + evaluate_agent, + keyword_check, + tool_called_check, +) +from agent_framework.azure import AzureOpenAIResponsesClient +from agent_framework_azure_ai import FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + + +# Define a simple tool for the agent +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + credential=DefaultAzureCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + # 2. Create an agent with a tool + agent = Agent( + client=AzureOpenAIResponsesClient( + project_client=project_client, + deployment_name=deployment, + ), + name="weather-assistant", + instructions="You are a helpful weather assistant. Use the get_weather tool to answer questions.", + tools=[get_weather], + ) + + # ========================================================================= + # Pattern 1: Local evaluation only (no API calls, instant results) + # ========================================================================= + print("=" * 60) + print("Pattern 1: Local evaluation only") + print("=" * 60) + + local = LocalEvaluator( + keyword_check("weather", "seattle"), + tool_called_check("get_weather"), + ) + + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?"], + evaluators=local, + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + for check_name, counts in r.per_evaluator.items(): + print(f" {check_name}: {counts['passed']} passed, {counts['failed']} failed") + if r.all_passed: + print("✓ All local checks passed!") + else: + print(f"✗ Failures: {r.error}") + + # ========================================================================= + # Pattern 2: Foundry evaluation only (cloud-based quality assessment) + # ========================================================================= + print() + print("=" * 60) + print("Pattern 2: Foundry evaluation only") + print("=" * 60) + + foundry = FoundryEvals(project_client=project_client, model_deployment=deployment) + + results = await evaluate_agent( + agent=agent, + queries=["What's the weather in Seattle?"], + evaluators=foundry, + ) + + for r in results: + print(f"Status: {r.status}") + print(f"Results: {r.passed}/{r.total} passed") + print(f"Portal: {r.report_url}") + if r.all_passed: + print("✓ All passed") + else: + print(f"✗ {r.failed} failed, {r.errored} errored") + + # ========================================================================= + # Pattern 3: Mixed — local + Foundry in one call + # ========================================================================= + print() + print("=" * 60) + print("Pattern 3: Mixed local + Foundry evaluation") + print("=" * 60) + + # Local checks: fast smoke tests + local = LocalEvaluator( + keyword_check("weather"), + tool_called_check("get_weather"), + ) + + # Foundry: deep quality assessment + foundry = FoundryEvals(project_client=project_client, model_deployment=deployment) + + # Pass both as a list — returns one EvalResults per provider + results = await evaluate_agent( + agent=agent, + queries=[ + "What's the weather in Seattle?", + "Tell me the weather in London", + ], + evaluators=[local, foundry], + ) + + for r in results: + status = "✓" if r.all_passed else "✗" + print(f" {status} {r.provider}: {r.passed}/{r.total} passed") + for check_name, counts in r.per_evaluator.items(): + print(f" {check_name}: {counts['passed']}/{counts['passed'] + counts['failed']}") + if r.report_url: + print(f" Portal: {r.report_url}") + + if all(r.all_passed for r in results): + print("✓ All checks passed (local + Foundry)!") + else: + failed = [r.provider for r in results if not r.all_passed] + print(f"✗ Failed providers: {', '.join(failed)}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_multiturn_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_multiturn_sample.py new file mode 100644 index 0000000000..b4023dacf4 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_multiturn_sample.py @@ -0,0 +1,190 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate multi-turn conversations with different split strategies. + +The same multi-turn conversation can be split different ways, each evaluating +a different aspect of agent behavior: + +1. LAST_TURN (default) — "Was the last response good given context?" +2. FULL — "Did the whole conversation serve the original request?" +3. per_turn_items — "Was each individual response appropriate?" + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set AZURE_AI_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Content, ConversationSplit, EvalItem, FunctionTool, Message +from agent_framework_azure_ai import FoundryEvals +from azure.ai.projects.aio import AIProjectClient +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + +# A multi-turn conversation with tool calls that we'll evaluate three ways. +# Uses framework Message/Content types for type-safe conversation construction. +CONVERSATION: list[Message] = [ + # Turn 1: user asks about weather → agent calls tool → responds + Message("user", ["What's the weather in Seattle?"]), + Message( + "assistant", + [ + Content.from_function_call("c1", "get_weather", arguments={"location": "seattle"}), + ], + ), + Message( + "tool", + [ + Content.from_function_result("c1", result="62°F, cloudy with a chance of rain"), + ], + ), + Message("assistant", ["Seattle is 62°F, cloudy with a chance of rain."]), + # Turn 2: user asks about Paris → agent calls tool → responds + Message("user", ["And Paris?"]), + Message( + "assistant", + [ + Content.from_function_call("c2", "get_weather", arguments={"location": "paris"}), + ], + ), + Message( + "tool", + [ + Content.from_function_result("c2", result="68°F, partly sunny"), + ], + ), + Message("assistant", ["Paris is 68°F, partly sunny."]), + # Turn 3: user asks for comparison → agent synthesizes without tool + Message("user", ["Can you compare them?"]), + Message( + "assistant", + [ + ( + "Seattle is cooler at 62°F with rain likely, while Paris is warmer " + "at 68°F and partly sunny. Paris is the better choice for outdoor activities." + ), + ], + ), +] + +TOOLS = [ + FunctionTool( + name="get_weather", + description="Get the current weather for a location.", + ), +] + + +def print_split(item: EvalItem, split: ConversationSplit = ConversationSplit.LAST_TURN) -> None: + """Print the query/response split for an EvalItem.""" + d = item.to_eval_data(split=split) + print(f" query_messages ({len(d['query_messages'])}):") + for m in d["query_messages"]: + content = m.get("content", "") + if isinstance(content, list): + content = content[0].get("type", str(content[0])) + print(f" {m['role']}: {str(content)[:70]}") + print(f" response_messages ({len(d['response_messages'])}):") + for m in d["response_messages"]: + content = m.get("content", "") + if isinstance(content, list): + content = content[0].get("type", str(content[0])) + print(f" {m['role']}: {str(content)[:70]}") + + +async def main() -> None: + project_client = AIProjectClient( + endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + credential=DefaultAzureCredential(), + ) + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + # ========================================================================= + # Strategy 1: LAST_TURN (default) + # "Given all context, was the last response good?" + # ========================================================================= + print("=" * 70) + print("Strategy 1: LAST_TURN — evaluate the final response") + print("=" * 70) + + # EvalItem takes conversation + tools; query/response are derived via split strategy + item = EvalItem(CONVERSATION, tools=TOOLS) + + print_split(item, ConversationSplit.LAST_TURN) + + results = await FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + # conversation_split defaults to LAST_TURN + ).evaluate([item], eval_name="Split Strategy: LAST_TURN") + + print(f"\n Result: {results.passed}/{results.total} passed") + print(f" Portal: {results.report_url}") + for ir in results.items: + for s in ir.scores: + print(f" {'✓' if s.passed else '✗'} {s.name}: {s.score}") + print() + + # ========================================================================= + # Strategy 2: FULL + # "Given the original request, did the whole conversation serve the user?" + # ========================================================================= + print("=" * 70) + print("Strategy 2: FULL — evaluate the entire conversation trajectory") + print("=" * 70) + + print_split(item, ConversationSplit.FULL) + + results = await FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + conversation_split=ConversationSplit.FULL, + ).evaluate([item], eval_name="Split Strategy: FULL") + + print(f"\n Result: {results.passed}/{results.total} passed") + print(f" Portal: {results.report_url}") + for ir in results.items: + for s in ir.scores: + print(f" {'✓' if s.passed else '✗'} {s.name}: {s.score}") + print() + + # ========================================================================= + # Strategy 3: per_turn_items + # "Was each individual response appropriate at that point?" + # ========================================================================= + print("=" * 70) + print("Strategy 3: per_turn_items — evaluate each turn independently") + print("=" * 70) + + items = EvalItem.per_turn_items(CONVERSATION, tools=TOOLS) + print(f" Split into {len(items)} items from {len(CONVERSATION)} messages:\n") + for i, it in enumerate(items): + print(f" Turn {i + 1}: query={it.query!r}, response={it.response[:60]!r}...") + print() + + results = await FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + ).evaluate(items, eval_name="Split Strategy: Per-Turn") + + print(f"\n Result: {results.passed}/{results.total} passed ({len(items)} items × 2 evaluators)") + print(f" Portal: {results.report_url}") + for ir in results.items: + for s in ir.scores: + print(f" {'✓' if s.passed else '✗'} {s.name}: {s.score}") + print() + + print("=" * 70) + print("All strategies complete. Compare results in the Foundry portal.") + print("=" * 70) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py new file mode 100644 index 0000000000..ef29a428d0 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_traces_sample.py @@ -0,0 +1,120 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate agent responses that already exist in Foundry (zero-code-change). + +This sample demonstrates two patterns: +1. evaluate_traces(response_ids=...) — Evaluate specific Responses API responses by ID. +2. evaluate_traces(agent_id=...) — Evaluate agent behavior from OTel traces in App Insights. + +These are the "zero-code-change" evaluation paths — the agent has already run, +and you're evaluating what happened after the fact. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Response IDs from prior agent runs (for Pattern 1) +- OTel traces exported to App Insights (for Pattern 2) +- Set AZURE_AI_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework_azure_ai import FoundryEvals, evaluate_traces +from azure.ai.projects.aio import AIProjectClient +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + credential=DefaultAzureCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + # ========================================================================= + # Pattern 1: evaluate_traces(response_ids=...) — By response ID + # ========================================================================= + # If your agent uses the Responses API (e.g., AzureOpenAIResponsesClient), + # each run produces a response_id. Pass those IDs to evaluate_traces() + # and Foundry retrieves the full conversation for evaluation. + print("=" * 60) + print("Pattern 1: evaluate_traces(response_ids=...)") + print("=" * 60) + + # Replace these with actual response IDs from your agent runs + response_ids = [ + "resp_abc123", + "resp_def456", + ] + + results = await evaluate_traces( + response_ids=response_ids, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.GROUNDEDNESS, FoundryEvals.TOOL_CALL_ACCURACY], + project_client=project_client, + model_deployment=deployment, + ) + + print(f"Status: {results.status}") + print(f"Results: {results.result_counts}") + print(f"Portal: {results.report_url}") + + # ========================================================================= + # Pattern 2: evaluate_traces(agent_id=...) — From App Insights + # ========================================================================= + # If your agent emits OTel traces to App Insights (via configure_otel_providers), + # you can evaluate recent activity without specifying individual response IDs. + # + # NOTE: Requires OTel traces exported to the App Insights instance connected + # to your Foundry project. The exact trace-based data source API is subject + # to change as Foundry evolves. + print() + print("=" * 60) + print("Pattern 2: evaluate_traces(agent_id=...)") + print("=" * 60) + + # Evaluate by response IDs (uses response-based data source internally) + results = await evaluate_traces( + response_ids=response_ids, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.COHERENCE], + project_client=project_client, + model_deployment=deployment, + ) + + print(f"Status: {results.status}") + print(f"Portal: {results.report_url}") + + # Evaluate by agent ID + time window (when trace-based API is available) + # results = await evaluate_traces( + # agent_id="travel-bot", + # evaluators=[FoundryEvals.INTENT_RESOLUTION, FoundryEvals.TASK_ADHERENCE], + # project_client=project_client, + # model_deployment=deployment, + # lookback_hours=24, + # ) + + +if __name__ == "__main__": + asyncio.run(main()) + + +""" +Sample output (with actual Azure AI Foundry project and valid response IDs): + +============================================================ +Pattern 1: evaluate_traces(response_ids=...) +============================================================ +Status: completed +Results: {'passed': 2, 'failed': 0, 'errored': 0} +Portal: https://ai.azure.com/... + +============================================================ +Pattern 2: evaluate_traces(agent_id=...) +============================================================ +Status: completed +Portal: https://ai.azure.com/... +""" diff --git a/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py new file mode 100644 index 0000000000..a974813e04 --- /dev/null +++ b/python/samples/05-end-to-end/evaluation/foundry_evals/evaluate_workflow_sample.py @@ -0,0 +1,185 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Evaluate a multi-agent workflow using Azure AI Foundry evaluators. + +This sample demonstrates two patterns: +1. Post-hoc: Run the workflow, then evaluate the result you already have. +2. Run + evaluate: Pass queries and let evaluate_workflow() run the workflow for you. + +Both patterns return a list of results (one per provider), each with a per-agent +breakdown in sub_results so you can identify which agent is underperforming. + +Prerequisites: +- An Azure AI Foundry project with a deployed model +- Set AZURE_AI_PROJECT_ENDPOINT and AZURE_AI_MODEL_DEPLOYMENT_NAME in .env +""" + +import asyncio +import os + +from agent_framework import Agent, evaluate_workflow +from agent_framework.azure import AzureOpenAIResponsesClient +from agent_framework_azure_ai import FoundryEvals +from agent_framework_orchestrations import SequentialBuilder +from azure.ai.projects.aio import AIProjectClient +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + + +# Simple tools for the agents +def get_weather(location: str) -> str: + """Get the current weather for a location.""" + weather_data = { + "seattle": "62°F, cloudy with a chance of rain", + "london": "55°F, overcast", + "paris": "68°F, partly sunny", + } + return weather_data.get(location.lower(), f"Weather data not available for {location}") + + +def get_flight_price(origin: str, destination: str) -> str: + """Get the price of a flight between two cities.""" + return f"Flights from {origin} to {destination}: $450 round-trip" + + +async def main() -> None: + # 1. Set up the Azure AI project client + project_client = AIProjectClient( + endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], + credential=DefaultAzureCredential(), + ) + + deployment = os.environ.get("AZURE_AI_MODEL_DEPLOYMENT_NAME", "gpt-4o") + + client = AzureOpenAIResponsesClient( + project_client=project_client, + deployment_name=deployment, + ) + + # 2. Create agents for a sequential workflow + # Use store=False so agents don't chain conversation state via previous_response_id. + # This allows the workflow to be run multiple times without stale state issues. + researcher = Agent( + client=client, + name="researcher", + instructions=( + "You are a travel researcher. Use your tools to gather weather " + "and flight information for the destination the user asks about." + ), + tools=[get_weather, get_flight_price], + default_options={"store": False}, + ) + + planner = Agent( + client=client, + name="planner", + instructions=( + "You are a travel planner. Based on the research provided, " + "create a concise travel recommendation with packing tips." + ), + default_options={"store": False}, + ) + + # 3. Build a sequential workflow: researcher → planner + workflow = SequentialBuilder(participants=[researcher, planner]).build() + + # 4. Create the evaluator — provider config goes here, once + evals = FoundryEvals(project_client=project_client, model_deployment=deployment) + + # ========================================================================= + # Pattern 1: Post-hoc — evaluate a workflow run you already did + # ========================================================================= + print("=" * 60) + print("Pattern 1: Post-hoc workflow evaluation") + print("=" * 60) + + result = await workflow.run("Plan a trip from Seattle to Paris") + + eval_results = await evaluate_workflow( + workflow=workflow, + workflow_result=result, + evaluators=evals, + ) + + for r in eval_results: + print(f"\nOverall: {r.status}") + print(f" Passed: {r.passed}/{r.total}") + print(f" Portal: {r.report_url}") + + print("\nPer-agent breakdown:") + for agent_name, agent_eval in r.sub_results.items(): + print(f" {agent_name}: {agent_eval.passed}/{agent_eval.total} passed") + if agent_eval.report_url: + print(f" Portal: {agent_eval.report_url}") + + # ========================================================================= + # Pattern 2: Run + evaluate with multiple queries + # ========================================================================= + # Build a fresh workflow to avoid stale session state from Pattern 1. + # The Responses API tracks previous_response_id per session, so reusing + # a workflow after a run would reference stale tool calls. + workflow2 = SequentialBuilder(participants=[researcher, planner]).build() + + print() + print("=" * 60) + print("Pattern 2: Run + evaluate with multiple queries") + print("=" * 60) + + eval_results = await evaluate_workflow( + workflow=workflow2, + queries=[ + "Plan a trip from London to Tokyo", + "Plan a trip from New York to Rome", + ], + evaluators=FoundryEvals( + project_client=project_client, + model_deployment=deployment, + evaluators=[FoundryEvals.RELEVANCE, FoundryEvals.TASK_ADHERENCE], + ), + ) + + for r in eval_results: + print(f"\nOverall: {r.status}") + print(f" Passed: {r.passed}/{r.total}") + if r.report_url: + print(f" Portal: {r.report_url}") + + print("\nPer-agent breakdown:") + for agent_name, agent_eval in r.sub_results.items(): + print(f" {agent_name}: {agent_eval.passed}/{agent_eval.total} passed") + if agent_eval.report_url: + print(f" Portal: {agent_eval.report_url}") + + +if __name__ == "__main__": + asyncio.run(main()) + + +""" +Sample output (with actual Azure AI Foundry project): + +============================================================ +Pattern 1: Post-hoc workflow evaluation +============================================================ + +Overall: completed + Passed: 2/2 + Portal: https://ai.azure.com/... + +Per-agent breakdown: + researcher: 1/1 passed + planner: 1/1 passed + +============================================================ +Pattern 2: Run + evaluate with multiple queries +============================================================ + +Overall: completed + Passed: 4/4 + +Per-agent breakdown: + researcher: 2/2 passed + planner: 2/2 passed +""" diff --git a/python/samples/05-end-to-end/hosted_agents/agent_with_local_tools/main.py b/python/samples/05-end-to-end/hosted_agents/agent_with_local_tools/main.py index 4c60902dc2..a6ee1c5fd2 100644 --- a/python/samples/05-end-to-end/hosted_agents/agent_with_local_tools/main.py +++ b/python/samples/05-end-to-end/hosted_agents/agent_with_local_tools/main.py @@ -20,9 +20,7 @@ # Configure these for your Foundry project # Read the explicit variables present in the .env file -PROJECT_ENDPOINT = os.getenv( - "PROJECT_ENDPOINT" -) # e.g., "https://.services.ai.azure.com" +PROJECT_ENDPOINT = os.getenv("PROJECT_ENDPOINT") # e.g., "https://.services.ai.azure.com" MODEL_DEPLOYMENT_NAME = os.getenv( "MODEL_DEPLOYMENT_NAME", "gpt-4.1-mini" ) # Your model deployment name e.g., "gpt-4.1-mini" @@ -90,14 +88,10 @@ def get_available_hotels( nights = (check_out - check_in).days # Filter hotels by price - available_hotels = [ - hotel for hotel in SEATTLE_HOTELS if hotel["price_per_night"] <= max_price - ] + available_hotels = [hotel for hotel in SEATTLE_HOTELS if hotel["price_per_night"] <= max_price] if not available_hotels: - return ( - f"No hotels found in Seattle within your budget of ${max_price}/night." - ) + return f"No hotels found in Seattle within your budget of ${max_price}/night." # Build response result = f"Available hotels in Seattle from {check_in_date} to {check_out_date} ({nights} nights):\n\n" @@ -117,11 +111,7 @@ def get_available_hotels( def get_credential(): """Will use Managed Identity when running in Azure, otherwise falls back to Azure CLI Credential.""" - return ( - ManagedIdentityCredential() - if os.getenv("MSI_ENDPOINT") - else AzureCliCredential() - ) + return ManagedIdentityCredential() if os.getenv("MSI_ENDPOINT") else AzureCliCredential() async def main(): diff --git a/python/samples/05-end-to-end/hosted_agents/writer_reviewer_agents_in_workflow/main.py b/python/samples/05-end-to-end/hosted_agents/writer_reviewer_agents_in_workflow/main.py index af2c049808..5175bb7176 100644 --- a/python/samples/05-end-to-end/hosted_agents/writer_reviewer_agents_in_workflow/main.py +++ b/python/samples/05-end-to-end/hosted_agents/writer_reviewer_agents_in_workflow/main.py @@ -24,11 +24,7 @@ def get_credential(): """Will use Managed Identity when running in Azure, otherwise falls back to Azure CLI Credential.""" - return ( - ManagedIdentityCredential() - if os.getenv("MSI_ENDPOINT") - else AzureCliCredential() - ) + return ManagedIdentityCredential() if os.getenv("MSI_ENDPOINT") else AzureCliCredential() @asynccontextmanager