-
Notifications
You must be signed in to change notification settings - Fork 3.3k
feat(evaluation): unify validators with azureml-assets #47526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
m7md7sien
wants to merge
4
commits into
main
Choose a base branch
from
mohessie/update_eval_validators
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
6c81f6a
feat(evaluation): unify validators with azureml-assets
m7md7sien 2808e5a
Potential fix for pull request finding
m7md7sien 106ac42
Add unit tests for actions/expected_actions alias input normalization
Copilot aadd11c
Remove redundant assertions from test_both_aliases_normalized_and_eva…
Copilot File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
70 changes: 70 additions & 0 deletions
70
...evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| # Copyright (c) Microsoft Corporation. | ||
| # Licensed under the MIT License. | ||
|
|
||
| """ | ||
| Utilities for resolving evaluation levels and reshaping query/response/messages inputs. | ||
| """ | ||
|
|
||
| from typing import List, Optional, Tuple, Union | ||
| from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget | ||
| from ._validation_constants import MessageRole, EvaluationLevel | ||
|
|
||
|
|
||
| def _resolve_evaluation_level( | ||
| evaluation_level: Optional[Union[EvaluationLevel, str]], | ||
| error_target: ErrorTarget, | ||
| ) -> Optional[EvaluationLevel]: | ||
| """Validate and normalize the evaluation_level parameter. | ||
|
|
||
| :param evaluation_level: The evaluation level to resolve. | ||
| :type evaluation_level: Optional[Union[EvaluationLevel, str]] | ||
| :param error_target: The error target for exceptions. | ||
| :type error_target: ErrorTarget | ||
| :return: The resolved EvaluationLevel or None for auto-detect. | ||
| :rtype: Optional[EvaluationLevel] | ||
| """ | ||
| valid = [level.value for level in EvaluationLevel] | ||
| if evaluation_level is None or evaluation_level == "": | ||
| return None | ||
| if isinstance(evaluation_level, EvaluationLevel): | ||
| return evaluation_level | ||
| if isinstance(evaluation_level, str): | ||
| try: | ||
| return EvaluationLevel(evaluation_level) | ||
| except ValueError as exc: | ||
| raise EvaluationException( | ||
| message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=error_target, | ||
| ) from exc | ||
| raise EvaluationException( | ||
| message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=error_target, | ||
| ) | ||
|
|
||
|
|
||
| def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: | ||
| """Merge query and response message lists into a single conversation.""" | ||
| return [*query, *response] | ||
|
|
||
|
|
||
| def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: | ||
| """Split messages into query/response slices at the latest user turn.""" | ||
| latest_user_index = max( | ||
| (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value), | ||
| default=-1, | ||
| ) | ||
| if latest_user_index == -1: | ||
| raise ValueError("messages must contain at least one message with role 'user'.") | ||
| return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] | ||
|
|
||
|
|
||
| def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: | ||
| """Wrap string query/response into separate message lists.""" | ||
| return ( | ||
| [{"role": "user", "content": [{"type": "text", "text": query}]}], | ||
| [{"role": "assistant", "content": [{"type": "text", "text": response}]}], | ||
| ) | ||
158 changes: 158 additions & 0 deletions
158
...re/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| # Copyright (c) Microsoft Corporation. | ||
| # Licensed under the MIT License. | ||
|
|
||
| """ | ||
| Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. | ||
| """ | ||
|
|
||
| from typing import Any, Dict | ||
| from typing_extensions import override | ||
| from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget | ||
| from ._validation_constants import MessageRole, ContentType | ||
| from ._conversation_validator import ConversationValidator | ||
| from ._tool_definitions_validator import ToolDefinitionsValidator | ||
|
|
||
|
|
||
| class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator): | ||
| """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. | ||
|
|
||
| A single implementation serves all evaluators via two behavior flags: | ||
| - ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the | ||
| messages path and the query/response path. Set False for evaluators that do not accept | ||
| tool definitions (parity with a plain ``ConversationValidator``). | ||
| - ``deep_validate_messages`` (default False): additionally run full per-message | ||
| ``_validate_message_dict`` checks in the messages path. | ||
| """ | ||
|
|
||
| enforce_tool_definitions: bool = True | ||
| deep_validate_messages: bool = False | ||
|
|
||
| def __init__( | ||
| self, | ||
| error_target: ErrorTarget, | ||
| requires_query: bool = True, | ||
| optional_tool_definitions: bool = True, | ||
| check_for_unsupported_tools: bool = False, | ||
| *, | ||
| enforce_tool_definitions: bool = True, | ||
| deep_validate_messages: bool = False, | ||
| ): | ||
| """Initialize MessagesOrQueryResponseInputValidator.""" | ||
| super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) | ||
| self.enforce_tool_definitions = enforce_tool_definitions | ||
| self.deep_validate_messages = deep_validate_messages | ||
|
|
||
| @override | ||
| def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: | ||
| """Validate evaluation input, supporting messages as an alternative to query/response.""" | ||
| # Multi-turn path (messages list) | ||
| messages = eval_input.get("messages") | ||
| if messages is not None: | ||
| if not isinstance(messages, list): | ||
| raise EvaluationException( | ||
| message="messages must be provided as a list of message dictionaries.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| if len(messages) == 0: | ||
| raise EvaluationException( | ||
| message="messages list must not be empty.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
|
|
||
| # Per-message structural checks | ||
| valid_roles = {role.value for role in MessageRole} | ||
| roles_present = set() | ||
| for index, message in enumerate(messages): | ||
| if not isinstance(message, dict): | ||
| raise EvaluationException( | ||
| message=( | ||
| f"Each item in 'messages' must be a dictionary, " | ||
| f"but item at index {index} is {type(message).__name__}." | ||
| ), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| role = message.get("role") | ||
| if role is None: | ||
| raise EvaluationException( | ||
| message=f"Each message must contain a 'role' key, but message at index {index} is missing it.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| if role not in valid_roles: | ||
| raise EvaluationException( | ||
| message=( | ||
| f"Invalid role '{role}' at message index {index}. " | ||
| f"Must be one of: {sorted(valid_roles)}." | ||
| ), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| roles_present.add(role) | ||
|
m7md7sien marked this conversation as resolved.
|
||
|
|
||
| # Conversation-level checks | ||
| if MessageRole.USER.value not in roles_present: | ||
| raise EvaluationException( | ||
| message="messages must contain at least one message with role 'user'.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| if MessageRole.ASSISTANT.value not in roles_present: | ||
| raise EvaluationException( | ||
| message="messages must contain at least one message with role 'assistant'.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| # The final assistant message must contain text | ||
| last_content = messages[-1].get("content", "") | ||
| if isinstance(last_content, list): | ||
| has_text = any( | ||
| ( | ||
| isinstance(content_item, dict) | ||
| and content_item.get("type") | ||
| in ( | ||
| ContentType.TEXT, | ||
| ContentType.INPUT_TEXT, | ||
| ContentType.OUTPUT_TEXT, | ||
| ) | ||
| ) | ||
| or isinstance(content_item, str) | ||
| for content_item in last_content | ||
| ) | ||
| if not has_text: | ||
| raise EvaluationException( | ||
| message=( | ||
| "The last message must contain text content, " | ||
| "not only tool calls. The conversation appears to be " | ||
| "mid-execution \u2014 provide the agent's final text response." | ||
| ), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
|
|
||
| if self.deep_validate_messages: | ||
| for message in messages: | ||
| error = self._validate_message_dict(message) | ||
| if error: | ||
| raise error | ||
|
|
||
| if self.enforce_tool_definitions: | ||
| tool_definitions = eval_input.get("tool_definitions") | ||
| tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions) | ||
| if tool_definitions_validation_exception: | ||
| raise tool_definitions_validation_exception | ||
| return True | ||
|
|
||
| if self.enforce_tool_definitions: | ||
| return super().validate_eval_input(eval_input) | ||
| return ConversationValidator.validate_eval_input(self, eval_input) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.