diff --git a/docs/built_in_graders/general.md b/docs/built_in_graders/general.md index 9d05d0cf..0dee6740 100644 --- a/docs/built_in_graders/general.md +++ b/docs/built_in_graders/general.md @@ -56,7 +56,7 @@ Evaluates how well a response addresses the user's query. Measures whether the a | `query` | str | Yes | The user's question or request | | `response` | str | Yes | The model's response to evaluate | | `context` | str | No | Additional context (e.g., conversation history) | -| `ground_truth` | str | No | Reference answer for comparison | +| `reference_response` | str | No | Reference answer for comparison | **Grading Criteria:** - **5**: Comprehensive response with helpful insights @@ -107,7 +107,7 @@ Detects fabricated information not supported by the provided context or common k | `query` | str | Yes | The user's question | | `response` | str | Yes | The model's response to evaluate | | `context` | str | No | Source documents to verify against | -| `ground_truth` | str | No | Reference answer | +| `reference_response` | str | No | Reference answer | !!! note If no context is provided, evaluation is based on common knowledge and factual consistency. @@ -175,7 +175,7 @@ Identifies harmful, offensive, or inappropriate content in responses. Critical f | `query` | str | Yes | The user's input | | `response` | str | Yes | The model's response to evaluate | | `context` | str | No | Additional context | -| `ground_truth` | str | No | Reference response | +| `reference_response` | str | No | Reference response | **What it detects:** - Violence, hatred, discrimination diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index ce83a825..1c8be5d2 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -244,7 +244,7 @@ class CorrectnessGrader(LLMGrader): >>> from openjudge.graders.common.correctness import CorrectnessGrader >>> >>> # Initialize grader - >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") + >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b") >>> grader = CorrectnessGrader(model=model, threshold=3) >>> >>> # Good match @@ -268,7 +268,7 @@ def __init__( self, model: BaseChatModel | dict, threshold: float = 3, - template: Optional[PromptTemplate] = DEFAULT_CORRECTNESS_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, ): """ @@ -279,13 +279,19 @@ def __init__( threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_CORRECTNESS_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) + + Raises: + ValueError: If threshold is not in range [1, 5] """ + if not 1 <= threshold <= 5: + raise ValueError(f"threshold must be in range [1, 5], got {threshold}") + super().__init__( name="correctness", mode=GraderMode.POINTWISE, description="Evaluate whether response matches the provided reference response", model=model, - template=template, + template=template or DEFAULT_CORRECTNESS_TEMPLATE, language=language, ) self.threshold = threshold @@ -330,11 +336,11 @@ async def aevaluate( name=self.name, score=result.score, reason=result.reason, - metadata={"threshold": self.threshold}, + metadata={**result.metadata, "threshold": self.threshold}, ) except Exception as e: - logger.error(f"Error evaluating correctness: {e}") + logger.exception(f"Error evaluating correctness: {e}") return GraderError( name=self.name, error=f"Evaluation error: {str(e)}", diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index b2736ac0..1486741a 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -216,7 +216,7 @@ class HallucinationGrader(LLMGrader): >>> # Initialize model >>> model = OpenAIChatModel( ... api_key="sk-...", - ... model="qwen3-max", + ... model="qwen3-32b", ... temperature=0.1 ... ) >>> @@ -253,7 +253,7 @@ def __init__( self, model: BaseChatModel | dict, threshold: float = 3, - template: Optional[PromptTemplate] = DEFAULT_HALLUCINATION_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, ): """ @@ -264,7 +264,13 @@ def __init__( threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) + + Raises: + ValueError: If threshold is not in range [1, 5] """ + if not 1 <= threshold <= 5: + raise ValueError(f"threshold must be in range [1, 5], got {threshold}") + super().__init__( name="hallucination", mode=GraderMode.POINTWISE, @@ -322,11 +328,11 @@ async def aevaluate( name=self.name, score=result.score, reason=result.reason, - metadata={"threshold": self.threshold}, + metadata={**result.metadata, "threshold": self.threshold}, ) except Exception as e: - logger.error(f"Error evaluating hallucination: {e}") + logger.exception(f"Error evaluating hallucination: {e}") return GraderError( name=self.name, error=f"Evaluation error: {str(e)}", diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index bcea3ccd..a83a0490 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -223,7 +223,7 @@ class HarmfulnessGrader(LLMGrader): >>> from openjudge.graders.common.harmfulness import HarmfulnessGrader >>> >>> # Initialize grader - >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") + >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b") >>> grader = HarmfulnessGrader(model=model, threshold=3) >>> >>> # Safe output @@ -246,7 +246,7 @@ def __init__( self, model: BaseChatModel | dict, threshold: float = 3, - template: Optional[PromptTemplate] = DEFAULT_HARMFULNESS_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, ): """ @@ -257,13 +257,19 @@ def __init__( threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_HARMFULNESS_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) + + Raises: + ValueError: If threshold is not in range [1, 5] """ + if not 1 <= threshold <= 5: + raise ValueError(f"threshold must be in range [1, 5], got {threshold}") + super().__init__( name="harmfulness", mode=GraderMode.POINTWISE, description="Evaluate whether response contains harmful or inappropriate content", model=model, - template=template, + template=template or DEFAULT_HARMFULNESS_TEMPLATE, language=language, ) self.threshold = threshold @@ -307,11 +313,11 @@ async def aevaluate( name=self.name, score=result.score, reason=result.reason, - metadata={"threshold": self.threshold}, + metadata={**result.metadata, "threshold": self.threshold}, ) except Exception as e: - logger.error(f"Error evaluating harmfulness: {e}") + logger.exception(f"Error evaluating harmfulness: {e}") return GraderError( name=self.name, error=f"Evaluation error: {str(e)}", diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index 5fcc5a75..8c647d44 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -238,7 +238,7 @@ class InstructionFollowingGrader(LLMGrader): >>> from openjudge.graders.common.instruction_following import InstructionFollowingGrader >>> >>> # Initialize grader - >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") + >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b") >>> grader = InstructionFollowingGrader(model=model, threshold=3) >>> >>> # Good adherence @@ -262,7 +262,7 @@ def __init__( self, model: BaseChatModel | dict, threshold: float = 3, - template: Optional[PromptTemplate] = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, ): """ @@ -273,13 +273,19 @@ def __init__( threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) + + Raises: + ValueError: If threshold is not in range [1, 5] """ + if not 1 <= threshold <= 5: + raise ValueError(f"threshold must be in range [1, 5], got {threshold}") + super().__init__( name="instruction_following", mode=GraderMode.POINTWISE, description="Evaluate whether response follows the given instructions", model=model, - template=template, + template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE, language=language, ) self.threshold = threshold @@ -318,11 +324,11 @@ async def aevaluate( name=self.name, score=result.score, reason=result.reason, - metadata={"threshold": self.threshold}, + metadata={**result.metadata, "threshold": self.threshold}, ) except Exception as e: - logger.error(f"Error evaluating instruction following: {e}") + logger.exception(f"Error evaluating instruction following: {e}") return GraderError( name=self.name, error=f"Evaluation error: {str(e)}", diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index 76e0de2f..293ceb4a 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -217,7 +217,7 @@ class RelevanceGrader(LLMGrader): Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Minimum score [0, 1] to pass (default: 0.7) + threshold: Minimum score [1, 5] to pass (default: 3) template: Custom evaluation template (default: DEFAULT_RELEVANCE_TEMPLATE) language: Prompt language - EN or ZH (default: LanguageEnum.EN) @@ -234,7 +234,7 @@ class RelevanceGrader(LLMGrader): >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b") - >>> grader = RelevanceGrader(model=model, threshold=0.7) + >>> grader = RelevanceGrader(model=model, threshold=3) >>> >>> # Relevant response >>> result = asyncio.run(grader.aevaluate( @@ -262,8 +262,8 @@ class RelevanceGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - threshold: float = 0.7, - template: Optional[PromptTemplate] = DEFAULT_RELEVANCE_TEMPLATE, + threshold: float = 3, + template: Optional[PromptTemplate] = None, language: LanguageEnum = LanguageEnum.EN, ): """ @@ -271,16 +271,22 @@ def __init__( Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Success threshold [0, 1] (default: 0.7) + threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_RELEVANCE_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) + + Raises: + ValueError: If threshold is not in range [1, 5] """ + if not 1 <= threshold <= 5: + raise ValueError(f"threshold must be in range [1, 5], got {threshold}") + super().__init__( name="relevance", mode=GraderMode.POINTWISE, description="Evaluate relevance of response to user query", model=model, - template=template, + template=template or DEFAULT_RELEVANCE_TEMPLATE, language=language, ) self.threshold = threshold @@ -323,11 +329,11 @@ async def aevaluate( name=self.name, score=result.score, reason=result.reason, - metadata={"threshold": self.threshold}, + metadata={**result.metadata, "threshold": self.threshold}, ) except Exception as e: - logger.error(f"Error evaluating relevance: {e}") + logger.exception(f"Error evaluating relevance: {e}") return GraderError( name=self.name, error=f"Evaluation error: {str(e)}", diff --git a/openjudge/graders/multimodal/_internal/criteria_utils.py b/openjudge/graders/multimodal/_internal/criteria_utils.py index ef71e74e..370258f3 100644 --- a/openjudge/graders/multimodal/_internal/criteria_utils.py +++ b/openjudge/graders/multimodal/_internal/criteria_utils.py @@ -87,13 +87,13 @@ def validate_and_sort_rubrics( # Sort rubrics by start of range sorted_rubrics = sorted(rubrics, key=lambda r: r.score_range[0]) - # Full overlap check + # Full overlap check (adjacent ranges like (0,5) and (5,7) are allowed) for i in range(len(sorted_rubrics)): a_start, a_end = sorted_rubrics[i].score_range for j in range(i + 1, len(sorted_rubrics)): b_start, b_end = sorted_rubrics[j].score_range - # Check if ranges overlap - if a_end >= b_start: + # Check if ranges overlap (> allows adjacent ranges to touch) + if a_end > b_start: raise ValueError( f"Overlapping score ranges: {sorted_rubrics[i].score_range} and {sorted_rubrics[j].score_range}", ) @@ -147,7 +147,7 @@ def construct_params_string( >>> construct_params_string(params) 'Input and Actual Output' """ - params = [PARAM_DISPLAY_NAMES[param] for param in evaluation_params] + params = [PARAM_DISPLAY_NAMES.get(param, param.replace("_", " ").title()) for param in evaluation_params] if len(params) == 1: params_str = params[0] @@ -164,7 +164,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]: Get the overall score range from rubrics Args: - rubric: List of rubric definitions + rubric: List of rubric definitions (does not need to be sorted) Returns: Tuple of (min_score, max_score) @@ -180,7 +180,9 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]: if not rubric: return (0, 10) - return rubric[0].score_range[0], rubric[-1].score_range[1] + min_score = min(r.score_range[0] for r in rubric) + max_score = max(r.score_range[1] for r in rubric) + return (min_score, max_score) __all__ = [ diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index faedd0d3..d6b4dea3 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -23,6 +23,7 @@ from openjudge.models.base_chat_model import BaseChatModel from openjudge.models.schema.oai.message import ChatMessage from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate +from openjudge.utils.utils import parse_structured_chat_response # pylint: disable=line-too-long @@ -217,30 +218,27 @@ async def _aevaluate_single_image( context_below=context_below or "", ) - try: - # Format image content for OpenAI API - content = [{"type": "text", "text": prompt}] - - if image.url: - content.append({"type": "image_url", "image_url": {"url": image.url}}) - elif image.base64: - # Format base64 image with data URL scheme - image_format = image.format or "jpeg" - data_url = f"data:image/{image_format};base64,{image.base64}" - content.append({"type": "image_url", "image_url": {"url": data_url}}) - - # Call model without structured output - chat_response = await self.model.achat( - messages=[{"role": "user", "content": content}], - structured_model=GraderScoreCallback, - ) - score = chat_response.parsed["score"] - reason = chat_response.parsed["reason"] - return score, reason + # Format image content for OpenAI API + content = [{"type": "text", "text": prompt}] - except Exception as e: - logger.error(f"Error evaluating image coherence: {e}") - return 0.0, f"Evaluation error: {str(e)}" + if image.url: + content.append({"type": "image_url", "image_url": {"url": image.url}}) + elif image.base64: + # Format base64 image with data URL scheme + image_format = image.format or "jpeg" + data_url = f"data:image/{image_format};base64,{image.base64}" + content.append({"type": "image_url", "image_url": {"url": data_url}}) + + chat_response = await self.model.achat( + messages=[{"role": "user", "content": content}], + structured_model=GraderScoreCallback, + ) + + # Default to 5.0 (neutral score on 0-10 scale) for missing fields + parsed = await parse_structured_chat_response(chat_response) + score = parsed.get("score", 5.0) + reason = parsed.get("reason", "") + return score, reason async def _acompute( self, @@ -326,7 +324,16 @@ async def aevaluate( ... ] ... ) """ - score, details = await self._acompute(response, **kwargs) + try: + score, details = await self._acompute(response, **kwargs) + except Exception as e: + logger.exception(f"Error evaluating image coherence: {e}") + from openjudge.graders.base_grader import GraderError + + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) if "error" in details: return GraderScore( diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index 33d4d33e..c8aa3350 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -24,6 +24,7 @@ from openjudge.models.base_chat_model import BaseChatModel from openjudge.models.schema.oai.message import ChatMessage from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate +from openjudge.utils.utils import parse_structured_chat_response # pylint: disable=line-too-long @@ -213,7 +214,7 @@ async def _aevaluate_single_image( context_below: Optional[str], ) -> Tuple[float, str]: """Async evaluation of single image helpfulness""" - messages = self.template.to_messages() + messages = self.template.to_messages(self.language) prompt = ( messages[0] .format( @@ -223,36 +224,17 @@ async def _aevaluate_single_image( .content ) - try: - content = format_image_content(prompt, [image]) - chat_response = await self.model.achat( - messages=[{"role": "user", "content": content}], - structured_model=GraderScoreCallback, - ) - - # Handle both streaming and non-streaming responses - if hasattr(chat_response, "__aiter__"): - # This is a streaming response, we need to collect it first - collected_content = [] - parsed = {} - async for chunk in chat_response: - if chunk.content: - collected_content.extend(chunk.content) - if chunk.parsed: - parsed.update(chunk.parsed) - - # Extract score and reason from metadata - score = parsed.get("score", 0.0) - reason = parsed.get("reason", "") - else: - # Non-streaming response - score = chat_response.parsed["score"] - reason = chat_response.parsed["reason"] - return score, reason + content = format_image_content(prompt, [image]) + chat_response = await self.model.achat( + messages=[{"role": "user", "content": content}], + structured_model=GraderScoreCallback, + ) - except Exception as e: - logger.error(f"Error evaluating image helpfulness: {e}") - return 0.0, f"Evaluation error: {str(e)}" + # Default to 5.0 (neutral score on 0-10 scale) for missing fields + parsed = await parse_structured_chat_response(chat_response) + score = parsed.get("score", 5.0) + reason = parsed.get("reason", "") + return score, reason async def _acompute( self, @@ -329,7 +311,16 @@ async def aevaluate( ... ] ... ) """ - score, details = await self._acompute(response, **kwargs) + try: + score, details = await self._acompute(response, **kwargs) + except Exception as e: + logger.exception(f"Error evaluating image helpfulness: {e}") + from openjudge.graders.base_grader import GraderError + + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) if "error" in details: return GraderScore( diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py index ca37dd64..4cb645f4 100644 --- a/openjudge/graders/multimodal/text_to_image.py +++ b/openjudge/graders/multimodal/text_to_image.py @@ -20,6 +20,7 @@ from openjudge.models.openai_chat_model import OpenAIChatModel from openjudge.models.schema.oai.message import ChatMessage from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate +from openjudge.utils.utils import parse_structured_chat_response # pylint: disable=line-too-long @@ -259,37 +260,18 @@ async def _aevaluate_semantic_consistency( messages = self.semantic_template.to_messages(self.language) prompt = messages[0].format(query=query).content - try: - content = format_image_content(prompt, [response]) - chat_response = await self.model.achat( - messages=[{"role": "user", "content": content}], - structured_model=GraderScoreCallback, - ) + content = format_image_content(prompt, [response]) + chat_response = await self.model.achat( + messages=[{"role": "user", "content": content}], + structured_model=GraderScoreCallback, + ) - # Handle both streaming and non-streaming responses - if hasattr(chat_response, "__aiter__"): - # This is a streaming response, we need to collect it first - collected_content = [] - parsed = {} - async for chunk in chat_response: - if chunk.content: - collected_content.extend(chunk.content) - if chunk.parsed: - parsed.update(chunk.parsed) - - # Extract score and reason from metadata - score = parsed.get("score", 0.0) - reason = parsed.get("reason", "") - else: - # Non-streaming response - score = chat_response.parsed["score"] - score = score if isinstance(score, list) else [score] - reason = chat_response.parsed["reason"] - return score, reason - - except Exception as e: - logger.error(f"Error evaluating semantic consistency: {e}") - return [5.0], f"Error during evaluation: {str(e)}" + # Default to 5.0 (neutral score on 0-10 scale) for missing fields + parsed = await parse_structured_chat_response(chat_response) + score = parsed.get("score", 5.0) + score = score if isinstance(score, list) else [score] + reason = parsed.get("reason", "") + return score, reason async def _aevaluate_perceptual_quality( self, @@ -299,20 +281,18 @@ async def _aevaluate_perceptual_quality( messages = self.perceptual_template.to_messages(self.language) prompt = messages[0].content - try: - content = format_image_content(prompt, [response]) - chat_response = await self.model.achat( - messages=[{"role": "user", "content": content}], - structured_model=GraderScoreCallback, - ) - score = chat_response.parsed["score"] - score = score[:2] if isinstance(score, list) else [score, score] - reason = chat_response.parsed["reason"] - return score, reason + content = format_image_content(prompt, [response]) + chat_response = await self.model.achat( + messages=[{"role": "user", "content": content}], + structured_model=GraderScoreCallback, + ) - except Exception as e: - logger.error(f"Error evaluating perceptual quality: {e}") - return [5.0, 5.0], f"Error during evaluation: {str(e)}" + # Default to [5.0, 5.0] (neutral scores on 0-10 scale) for missing fields + parsed = await parse_structured_chat_response(chat_response) + score = parsed.get("score", [5.0, 5.0]) + score = score[:2] if isinstance(score, list) else [score, score] + reason = parsed.get("reason", "") + return score, reason async def _a_compute( self, @@ -406,7 +386,16 @@ async def aevaluate( metadata={"error": "response must be MLLMImage"}, ) - score, details = await self._a_compute(query, response, **kwargs) + try: + score, details = await self._a_compute(query, response, **kwargs) + except Exception as e: + logger.exception(f"Error evaluating text-to-image: {e}") + from openjudge.graders.base_grader import GraderError + + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) # Generate comprehensive reason reason = f"""Text-to-Image Quality Score: {score:.4f} diff --git a/openjudge/utils/utils.py b/openjudge/utils/utils.py index 92c971c1..b8967a16 100644 --- a/openjudge/utils/utils.py +++ b/openjudge/utils/utils.py @@ -6,7 +6,7 @@ """ import json -from typing import Any, Dict, Type +from typing import Any, Dict, Optional, Type from json_repair import repair_json from loguru import logger @@ -203,3 +203,42 @@ def trim_and_load_json(response: str, metric: Any = None) -> Dict[str, Any]: metric_name = getattr(metric, "name", "unknown_metric") logger.error(f"{metric_name}: {error_msg}") raise ValueError(error_msg) from e + + +async def parse_structured_chat_response( + chat_response: Any, + default: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """Parse structured response from streaming or non-streaming chat response. + + For streaming responses, returns the last chunk's parsed result (complete). + For non-streaming responses, returns the parsed result directly. + + Args: + chat_response: Chat response object from model.achat() with structured_model. + Can be either streaming (async iterator) or non-streaming. + default: Default dict to return if parsing fails. Defaults to empty dict. + + Returns: + Dict[str, Any]: The parsed structured response containing fields like + 'score' and 'reason'. + + Example: + >>> response = await model.achat(messages, structured_model=GraderScoreCallback) + >>> parsed = await parse_structured_chat_response(response) + >>> score = parsed.get("score", 5.0) + >>> reason = parsed.get("reason", "") + """ + if default is None: + default = {} + + if hasattr(chat_response, "__aiter__"): + # Streaming response - only the last chunk contains complete result + parsed = None + async for chunk in chat_response: + if chunk.parsed: + parsed = chunk.parsed + return parsed if parsed is not None else default + + # Non-streaming response + return chat_response.parsed if chat_response.parsed else default diff --git a/tests/graders/common/test_correctness.py b/tests/graders/common/test_correctness.py index ca0dc5f9..72c12224 100644 --- a/tests/graders/common/test_correctness.py +++ b/tests/graders/common/test_correctness.py @@ -73,7 +73,7 @@ async def test_correctness_grader_with_reference_response(self): mock_model = AsyncMock() grader = CorrectnessGrader( model=mock_model, - threshold=0.7, + threshold=3, language=LanguageEnum.EN, ) @@ -93,7 +93,7 @@ async def test_correctness_grader_with_reference_response(self): assert isinstance(result.score, (int, float)) assert result.score >= 1 and result.score <= 5 assert "Correctness score" in result.reason - assert result.metadata["threshold"] == 0.7 + assert result.metadata["threshold"] == 3 # Verify model was called correctly mock_achat.assert_called_once() @@ -116,7 +116,7 @@ async def test_correctness_grader_without_reference_response(self): mock_model = AsyncMock() grader = CorrectnessGrader( model=mock_model, - threshold=0.7, + threshold=3, language=LanguageEnum.EN, ) @@ -132,7 +132,7 @@ async def test_correctness_grader_without_reference_response(self): # Verify result structure assert result.name == "correctness" assert isinstance(result.score, (int, float)) - assert result.metadata["threshold"] == 0.7 + assert result.metadata["threshold"] == 3 # Verify model was called correctly mock_achat.assert_called_once() diff --git a/tests/graders/common/test_harmfulness.py b/tests/graders/common/test_harmfulness.py index 63713956..937e6286 100644 --- a/tests/graders/common/test_harmfulness.py +++ b/tests/graders/common/test_harmfulness.py @@ -52,10 +52,10 @@ def test_initialization(self): mock_model = AsyncMock() grader = HarmfulnessGrader( model=mock_model, - threshold=0.8, + threshold=3, ) assert grader.name == "harmfulness" - assert grader.threshold == 0.8 + assert grader.threshold == 3 assert grader.model == mock_model @pytest.mark.asyncio diff --git a/tests/graders/multimodal/test_image_coherence.py b/tests/graders/multimodal/test_image_coherence.py index 6d96a26d..5f2c068e 100644 --- a/tests/graders/multimodal/test_image_coherence.py +++ b/tests/graders/multimodal/test_image_coherence.py @@ -62,7 +62,7 @@ async def test_successful_evaluation(self): # Create a simple mock response object (not AsyncMock to avoid __aiter__ check) class MockResponse: def __init__(self): - self.metadata = { + self.parsed = { "score": 8.0, # Will be normalized to 0.8 (divided by 10) "reason": "Image is highly coherent with surrounding text", } @@ -92,6 +92,8 @@ def __init__(self): @pytest.mark.asyncio async def test_error_handling(self): """Test graceful error handling""" + from openjudge.graders.base_grader import GraderError + # Create mock model that raises exception mock_model = AsyncMock() mock_model.achat = AsyncMock(side_effect=Exception("API Error")) @@ -105,9 +107,9 @@ async def test_error_handling(self): response=["Text before", mock_image, "Text after"], ) - # Assertions - assert result.score == 0.0 - assert "Evaluation error: API Error" in result.reason + # Assertions - grader returns GraderError on exception + assert isinstance(result, GraderError) + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ==================== diff --git a/tests/graders/multimodal/test_image_helpfulness.py b/tests/graders/multimodal/test_image_helpfulness.py index 17c3a08f..0b3f0c34 100644 --- a/tests/graders/multimodal/test_image_helpfulness.py +++ b/tests/graders/multimodal/test_image_helpfulness.py @@ -62,7 +62,7 @@ async def test_successful_evaluation(self): # Create a simple mock response object (not AsyncMock to avoid __aiter__ check) class MockResponse: def __init__(self): - self.metadata = { + self.parsed = { "score": 8.0, # Will be normalized to 0.8 (divided by 10) "reason": "Image is very helpful for understanding the text", } @@ -92,6 +92,8 @@ def __init__(self): @pytest.mark.asyncio async def test_error_handling(self): """Test graceful error handling""" + from openjudge.graders.base_grader import GraderError + # Create mock model that raises exception mock_model = AsyncMock() mock_model.achat = AsyncMock(side_effect=Exception("API Error")) @@ -105,9 +107,9 @@ async def test_error_handling(self): response=["Text before", mock_image, "Text after"], ) - # Assertions - assert result.score == 0.0 - assert "Evaluation error: API Error" in result.reason + # Assertions - grader returns GraderError on exception + assert isinstance(result, GraderError) + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ==================== diff --git a/tests/graders/multimodal/test_text_to_image.py b/tests/graders/multimodal/test_text_to_image.py index d695eb9a..4890825a 100644 --- a/tests/graders/multimodal/test_text_to_image.py +++ b/tests/graders/multimodal/test_text_to_image.py @@ -64,7 +64,7 @@ async def test_successful_evaluation(self): # Create simple mock response objects (not AsyncMock to avoid __aiter__ check) class MockResponse: def __init__(self, score, reason): - self.metadata = {"score": score, "reason": reason} + self.parsed = {"score": score, "reason": reason} # TextToImageGrader calls model twice (semantic + perceptual) mock_semantic = MockResponse(8.0, "Good semantic consistency") @@ -95,6 +95,8 @@ def __init__(self, score, reason): @pytest.mark.asyncio async def test_error_handling(self): """Test graceful error handling""" + from openjudge.graders.base_grader import GraderError + # Create mock model that raises exception mock_model = AsyncMock(spec=BaseChatModel) mock_model.achat = AsyncMock(side_effect=Exception("API Error")) @@ -109,10 +111,9 @@ async def test_error_handling(self): response=mock_image, ) - # Assertions - # TextToImageGrader returns 0.5 (default) on error, not 0.0 - assert result.score == 0.5 - assert "error" in result.reason.lower() + # Assertions - grader returns GraderError on exception + assert isinstance(result, GraderError) + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ====================