diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index 8b87979f..8bac3933 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -11,7 +11,7 @@ from loguru import logger -from openjudge.graders.base_grader import GraderMode, GraderScore +from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel from openjudge.models.schema.oai.message import ChatMessage @@ -240,8 +240,8 @@ class CorrectnessGrader(LLMGrader): Example: >>> import asyncio - >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.llm_judge import CorrectnessGrader + >>> from openjudge.models.openai_chat_model import OpenAIChatModel + >>> from openjudge.graders.common.correctness import CorrectnessGrader >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") @@ -326,25 +326,19 @@ async def aevaluate( context=context, reference_response=reference_response, ) - score = result.score - reason = result.reason + return GraderScore( + name=self.name, + score=result.score, + reason=result.reason, + metadata={"threshold": self.threshold}, + ) except Exception as e: logger.error(f"Error evaluating correctness: {e}") - score = 0.0 - reason = f"Evaluation error: {str(e)}" - - # Prepare metadata - metadata = { - "threshold": self.threshold, - } - - return GraderScore( - name=self.name, - score=score, - reason=reason, - metadata=metadata, - ) + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) __all__ = ["CorrectnessGrader", "DEFAULT_CORRECTNESS_TEMPLATE"] diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index 3644ed4a..30ca8745 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -11,7 +11,7 @@ from loguru import logger -from openjudge.graders.base_grader import GraderMode, GraderScore +from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel from openjudge.models.schema.oai.message import ChatMessage @@ -210,8 +210,8 @@ class HallucinationGrader(LLMGrader): Example: >>> import asyncio - >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.llm_judge import HallucinationGrader + >>> from openjudge.models.openai_chat_model import OpenAIChatModel + >>> from openjudge.graders.common.hallucination import HallucinationGrader >>> >>> # Initialize model >>> model = OpenAIChatModel( @@ -318,25 +318,19 @@ async def aevaluate( context=context, reference_response=reference_response, ) - score = result.score - reason = result.reason + return GraderScore( + name=self.name, + score=result.score, + reason=result.reason, + metadata={"threshold": self.threshold}, + ) except Exception as e: logger.error(f"Error evaluating hallucination: {e}") - score = 0.0 - reason = f"Evaluation error: {str(e)}" - - # Prepare metadata - metadata = { - "threshold": self.threshold, - } - - return GraderScore( - name=self.name, - score=score, - reason=reason, - metadata=metadata, - ) + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) @staticmethod def get_metadata() -> Dict[str, Any]: diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index be6a9ee8..5ec35799 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -10,7 +10,7 @@ from loguru import logger -from openjudge.graders.base_grader import GraderMode, GraderScore +from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel from openjudge.models.schema.oai.message import ChatMessage @@ -219,8 +219,8 @@ class HarmfulnessGrader(LLMGrader): Example: >>> import asyncio - >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.llm_judge import HarmfulnessGrader + >>> from openjudge.models.openai_chat_model import OpenAIChatModel + >>> from openjudge.graders.common.harmfulness import HarmfulnessGrader >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") @@ -303,25 +303,19 @@ async def aevaluate( context=context, reference_response=reference_response, ) - score = result.score - reason = result.reason + return GraderScore( + name=self.name, + score=result.score, + reason=result.reason, + metadata={"threshold": self.threshold}, + ) except Exception as e: logger.error(f"Error evaluating harmfulness: {e}") - score = 0.0 - reason = f"Evaluation error: {str(e)}" - - # Prepare metadata - metadata = { - "threshold": self.threshold, - } - - return GraderScore( - name=self.name, - score=score, - reason=reason, - metadata=metadata, - ) + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) __all__ = ["HarmfulnessGrader", "DEFAULT_HARMFULNESS_TEMPLATE"] diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index 8e341024..7b61860b 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -11,7 +11,7 @@ from loguru import logger -from openjudge.graders.base_grader import GraderMode, GraderScore +from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore from openjudge.graders.llm_grader import LLMGrader from openjudge.models.base_chat_model import BaseChatModel from openjudge.models.schema.oai.message import ChatMessage @@ -234,8 +234,8 @@ class InstructionFollowingGrader(LLMGrader): - metadata: Threshold and evaluation details Example: - >>> from openjudge.model.openai_llm import OpenAIChatModel - >>> from openjudge.llm_judge import InstructionFollowingGrader + >>> from openjudge.models.openai_chat_model import OpenAIChatModel + >>> from openjudge.graders.common.instruction_following import InstructionFollowingGrader >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") @@ -244,8 +244,8 @@ class InstructionFollowingGrader(LLMGrader): >>> # Good adherence >>> result = asyncio.run(grader.aevaluate( ... instruction="Write exactly 3 sentences in formal academic tone.", - ... output="Climate change poses serious risks. Research shows rising temperatures." - ... "Action is urgently needed." + ... response="Climate change poses serious risks. Research shows rising temperatures. " + ... "Action is urgently needed." ... )) >>> print(result.score) # 5 - follows all requirements >>> @@ -314,25 +314,19 @@ async def aevaluate( response=response, query=query, ) - score = result.score - reason = result.reason + return GraderScore( + name=self.name, + score=result.score, + reason=result.reason, + metadata={"threshold": self.threshold}, + ) except Exception as e: logger.error(f"Error evaluating instruction following: {e}") - score = 0.0 - reason = f"Evaluation error: {str(e)}" - - # Prepare metadata - metadata = { - "threshold": self.threshold, - } - - return GraderScore( - name=self.name, - score=score, - reason=reason, - metadata=metadata, - ) + return GraderError( + name=self.name, + error=f"Evaluation error: {str(e)}", + ) __all__ = ["InstructionFollowingGrader", "DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE"] diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index a934b3fc..76e0de2f 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -217,6 +217,7 @@ class RelevanceGrader(LLMGrader): Args: model: BaseChatModel instance or dict config for OpenAIChatModel + threshold: Minimum score [0, 1] to pass (default: 0.7) template: Custom evaluation template (default: DEFAULT_RELEVANCE_TEMPLATE) language: Prompt language - EN or ZH (default: LanguageEnum.EN) @@ -224,7 +225,7 @@ class RelevanceGrader(LLMGrader): GraderScore object with: - score: Score [1, 5] where 5 = highly relevant, 1 = irrelevant - reason: Explanation of relevance assessment - - metadata: Evaluation details + - metadata: Threshold and evaluation details Example: >>> import asyncio @@ -233,7 +234,7 @@ class RelevanceGrader(LLMGrader): >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b") - >>> grader = RelevanceGrader(model=model) + >>> grader = RelevanceGrader(model=model, threshold=0.7) >>> >>> # Relevant response >>> result = asyncio.run(grader.aevaluate( @@ -261,6 +262,7 @@ class RelevanceGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, + threshold: float = 0.7, template: Optional[PromptTemplate] = DEFAULT_RELEVANCE_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, ): @@ -269,6 +271,7 @@ def __init__( Args: model: BaseChatModel instance or dict config for OpenAIChatModel + threshold: Success threshold [0, 1] (default: 0.7) template: PromptTemplate for evaluation prompts (default: DEFAULT_RELEVANCE_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) """ @@ -280,6 +283,7 @@ def __init__( template=template, language=language, ) + self.threshold = threshold async def aevaluate( self, @@ -315,8 +319,12 @@ async def aevaluate( context=context, reference_response=reference_response, ) - score = result.score - reason = result.reason + return GraderScore( + name=self.name, + score=result.score, + reason=result.reason, + metadata={"threshold": self.threshold}, + ) except Exception as e: logger.error(f"Error evaluating relevance: {e}") @@ -325,11 +333,5 @@ async def aevaluate( error=f"Evaluation error: {str(e)}", ) - return GraderScore( - name=self.name, - score=score, - reason=reason, - ) - __all__ = ["RelevanceGrader", "DEFAULT_RELEVANCE_TEMPLATE"] diff --git a/openjudge/graders/format/json/json_match.py b/openjudge/graders/format/json/json_match.py index a30939a8..e816dde6 100644 --- a/openjudge/graders/format/json/json_match.py +++ b/openjudge/graders/format/json/json_match.py @@ -42,7 +42,7 @@ def __init__( ): super().__init__( name=name, - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description=description, ) self.strict_order = strict_order diff --git a/openjudge/graders/format/json/json_validator.py b/openjudge/graders/format/json/json_validator.py index 21c60177..9621c9e6 100644 --- a/openjudge/graders/format/json/json_validator.py +++ b/openjudge/graders/format/json/json_validator.py @@ -38,7 +38,7 @@ def __init__( ): super().__init__( name=name, - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description=description, ) @@ -49,6 +49,14 @@ def _compute(self, response: str) -> tuple[bool, dict]: Returns: tuple[bool, dict]: (is_valid, details) """ + # Input validation + if not isinstance(response, str): + return False, { + "is_valid": False, + "error_message": f"Invalid input type: expected str, got {type(response).__name__}", + "response_length": 0, + } + try: json.loads(response) return True, {"is_valid": True, "response_length": len(response)} @@ -58,12 +66,6 @@ def _compute(self, response: str) -> tuple[bool, dict]: "error_message": f"JSON decode error: {str(e)}", "response_length": len(response), } - except TypeError as e: - return False, { - "is_valid": False, - "error_message": f"Type error: {str(e)}", - "response_length": len(response), - } except Exception as e: return False, { "is_valid": False, diff --git a/openjudge/graders/format/length_penalty.py b/openjudge/graders/format/length_penalty.py index eb8d4e89..328874c2 100644 --- a/openjudge/graders/format/length_penalty.py +++ b/openjudge/graders/format/length_penalty.py @@ -29,7 +29,6 @@ def __init__( """ super().__init__( name="length_penalty", - grader_mode="content", mode=GraderMode.POINTWISE, description="Text length based penalty for content that is too short or too long.", ) @@ -77,7 +76,6 @@ async def aevaluate(self, response: str) -> GraderScore: >>> print(result.score < 0) True """ - length = len(response) penalty = 0.0 diff --git a/openjudge/graders/format/ngram_repetition_penalty.py b/openjudge/graders/format/ngram_repetition_penalty.py index 371f8fa5..7352745d 100644 --- a/openjudge/graders/format/ngram_repetition_penalty.py +++ b/openjudge/graders/format/ngram_repetition_penalty.py @@ -45,7 +45,7 @@ def __init__( """ super().__init__( name="ngram_repetition_penalty", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Calculate N-gram repetition penalty supporting Chinese processing " "and multiple penalty strategies.", ) @@ -67,7 +67,7 @@ def __init__( chinese_only=chinese_only, ) - self._think_pattern = re.compile(r"(.*?)", flags=re.DOTALL) + self._think_pattern = re.compile(r"(.*?)", flags=re.DOTALL) def _extract_thought_process(self, content: str) -> str: """Extract thought process""" @@ -146,7 +146,6 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore: >>> print(result.score) 0.0 """ - # Select text based on analysis scope if self.analyze_scope == "thought": text_to_analyze = self._extract_thought_process(response) diff --git a/openjudge/graders/format/reasoning_format.py b/openjudge/graders/format/reasoning_format.py index d0ec45e7..a2280dd1 100644 --- a/openjudge/graders/format/reasoning_format.py +++ b/openjudge/graders/format/reasoning_format.py @@ -74,7 +74,6 @@ async def aevaluate(self, response: str, *args: Any, **kwargs: Any) -> GraderSco >>> print(result.score) 1.0 """ - # Check thinking format tags has_think_tag = bool(self.think_pattern.search(response)) diff --git a/openjudge/graders/format/reasoning_tool_format.py b/openjudge/graders/format/reasoning_tool_format.py index 4bf56ab8..b249d0bc 100644 --- a/openjudge/graders/format/reasoning_tool_format.py +++ b/openjudge/graders/format/reasoning_tool_format.py @@ -80,7 +80,6 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore: >>> print(result.score) 1.0 """ - # Extract tag contents think_matches = self._think_pattern.search(response) answer_matches = self._answer_pattern.search(response) diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index ae95a495..faedd0d3 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -195,7 +195,7 @@ def __init__( """ super().__init__( name="image_coherence", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Evaluate image-text coherence", model=model, template=template, diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index effd61c5..33d4d33e 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -197,7 +197,7 @@ def __init__( """ super().__init__( name="image_helpfulness", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Evaluate image helpfulness for understanding text", model=model, template=template, diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py index 39662b7c..ca37dd64 100644 --- a/openjudge/graders/multimodal/text_to_image.py +++ b/openjudge/graders/multimodal/text_to_image.py @@ -241,7 +241,7 @@ def __init__( """ super().__init__( name="text_to_image", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Evaluate text-to-image generation quality", ) self.model = model if isinstance(model, BaseChatModel) else OpenAIChatModel(**model) diff --git a/tests/graders/common/test_correctness.py b/tests/graders/common/test_correctness.py index 21f2c77b..ca0dc5f9 100644 --- a/tests/graders/common/test_correctness.py +++ b/tests/graders/common/test_correctness.py @@ -156,9 +156,8 @@ async def test_error_handling(self): reference_response="The capital of France is Paris.", ) - # Assertions - assert result.score == 0.0 - assert "Evaluation error: API Error" in result.reason + # Assertions - error is returned as GraderError with error field + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ==================== diff --git a/tests/graders/common/test_hallucination.py b/tests/graders/common/test_hallucination.py index 99e1b5ac..7d99a6cf 100644 --- a/tests/graders/common/test_hallucination.py +++ b/tests/graders/common/test_hallucination.py @@ -162,9 +162,8 @@ async def test_error_handling(self): context="Programming", ) - # Assertions - assert result.score == 0.0 - assert "Evaluation error: API Error" in result.reason + # Assertions - error is returned as GraderError with error field + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ==================== diff --git a/tests/graders/common/test_harmfulness.py b/tests/graders/common/test_harmfulness.py index 19052560..63713956 100644 --- a/tests/graders/common/test_harmfulness.py +++ b/tests/graders/common/test_harmfulness.py @@ -144,9 +144,8 @@ async def test_error_handling(self): response="Python is a high-level programming language.", ) - # Assertions - assert result.score == 0.0 - assert "Evaluation error: API Error" in result.reason + # Assertions - error is returned as GraderError with error field + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ==================== diff --git a/tests/graders/common/test_instruction_following.py b/tests/graders/common/test_instruction_following.py index a0be4ddc..70965e1f 100644 --- a/tests/graders/common/test_instruction_following.py +++ b/tests/graders/common/test_instruction_following.py @@ -139,9 +139,8 @@ async def test_error_handling(self): response="Python is a programming language.", ) - # Assertions - assert result.score == 0.0 - assert "Evaluation error: API Error" in result.reason + # Assertions - error is returned as GraderError with error field + assert "Evaluation error: API Error" in result.error # ==================== QUALITY TESTS ==================== diff --git a/tests/graders/common/test_relevance.py b/tests/graders/common/test_relevance.py index 4828b014..14a4f313 100644 --- a/tests/graders/common/test_relevance.py +++ b/tests/graders/common/test_relevance.py @@ -139,7 +139,7 @@ async def test_error_handling(self): response="Python is a programming language.", ) - # Assertions + # Assertions - error is returned as GraderError with error field assert "Evaluation error: API Error" in result.error