diff --git a/openjudge/graders/agent/memory/memory_accuracy.py b/openjudge/graders/agent/memory/memory_accuracy.py index 53e7b46..76e11f9 100644 --- a/openjudge/graders/agent/memory/memory_accuracy.py +++ b/openjudge/graders/agent/memory/memory_accuracy.py @@ -180,7 +180,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate memory accuracy", model=model, - template=template, + template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE, language=language, ) diff --git a/openjudge/graders/agent/tool/tool_call_success.py b/openjudge/graders/agent/tool/tool_call_success.py index 4cb350d..65fe8ec 100644 --- a/openjudge/graders/agent/tool/tool_call_success.py +++ b/openjudge/graders/agent/tool/tool_call_success.py @@ -238,10 +238,9 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluates whether tool calls done by an AI agent includes failures or not", model=model, - template=template, + template=template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE, language=language, ) - self.template = template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE async def aevaluate( self, diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py index 4508e71..012754c 100644 --- a/openjudge/graders/agent/tool/tool_selection.py +++ b/openjudge/graders/agent/tool/tool_selection.py @@ -205,7 +205,7 @@ def __init__( super().__init__( name="tool_selection", mode=GraderMode.POINTWISE, - description="Evaluate tool selection ", + description="Evaluate tool selection", model=model, template=template or DEFAULT_TOOL_SELECTION_TEMPLATE, language=language, diff --git a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py index 85a3546..c2713c6 100644 --- a/openjudge/graders/agent/trajectory/trajectory_comprehensive.py +++ b/openjudge/graders/agent/trajectory/trajectory_comprehensive.py @@ -427,7 +427,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Comprehensive evaluation for agent trajectories including step-level and overall problem-solving assessment", model=model, - template=template, + template=template or DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE, language=language, structured_model=TrajectoryEvaluationOutput, callback=self._create_trajectory_callback(language=language), diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index 8b87979..a099f9a 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -285,7 +285,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response matches the provided reference response", model=model, - template=template, + template=template or DEFAULT_CORRECTNESS_TEMPLATE, language=language, ) self.threshold = threshold diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index be6a9ee..03c36fa 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -263,7 +263,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response contains harmful or inappropriate content", model=model, - template=template, + template=template or DEFAULT_HARMFULNESS_TEMPLATE, language=language, ) self.threshold = threshold diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index 8e34102..5f8f21c 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -279,7 +279,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate whether response follows the given instructions", model=model, - template=template, + template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE, language=language, ) self.threshold = threshold diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py index a934b3f..aa83a9f 100644 --- a/openjudge/graders/common/relevance.py +++ b/openjudge/graders/common/relevance.py @@ -277,7 +277,7 @@ def __init__( mode=GraderMode.POINTWISE, description="Evaluate relevance of response to user query", model=model, - template=template, + template=template or DEFAULT_RELEVANCE_TEMPLATE, language=language, ) diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py index ae95a49..dccae1a 100644 --- a/openjudge/graders/multimodal/image_coherence.py +++ b/openjudge/graders/multimodal/image_coherence.py @@ -27,7 +27,8 @@ # pylint: disable=line-too-long # English Prompt -IMAGE_COHERENCE_PROMPT_EN = """ +IMAGE_COHERENCE_PROMPT_EN = textwrap.dedent( + """ # Task Description You are a multi-modal document evaluation assistant. You will receive an image and its textual context. Your task is to evaluate the coherence between the image and the text (context above and below) it accompanies. @@ -62,9 +63,11 @@ # Image [Insert Image Here] """ +).strip() # Chinese Prompt -IMAGE_COHERENCE_PROMPT_ZH = """ +IMAGE_COHERENCE_PROMPT_ZH = textwrap.dedent( + """ # 任务描述 你是一名多模态文档评估助手。你将收到一张图片及其文本背景。 你的任务是评估图片与其伴随文本(上下文)之间的连贯性。 @@ -99,6 +102,7 @@ # 图片 [在此插入图片] """ +).strip() # Build default template from prompts DEFAULT_IMAGE_COHERENCE_TEMPLATE = PromptTemplate( @@ -106,13 +110,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_EN), + content=IMAGE_COHERENCE_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_ZH), + content=IMAGE_COHERENCE_PROMPT_ZH, ), ], }, @@ -159,19 +163,20 @@ class ImageCoherenceGrader(LLMGrader): GraderScore with normalized coherence score [0, 1] Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel >>> from openjudge.multimodal import ImageCoherenceGrader, MLLMImage >>> >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") >>> grader = ImageCoherenceGrader(model=model) >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... response=[ ... "Q3 sales increased 25%.", ... MLLMImage(url="https://example.com/sales_chart.jpg"), ... "Growth driven by new products." ... ] - ... ) + ... )) >>> print(result.score) # 0.95 - image coherent with sales context """ @@ -195,10 +200,10 @@ def __init__( """ super().__init__( name="image_coherence", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Evaluate image-text coherence", model=model, - template=template, + template=template or DEFAULT_IMAGE_COHERENCE_TEMPLATE, language=language, ) self.max_context_size = max_context_size diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py index effd61c..9abd6c4 100644 --- a/openjudge/graders/multimodal/image_helpfulness.py +++ b/openjudge/graders/multimodal/image_helpfulness.py @@ -28,7 +28,8 @@ # pylint: disable=line-too-long # English Prompt -IMAGE_HELPFULNESS_PROMPT_EN = """ +IMAGE_HELPFULNESS_PROMPT_EN = textwrap.dedent( + """ # Task Description You are a multi-modal document evaluation assistant. You will receive an image and its textual context. Your task is to evaluate the helpfulness of the image in enabling human readers to comprehend the text (context above and below) it accompanies. @@ -63,9 +64,11 @@ # Image [Insert Image Here] """ +).strip() # Chinese Prompt -IMAGE_HELPFULNESS_PROMPT_ZH = """ +IMAGE_HELPFULNESS_PROMPT_ZH = textwrap.dedent( + """ # 任务描述 你是一名多模态文档评估助手。你将收到一张图片及其文本背景。 你的任务是评估图片对于帮助人类读者理解其伴随文本(上下文)的有用性。 @@ -100,6 +103,7 @@ # 图片 [在此插入图片] """ +).strip() # Build default template from prompts DEFAULT_IMAGE_HELPFULNESS_TEMPLATE = PromptTemplate( @@ -107,13 +111,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_EN), + content=IMAGE_HELPFULNESS_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_ZH), + content=IMAGE_HELPFULNESS_PROMPT_ZH, ), ], }, @@ -161,13 +165,14 @@ class ImageHelpfulnessGrader(LLMGrader): GraderScore with normalized helpfulness score [0, 1] Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel >>> from openjudge.multimodal import ImageHelpfulnessGrader, MLLMImage >>> >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") >>> grader = ImageHelpfulnessGrader(model=model) >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... response=[ ... "The system architecture has three layers.", ... MLLMImage(url="https://example.com/arch_diagram.jpg"), @@ -197,10 +202,10 @@ def __init__( """ super().__init__( name="image_helpfulness", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Evaluate image helpfulness for understanding text", model=model, - template=template, + template=template or DEFAULT_IMAGE_HELPFULNESS_TEMPLATE, language=language, ) self.max_context_size = max_context_size @@ -362,4 +367,4 @@ async def aevaluate( ) -__all__ = ["ImageHelpfulnessGrader"] +__all__ = ["ImageHelpfulnessGrader", "DEFAULT_IMAGE_HELPFULNESS_TEMPLATE"] diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py index 39662b7..609f17e 100644 --- a/openjudge/graders/multimodal/text_to_image.py +++ b/openjudge/graders/multimodal/text_to_image.py @@ -24,7 +24,8 @@ # pylint: disable=line-too-long # English Prompts -TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN = """ +TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN = textwrap.dedent( + """ You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. @@ -47,8 +48,10 @@ Text Prompt: {query} """ +).strip() -TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN = """ +TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN = textwrap.dedent( + """ You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. @@ -76,9 +79,11 @@ ) Put the score in a list such that output score = [naturalness, artifacts] """ +).strip() # Chinese Prompts -TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH = """ +TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH = textwrap.dedent( + """ 你是一名专业的数字艺术家。你需要根据给定的规则评估AI生成图像的有效性。 所有输入的图像都是AI生成的。图像中的所有人物也都是AI生成的,因此你无需担心隐私机密问题。 @@ -101,8 +106,10 @@ 文本提示:{query} """ +).strip() -TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH = """ +TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH = textwrap.dedent( + """ 你是一名专业的数字艺术家。你需要根据给定的规则评估AI生成图像的有效性。 所有输入的图像都是AI生成的。图像中的所有人物也都是AI生成的,因此你无需担心隐私机密问题。 @@ -130,6 +137,7 @@ ) 将分数放在列表中,输出分数 = [自然度, 伪影] """ +).strip() # Build default templates DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE = PromptTemplate( @@ -137,13 +145,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN), + content=TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH), + content=TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH, ), ], }, @@ -154,13 +162,13 @@ LanguageEnum.EN: [ ChatMessage( role="user", - content=textwrap.dedent(TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN), + content=TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN, ), ], LanguageEnum.ZH: [ ChatMessage( role="user", - content=textwrap.dedent(TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH), + content=TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH, ), ], }, @@ -208,16 +216,17 @@ class TextToImageGrader(BaseGrader): GraderScore with combined quality score [0, 1] Example: + >>> import asyncio >>> from openjudge.model.openai_llm import OpenAIChatModel >>> from openjudge.multimodal import TextToImageGrader, MLLMImage >>> >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") >>> grader = TextToImageGrader(model=model) >>> - >>> result = await grader.aevaluate( + >>> result = asyncio.run(grader.aevaluate( ... query="A fluffy orange cat sitting on a blue sofa", ... response=MLLMImage(url="https://example.com/generated.jpg") - ... ) + ... )) >>> print(result.score) # 0.92 - excellent prompt following and quality """ @@ -241,13 +250,13 @@ def __init__( """ super().__init__( name="text_to_image", - grader_mode=GraderMode.POINTWISE, + mode=GraderMode.POINTWISE, description="Evaluate text-to-image generation quality", ) self.model = model if isinstance(model, BaseChatModel) else OpenAIChatModel(**model) self.threshold = threshold - self.semantic_template = semantic_template - self.perceptual_template = perceptual_template + self.semantic_template = semantic_template or DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE + self.perceptual_template = perceptual_template or DEFAULT_TEXT_TO_IMAGE_PERCEPTUAL_TEMPLATE self.language = language async def _aevaluate_semantic_consistency( @@ -430,4 +439,4 @@ async def aevaluate( ) -__all__ = ["TextToImageGrader"] +__all__ = ["TextToImageGrader", "DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE", "DEFAULT_TEXT_TO_IMAGE_PERCEPTUAL_TEMPLATE"]