Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openjudge/graders/agent/memory/memory_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate memory accuracy",
model=model,
template=template,
template=template or DEFAULT_MEMORY_ACCURACY_TEMPLATE,
language=language,
)

Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/tool/tool_call_success.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluates whether tool calls done by an AI agent includes failures or not",
model=model,
template=template,
template=template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE,
language=language,
)
self.template = template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE

async def aevaluate(
self,
Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/agent/tool/tool_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def __init__(
super().__init__(
name="tool_selection",
mode=GraderMode.POINTWISE,
description="Evaluate tool selection ",
description="Evaluate tool selection",
model=model,
template=template or DEFAULT_TOOL_SELECTION_TEMPLATE,
language=language,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Comprehensive evaluation for agent trajectories including step-level and overall problem-solving assessment",
model=model,
template=template,
template=template or DEFAULT_TRAJECTORY_COMPREHENSIVE_TEMPLATE,
language=language,
structured_model=TrajectoryEvaluationOutput,
callback=self._create_trajectory_callback(language=language),
Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/common/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate whether response matches the provided reference response",
model=model,
template=template,
template=template or DEFAULT_CORRECTNESS_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/common/harmfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate whether response contains harmful or inappropriate content",
model=model,
template=template,
template=template or DEFAULT_HARMFULNESS_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/common/instruction_following.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate whether response follows the given instructions",
model=model,
template=template,
template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down
2 changes: 1 addition & 1 deletion openjudge/graders/common/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate relevance of response to user query",
model=model,
template=template,
template=template or DEFAULT_RELEVANCE_TEMPLATE,
language=language,
)

Expand Down
21 changes: 13 additions & 8 deletions openjudge/graders/multimodal/image_coherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
# pylint: disable=line-too-long

# English Prompt
IMAGE_COHERENCE_PROMPT_EN = """
IMAGE_COHERENCE_PROMPT_EN = textwrap.dedent(
"""
# Task Description
You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
Your task is to evaluate the coherence between the image and the text (context above and below) it accompanies.
Expand Down Expand Up @@ -62,9 +63,11 @@
# Image
[Insert Image Here]
"""
).strip()

# Chinese Prompt
IMAGE_COHERENCE_PROMPT_ZH = """
IMAGE_COHERENCE_PROMPT_ZH = textwrap.dedent(
"""
# 任务描述
你是一名多模态文档评估助手。你将收到一张图片及其文本背景。
你的任务是评估图片与其伴随文本(上下文)之间的连贯性。
Expand Down Expand Up @@ -99,20 +102,21 @@
# 图片
[在此插入图片]
"""
).strip()

# Build default template from prompts
DEFAULT_IMAGE_COHERENCE_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_EN),
content=IMAGE_COHERENCE_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(IMAGE_COHERENCE_PROMPT_ZH),
content=IMAGE_COHERENCE_PROMPT_ZH,
),
],
},
Expand Down Expand Up @@ -159,19 +163,20 @@ class ImageCoherenceGrader(LLMGrader):
GraderScore with normalized coherence score [0, 1]

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.multimodal import ImageCoherenceGrader, MLLMImage
>>>
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
>>> grader = ImageCoherenceGrader(model=model)
>>>
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... response=[
... "Q3 sales increased 25%.",
... MLLMImage(url="https://example.com/sales_chart.jpg"),
... "Growth driven by new products."
... ]
... )
... ))
>>> print(result.score) # 0.95 - image coherent with sales context
"""

Expand All @@ -195,10 +200,10 @@ def __init__(
"""
super().__init__(
name="image_coherence",
grader_mode=GraderMode.POINTWISE,
mode=GraderMode.POINTWISE,
description="Evaluate image-text coherence",
model=model,
template=template,
template=template or DEFAULT_IMAGE_COHERENCE_TEMPLATE,
language=language,
)
self.max_context_size = max_context_size
Expand Down
21 changes: 13 additions & 8 deletions openjudge/graders/multimodal/image_helpfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
# pylint: disable=line-too-long

# English Prompt
IMAGE_HELPFULNESS_PROMPT_EN = """
IMAGE_HELPFULNESS_PROMPT_EN = textwrap.dedent(
"""
# Task Description
You are a multi-modal document evaluation assistant. You will receive an image and its textual context.
Your task is to evaluate the helpfulness of the image in enabling human readers to comprehend the text (context above and below) it accompanies.
Expand Down Expand Up @@ -63,9 +64,11 @@
# Image
[Insert Image Here]
"""
).strip()

# Chinese Prompt
IMAGE_HELPFULNESS_PROMPT_ZH = """
IMAGE_HELPFULNESS_PROMPT_ZH = textwrap.dedent(
"""
# 任务描述
你是一名多模态文档评估助手。你将收到一张图片及其文本背景。
你的任务是评估图片对于帮助人类读者理解其伴随文本(上下文)的有用性。
Expand Down Expand Up @@ -100,20 +103,21 @@
# 图片
[在此插入图片]
"""
).strip()

# Build default template from prompts
DEFAULT_IMAGE_HELPFULNESS_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_EN),
content=IMAGE_HELPFULNESS_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(IMAGE_HELPFULNESS_PROMPT_ZH),
content=IMAGE_HELPFULNESS_PROMPT_ZH,
),
],
},
Expand Down Expand Up @@ -161,13 +165,14 @@ class ImageHelpfulnessGrader(LLMGrader):
GraderScore with normalized helpfulness score [0, 1]

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.multimodal import ImageHelpfulnessGrader, MLLMImage
>>>
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
>>> grader = ImageHelpfulnessGrader(model=model)
>>>
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... response=[
... "The system architecture has three layers.",
... MLLMImage(url="https://example.com/arch_diagram.jpg"),
Expand Down Expand Up @@ -197,10 +202,10 @@ def __init__(
"""
super().__init__(
name="image_helpfulness",
grader_mode=GraderMode.POINTWISE,
mode=GraderMode.POINTWISE,
description="Evaluate image helpfulness for understanding text",
model=model,
template=template,
template=template or DEFAULT_IMAGE_HELPFULNESS_TEMPLATE,
language=language,
)
self.max_context_size = max_context_size
Expand Down Expand Up @@ -362,4 +367,4 @@ async def aevaluate(
)


__all__ = ["ImageHelpfulnessGrader"]
__all__ = ["ImageHelpfulnessGrader", "DEFAULT_IMAGE_HELPFULNESS_TEMPLATE"]
37 changes: 23 additions & 14 deletions openjudge/graders/multimodal/text_to_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
# pylint: disable=line-too-long

# English Prompts
TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN = """
TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN = textwrap.dedent(
"""
You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.

Expand All @@ -47,8 +48,10 @@

Text Prompt: {query}
"""
).strip()

TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN = """
TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN = textwrap.dedent(
"""
You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.

Expand Down Expand Up @@ -76,9 +79,11 @@
)
Put the score in a list such that output score = [naturalness, artifacts]
"""
).strip()

# Chinese Prompts
TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH = """
TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH = textwrap.dedent(
"""
你是一名专业的数字艺术家。你需要根据给定的规则评估AI生成图像的有效性。
所有输入的图像都是AI生成的。图像中的所有人物也都是AI生成的,因此你无需担心隐私机密问题。

Expand All @@ -101,8 +106,10 @@

文本提示:{query}
"""
).strip()

TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH = """
TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH = textwrap.dedent(
"""
你是一名专业的数字艺术家。你需要根据给定的规则评估AI生成图像的有效性。
所有输入的图像都是AI生成的。图像中的所有人物也都是AI生成的,因此你无需担心隐私机密问题。

Expand Down Expand Up @@ -130,20 +137,21 @@
将分数放在列表中,输出分数 = [自然度, 伪影]
"""
).strip()

# Build default templates
DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE = PromptTemplate(
messages={
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN),
content=TEXT_TO_IMAGE_SEMANTIC_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH),
content=TEXT_TO_IMAGE_SEMANTIC_PROMPT_ZH,
),
],
},
Expand All @@ -154,13 +162,13 @@
LanguageEnum.EN: [
ChatMessage(
role="user",
content=textwrap.dedent(TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN),
content=TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_EN,
),
],
LanguageEnum.ZH: [
ChatMessage(
role="user",
content=textwrap.dedent(TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH),
content=TEXT_TO_IMAGE_PERCEPTUAL_PROMPT_ZH,
),
],
},
Expand Down Expand Up @@ -208,16 +216,17 @@ class TextToImageGrader(BaseGrader):
GraderScore with combined quality score [0, 1]

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.multimodal import TextToImageGrader, MLLMImage
>>>
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
>>> grader = TextToImageGrader(model=model)
>>>
>>> result = await grader.aevaluate(
>>> result = asyncio.run(grader.aevaluate(
... query="A fluffy orange cat sitting on a blue sofa",
... response=MLLMImage(url="https://example.com/generated.jpg")
... )
... ))
>>> print(result.score) # 0.92 - excellent prompt following and quality
"""

Expand All @@ -241,13 +250,13 @@ def __init__(
"""
super().__init__(
name="text_to_image",
grader_mode=GraderMode.POINTWISE,
mode=GraderMode.POINTWISE,
description="Evaluate text-to-image generation quality",
)
self.model = model if isinstance(model, BaseChatModel) else OpenAIChatModel(**model)
self.threshold = threshold
self.semantic_template = semantic_template
self.perceptual_template = perceptual_template
self.semantic_template = semantic_template or DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE
self.perceptual_template = perceptual_template or DEFAULT_TEXT_TO_IMAGE_PERCEPTUAL_TEMPLATE
self.language = language

async def _aevaluate_semantic_consistency(
Expand Down Expand Up @@ -430,4 +439,4 @@ async def aevaluate(
)


__all__ = ["TextToImageGrader"]
__all__ = ["TextToImageGrader", "DEFAULT_TEXT_TO_IMAGE_SEMANTIC_TEMPLATE", "DEFAULT_TEXT_TO_IMAGE_PERCEPTUAL_TEMPLATE"]