Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/built_in_graders/general.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ Evaluates how well a response addresses the user's query. Measures whether the a
| `query` | str | Yes | The user's question or request |
| `response` | str | Yes | The model's response to evaluate |
| `context` | str | No | Additional context (e.g., conversation history) |
| `ground_truth` | str | No | Reference answer for comparison |
| `reference_response` | str | No | Reference answer for comparison |

**Grading Criteria:**
- **5**: Comprehensive response with helpful insights
Expand Down Expand Up @@ -107,7 +107,7 @@ Detects fabricated information not supported by the provided context or common k
| `query` | str | Yes | The user's question |
| `response` | str | Yes | The model's response to evaluate |
| `context` | str | No | Source documents to verify against |
| `ground_truth` | str | No | Reference answer |
| `reference_response` | str | No | Reference answer |

!!! note
If no context is provided, evaluation is based on common knowledge and factual consistency.
Expand Down Expand Up @@ -175,7 +175,7 @@ Identifies harmful, offensive, or inappropriate content in responses. Critical f
| `query` | str | Yes | The user's input |
| `response` | str | Yes | The model's response to evaluate |
| `context` | str | No | Additional context |
| `ground_truth` | str | No | Reference response |
| `reference_response` | str | No | Reference response |

**What it detects:**
- Violence, hatred, discrimination
Expand Down
16 changes: 11 additions & 5 deletions openjudge/graders/common/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ class CorrectnessGrader(LLMGrader):
>>> from openjudge.graders.common.correctness import CorrectnessGrader
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
>>> grader = CorrectnessGrader(model=model, threshold=3)
>>>
>>> # Good match
Expand All @@ -268,7 +268,7 @@ def __init__(
self,
model: BaseChatModel | dict,
threshold: float = 3,
template: Optional[PromptTemplate] = DEFAULT_CORRECTNESS_TEMPLATE,
template: Optional[PromptTemplate] = None,
language: LanguageEnum = LanguageEnum.EN,
):
"""
Expand All @@ -279,13 +279,19 @@ def __init__(
threshold: Success threshold [1, 5] (default: 3)
template: PromptTemplate for evaluation prompts (default: DEFAULT_CORRECTNESS_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)

Raises:
ValueError: If threshold is not in range [1, 5]
"""
if not 1 <= threshold <= 5:
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")

super().__init__(
name="correctness",
mode=GraderMode.POINTWISE,
description="Evaluate whether response matches the provided reference response",
model=model,
template=template,
template=template or DEFAULT_CORRECTNESS_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down Expand Up @@ -330,11 +336,11 @@ async def aevaluate(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
metadata={**result.metadata, "threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating correctness: {e}")
logger.exception(f"Error evaluating correctness: {e}")
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
Expand Down
14 changes: 10 additions & 4 deletions openjudge/graders/common/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ class HallucinationGrader(LLMGrader):
>>> # Initialize model
>>> model = OpenAIChatModel(
... api_key="sk-...",
... model="qwen3-max",
... model="qwen3-32b",
... temperature=0.1
... )
>>>
Expand Down Expand Up @@ -253,7 +253,7 @@ def __init__(
self,
model: BaseChatModel | dict,
threshold: float = 3,
template: Optional[PromptTemplate] = DEFAULT_HALLUCINATION_TEMPLATE,
template: Optional[PromptTemplate] = None,
language: LanguageEnum = LanguageEnum.EN,
):
"""
Expand All @@ -264,7 +264,13 @@ def __init__(
threshold: Success threshold [1, 5] (default: 3)
template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)

Raises:
ValueError: If threshold is not in range [1, 5]
"""
if not 1 <= threshold <= 5:
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")

super().__init__(
name="hallucination",
mode=GraderMode.POINTWISE,
Expand Down Expand Up @@ -322,11 +328,11 @@ async def aevaluate(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
metadata={**result.metadata, "threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating hallucination: {e}")
logger.exception(f"Error evaluating hallucination: {e}")
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
Expand Down
16 changes: 11 additions & 5 deletions openjudge/graders/common/harmfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ class HarmfulnessGrader(LLMGrader):
>>> from openjudge.graders.common.harmfulness import HarmfulnessGrader
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
>>> grader = HarmfulnessGrader(model=model, threshold=3)
>>>
>>> # Safe output
Expand All @@ -246,7 +246,7 @@ def __init__(
self,
model: BaseChatModel | dict,
threshold: float = 3,
template: Optional[PromptTemplate] = DEFAULT_HARMFULNESS_TEMPLATE,
template: Optional[PromptTemplate] = None,
language: LanguageEnum = LanguageEnum.EN,
):
"""
Expand All @@ -257,13 +257,19 @@ def __init__(
threshold: Success threshold [1, 5] (default: 3)
template: PromptTemplate for evaluation prompts (default: DEFAULT_HARMFULNESS_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)

Raises:
ValueError: If threshold is not in range [1, 5]
"""
if not 1 <= threshold <= 5:
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")

super().__init__(
name="harmfulness",
mode=GraderMode.POINTWISE,
description="Evaluate whether response contains harmful or inappropriate content",
model=model,
template=template,
template=template or DEFAULT_HARMFULNESS_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down Expand Up @@ -307,11 +313,11 @@ async def aevaluate(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
metadata={**result.metadata, "threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating harmfulness: {e}")
logger.exception(f"Error evaluating harmfulness: {e}")
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
Expand Down
16 changes: 11 additions & 5 deletions openjudge/graders/common/instruction_following.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ class InstructionFollowingGrader(LLMGrader):
>>> from openjudge.graders.common.instruction_following import InstructionFollowingGrader
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
>>> grader = InstructionFollowingGrader(model=model, threshold=3)
>>>
>>> # Good adherence
Expand All @@ -262,7 +262,7 @@ def __init__(
self,
model: BaseChatModel | dict,
threshold: float = 3,
template: Optional[PromptTemplate] = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
template: Optional[PromptTemplate] = None,
language: LanguageEnum = LanguageEnum.EN,
):
"""
Expand All @@ -273,13 +273,19 @@ def __init__(
threshold: Success threshold [1, 5] (default: 3)
template: PromptTemplate for evaluation prompts (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)

Raises:
ValueError: If threshold is not in range [1, 5]
"""
if not 1 <= threshold <= 5:
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")

super().__init__(
name="instruction_following",
mode=GraderMode.POINTWISE,
description="Evaluate whether response follows the given instructions",
model=model,
template=template,
template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down Expand Up @@ -318,11 +324,11 @@ async def aevaluate(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
metadata={**result.metadata, "threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating instruction following: {e}")
logger.exception(f"Error evaluating instruction following: {e}")
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
Expand Down
22 changes: 14 additions & 8 deletions openjudge/graders/common/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class RelevanceGrader(LLMGrader):

Args:
model: BaseChatModel instance or dict config for OpenAIChatModel
threshold: Minimum score [0, 1] to pass (default: 0.7)
threshold: Minimum score [1, 5] to pass (default: 3)
template: Custom evaluation template (default: DEFAULT_RELEVANCE_TEMPLATE)
language: Prompt language - EN or ZH (default: LanguageEnum.EN)

Expand All @@ -234,7 +234,7 @@ class RelevanceGrader(LLMGrader):
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
>>> grader = RelevanceGrader(model=model, threshold=0.7)
>>> grader = RelevanceGrader(model=model, threshold=3)
>>>
>>> # Relevant response
>>> result = asyncio.run(grader.aevaluate(
Expand Down Expand Up @@ -262,25 +262,31 @@ class RelevanceGrader(LLMGrader):
def __init__(
self,
model: BaseChatModel | dict,
threshold: float = 0.7,
template: Optional[PromptTemplate] = DEFAULT_RELEVANCE_TEMPLATE,
threshold: float = 3,
template: Optional[PromptTemplate] = None,
language: LanguageEnum = LanguageEnum.EN,
):
"""
Initialize RelevanceGrader

Args:
model: BaseChatModel instance or dict config for OpenAIChatModel
threshold: Success threshold [0, 1] (default: 0.7)
threshold: Success threshold [1, 5] (default: 3)
template: PromptTemplate for evaluation prompts (default: DEFAULT_RELEVANCE_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)

Raises:
ValueError: If threshold is not in range [1, 5]
"""
if not 1 <= threshold <= 5:
raise ValueError(f"threshold must be in range [1, 5], got {threshold}")

super().__init__(
name="relevance",
mode=GraderMode.POINTWISE,
description="Evaluate relevance of response to user query",
model=model,
template=template,
template=template or DEFAULT_RELEVANCE_TEMPLATE,
language=language,
)
self.threshold = threshold
Expand Down Expand Up @@ -323,11 +329,11 @@ async def aevaluate(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
metadata={**result.metadata, "threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating relevance: {e}")
logger.exception(f"Error evaluating relevance: {e}")
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
Expand Down
14 changes: 8 additions & 6 deletions openjudge/graders/multimodal/_internal/criteria_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ def validate_and_sort_rubrics(
# Sort rubrics by start of range
sorted_rubrics = sorted(rubrics, key=lambda r: r.score_range[0])

# Full overlap check
# Full overlap check (adjacent ranges like (0,5) and (5,7) are allowed)
for i in range(len(sorted_rubrics)):
a_start, a_end = sorted_rubrics[i].score_range
for j in range(i + 1, len(sorted_rubrics)):
b_start, b_end = sorted_rubrics[j].score_range
# Check if ranges overlap
if a_end >= b_start:
# Check if ranges overlap (> allows adjacent ranges to touch)
if a_end > b_start:
raise ValueError(
f"Overlapping score ranges: {sorted_rubrics[i].score_range} and {sorted_rubrics[j].score_range}",
)
Expand Down Expand Up @@ -147,7 +147,7 @@ def construct_params_string(
>>> construct_params_string(params)
'Input and Actual Output'
"""
params = [PARAM_DISPLAY_NAMES[param] for param in evaluation_params]
params = [PARAM_DISPLAY_NAMES.get(param, param.replace("_", " ").title()) for param in evaluation_params]

if len(params) == 1:
params_str = params[0]
Expand All @@ -164,7 +164,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
Get the overall score range from rubrics

Args:
rubric: List of rubric definitions
rubric: List of rubric definitions (does not need to be sorted)

Returns:
Tuple of (min_score, max_score)
Expand All @@ -180,7 +180,9 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
if not rubric:
return (0, 10)

return rubric[0].score_range[0], rubric[-1].score_range[1]
min_score = min(r.score_range[0] for r in rubric)
max_score = max(r.score_range[1] for r in rubric)
return (min_score, max_score)


__all__ = [
Expand Down
Loading