Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 13 additions & 19 deletions openjudge/graders/common/correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from loguru import logger

from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
from openjudge.models.schema.oai.message import ChatMessage
Expand Down Expand Up @@ -240,8 +240,8 @@ class CorrectnessGrader(LLMGrader):

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.llm_judge import CorrectnessGrader
>>> from openjudge.models.openai_chat_model import OpenAIChatModel
>>> from openjudge.graders.common.correctness import CorrectnessGrader
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
Expand Down Expand Up @@ -326,25 +326,19 @@ async def aevaluate(
context=context,
reference_response=reference_response,
)
score = result.score
reason = result.reason
return GraderScore(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating correctness: {e}")
score = 0.0
reason = f"Evaluation error: {str(e)}"

# Prepare metadata
metadata = {
"threshold": self.threshold,
}

return GraderScore(
name=self.name,
score=score,
reason=reason,
metadata=metadata,
)
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
)


__all__ = ["CorrectnessGrader", "DEFAULT_CORRECTNESS_TEMPLATE"]
32 changes: 13 additions & 19 deletions openjudge/graders/common/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from loguru import logger

from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
from openjudge.models.schema.oai.message import ChatMessage
Expand Down Expand Up @@ -210,8 +210,8 @@ class HallucinationGrader(LLMGrader):

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.llm_judge import HallucinationGrader
>>> from openjudge.models.openai_chat_model import OpenAIChatModel
>>> from openjudge.graders.common.hallucination import HallucinationGrader
>>>
>>> # Initialize model
>>> model = OpenAIChatModel(
Expand Down Expand Up @@ -318,25 +318,19 @@ async def aevaluate(
context=context,
reference_response=reference_response,
)
score = result.score
reason = result.reason
return GraderScore(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating hallucination: {e}")
score = 0.0
reason = f"Evaluation error: {str(e)}"

# Prepare metadata
metadata = {
"threshold": self.threshold,
}

return GraderScore(
name=self.name,
score=score,
reason=reason,
metadata=metadata,
)
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
)

@staticmethod
def get_metadata() -> Dict[str, Any]:
Expand Down
32 changes: 13 additions & 19 deletions openjudge/graders/common/harmfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from loguru import logger

from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
from openjudge.models.schema.oai.message import ChatMessage
Expand Down Expand Up @@ -219,8 +219,8 @@ class HarmfulnessGrader(LLMGrader):

Example:
>>> import asyncio
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.llm_judge import HarmfulnessGrader
>>> from openjudge.models.openai_chat_model import OpenAIChatModel
>>> from openjudge.graders.common.harmfulness import HarmfulnessGrader
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
Expand Down Expand Up @@ -303,25 +303,19 @@ async def aevaluate(
context=context,
reference_response=reference_response,
)
score = result.score
reason = result.reason
return GraderScore(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating harmfulness: {e}")
score = 0.0
reason = f"Evaluation error: {str(e)}"

# Prepare metadata
metadata = {
"threshold": self.threshold,
}

return GraderScore(
name=self.name,
score=score,
reason=reason,
metadata=metadata,
)
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
)


__all__ = ["HarmfulnessGrader", "DEFAULT_HARMFULNESS_TEMPLATE"]
36 changes: 15 additions & 21 deletions openjudge/graders/common/instruction_following.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from loguru import logger

from openjudge.graders.base_grader import GraderMode, GraderScore
from openjudge.graders.base_grader import GraderError, GraderMode, GraderScore
from openjudge.graders.llm_grader import LLMGrader
from openjudge.models.base_chat_model import BaseChatModel
from openjudge.models.schema.oai.message import ChatMessage
Expand Down Expand Up @@ -234,8 +234,8 @@ class InstructionFollowingGrader(LLMGrader):
- metadata: Threshold and evaluation details

Example:
>>> from openjudge.model.openai_llm import OpenAIChatModel
>>> from openjudge.llm_judge import InstructionFollowingGrader
>>> from openjudge.models.openai_chat_model import OpenAIChatModel
>>> from openjudge.graders.common.instruction_following import InstructionFollowingGrader
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
Expand All @@ -244,8 +244,8 @@ class InstructionFollowingGrader(LLMGrader):
>>> # Good adherence
>>> result = asyncio.run(grader.aevaluate(
... instruction="Write exactly 3 sentences in formal academic tone.",
... output="Climate change poses serious risks. Research shows rising temperatures."
... "Action is urgently needed."
... response="Climate change poses serious risks. Research shows rising temperatures. "
... "Action is urgently needed."
... ))
>>> print(result.score) # 5 - follows all requirements
>>>
Expand Down Expand Up @@ -314,25 +314,19 @@ async def aevaluate(
response=response,
query=query,
)
score = result.score
reason = result.reason
return GraderScore(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating instruction following: {e}")
score = 0.0
reason = f"Evaluation error: {str(e)}"

# Prepare metadata
metadata = {
"threshold": self.threshold,
}

return GraderScore(
name=self.name,
score=score,
reason=reason,
metadata=metadata,
)
return GraderError(
name=self.name,
error=f"Evaluation error: {str(e)}",
)


__all__ = ["InstructionFollowingGrader", "DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE"]
22 changes: 12 additions & 10 deletions openjudge/graders/common/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,15 @@ class RelevanceGrader(LLMGrader):

Args:
model: BaseChatModel instance or dict config for OpenAIChatModel
threshold: Minimum score [0, 1] to pass (default: 0.7)
template: Custom evaluation template (default: DEFAULT_RELEVANCE_TEMPLATE)
language: Prompt language - EN or ZH (default: LanguageEnum.EN)

Returns:
GraderScore object with:
- score: Score [1, 5] where 5 = highly relevant, 1 = irrelevant
- reason: Explanation of relevance assessment
- metadata: Evaluation details
- metadata: Threshold and evaluation details

Example:
>>> import asyncio
Expand All @@ -233,7 +234,7 @@ class RelevanceGrader(LLMGrader):
>>>
>>> # Initialize grader
>>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
>>> grader = RelevanceGrader(model=model)
>>> grader = RelevanceGrader(model=model, threshold=0.7)
>>>
>>> # Relevant response
>>> result = asyncio.run(grader.aevaluate(
Expand Down Expand Up @@ -261,6 +262,7 @@ class RelevanceGrader(LLMGrader):
def __init__(
self,
model: BaseChatModel | dict,
threshold: float = 0.7,
template: Optional[PromptTemplate] = DEFAULT_RELEVANCE_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
Expand All @@ -269,6 +271,7 @@ def __init__(

Args:
model: BaseChatModel instance or dict config for OpenAIChatModel
threshold: Success threshold [0, 1] (default: 0.7)
template: PromptTemplate for evaluation prompts (default: DEFAULT_RELEVANCE_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)
"""
Expand All @@ -280,6 +283,7 @@ def __init__(
template=template,
language=language,
)
self.threshold = threshold

async def aevaluate(
self,
Expand Down Expand Up @@ -315,8 +319,12 @@ async def aevaluate(
context=context,
reference_response=reference_response,
)
score = result.score
reason = result.reason
return GraderScore(
name=self.name,
score=result.score,
reason=result.reason,
metadata={"threshold": self.threshold},
)

except Exception as e:
logger.error(f"Error evaluating relevance: {e}")
Expand All @@ -325,11 +333,5 @@ async def aevaluate(
error=f"Evaluation error: {str(e)}",
)

return GraderScore(
name=self.name,
score=score,
reason=reason,
)


__all__ = ["RelevanceGrader", "DEFAULT_RELEVANCE_TEMPLATE"]
2 changes: 1 addition & 1 deletion openjudge/graders/format/json/json_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(
):
super().__init__(
name=name,
grader_mode=GraderMode.POINTWISE,
mode=GraderMode.POINTWISE,
description=description,
)
self.strict_order = strict_order
Expand Down
16 changes: 9 additions & 7 deletions openjudge/graders/format/json/json_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(
):
super().__init__(
name=name,
grader_mode=GraderMode.POINTWISE,
mode=GraderMode.POINTWISE,
description=description,
)

Expand All @@ -49,6 +49,14 @@ def _compute(self, response: str) -> tuple[bool, dict]:
Returns:
tuple[bool, dict]: (is_valid, details)
"""
# Input validation
if not isinstance(response, str):
return False, {
"is_valid": False,
"error_message": f"Invalid input type: expected str, got {type(response).__name__}",
"response_length": 0,
}

try:
json.loads(response)
return True, {"is_valid": True, "response_length": len(response)}
Expand All @@ -58,12 +66,6 @@ def _compute(self, response: str) -> tuple[bool, dict]:
"error_message": f"JSON decode error: {str(e)}",
"response_length": len(response),
}
except TypeError as e:
return False, {
"is_valid": False,
"error_message": f"Type error: {str(e)}",
"response_length": len(response),
}
except Exception as e:
return False, {
"is_valid": False,
Expand Down
2 changes: 0 additions & 2 deletions openjudge/graders/format/length_penalty.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def __init__(
"""
super().__init__(
name="length_penalty",
grader_mode="content",
mode=GraderMode.POINTWISE,
description="Text length based penalty for content that is too short or too long.",
)
Expand Down Expand Up @@ -77,7 +76,6 @@ async def aevaluate(self, response: str) -> GraderScore:
>>> print(result.score < 0)
True
"""

length = len(response)

penalty = 0.0
Expand Down
5 changes: 2 additions & 3 deletions openjudge/graders/format/ngram_repetition_penalty.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
"""
super().__init__(
name="ngram_repetition_penalty",
grader_mode=GraderMode.POINTWISE,
mode=GraderMode.POINTWISE,
description="Calculate N-gram repetition penalty supporting Chinese processing "
"and multiple penalty strategies.",
)
Expand All @@ -67,7 +67,7 @@ def __init__(
chinese_only=chinese_only,
)

self._think_pattern = re.compile(r"(.*?)", flags=re.DOTALL)
self._think_pattern = re.compile(r"<think>(.*?)</think>", flags=re.DOTALL)

def _extract_thought_process(self, content: str) -> str:
"""Extract thought process"""
Expand Down Expand Up @@ -146,7 +146,6 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore:
>>> print(result.score)
0.0
"""

# Select text based on analysis scope
if self.analyze_scope == "thought":
text_to_analyze = self._extract_thought_process(response)
Expand Down
Loading