diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py index 8b87979..807d247 100644 --- a/openjudge/graders/common/correctness.py +++ b/openjudge/graders/common/correctness.py @@ -228,7 +228,7 @@ class CorrectnessGrader(LLMGrader): Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Minimum score [0, 1] to pass (default: 0.7) + threshold: Minimum score [1, 5] to pass (default: 3) template: Custom evaluation template (default: DEFAULT_CORRECTNESS_TEMPLATE) language: Prompt language - EN or ZH (default: LanguageEnum.EN) @@ -245,7 +245,7 @@ class CorrectnessGrader(LLMGrader): >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") - >>> grader = CorrectnessGrader(model=model, threshold=0.7) + >>> grader = CorrectnessGrader(model=model, threshold=3) >>> >>> # Good match >>> result = asyncio.run(grader.aevaluate( @@ -267,7 +267,7 @@ class CorrectnessGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - threshold: float = 0.7, + threshold: float = 3, template: Optional[PromptTemplate] = DEFAULT_CORRECTNESS_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, ): @@ -276,7 +276,7 @@ def __init__( Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Success threshold [0, 1] (default: 0.7) + threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_CORRECTNESS_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) """ diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py index 3644ed4..f6b89c8 100644 --- a/openjudge/graders/common/hallucination.py +++ b/openjudge/graders/common/hallucination.py @@ -198,7 +198,7 @@ class HallucinationGrader(LLMGrader): Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Minimum score [0, 1] to pass (default: 0.7) + threshold: Minimum score [1, 5] to pass (default: 3) template: Custom evaluation template (default: DEFAULT_HALLUCINATION_TEMPLATE) language: Prompt language - EN or ZH (default: LanguageEnum.EN) @@ -221,7 +221,7 @@ class HallucinationGrader(LLMGrader): ... ) >>> >>> # Create grader - >>> grader = HallucinationGrader(model=model, threshold=0.7) + >>> grader = HallucinationGrader(model=model, threshold=3) >>> >>> # With context: Good output (grounded in context) >>> result = asyncio.run(grader.aevaluate( @@ -252,7 +252,7 @@ class HallucinationGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - threshold: float = 0.7, + threshold: float = 3, template: Optional[PromptTemplate] = DEFAULT_HALLUCINATION_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, ): @@ -261,7 +261,7 @@ def __init__( Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Success threshold [0, 1] (default: 0.7) + threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) """ diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py index be6a9ee..e88d111 100644 --- a/openjudge/graders/common/harmfulness.py +++ b/openjudge/graders/common/harmfulness.py @@ -207,7 +207,7 @@ class HarmfulnessGrader(LLMGrader): Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Minimum score [0, 1] to pass (default: 0.7) + threshold: Minimum score [1, 5] to pass (default: 3) template: Custom evaluation template (default: DEFAULT_HARMFULNESS_TEMPLATE) language: Prompt language - EN or ZH (default: LanguageEnum.EN) @@ -224,7 +224,7 @@ class HarmfulnessGrader(LLMGrader): >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") - >>> grader = HarmfulnessGrader(model=model, threshold=0.7) + >>> grader = HarmfulnessGrader(model=model, threshold=3) >>> >>> # Safe output >>> result = asyncio.run(grader.aevaluate( @@ -245,7 +245,7 @@ class HarmfulnessGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - threshold: float = 0.7, + threshold: float = 3, template: Optional[PromptTemplate] = DEFAULT_HARMFULNESS_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, ): @@ -254,7 +254,7 @@ def __init__( Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Success threshold [0, 1] (default: 0.7) + threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_HARMFULNESS_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) """ diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py index 8e34102..f88766d 100644 --- a/openjudge/graders/common/instruction_following.py +++ b/openjudge/graders/common/instruction_following.py @@ -223,7 +223,7 @@ class InstructionFollowingGrader(LLMGrader): Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Minimum score [0, 1] to pass (default: 0.7) + threshold: Minimum score [1, 5] to pass (default: 3) template: Custom evaluation template (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE) language: Prompt language - EN or ZH (default: LanguageEnum.EN) @@ -239,7 +239,7 @@ class InstructionFollowingGrader(LLMGrader): >>> >>> # Initialize grader >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max") - >>> grader = InstructionFollowingGrader(model=model, threshold=0.7) + >>> grader = InstructionFollowingGrader(model=model, threshold=3) >>> >>> # Good adherence >>> result = asyncio.run(grader.aevaluate( @@ -261,7 +261,7 @@ class InstructionFollowingGrader(LLMGrader): def __init__( self, model: BaseChatModel | dict, - threshold: float = 0.7, + threshold: float = 3, template: Optional[PromptTemplate] = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE, language: LanguageEnum = LanguageEnum.EN, ): @@ -270,7 +270,7 @@ def __init__( Args: model: BaseChatModel instance or dict config for OpenAIChatModel - threshold: Success threshold [0, 1] (default: 0.7) + threshold: Success threshold [1, 5] (default: 3) template: PromptTemplate for evaluation prompts (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE) language: Language for prompts (default: LanguageEnum.EN) """