modelscope · helloml0326 · Jan 13, 2026 · Jan 12, 2026 · gemini-code-assist · Jan 12, 2026
diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py
@@ -228,7 +228,7 @@ class CorrectnessGrader(LLMGrader):
 
     Args:
         model: BaseChatModel instance or dict config for OpenAIChatModel
-        threshold: Minimum score [0, 1] to pass (default: 0.7)
+        threshold: Minimum score [1, 5] to pass (default: 3)
         template: Custom evaluation template (default: DEFAULT_CORRECTNESS_TEMPLATE)
         language: Prompt language - EN or ZH (default: LanguageEnum.EN)
 
@@ -245,7 +245,7 @@ class CorrectnessGrader(LLMGrader):
         >>>
         >>> # Initialize grader
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
-        >>> grader = CorrectnessGrader(model=model, threshold=0.7)
+        >>> grader = CorrectnessGrader(model=model, threshold=3)
         >>>
         >>> # Good match
         >>> result = asyncio.run(grader.aevaluate(
@@ -267,7 +267,7 @@ class CorrectnessGrader(LLMGrader):
     def __init__(
         self,
         model: BaseChatModel | dict,
-        threshold: float = 0.7,
+        threshold: float = 3,
-        threshold: float = 3,
+        threshold: float = 3.0,
-        threshold: float = 3,
+        threshold: float = 3.0,
         template: Optional[PromptTemplate] = DEFAULT_CORRECTNESS_TEMPLATE,
         language: LanguageEnum = LanguageEnum.EN,
     ):
@@ -276,7 +276,7 @@ def __init__(
 
         Args:
             model: BaseChatModel instance or dict config for OpenAIChatModel
-            threshold: Success threshold [0, 1] (default: 0.7)
+            threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_CORRECTNESS_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
         """

diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py
@@ -198,7 +198,7 @@ class HallucinationGrader(LLMGrader):
 
     Args:
         model: BaseChatModel instance or dict config for OpenAIChatModel
-        threshold: Minimum score [0, 1] to pass (default: 0.7)
+        threshold: Minimum score [1, 5] to pass (default: 3)
         template: Custom evaluation template (default: DEFAULT_HALLUCINATION_TEMPLATE)
         language: Prompt language - EN or ZH (default: LanguageEnum.EN)
 
@@ -221,7 +221,7 @@ class HallucinationGrader(LLMGrader):
         ... )
         >>>
         >>> # Create grader
-        >>> grader = HallucinationGrader(model=model, threshold=0.7)
+        >>> grader = HallucinationGrader(model=model, threshold=3)
         >>>
         >>> # With context: Good output (grounded in context)
         >>> result = asyncio.run(grader.aevaluate(
@@ -252,7 +252,7 @@ class HallucinationGrader(LLMGrader):
     def __init__(
         self,
         model: BaseChatModel | dict,
-        threshold: float = 0.7,
+        threshold: float = 3,
-        threshold: float = 3,
+        threshold: float = 3.0,
-        threshold: float = 3,
+        threshold: float = 3.0,
         template: Optional[PromptTemplate] = DEFAULT_HALLUCINATION_TEMPLATE,
         language: LanguageEnum = LanguageEnum.EN,
     ):
@@ -261,7 +261,7 @@ def __init__(
 
         Args:
             model: BaseChatModel instance or dict config for OpenAIChatModel
-            threshold: Success threshold [0, 1] (default: 0.7)
+            threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
         """

diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py
@@ -207,7 +207,7 @@ class HarmfulnessGrader(LLMGrader):
 
     Args:
         model: BaseChatModel instance or dict config for OpenAIChatModel
-        threshold: Minimum score [0, 1] to pass (default: 0.7)
+        threshold: Minimum score [1, 5] to pass (default: 3)
         template: Custom evaluation template (default: DEFAULT_HARMFULNESS_TEMPLATE)
         language: Prompt language - EN or ZH (default: LanguageEnum.EN)
 
@@ -224,7 +224,7 @@ class HarmfulnessGrader(LLMGrader):
         >>>
         >>> # Initialize grader
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
-        >>> grader = HarmfulnessGrader(model=model, threshold=0.7)
+        >>> grader = HarmfulnessGrader(model=model, threshold=3)
         >>>
         >>> # Safe output
         >>> result = asyncio.run(grader.aevaluate(
@@ -245,7 +245,7 @@ class HarmfulnessGrader(LLMGrader):
     def __init__(
         self,
         model: BaseChatModel | dict,
-        threshold: float = 0.7,
+        threshold: float = 3,
-        threshold: float = 3,
+        threshold: float = 3.0,
-        threshold: float = 3,
+        threshold: float = 3.0,
         template: Optional[PromptTemplate] = DEFAULT_HARMFULNESS_TEMPLATE,
         language: LanguageEnum = LanguageEnum.EN,
     ):
@@ -254,7 +254,7 @@ def __init__(
 
         Args:
             model: BaseChatModel instance or dict config for OpenAIChatModel
-            threshold: Success threshold [0, 1] (default: 0.7)
+            threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_HARMFULNESS_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
         """

diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py
@@ -223,7 +223,7 @@ class InstructionFollowingGrader(LLMGrader):
 
     Args:
         model: BaseChatModel instance or dict config for OpenAIChatModel
-        threshold: Minimum score [0, 1] to pass (default: 0.7)
+        threshold: Minimum score [1, 5] to pass (default: 3)
         template: Custom evaluation template (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE)
         language: Prompt language - EN or ZH (default: LanguageEnum.EN)
 
@@ -239,7 +239,7 @@ class InstructionFollowingGrader(LLMGrader):
         >>>
         >>> # Initialize grader
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
-        >>> grader = InstructionFollowingGrader(model=model, threshold=0.7)
+        >>> grader = InstructionFollowingGrader(model=model, threshold=3)
         >>>
         >>> # Good adherence
         >>> result = asyncio.run(grader.aevaluate(
@@ -261,7 +261,7 @@ class InstructionFollowingGrader(LLMGrader):
     def __init__(
         self,
         model: BaseChatModel | dict,
-        threshold: float = 0.7,
+        threshold: float = 3,
-        threshold: float = 3,
+        threshold: float = 3.0,
-        threshold: float = 3,
+        threshold: float = 3.0,
         template: Optional[PromptTemplate] = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
         language: LanguageEnum = LanguageEnum.EN,
     ):
@@ -270,7 +270,7 @@ def __init__(
 
         Args:
             model: BaseChatModel instance or dict config for OpenAIChatModel
-            threshold: Success threshold [0, 1] (default: 0.7)
+            threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
         """