diff --git a/docs/built_in_graders/general.md b/docs/built_in_graders/general.md
index 9d05d0cf..0dee6740 100644
--- a/docs/built_in_graders/general.md
+++ b/docs/built_in_graders/general.md
@@ -56,7 +56,7 @@ Evaluates how well a response addresses the user's query. Measures whether the a
 | `query` | str | Yes | The user's question or request |
 | `response` | str | Yes | The model's response to evaluate |
 | `context` | str | No | Additional context (e.g., conversation history) |
-| `ground_truth` | str | No | Reference answer for comparison |
+| `reference_response` | str | No | Reference answer for comparison |
 
 **Grading Criteria:**
 - **5**: Comprehensive response with helpful insights
@@ -107,7 +107,7 @@ Detects fabricated information not supported by the provided context or common k
 | `query` | str | Yes | The user's question |
 | `response` | str | Yes | The model's response to evaluate |
 | `context` | str | No | Source documents to verify against |
-| `ground_truth` | str | No | Reference answer |
+| `reference_response` | str | No | Reference answer |
 
 !!! note
     If no context is provided, evaluation is based on common knowledge and factual consistency.
@@ -175,7 +175,7 @@ Identifies harmful, offensive, or inappropriate content in responses. Critical f
 | `query` | str | Yes | The user's input |
 | `response` | str | Yes | The model's response to evaluate |
 | `context` | str | No | Additional context |
-| `ground_truth` | str | No | Reference response |
+| `reference_response` | str | No | Reference response |
 
 **What it detects:**
 - Violence, hatred, discrimination
diff --git a/openjudge/graders/common/correctness.py b/openjudge/graders/common/correctness.py
index ce83a825..1c8be5d2 100644
--- a/openjudge/graders/common/correctness.py
+++ b/openjudge/graders/common/correctness.py
@@ -244,7 +244,7 @@ class CorrectnessGrader(LLMGrader):
         >>> from openjudge.graders.common.correctness import CorrectnessGrader
         >>>
         >>> # Initialize grader
-        >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
+        >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
         >>> grader = CorrectnessGrader(model=model, threshold=3)
         >>>
         >>> # Good match
@@ -268,7 +268,7 @@ def __init__(
         self,
         model: BaseChatModel | dict,
         threshold: float = 3,
-        template: Optional[PromptTemplate] = DEFAULT_CORRECTNESS_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
     ):
         """
@@ -279,13 +279,19 @@ def __init__(
             threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_CORRECTNESS_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
+
+        Raises:
+            ValueError: If threshold is not in range [1, 5]
         """
+        if not 1 <= threshold <= 5:
+            raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
+
         super().__init__(
             name="correctness",
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response matches the provided reference response",
             model=model,
-            template=template,
+            template=template or DEFAULT_CORRECTNESS_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
@@ -330,11 +336,11 @@ async def aevaluate(
                 name=self.name,
                 score=result.score,
                 reason=result.reason,
-                metadata={"threshold": self.threshold},
+                metadata={**result.metadata, "threshold": self.threshold},
             )
 
         except Exception as e:
-            logger.error(f"Error evaluating correctness: {e}")
+            logger.exception(f"Error evaluating correctness: {e}")
             return GraderError(
                 name=self.name,
                 error=f"Evaluation error: {str(e)}",
diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py
index b2736ac0..1486741a 100644
--- a/openjudge/graders/common/hallucination.py
+++ b/openjudge/graders/common/hallucination.py
@@ -216,7 +216,7 @@ class HallucinationGrader(LLMGrader):
         >>> # Initialize model
         >>> model = OpenAIChatModel(
         ...     api_key="sk-...",
-        ...     model="qwen3-max",
+        ...     model="qwen3-32b",
         ...     temperature=0.1
         ... )
         >>>
@@ -253,7 +253,7 @@ def __init__(
         self,
         model: BaseChatModel | dict,
         threshold: float = 3,
-        template: Optional[PromptTemplate] = DEFAULT_HALLUCINATION_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
     ):
         """
@@ -264,7 +264,13 @@ def __init__(
             threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
+
+        Raises:
+            ValueError: If threshold is not in range [1, 5]
         """
+        if not 1 <= threshold <= 5:
+            raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
+
         super().__init__(
             name="hallucination",
             mode=GraderMode.POINTWISE,
@@ -322,11 +328,11 @@ async def aevaluate(
                 name=self.name,
                 score=result.score,
                 reason=result.reason,
-                metadata={"threshold": self.threshold},
+                metadata={**result.metadata, "threshold": self.threshold},
             )
 
         except Exception as e:
-            logger.error(f"Error evaluating hallucination: {e}")
+            logger.exception(f"Error evaluating hallucination: {e}")
             return GraderError(
                 name=self.name,
                 error=f"Evaluation error: {str(e)}",
diff --git a/openjudge/graders/common/harmfulness.py b/openjudge/graders/common/harmfulness.py
index bcea3ccd..a83a0490 100644
--- a/openjudge/graders/common/harmfulness.py
+++ b/openjudge/graders/common/harmfulness.py
@@ -223,7 +223,7 @@ class HarmfulnessGrader(LLMGrader):
         >>> from openjudge.graders.common.harmfulness import HarmfulnessGrader
         >>>
         >>> # Initialize grader
-        >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
+        >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
         >>> grader = HarmfulnessGrader(model=model, threshold=3)
         >>>
         >>> # Safe output
@@ -246,7 +246,7 @@ def __init__(
         self,
         model: BaseChatModel | dict,
         threshold: float = 3,
-        template: Optional[PromptTemplate] = DEFAULT_HARMFULNESS_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
     ):
         """
@@ -257,13 +257,19 @@ def __init__(
             threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_HARMFULNESS_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
+
+        Raises:
+            ValueError: If threshold is not in range [1, 5]
         """
+        if not 1 <= threshold <= 5:
+            raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
+
         super().__init__(
             name="harmfulness",
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response contains harmful or inappropriate content",
             model=model,
-            template=template,
+            template=template or DEFAULT_HARMFULNESS_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
@@ -307,11 +313,11 @@ async def aevaluate(
                 name=self.name,
                 score=result.score,
                 reason=result.reason,
-                metadata={"threshold": self.threshold},
+                metadata={**result.metadata, "threshold": self.threshold},
             )
 
         except Exception as e:
-            logger.error(f"Error evaluating harmfulness: {e}")
+            logger.exception(f"Error evaluating harmfulness: {e}")
             return GraderError(
                 name=self.name,
                 error=f"Evaluation error: {str(e)}",
diff --git a/openjudge/graders/common/instruction_following.py b/openjudge/graders/common/instruction_following.py
index 5fcc5a75..8c647d44 100644
--- a/openjudge/graders/common/instruction_following.py
+++ b/openjudge/graders/common/instruction_following.py
@@ -238,7 +238,7 @@ class InstructionFollowingGrader(LLMGrader):
         >>> from openjudge.graders.common.instruction_following import InstructionFollowingGrader
         >>>
         >>> # Initialize grader
-        >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-max")
+        >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
         >>> grader = InstructionFollowingGrader(model=model, threshold=3)
         >>>
         >>> # Good adherence
@@ -262,7 +262,7 @@ def __init__(
         self,
         model: BaseChatModel | dict,
         threshold: float = 3,
-        template: Optional[PromptTemplate] = DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
     ):
         """
@@ -273,13 +273,19 @@ def __init__(
             threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
+
+        Raises:
+            ValueError: If threshold is not in range [1, 5]
         """
+        if not 1 <= threshold <= 5:
+            raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
+
         super().__init__(
             name="instruction_following",
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response follows the given instructions",
             model=model,
-            template=template,
+            template=template or DEFAULT_INSTRUCTION_FOLLOWING_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
@@ -318,11 +324,11 @@ async def aevaluate(
                 name=self.name,
                 score=result.score,
                 reason=result.reason,
-                metadata={"threshold": self.threshold},
+                metadata={**result.metadata, "threshold": self.threshold},
             )
 
         except Exception as e:
-            logger.error(f"Error evaluating instruction following: {e}")
+            logger.exception(f"Error evaluating instruction following: {e}")
             return GraderError(
                 name=self.name,
                 error=f"Evaluation error: {str(e)}",
diff --git a/openjudge/graders/common/relevance.py b/openjudge/graders/common/relevance.py
index 76e0de2f..293ceb4a 100644
--- a/openjudge/graders/common/relevance.py
+++ b/openjudge/graders/common/relevance.py
@@ -217,7 +217,7 @@ class RelevanceGrader(LLMGrader):
 
     Args:
         model: BaseChatModel instance or dict config for OpenAIChatModel
-        threshold: Minimum score [0, 1] to pass (default: 0.7)
+        threshold: Minimum score [1, 5] to pass (default: 3)
         template: Custom evaluation template (default: DEFAULT_RELEVANCE_TEMPLATE)
         language: Prompt language - EN or ZH (default: LanguageEnum.EN)
 
@@ -234,7 +234,7 @@ class RelevanceGrader(LLMGrader):
         >>>
         >>> # Initialize grader
         >>> model = OpenAIChatModel(api_key="sk-...", model="qwen3-32b")
-        >>> grader = RelevanceGrader(model=model, threshold=0.7)
+        >>> grader = RelevanceGrader(model=model, threshold=3)
         >>>
         >>> # Relevant response
         >>> result = asyncio.run(grader.aevaluate(
@@ -262,8 +262,8 @@ class RelevanceGrader(LLMGrader):
     def __init__(
         self,
         model: BaseChatModel | dict,
-        threshold: float = 0.7,
-        template: Optional[PromptTemplate] = DEFAULT_RELEVANCE_TEMPLATE,
+        threshold: float = 3,
+        template: Optional[PromptTemplate] = None,
         language: LanguageEnum = LanguageEnum.EN,
     ):
         """
@@ -271,16 +271,22 @@ def __init__(
 
         Args:
             model: BaseChatModel instance or dict config for OpenAIChatModel
-            threshold: Success threshold [0, 1] (default: 0.7)
+            threshold: Success threshold [1, 5] (default: 3)
             template: PromptTemplate for evaluation prompts (default: DEFAULT_RELEVANCE_TEMPLATE)
             language: Language for prompts (default: LanguageEnum.EN)
+
+        Raises:
+            ValueError: If threshold is not in range [1, 5]
         """
+        if not 1 <= threshold <= 5:
+            raise ValueError(f"threshold must be in range [1, 5], got {threshold}")
+
         super().__init__(
             name="relevance",
             mode=GraderMode.POINTWISE,
             description="Evaluate relevance of response to user query",
             model=model,
-            template=template,
+            template=template or DEFAULT_RELEVANCE_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
@@ -323,11 +329,11 @@ async def aevaluate(
                 name=self.name,
                 score=result.score,
                 reason=result.reason,
-                metadata={"threshold": self.threshold},
+                metadata={**result.metadata, "threshold": self.threshold},
             )
 
         except Exception as e:
-            logger.error(f"Error evaluating relevance: {e}")
+            logger.exception(f"Error evaluating relevance: {e}")
             return GraderError(
                 name=self.name,
                 error=f"Evaluation error: {str(e)}",
diff --git a/openjudge/graders/multimodal/_internal/criteria_utils.py b/openjudge/graders/multimodal/_internal/criteria_utils.py
index ef71e74e..370258f3 100644
--- a/openjudge/graders/multimodal/_internal/criteria_utils.py
+++ b/openjudge/graders/multimodal/_internal/criteria_utils.py
@@ -87,13 +87,13 @@ def validate_and_sort_rubrics(
     # Sort rubrics by start of range
     sorted_rubrics = sorted(rubrics, key=lambda r: r.score_range[0])
 
-    # Full overlap check
+    # Full overlap check (adjacent ranges like (0,5) and (5,7) are allowed)
     for i in range(len(sorted_rubrics)):
         a_start, a_end = sorted_rubrics[i].score_range
         for j in range(i + 1, len(sorted_rubrics)):
             b_start, b_end = sorted_rubrics[j].score_range
-            # Check if ranges overlap
-            if a_end >= b_start:
+            # Check if ranges overlap (> allows adjacent ranges to touch)
+            if a_end > b_start:
                 raise ValueError(
                     f"Overlapping score ranges: {sorted_rubrics[i].score_range} and {sorted_rubrics[j].score_range}",
                 )
@@ -147,7 +147,7 @@ def construct_params_string(
         >>> construct_params_string(params)
         'Input and Actual Output'
     """
-    params = [PARAM_DISPLAY_NAMES[param] for param in evaluation_params]
+    params = [PARAM_DISPLAY_NAMES.get(param, param.replace("_", " ").title()) for param in evaluation_params]
 
     if len(params) == 1:
         params_str = params[0]
@@ -164,7 +164,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
     Get the overall score range from rubrics
 
     Args:
-        rubric: List of rubric definitions
+        rubric: List of rubric definitions (does not need to be sorted)
 
     Returns:
         Tuple of (min_score, max_score)
@@ -180,7 +180,9 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
     if not rubric:
         return (0, 10)
 
-    return rubric[0].score_range[0], rubric[-1].score_range[1]
+    min_score = min(r.score_range[0] for r in rubric)
+    max_score = max(r.score_range[1] for r in rubric)
+    return (min_score, max_score)
 
 
 __all__ = [
diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py
index faedd0d3..d6b4dea3 100644
--- a/openjudge/graders/multimodal/image_coherence.py
+++ b/openjudge/graders/multimodal/image_coherence.py
@@ -23,6 +23,7 @@
 from openjudge.models.base_chat_model import BaseChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
+from openjudge.utils.utils import parse_structured_chat_response
 
 # pylint: disable=line-too-long
 
@@ -217,30 +218,27 @@ async def _aevaluate_single_image(
             context_below=context_below or "",
         )
 
-        try:
-            # Format image content for OpenAI API
-            content = [{"type": "text", "text": prompt}]
-
-            if image.url:
-                content.append({"type": "image_url", "image_url": {"url": image.url}})
-            elif image.base64:
-                # Format base64 image with data URL scheme
-                image_format = image.format or "jpeg"
-                data_url = f"data:image/{image_format};base64,{image.base64}"
-                content.append({"type": "image_url", "image_url": {"url": data_url}})
-
-            # Call model without structured output
-            chat_response = await self.model.achat(
-                messages=[{"role": "user", "content": content}],
-                structured_model=GraderScoreCallback,
-            )
-            score = chat_response.parsed["score"]
-            reason = chat_response.parsed["reason"]
-            return score, reason
+        # Format image content for OpenAI API
+        content = [{"type": "text", "text": prompt}]
 
-        except Exception as e:
-            logger.error(f"Error evaluating image coherence: {e}")
-            return 0.0, f"Evaluation error: {str(e)}"
+        if image.url:
+            content.append({"type": "image_url", "image_url": {"url": image.url}})
+        elif image.base64:
+            # Format base64 image with data URL scheme
+            image_format = image.format or "jpeg"
+            data_url = f"data:image/{image_format};base64,{image.base64}"
+            content.append({"type": "image_url", "image_url": {"url": data_url}})
+
+        chat_response = await self.model.achat(
+            messages=[{"role": "user", "content": content}],
+            structured_model=GraderScoreCallback,
+        )
+
+        # Default to 5.0 (neutral score on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", 5.0)
+        reason = parsed.get("reason", "")
+        return score, reason
 
     async def _acompute(
         self,
@@ -326,7 +324,16 @@ async def aevaluate(
             ...     ]
             ... )
         """
-        score, details = await self._acompute(response, **kwargs)
+        try:
+            score, details = await self._acompute(response, **kwargs)
+        except Exception as e:
+            logger.exception(f"Error evaluating image coherence: {e}")
+            from openjudge.graders.base_grader import GraderError
+
+            return GraderError(
+                name=self.name,
+                error=f"Evaluation error: {str(e)}",
+            )
 
         if "error" in details:
             return GraderScore(
diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py
index 33d4d33e..c8aa3350 100644
--- a/openjudge/graders/multimodal/image_helpfulness.py
+++ b/openjudge/graders/multimodal/image_helpfulness.py
@@ -24,6 +24,7 @@
 from openjudge.models.base_chat_model import BaseChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
+from openjudge.utils.utils import parse_structured_chat_response
 
 # pylint: disable=line-too-long
 
@@ -213,7 +214,7 @@ async def _aevaluate_single_image(
         context_below: Optional[str],
     ) -> Tuple[float, str]:
         """Async evaluation of single image helpfulness"""
-        messages = self.template.to_messages()
+        messages = self.template.to_messages(self.language)
         prompt = (
             messages[0]
             .format(
@@ -223,36 +224,17 @@ async def _aevaluate_single_image(
             .content
         )
 
-        try:
-            content = format_image_content(prompt, [image])
-            chat_response = await self.model.achat(
-                messages=[{"role": "user", "content": content}],
-                structured_model=GraderScoreCallback,
-            )
-
-            # Handle both streaming and non-streaming responses
-            if hasattr(chat_response, "__aiter__"):
-                # This is a streaming response, we need to collect it first
-                collected_content = []
-                parsed = {}
-                async for chunk in chat_response:
-                    if chunk.content:
-                        collected_content.extend(chunk.content)
-                    if chunk.parsed:
-                        parsed.update(chunk.parsed)
-
-                # Extract score and reason from metadata
-                score = parsed.get("score", 0.0)
-                reason = parsed.get("reason", "")
-            else:
-                # Non-streaming response
-                score = chat_response.parsed["score"]
-                reason = chat_response.parsed["reason"]
-            return score, reason
+        content = format_image_content(prompt, [image])
+        chat_response = await self.model.achat(
+            messages=[{"role": "user", "content": content}],
+            structured_model=GraderScoreCallback,
+        )
 
-        except Exception as e:
-            logger.error(f"Error evaluating image helpfulness: {e}")
-            return 0.0, f"Evaluation error: {str(e)}"
+        # Default to 5.0 (neutral score on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", 5.0)
+        reason = parsed.get("reason", "")
+        return score, reason
 
     async def _acompute(
         self,
@@ -329,7 +311,16 @@ async def aevaluate(
             ...     ]
             ... )
         """
-        score, details = await self._acompute(response, **kwargs)
+        try:
+            score, details = await self._acompute(response, **kwargs)
+        except Exception as e:
+            logger.exception(f"Error evaluating image helpfulness: {e}")
+            from openjudge.graders.base_grader import GraderError
+
+            return GraderError(
+                name=self.name,
+                error=f"Evaluation error: {str(e)}",
+            )
 
         if "error" in details:
             return GraderScore(
diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py
index ca37dd64..4cb645f4 100644
--- a/openjudge/graders/multimodal/text_to_image.py
+++ b/openjudge/graders/multimodal/text_to_image.py
@@ -20,6 +20,7 @@
 from openjudge.models.openai_chat_model import OpenAIChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
+from openjudge.utils.utils import parse_structured_chat_response
 
 # pylint: disable=line-too-long
 
@@ -259,37 +260,18 @@ async def _aevaluate_semantic_consistency(
         messages = self.semantic_template.to_messages(self.language)
         prompt = messages[0].format(query=query).content
 
-        try:
-            content = format_image_content(prompt, [response])
-            chat_response = await self.model.achat(
-                messages=[{"role": "user", "content": content}],
-                structured_model=GraderScoreCallback,
-            )
+        content = format_image_content(prompt, [response])
+        chat_response = await self.model.achat(
+            messages=[{"role": "user", "content": content}],
+            structured_model=GraderScoreCallback,
+        )
 
-            # Handle both streaming and non-streaming responses
-            if hasattr(chat_response, "__aiter__"):
-                # This is a streaming response, we need to collect it first
-                collected_content = []
-                parsed = {}
-                async for chunk in chat_response:
-                    if chunk.content:
-                        collected_content.extend(chunk.content)
-                    if chunk.parsed:
-                        parsed.update(chunk.parsed)
-
-                # Extract score and reason from metadata
-                score = parsed.get("score", 0.0)
-                reason = parsed.get("reason", "")
-            else:
-                # Non-streaming response
-                score = chat_response.parsed["score"]
-                score = score if isinstance(score, list) else [score]
-                reason = chat_response.parsed["reason"]
-            return score, reason
-
-        except Exception as e:
-            logger.error(f"Error evaluating semantic consistency: {e}")
-            return [5.0], f"Error during evaluation: {str(e)}"
+        # Default to 5.0 (neutral score on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", 5.0)
+        score = score if isinstance(score, list) else [score]
+        reason = parsed.get("reason", "")
+        return score, reason
 
     async def _aevaluate_perceptual_quality(
         self,
@@ -299,20 +281,18 @@ async def _aevaluate_perceptual_quality(
         messages = self.perceptual_template.to_messages(self.language)
         prompt = messages[0].content
 
-        try:
-            content = format_image_content(prompt, [response])
-            chat_response = await self.model.achat(
-                messages=[{"role": "user", "content": content}],
-                structured_model=GraderScoreCallback,
-            )
-            score = chat_response.parsed["score"]
-            score = score[:2] if isinstance(score, list) else [score, score]
-            reason = chat_response.parsed["reason"]
-            return score, reason
+        content = format_image_content(prompt, [response])
+        chat_response = await self.model.achat(
+            messages=[{"role": "user", "content": content}],
+            structured_model=GraderScoreCallback,
+        )
 
-        except Exception as e:
-            logger.error(f"Error evaluating perceptual quality: {e}")
-            return [5.0, 5.0], f"Error during evaluation: {str(e)}"
+        # Default to [5.0, 5.0] (neutral scores on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", [5.0, 5.0])
+        score = score[:2] if isinstance(score, list) else [score, score]
+        reason = parsed.get("reason", "")
+        return score, reason
 
     async def _a_compute(
         self,
@@ -406,7 +386,16 @@ async def aevaluate(
                 metadata={"error": "response must be MLLMImage"},
             )
 
-        score, details = await self._a_compute(query, response, **kwargs)
+        try:
+            score, details = await self._a_compute(query, response, **kwargs)
+        except Exception as e:
+            logger.exception(f"Error evaluating text-to-image: {e}")
+            from openjudge.graders.base_grader import GraderError
+
+            return GraderError(
+                name=self.name,
+                error=f"Evaluation error: {str(e)}",
+            )
 
         # Generate comprehensive reason
         reason = f"""Text-to-Image Quality Score: {score:.4f}
diff --git a/openjudge/utils/utils.py b/openjudge/utils/utils.py
index 92c971c1..b8967a16 100644
--- a/openjudge/utils/utils.py
+++ b/openjudge/utils/utils.py
@@ -6,7 +6,7 @@
 """
 
 import json
-from typing import Any, Dict, Type
+from typing import Any, Dict, Optional, Type
 
 from json_repair import repair_json
 from loguru import logger
@@ -203,3 +203,42 @@ def trim_and_load_json(response: str, metric: Any = None) -> Dict[str, Any]:
             metric_name = getattr(metric, "name", "unknown_metric")
             logger.error(f"{metric_name}: {error_msg}")
         raise ValueError(error_msg) from e
+
+
+async def parse_structured_chat_response(
+    chat_response: Any,
+    default: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Parse structured response from streaming or non-streaming chat response.
+
+    For streaming responses, returns the last chunk's parsed result (complete).
+    For non-streaming responses, returns the parsed result directly.
+
+    Args:
+        chat_response: Chat response object from model.achat() with structured_model.
+            Can be either streaming (async iterator) or non-streaming.
+        default: Default dict to return if parsing fails. Defaults to empty dict.
+
+    Returns:
+        Dict[str, Any]: The parsed structured response containing fields like
+            'score' and 'reason'.
+
+    Example:
+        >>> response = await model.achat(messages, structured_model=GraderScoreCallback)
+        >>> parsed = await parse_structured_chat_response(response)
+        >>> score = parsed.get("score", 5.0)
+        >>> reason = parsed.get("reason", "")
+    """
+    if default is None:
+        default = {}
+
+    if hasattr(chat_response, "__aiter__"):
+        # Streaming response - only the last chunk contains complete result
+        parsed = None
+        async for chunk in chat_response:
+            if chunk.parsed:
+                parsed = chunk.parsed
+        return parsed if parsed is not None else default
+
+    # Non-streaming response
+    return chat_response.parsed if chat_response.parsed else default
diff --git a/tests/graders/common/test_correctness.py b/tests/graders/common/test_correctness.py
index ca0dc5f9..72c12224 100644
--- a/tests/graders/common/test_correctness.py
+++ b/tests/graders/common/test_correctness.py
@@ -73,7 +73,7 @@ async def test_correctness_grader_with_reference_response(self):
             mock_model = AsyncMock()
             grader = CorrectnessGrader(
                 model=mock_model,
-                threshold=0.7,
+                threshold=3,
                 language=LanguageEnum.EN,
             )
 
@@ -93,7 +93,7 @@ async def test_correctness_grader_with_reference_response(self):
             assert isinstance(result.score, (int, float))
             assert result.score >= 1 and result.score <= 5
             assert "Correctness score" in result.reason
-            assert result.metadata["threshold"] == 0.7
+            assert result.metadata["threshold"] == 3
 
             # Verify model was called correctly
             mock_achat.assert_called_once()
@@ -116,7 +116,7 @@ async def test_correctness_grader_without_reference_response(self):
             mock_model = AsyncMock()
             grader = CorrectnessGrader(
                 model=mock_model,
-                threshold=0.7,
+                threshold=3,
                 language=LanguageEnum.EN,
             )
 
@@ -132,7 +132,7 @@ async def test_correctness_grader_without_reference_response(self):
             # Verify result structure
             assert result.name == "correctness"
             assert isinstance(result.score, (int, float))
-            assert result.metadata["threshold"] == 0.7
+            assert result.metadata["threshold"] == 3
 
             # Verify model was called correctly
             mock_achat.assert_called_once()
diff --git a/tests/graders/common/test_harmfulness.py b/tests/graders/common/test_harmfulness.py
index 63713956..937e6286 100644
--- a/tests/graders/common/test_harmfulness.py
+++ b/tests/graders/common/test_harmfulness.py
@@ -52,10 +52,10 @@ def test_initialization(self):
         mock_model = AsyncMock()
         grader = HarmfulnessGrader(
             model=mock_model,
-            threshold=0.8,
+            threshold=3,
         )
         assert grader.name == "harmfulness"
-        assert grader.threshold == 0.8
+        assert grader.threshold == 3
         assert grader.model == mock_model
 
     @pytest.mark.asyncio
diff --git a/tests/graders/multimodal/test_image_coherence.py b/tests/graders/multimodal/test_image_coherence.py
index 6d96a26d..5f2c068e 100644
--- a/tests/graders/multimodal/test_image_coherence.py
+++ b/tests/graders/multimodal/test_image_coherence.py
@@ -62,7 +62,7 @@ async def test_successful_evaluation(self):
         # Create a simple mock response object (not AsyncMock to avoid __aiter__ check)
         class MockResponse:
             def __init__(self):
-                self.metadata = {
+                self.parsed = {
                     "score": 8.0,  # Will be normalized to 0.8 (divided by 10)
                     "reason": "Image is highly coherent with surrounding text",
                 }
@@ -92,6 +92,8 @@ def __init__(self):
     @pytest.mark.asyncio
     async def test_error_handling(self):
         """Test graceful error handling"""
+        from openjudge.graders.base_grader import GraderError
+
         # Create mock model that raises exception
         mock_model = AsyncMock()
         mock_model.achat = AsyncMock(side_effect=Exception("API Error"))
@@ -105,9 +107,9 @@ async def test_error_handling(self):
             response=["Text before", mock_image, "Text after"],
         )
 
-        # Assertions
-        assert result.score == 0.0
-        assert "Evaluation error: API Error" in result.reason
+        # Assertions - grader returns GraderError on exception
+        assert isinstance(result, GraderError)
+        assert "Evaluation error: API Error" in result.error
 
 
 # ==================== QUALITY TESTS ====================
diff --git a/tests/graders/multimodal/test_image_helpfulness.py b/tests/graders/multimodal/test_image_helpfulness.py
index 17c3a08f..0b3f0c34 100644
--- a/tests/graders/multimodal/test_image_helpfulness.py
+++ b/tests/graders/multimodal/test_image_helpfulness.py
@@ -62,7 +62,7 @@ async def test_successful_evaluation(self):
         # Create a simple mock response object (not AsyncMock to avoid __aiter__ check)
         class MockResponse:
             def __init__(self):
-                self.metadata = {
+                self.parsed = {
                     "score": 8.0,  # Will be normalized to 0.8 (divided by 10)
                     "reason": "Image is very helpful for understanding the text",
                 }
@@ -92,6 +92,8 @@ def __init__(self):
     @pytest.mark.asyncio
     async def test_error_handling(self):
         """Test graceful error handling"""
+        from openjudge.graders.base_grader import GraderError
+
         # Create mock model that raises exception
         mock_model = AsyncMock()
         mock_model.achat = AsyncMock(side_effect=Exception("API Error"))
@@ -105,9 +107,9 @@ async def test_error_handling(self):
             response=["Text before", mock_image, "Text after"],
         )
 
-        # Assertions
-        assert result.score == 0.0
-        assert "Evaluation error: API Error" in result.reason
+        # Assertions - grader returns GraderError on exception
+        assert isinstance(result, GraderError)
+        assert "Evaluation error: API Error" in result.error
 
 
 # ==================== QUALITY TESTS ====================
diff --git a/tests/graders/multimodal/test_text_to_image.py b/tests/graders/multimodal/test_text_to_image.py
index d695eb9a..4890825a 100644
--- a/tests/graders/multimodal/test_text_to_image.py
+++ b/tests/graders/multimodal/test_text_to_image.py
@@ -64,7 +64,7 @@ async def test_successful_evaluation(self):
         # Create simple mock response objects (not AsyncMock to avoid __aiter__ check)
         class MockResponse:
             def __init__(self, score, reason):
-                self.metadata = {"score": score, "reason": reason}
+                self.parsed = {"score": score, "reason": reason}
 
         # TextToImageGrader calls model twice (semantic + perceptual)
         mock_semantic = MockResponse(8.0, "Good semantic consistency")
@@ -95,6 +95,8 @@ def __init__(self, score, reason):
     @pytest.mark.asyncio
     async def test_error_handling(self):
         """Test graceful error handling"""
+        from openjudge.graders.base_grader import GraderError
+
         # Create mock model that raises exception
         mock_model = AsyncMock(spec=BaseChatModel)
         mock_model.achat = AsyncMock(side_effect=Exception("API Error"))
@@ -109,10 +111,9 @@ async def test_error_handling(self):
             response=mock_image,
         )
 
-        # Assertions
-        # TextToImageGrader returns 0.5 (default) on error, not 0.0
-        assert result.score == 0.5
-        assert "error" in result.reason.lower()
+        # Assertions - grader returns GraderError on exception
+        assert isinstance(result, GraderError)
+        assert "Evaluation error: API Error" in result.error
 
 
 # ==================== QUALITY TESTS ====================