diff --git a/cookbooks/zero_shot_evaluation/chart_generator.py b/cookbooks/zero_shot_evaluation/chart_generator.py
new file mode 100644
index 00000000..ec1fcd02
--- /dev/null
+++ b/cookbooks/zero_shot_evaluation/chart_generator.py
@@ -0,0 +1,291 @@
+# -*- coding: utf-8 -*-
+"""Chart generator for zero-shot evaluation results.
+
+This module provides visualization capabilities for evaluation results,
+generating beautiful bar charts to display model win rates.
+"""
+
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from loguru import logger
+
+from cookbooks.zero_shot_evaluation.schema import ChartConfig
+
+
+class WinRateChartGenerator:
+    """Generator for win rate comparison charts.
+
+    Creates visually appealing bar charts showing model rankings
+    based on pairwise evaluation results.
+
+    Attributes:
+        config: Chart configuration options
+
+    Example:
+        >>> generator = WinRateChartGenerator(config)
+        >>> path = generator.generate(
+        ...     rankings=[("GPT-4", 0.73), ("Claude", 0.65)],
+        ...     output_dir="./results",
+        ...     task_description="Translation evaluation",
+        ... )
+    """
+
+    # Color palette - inspired by modern data visualization
+    ACCENT_COLOR = "#FF6B35"  # Vibrant orange for best model
+    ACCENT_HATCH = "///"  # Diagonal stripes pattern
+    BAR_COLORS = [
+        "#4A4A4A",  # Dark gray
+        "#6B6B6B",  # Medium gray
+        "#8C8C8C",  # Light gray
+        "#ADADAD",  # Lighter gray
+        "#CECECE",  # Very light gray
+    ]
+
+    def __init__(self, config: Optional[ChartConfig] = None):
+        """Initialize chart generator.
+
+        Args:
+            config: Chart configuration. Uses defaults if not provided.
+        """
+        self.config = config or ChartConfig()
+
+    def _configure_cjk_font(self, plt, font_manager) -> Optional[str]:
+        """Configure matplotlib to support CJK (Chinese/Japanese/Korean) characters.
+
+        Attempts to find and use a system font that supports CJK characters.
+        Falls back gracefully if no suitable font is found.
+
+        Returns:
+            Font name if found, None otherwise
+        """
+        # Common CJK fonts on different platforms (simplified Chinese priority)
+        cjk_fonts = [
+            # macOS - Simplified Chinese (verified available)
+            "Hiragino Sans GB",
+            "Songti SC",
+            "Kaiti SC",
+            "Heiti SC",
+            "Lantinghei SC",
+            "PingFang SC",
+            "STFangsong",
+            # Windows
+            "Microsoft YaHei",
+            "SimHei",
+            "SimSun",
+            # Linux
+            "Noto Sans CJK SC",
+            "WenQuanYi Micro Hei",
+            "Droid Sans Fallback",
+            # Generic
+            "Arial Unicode MS",
+        ]
+
+        # Get available fonts
+        available_fonts = {f.name for f in font_manager.fontManager.ttflist}
+
+        # Find the first available CJK font
+        for font_name in cjk_fonts:
+            if font_name in available_fonts:
+                plt.rcParams["font.sans-serif"] = [font_name] + plt.rcParams.get("font.sans-serif", [])
+                plt.rcParams["axes.unicode_minus"] = False  # Fix minus sign display
+                logger.debug(f"Using CJK font: {font_name}")
+                return font_name
+
+        # No CJK font found, log warning
+        logger.warning(
+            "No CJK font found. Chinese characters may not display correctly. "
+            "Consider installing a CJK font like 'Noto Sans CJK SC'."
+        )
+        return None
+
+    def generate(
+        self,
+        rankings: List[Tuple[str, float]],
+        output_dir: str,
+        task_description: Optional[str] = None,
+        total_queries: int = 0,
+        total_comparisons: int = 0,
+    ) -> Optional[Path]:
+        """Generate win rate bar chart.
+
+        Args:
+            rankings: List of (model_name, win_rate) tuples, sorted by win rate
+            output_dir: Directory to save the chart
+            task_description: Task description for subtitle
+            total_queries: Number of queries evaluated
+            total_comparisons: Number of pairwise comparisons
+
+        Returns:
+            Path to saved chart file, or None if generation failed
+        """
+        if not rankings:
+            logger.warning("No rankings data to visualize")
+            return None
+
+        try:
+            import matplotlib.patches as mpatches
+            import matplotlib.pyplot as plt
+            import numpy as np
+            from matplotlib import font_manager
+        except ImportError:
+            logger.warning("matplotlib not installed. Install with: pip install matplotlib")
+            return None
+
+        # Extract config values (defaults are centralized in ChartConfig schema)
+        figsize = self.config.figsize
+        dpi = self.config.dpi
+        fmt = self.config.format
+        show_values = self.config.show_values
+        highlight_best = self.config.highlight_best
+        custom_title = self.config.title
+
+        # Prepare data (already sorted high to low)
+        model_names = [r[0] for r in rankings]
+        win_rates = [r[1] * 100 for r in rankings]  # Convert to percentage
+        n_models = len(model_names)
+
+        # Setup figure with modern styling (MUST be before font config)
+        plt.style.use("seaborn-v0_8-whitegrid")
+
+        # Configure font for CJK (Chinese/Japanese/Korean) support
+        # This MUST be after plt.style.use() as style resets font settings
+        self._configure_cjk_font(plt, font_manager)
+        fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
+
+        # Create bar positions
+        x_pos = np.arange(n_models)
+        bar_width = 0.6
+
+        # Determine colors for each bar
+        colors = []
+        edge_colors = []
+
+        for i in range(n_models):
+            if i == 0 and highlight_best:
+                # Best model gets accent color
+                colors.append(self.ACCENT_COLOR)
+                edge_colors.append(self.ACCENT_COLOR)
+            else:
+                # Other models get grayscale
+                color_idx = min(i - 1, len(self.BAR_COLORS) - 1) if highlight_best else min(i, len(self.BAR_COLORS) - 1)
+                colors.append(self.BAR_COLORS[color_idx])
+                edge_colors.append(self.BAR_COLORS[color_idx])
+
+        # Draw bars
+        bars = ax.bar(
+            x_pos,
+            win_rates,
+            width=bar_width,
+            color=colors,
+            edgecolor=edge_colors,
+            linewidth=1.5,
+            zorder=3,
+        )
+
+        # Add hatch pattern to best model
+        if highlight_best and n_models > 0:
+            bars[0].set_hatch(self.ACCENT_HATCH)
+            bars[0].set_edgecolor("white")
+
+        # Add value labels on top of bars
+        if show_values:
+            for i, (bar, rate) in enumerate(zip(bars, win_rates)):
+                height = bar.get_height()
+                ax.annotate(
+                    f"{rate:.1f}",
+                    xy=(bar.get_x() + bar.get_width() / 2, height),
+                    xytext=(0, 5),
+                    textcoords="offset points",
+                    ha="center",
+                    va="bottom",
+                    fontsize=12,
+                    fontweight="bold",
+                    color="#333333",
+                )
+
+        # Customize axes
+        ax.set_xticks(x_pos)
+        ax.set_xticklabels(model_names, fontsize=11, fontweight="medium")
+        ax.set_ylabel("Win Rate (%)", fontsize=12, fontweight="medium", labelpad=10)
+        ax.set_ylim(0, max(10, min(100, max(win_rates) * 1.15)))  # Add headroom for labels
+
+        # Remove top and right spines
+        ax.spines["top"].set_visible(False)
+        ax.spines["right"].set_visible(False)
+        ax.spines["left"].set_color("#CCCCCC")
+        ax.spines["bottom"].set_color("#CCCCCC")
+
+        # Customize grid
+        ax.yaxis.grid(True, linestyle="--", alpha=0.5, color="#DDDDDD", zorder=0)
+        ax.xaxis.grid(False)
+
+        # Title
+        title = custom_title or "Model Win Rate Comparison"
+        ax.set_title(title, fontsize=16, fontweight="bold", pad=20, color="#333333")
+
+        # Subtitle with evaluation info
+        subtitle_parts = []
+        if task_description:
+            # Truncate long descriptions
+            desc = task_description[:60] + "..." if len(task_description) > 60 else task_description
+            subtitle_parts.append(f"Task: {desc}")
+        if total_queries > 0:
+            subtitle_parts.append(f"Queries: {total_queries}")
+        if total_comparisons > 0:
+            subtitle_parts.append(f"Comparisons: {total_comparisons}")
+
+        if subtitle_parts:
+            subtitle = "  |  ".join(subtitle_parts)
+            ax.text(
+                0.5,
+                1.02,
+                subtitle,
+                transform=ax.transAxes,
+                ha="center",
+                va="bottom",
+                fontsize=10,
+                color="#666666",
+                style="italic",
+            )
+
+        # Create legend
+        legend_elements = []
+        if highlight_best and n_models > 0:
+            best_patch = mpatches.Patch(
+                facecolor=self.ACCENT_COLOR,
+                edgecolor="white",
+                hatch=self.ACCENT_HATCH,
+                label=f"Best: {model_names[0]}",
+            )
+            legend_elements.append(best_patch)
+
+        if legend_elements:
+            ax.legend(
+                handles=legend_elements,
+                loc="upper right",
+                frameon=True,
+                framealpha=0.9,
+                fontsize=10,
+            )
+
+        # Tight layout
+        plt.tight_layout()
+
+        # Save chart
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        chart_file = output_path / f"win_rate_chart.{fmt}"
+
+        plt.savefig(
+            chart_file,
+            format=fmt,
+            dpi=dpi,
+            bbox_inches="tight",
+            facecolor="white",
+            edgecolor="none",
+        )
+        plt.close(fig)
+
+        logger.info(f"Win rate chart saved to {chart_file}")
+        return chart_file
diff --git a/cookbooks/zero_shot_evaluation/checkpoint.py b/cookbooks/zero_shot_evaluation/checkpoint.py
deleted file mode 100644
index 43a8b7b3..00000000
--- a/cookbooks/zero_shot_evaluation/checkpoint.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Checkpoint management for evaluation pipeline."""
-
-import json
-from datetime import datetime
-from enum import Enum
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from loguru import logger
-from pydantic import BaseModel, Field
-
-from cookbooks.zero_shot_evaluation.schema import GeneratedQuery
-
-
-class EvaluationStage(str, Enum):
-    """Evaluation pipeline stages."""
-
-    NOT_STARTED = "not_started"
-    QUERIES_GENERATED = "queries_generated"
-    RESPONSES_COLLECTED = "responses_collected"
-    RUBRICS_GENERATED = "rubrics_generated"
-    EVALUATION_COMPLETE = "evaluation_complete"
-
-
-class CheckpointData(BaseModel):
-    """Checkpoint data model."""
-
-    stage: EvaluationStage = Field(default=EvaluationStage.NOT_STARTED)
-    created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
-    updated_at: str = Field(default_factory=lambda: datetime.now().isoformat())
-
-    # Data files
-    queries_file: Optional[str] = None
-    responses_file: Optional[str] = None
-    rubrics_file: Optional[str] = None
-
-    # Progress tracking
-    total_queries: int = 0
-    collected_responses: int = 0
-    evaluated_pairs: int = 0
-    total_pairs: int = 0
-
-
-class CheckpointManager:
-    """Manage evaluation checkpoints for resume capability."""
-
-    CHECKPOINT_FILE = "checkpoint.json"
-    QUERIES_FILE = "queries.json"
-    RESPONSES_FILE = "responses.json"
-    RUBRICS_FILE = "rubrics.json"
-
-    def __init__(self, output_dir: str):
-        """Initialize checkpoint manager.
-
-        Args:
-            output_dir: Directory to store checkpoint files
-        """
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        self._checkpoint: Optional[CheckpointData] = None
-
-    @property
-    def checkpoint_path(self) -> Path:
-        return self.output_dir / self.CHECKPOINT_FILE
-
-    def load(self) -> Optional[CheckpointData]:
-        """Load existing checkpoint if available."""
-        if not self.checkpoint_path.exists():
-            logger.info("No checkpoint found, starting fresh")
-            return None
-
-        try:
-            with open(self.checkpoint_path, "r", encoding="utf-8") as f:
-                data = json.load(f)
-            self._checkpoint = CheckpointData(**data)
-            logger.info(f"Loaded checkpoint: stage={self._checkpoint.stage.value}")
-            return self._checkpoint
-        except Exception as e:
-            logger.warning(f"Failed to load checkpoint: {e}")
-            return None
-
-    def save(self, checkpoint: CheckpointData) -> None:
-        """Save checkpoint to file."""
-        checkpoint.updated_at = datetime.now().isoformat()
-        self._checkpoint = checkpoint
-
-        with open(self.checkpoint_path, "w", encoding="utf-8") as f:
-            json.dump(checkpoint.model_dump(), f, indent=2, ensure_ascii=False)
-
-        logger.debug(f"Checkpoint saved: stage={checkpoint.stage.value}")
-
-    def save_queries(self, queries: List[GeneratedQuery]) -> str:
-        """Save generated queries."""
-        file_path = self.output_dir / self.QUERIES_FILE
-
-        with open(file_path, "w", encoding="utf-8") as f:
-            json.dump([q.model_dump() for q in queries], f, indent=2, ensure_ascii=False)
-
-        logger.info(f"Saved {len(queries)} queries to {file_path}")
-        return str(file_path)
-
-    def load_queries(self) -> List[GeneratedQuery]:
-        """Load saved queries."""
-        file_path = self.output_dir / self.QUERIES_FILE
-
-        if not file_path.exists():
-            return []
-
-        with open(file_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-
-        queries = [GeneratedQuery(**item) for item in data]
-        logger.info(f"Loaded {len(queries)} queries from {file_path}")
-        return queries
-
-    def save_responses(self, responses: List[Dict[str, Any]]) -> str:
-        """Save collected responses."""
-        file_path = self.output_dir / self.RESPONSES_FILE
-
-        with open(file_path, "w", encoding="utf-8") as f:
-            json.dump(responses, f, indent=2, ensure_ascii=False)
-
-        logger.info(f"Saved {len(responses)} responses to {file_path}")
-        return str(file_path)
-
-    def load_responses(self) -> List[Dict[str, Any]]:
-        """Load saved responses."""
-        file_path = self.output_dir / self.RESPONSES_FILE
-
-        if not file_path.exists():
-            return []
-
-        with open(file_path, "r", encoding="utf-8") as f:
-            responses = json.load(f)
-
-        logger.info(f"Loaded {len(responses)} responses from {file_path}")
-        return responses
-
-    def save_rubrics(self, rubrics: List[str]) -> str:
-        """Save generated rubrics."""
-        file_path = self.output_dir / self.RUBRICS_FILE
-
-        with open(file_path, "w", encoding="utf-8") as f:
-            json.dump(rubrics, f, indent=2, ensure_ascii=False)
-
-        logger.info(f"Saved {len(rubrics)} rubrics to {file_path}")
-        return str(file_path)
-
-    def load_rubrics(self) -> List[str]:
-        """Load saved rubrics."""
-        file_path = self.output_dir / self.RUBRICS_FILE
-
-        if not file_path.exists():
-            return []
-
-        with open(file_path, "r", encoding="utf-8") as f:
-            rubrics = json.load(f)
-
-        logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}")
-        return rubrics
-
-    def update_stage(
-        self,
-        stage: EvaluationStage,
-        **kwargs,
-    ) -> None:
-        """Update checkpoint stage and save."""
-        if self._checkpoint is None:
-            self._checkpoint = CheckpointData()
-
-        self._checkpoint.stage = stage
-        for key, value in kwargs.items():
-            if hasattr(self._checkpoint, key):
-                setattr(self._checkpoint, key, value)
-
-        self.save(self._checkpoint)
-
-    def clear(self) -> None:
-        """Clear all checkpoint data."""
-        for file_name in [
-            self.CHECKPOINT_FILE,
-            self.QUERIES_FILE,
-            self.RESPONSES_FILE,
-            self.RUBRICS_FILE,
-        ]:
-            file_path = self.output_dir / file_name
-            if file_path.exists():
-                file_path.unlink()
-
-        self._checkpoint = None
-        logger.info("Checkpoint cleared")
diff --git a/cookbooks/zero_shot_evaluation/schema.py b/cookbooks/zero_shot_evaluation/schema.py
index 6fa1be3d..7437d13a 100644
--- a/cookbooks/zero_shot_evaluation/schema.py
+++ b/cookbooks/zero_shot_evaluation/schema.py
@@ -92,12 +92,25 @@ class OutputConfig(BaseModel):
     output_dir: str = Field(default="./evaluation_results", description="Output directory")
 
 
+class ChartConfig(BaseModel):
+    """Chart generation configuration."""
+
+    enabled: bool = Field(default=True, description="Whether to generate win rate chart")
+    title: Optional[str] = Field(default=None, description="Chart title (auto-generated if not set)")
+    figsize: tuple = Field(default=(12, 7), description="Figure size (width, height) in inches")
+    dpi: int = Field(default=150, ge=72, le=300, description="Image resolution")
+    format: Literal["png", "svg", "pdf"] = Field(default="png", description="Output format")
+    show_values: bool = Field(default=True, description="Show values on top of bars")
+    highlight_best: bool = Field(default=True, description="Highlight the best model with accent color")
+
+
 class ReportConfig(BaseModel):
     """Report generation configuration."""
 
     enabled: bool = Field(default=False, description="Whether to generate report")
     language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en")
     include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section")
+    chart: ChartConfig = Field(default_factory=ChartConfig, description="Chart configuration")
 
 
 class ZeroShotConfig(BaseModel):
diff --git a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
index 66ade84c..3bbbd770 100644
--- a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
+++ b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
@@ -23,6 +23,7 @@
 from loguru import logger
 from pydantic import BaseModel, Field
 
+from cookbooks.zero_shot_evaluation.chart_generator import WinRateChartGenerator
 from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
 from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
 from cookbooks.zero_shot_evaluation.schema import (
@@ -57,6 +58,34 @@ class EvaluationStage(str, Enum):
     RUBRICS_GENERATED = "rubrics_generated"
     EVALUATION_COMPLETE = "evaluation_complete"
 
+    @classmethod
+    def get_order(cls, stage: "EvaluationStage") -> int:
+        """Get numeric order of a stage for comparison."""
+        order = {
+            cls.NOT_STARTED: 0,
+            cls.QUERIES_GENERATED: 1,
+            cls.RESPONSES_COLLECTED: 2,
+            cls.RUBRICS_GENERATED: 3,
+            cls.EVALUATION_COMPLETE: 4,
+        }
+        return order.get(stage, -1)
+
+    def __ge__(self, other: "EvaluationStage") -> bool:
+        """Compare stages by pipeline order, not string value."""
+        return self.get_order(self) >= self.get_order(other)
+
+    def __gt__(self, other: "EvaluationStage") -> bool:
+        """Compare stages by pipeline order, not string value."""
+        return self.get_order(self) > self.get_order(other)
+
+    def __le__(self, other: "EvaluationStage") -> bool:
+        """Compare stages by pipeline order, not string value."""
+        return self.get_order(self) <= self.get_order(other)
+
+    def __lt__(self, other: "EvaluationStage") -> bool:
+        """Compare stages by pipeline order, not string value."""
+        return self.get_order(self) < self.get_order(other)
+
 
 class _CheckpointData(BaseModel):
     """Internal checkpoint data model."""
@@ -630,7 +659,7 @@ async def evaluate(
         if queries:
             self._queries = queries
             logger.info(f"Using {len(queries)} provided queries")
-        elif checkpoint and checkpoint.stage.value >= EvaluationStage.QUERIES_GENERATED.value:
+        elif checkpoint and checkpoint.stage >= EvaluationStage.QUERIES_GENERATED:
             self._queries = self._checkpoint_mgr.load_queries()
             logger.info(f"Resumed {len(self._queries)} queries from checkpoint")
         elif not self._queries:
@@ -644,7 +673,7 @@ async def evaluate(
             )
 
         # Step 2: Collect or load responses
-        if checkpoint and checkpoint.stage.value >= EvaluationStage.RESPONSES_COLLECTED.value:
+        if checkpoint and checkpoint.stage >= EvaluationStage.RESPONSES_COLLECTED:
             self._responses = self._checkpoint_mgr.load_responses()
             logger.info(f"Resumed {len(self._responses)} responses from checkpoint")
         elif not self._responses:
@@ -661,7 +690,7 @@ async def evaluate(
         if rubrics:
             self._rubrics = rubrics
             logger.info(f"Using {len(rubrics)} provided rubrics")
-        elif checkpoint and checkpoint.stage.value >= EvaluationStage.RUBRICS_GENERATED.value:
+        elif checkpoint and checkpoint.stage >= EvaluationStage.RUBRICS_GENERATED:
             self._rubrics = self._checkpoint_mgr.load_rubrics()
             logger.info(f"Resumed {len(self._rubrics)} rubrics from checkpoint")
         elif not self._rubrics:
@@ -702,6 +731,10 @@ async def evaluate(
         if self.config.report.enabled:
             await self._generate_and_save_report(result)
 
+        # Step 7: Generate win rate chart if enabled (requires report.enabled)
+        if self.config.report.enabled and self.config.report.chart.enabled:
+            self._generate_win_rate_chart(result)
+
         return result
 
     async def _generate_and_save_report(self, result: EvaluationResult) -> None:
@@ -728,6 +761,21 @@ async def _generate_and_save_report(self, result: EvaluationResult) -> None:
             f.write(report)
         logger.info(f"Report saved to {report_path}")
 
+    def _generate_win_rate_chart(self, result: EvaluationResult) -> None:
+        """Generate and save win rate comparison chart."""
+        logger.info("Step 7: Generating win rate chart...")
+
+        chart_config = self.config.report.chart
+        generator = WinRateChartGenerator(config=chart_config)
+
+        chart_path = generator.generate(
+            rankings=result.rankings,
+            output_dir=self.config.output.output_dir,
+            task_description=self.config.task.description,
+            total_queries=result.total_queries,
+            total_comparisons=result.total_comparisons,
+        )
+
     def _display_results(self, result: EvaluationResult) -> None:
         """Display evaluation results with formatted output."""
         endpoint_names = list(self.config.target_endpoints.keys())
diff --git a/docs/applications/zero_shot_evaluation.md b/docs/applications/zero_shot_evaluation.md
index 4fa9f2d7..3ab63257 100644
--- a/docs/applications/zero_shot_evaluation.md
+++ b/docs/applications/zero_shot_evaluation.md
@@ -10,7 +10,7 @@ Zero-shot evaluation is ideal for **model comparison**, **agent pipeline testing
 !!! tip "No Test Data Required"
     Unlike traditional evaluation, zero-shot evaluation generates its own test queries from the task description, eliminating the need for pre-existing test datasets.
 
-The pipeline automates five steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → produce rankings.
+The pipeline automates seven steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → analyze results → generate report → create visualization.
 
 | Step | Component | Description |
 |------|-----------|-------------|
@@ -18,7 +18,9 @@ The pipeline automates five steps: generate test queries → collect responses 
 | 2 | `ResponseCollector` | Collect responses from all target endpoints |
 | 3 | `TaskBasedRubricGenerator` | Generate evaluation criteria for the task |
 | 4 | `GradingRunner` | Run pairwise comparisons with judge model |
-| 5 | `ZeroShotPipeline` | Analyze results and produce rankings |
+| 5 | `PairwiseAnalyzer` | Analyze results and produce rankings |
+| 6 | `ReportGenerator` | Generate detailed Markdown evaluation report |
+| 7 | `WinRateChartGenerator` | Create win rate visualization chart |
 
 
 ## Quick Start
@@ -267,6 +269,15 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons
         ============================================================
         ```
 
+    **Output Files:**
+
+    | File | Description |
+    |------|-------------|
+    | `evaluation_report.md` | Detailed Markdown report with analysis |
+    | `win_rate_chart.png` | Visual bar chart for presentations |
+    | `comparison_details.json` | Traceable pairwise comparison records |
+    | `evaluation_results.json` | Structured result data (JSON) |
+
 === "Query Generation Options"
 
     Fine-tune query generation behavior:
@@ -312,6 +323,7 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons
     ```
     evaluation_results/
     ├── evaluation_report.md      # Generated Markdown report
+    ├── win_rate_chart.png        # Win rate visualization chart
     ├── comparison_details.json   # All pairwise comparison details
     ├── evaluation_results.json   # Final rankings and statistics
     ├── queries.json              # Generated test queries
@@ -322,6 +334,33 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons
     !!! tip "Example Report"
         View a real report: [Oncology Medical Translation Evaluation](sample_reports/oncology_translation_report.md)
 
+=== "Win Rate Chart"
+
+    Automatically generate a beautiful bar chart showing model win rates:
+
+    ```yaml
+    report:
+      chart:
+        enabled: true          # Enable chart generation (default: true)
+        title: null            # Custom title (auto-generated if not set)
+        figsize: [12, 7]       # Figure size (width, height) in inches
+        dpi: 150               # Image resolution (72-300)
+        format: "png"          # Output format: png / svg / pdf
+        show_values: true      # Show percentage values on bars
+        highlight_best: true   # Highlight best model with accent color
+    ```
+
+    **Chart Features:**
+
+    - 🥇 **Best model highlighted** with orange diagonal stripes
+    - 📊 **Gray gradient** for other models by rank
+    - 🔢 **Value labels** on top of each bar
+    - 🌏 **CJK font support** for Chinese/Japanese/Korean text
+
+    ![Win Rate Chart Example](../images/win_rate_chart_example.png)
+
+    *Example: Oncology medical translation evaluation with 5 models*
+
 === "Checkpoint & Resume"
 
     Evaluations automatically save checkpoints for resumption after interruptions:
@@ -348,6 +387,7 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons
     - Set `num_queries` to at least **20** for statistically meaningful results
     - Choose a **strong judge model** (at least as capable as models being evaluated)
     - Use `--save` flag to persist results for later analysis
+    - Use the generated **win rate chart** for presentations and reports
 
 !!! warning "Don't"
     - Use a judge model weaker than the models being evaluated
diff --git a/docs/images/win_rate_chart_example.png b/docs/images/win_rate_chart_example.png
new file mode 100644
index 00000000..1aed684a
Binary files /dev/null and b/docs/images/win_rate_chart_example.png differ
diff --git a/pyproject.toml b/pyproject.toml
index 4a7bdbd3..2c045878 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,7 +71,8 @@ dev = [
     "pytest-trio",
     "pytest-twisted",
     "twisted",
-    "python-dotenv"
+    "python-dotenv",
+    "matplotlib>=3.7.0,<4.0.0"
 ]
 verl = [
     "transformers>=4.52.4,<5.0.0",