diff --git a/cookbooks/zero_shot_evaluation/chart_generator.py b/cookbooks/zero_shot_evaluation/chart_generator.py new file mode 100644 index 00000000..ec1fcd02 --- /dev/null +++ b/cookbooks/zero_shot_evaluation/chart_generator.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +"""Chart generator for zero-shot evaluation results. + +This module provides visualization capabilities for evaluation results, +generating beautiful bar charts to display model win rates. +""" + +from pathlib import Path +from typing import List, Optional, Tuple + +from loguru import logger + +from cookbooks.zero_shot_evaluation.schema import ChartConfig + + +class WinRateChartGenerator: + """Generator for win rate comparison charts. + + Creates visually appealing bar charts showing model rankings + based on pairwise evaluation results. + + Attributes: + config: Chart configuration options + + Example: + >>> generator = WinRateChartGenerator(config) + >>> path = generator.generate( + ... rankings=[("GPT-4", 0.73), ("Claude", 0.65)], + ... output_dir="./results", + ... task_description="Translation evaluation", + ... ) + """ + + # Color palette - inspired by modern data visualization + ACCENT_COLOR = "#FF6B35" # Vibrant orange for best model + ACCENT_HATCH = "///" # Diagonal stripes pattern + BAR_COLORS = [ + "#4A4A4A", # Dark gray + "#6B6B6B", # Medium gray + "#8C8C8C", # Light gray + "#ADADAD", # Lighter gray + "#CECECE", # Very light gray + ] + + def __init__(self, config: Optional[ChartConfig] = None): + """Initialize chart generator. + + Args: + config: Chart configuration. Uses defaults if not provided. + """ + self.config = config or ChartConfig() + + def _configure_cjk_font(self, plt, font_manager) -> Optional[str]: + """Configure matplotlib to support CJK (Chinese/Japanese/Korean) characters. + + Attempts to find and use a system font that supports CJK characters. + Falls back gracefully if no suitable font is found. + + Returns: + Font name if found, None otherwise + """ + # Common CJK fonts on different platforms (simplified Chinese priority) + cjk_fonts = [ + # macOS - Simplified Chinese (verified available) + "Hiragino Sans GB", + "Songti SC", + "Kaiti SC", + "Heiti SC", + "Lantinghei SC", + "PingFang SC", + "STFangsong", + # Windows + "Microsoft YaHei", + "SimHei", + "SimSun", + # Linux + "Noto Sans CJK SC", + "WenQuanYi Micro Hei", + "Droid Sans Fallback", + # Generic + "Arial Unicode MS", + ] + + # Get available fonts + available_fonts = {f.name for f in font_manager.fontManager.ttflist} + + # Find the first available CJK font + for font_name in cjk_fonts: + if font_name in available_fonts: + plt.rcParams["font.sans-serif"] = [font_name] + plt.rcParams.get("font.sans-serif", []) + plt.rcParams["axes.unicode_minus"] = False # Fix minus sign display + logger.debug(f"Using CJK font: {font_name}") + return font_name + + # No CJK font found, log warning + logger.warning( + "No CJK font found. Chinese characters may not display correctly. " + "Consider installing a CJK font like 'Noto Sans CJK SC'." + ) + return None + + def generate( + self, + rankings: List[Tuple[str, float]], + output_dir: str, + task_description: Optional[str] = None, + total_queries: int = 0, + total_comparisons: int = 0, + ) -> Optional[Path]: + """Generate win rate bar chart. + + Args: + rankings: List of (model_name, win_rate) tuples, sorted by win rate + output_dir: Directory to save the chart + task_description: Task description for subtitle + total_queries: Number of queries evaluated + total_comparisons: Number of pairwise comparisons + + Returns: + Path to saved chart file, or None if generation failed + """ + if not rankings: + logger.warning("No rankings data to visualize") + return None + + try: + import matplotlib.patches as mpatches + import matplotlib.pyplot as plt + import numpy as np + from matplotlib import font_manager + except ImportError: + logger.warning("matplotlib not installed. Install with: pip install matplotlib") + return None + + # Extract config values (defaults are centralized in ChartConfig schema) + figsize = self.config.figsize + dpi = self.config.dpi + fmt = self.config.format + show_values = self.config.show_values + highlight_best = self.config.highlight_best + custom_title = self.config.title + + # Prepare data (already sorted high to low) + model_names = [r[0] for r in rankings] + win_rates = [r[1] * 100 for r in rankings] # Convert to percentage + n_models = len(model_names) + + # Setup figure with modern styling (MUST be before font config) + plt.style.use("seaborn-v0_8-whitegrid") + + # Configure font for CJK (Chinese/Japanese/Korean) support + # This MUST be after plt.style.use() as style resets font settings + self._configure_cjk_font(plt, font_manager) + fig, ax = plt.subplots(figsize=figsize, dpi=dpi) + + # Create bar positions + x_pos = np.arange(n_models) + bar_width = 0.6 + + # Determine colors for each bar + colors = [] + edge_colors = [] + + for i in range(n_models): + if i == 0 and highlight_best: + # Best model gets accent color + colors.append(self.ACCENT_COLOR) + edge_colors.append(self.ACCENT_COLOR) + else: + # Other models get grayscale + color_idx = min(i - 1, len(self.BAR_COLORS) - 1) if highlight_best else min(i, len(self.BAR_COLORS) - 1) + colors.append(self.BAR_COLORS[color_idx]) + edge_colors.append(self.BAR_COLORS[color_idx]) + + # Draw bars + bars = ax.bar( + x_pos, + win_rates, + width=bar_width, + color=colors, + edgecolor=edge_colors, + linewidth=1.5, + zorder=3, + ) + + # Add hatch pattern to best model + if highlight_best and n_models > 0: + bars[0].set_hatch(self.ACCENT_HATCH) + bars[0].set_edgecolor("white") + + # Add value labels on top of bars + if show_values: + for i, (bar, rate) in enumerate(zip(bars, win_rates)): + height = bar.get_height() + ax.annotate( + f"{rate:.1f}", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 5), + textcoords="offset points", + ha="center", + va="bottom", + fontsize=12, + fontweight="bold", + color="#333333", + ) + + # Customize axes + ax.set_xticks(x_pos) + ax.set_xticklabels(model_names, fontsize=11, fontweight="medium") + ax.set_ylabel("Win Rate (%)", fontsize=12, fontweight="medium", labelpad=10) + ax.set_ylim(0, max(10, min(100, max(win_rates) * 1.15))) # Add headroom for labels + + # Remove top and right spines + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.spines["left"].set_color("#CCCCCC") + ax.spines["bottom"].set_color("#CCCCCC") + + # Customize grid + ax.yaxis.grid(True, linestyle="--", alpha=0.5, color="#DDDDDD", zorder=0) + ax.xaxis.grid(False) + + # Title + title = custom_title or "Model Win Rate Comparison" + ax.set_title(title, fontsize=16, fontweight="bold", pad=20, color="#333333") + + # Subtitle with evaluation info + subtitle_parts = [] + if task_description: + # Truncate long descriptions + desc = task_description[:60] + "..." if len(task_description) > 60 else task_description + subtitle_parts.append(f"Task: {desc}") + if total_queries > 0: + subtitle_parts.append(f"Queries: {total_queries}") + if total_comparisons > 0: + subtitle_parts.append(f"Comparisons: {total_comparisons}") + + if subtitle_parts: + subtitle = " | ".join(subtitle_parts) + ax.text( + 0.5, + 1.02, + subtitle, + transform=ax.transAxes, + ha="center", + va="bottom", + fontsize=10, + color="#666666", + style="italic", + ) + + # Create legend + legend_elements = [] + if highlight_best and n_models > 0: + best_patch = mpatches.Patch( + facecolor=self.ACCENT_COLOR, + edgecolor="white", + hatch=self.ACCENT_HATCH, + label=f"Best: {model_names[0]}", + ) + legend_elements.append(best_patch) + + if legend_elements: + ax.legend( + handles=legend_elements, + loc="upper right", + frameon=True, + framealpha=0.9, + fontsize=10, + ) + + # Tight layout + plt.tight_layout() + + # Save chart + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + chart_file = output_path / f"win_rate_chart.{fmt}" + + plt.savefig( + chart_file, + format=fmt, + dpi=dpi, + bbox_inches="tight", + facecolor="white", + edgecolor="none", + ) + plt.close(fig) + + logger.info(f"Win rate chart saved to {chart_file}") + return chart_file diff --git a/cookbooks/zero_shot_evaluation/checkpoint.py b/cookbooks/zero_shot_evaluation/checkpoint.py deleted file mode 100644 index 43a8b7b3..00000000 --- a/cookbooks/zero_shot_evaluation/checkpoint.py +++ /dev/null @@ -1,192 +0,0 @@ -# -*- coding: utf-8 -*- -"""Checkpoint management for evaluation pipeline.""" - -import json -from datetime import datetime -from enum import Enum -from pathlib import Path -from typing import Any, Dict, List, Optional - -from loguru import logger -from pydantic import BaseModel, Field - -from cookbooks.zero_shot_evaluation.schema import GeneratedQuery - - -class EvaluationStage(str, Enum): - """Evaluation pipeline stages.""" - - NOT_STARTED = "not_started" - QUERIES_GENERATED = "queries_generated" - RESPONSES_COLLECTED = "responses_collected" - RUBRICS_GENERATED = "rubrics_generated" - EVALUATION_COMPLETE = "evaluation_complete" - - -class CheckpointData(BaseModel): - """Checkpoint data model.""" - - stage: EvaluationStage = Field(default=EvaluationStage.NOT_STARTED) - created_at: str = Field(default_factory=lambda: datetime.now().isoformat()) - updated_at: str = Field(default_factory=lambda: datetime.now().isoformat()) - - # Data files - queries_file: Optional[str] = None - responses_file: Optional[str] = None - rubrics_file: Optional[str] = None - - # Progress tracking - total_queries: int = 0 - collected_responses: int = 0 - evaluated_pairs: int = 0 - total_pairs: int = 0 - - -class CheckpointManager: - """Manage evaluation checkpoints for resume capability.""" - - CHECKPOINT_FILE = "checkpoint.json" - QUERIES_FILE = "queries.json" - RESPONSES_FILE = "responses.json" - RUBRICS_FILE = "rubrics.json" - - def __init__(self, output_dir: str): - """Initialize checkpoint manager. - - Args: - output_dir: Directory to store checkpoint files - """ - self.output_dir = Path(output_dir) - self.output_dir.mkdir(parents=True, exist_ok=True) - self._checkpoint: Optional[CheckpointData] = None - - @property - def checkpoint_path(self) -> Path: - return self.output_dir / self.CHECKPOINT_FILE - - def load(self) -> Optional[CheckpointData]: - """Load existing checkpoint if available.""" - if not self.checkpoint_path.exists(): - logger.info("No checkpoint found, starting fresh") - return None - - try: - with open(self.checkpoint_path, "r", encoding="utf-8") as f: - data = json.load(f) - self._checkpoint = CheckpointData(**data) - logger.info(f"Loaded checkpoint: stage={self._checkpoint.stage.value}") - return self._checkpoint - except Exception as e: - logger.warning(f"Failed to load checkpoint: {e}") - return None - - def save(self, checkpoint: CheckpointData) -> None: - """Save checkpoint to file.""" - checkpoint.updated_at = datetime.now().isoformat() - self._checkpoint = checkpoint - - with open(self.checkpoint_path, "w", encoding="utf-8") as f: - json.dump(checkpoint.model_dump(), f, indent=2, ensure_ascii=False) - - logger.debug(f"Checkpoint saved: stage={checkpoint.stage.value}") - - def save_queries(self, queries: List[GeneratedQuery]) -> str: - """Save generated queries.""" - file_path = self.output_dir / self.QUERIES_FILE - - with open(file_path, "w", encoding="utf-8") as f: - json.dump([q.model_dump() for q in queries], f, indent=2, ensure_ascii=False) - - logger.info(f"Saved {len(queries)} queries to {file_path}") - return str(file_path) - - def load_queries(self) -> List[GeneratedQuery]: - """Load saved queries.""" - file_path = self.output_dir / self.QUERIES_FILE - - if not file_path.exists(): - return [] - - with open(file_path, "r", encoding="utf-8") as f: - data = json.load(f) - - queries = [GeneratedQuery(**item) for item in data] - logger.info(f"Loaded {len(queries)} queries from {file_path}") - return queries - - def save_responses(self, responses: List[Dict[str, Any]]) -> str: - """Save collected responses.""" - file_path = self.output_dir / self.RESPONSES_FILE - - with open(file_path, "w", encoding="utf-8") as f: - json.dump(responses, f, indent=2, ensure_ascii=False) - - logger.info(f"Saved {len(responses)} responses to {file_path}") - return str(file_path) - - def load_responses(self) -> List[Dict[str, Any]]: - """Load saved responses.""" - file_path = self.output_dir / self.RESPONSES_FILE - - if not file_path.exists(): - return [] - - with open(file_path, "r", encoding="utf-8") as f: - responses = json.load(f) - - logger.info(f"Loaded {len(responses)} responses from {file_path}") - return responses - - def save_rubrics(self, rubrics: List[str]) -> str: - """Save generated rubrics.""" - file_path = self.output_dir / self.RUBRICS_FILE - - with open(file_path, "w", encoding="utf-8") as f: - json.dump(rubrics, f, indent=2, ensure_ascii=False) - - logger.info(f"Saved {len(rubrics)} rubrics to {file_path}") - return str(file_path) - - def load_rubrics(self) -> List[str]: - """Load saved rubrics.""" - file_path = self.output_dir / self.RUBRICS_FILE - - if not file_path.exists(): - return [] - - with open(file_path, "r", encoding="utf-8") as f: - rubrics = json.load(f) - - logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}") - return rubrics - - def update_stage( - self, - stage: EvaluationStage, - **kwargs, - ) -> None: - """Update checkpoint stage and save.""" - if self._checkpoint is None: - self._checkpoint = CheckpointData() - - self._checkpoint.stage = stage - for key, value in kwargs.items(): - if hasattr(self._checkpoint, key): - setattr(self._checkpoint, key, value) - - self.save(self._checkpoint) - - def clear(self) -> None: - """Clear all checkpoint data.""" - for file_name in [ - self.CHECKPOINT_FILE, - self.QUERIES_FILE, - self.RESPONSES_FILE, - self.RUBRICS_FILE, - ]: - file_path = self.output_dir / file_name - if file_path.exists(): - file_path.unlink() - - self._checkpoint = None - logger.info("Checkpoint cleared") diff --git a/cookbooks/zero_shot_evaluation/schema.py b/cookbooks/zero_shot_evaluation/schema.py index 6fa1be3d..7437d13a 100644 --- a/cookbooks/zero_shot_evaluation/schema.py +++ b/cookbooks/zero_shot_evaluation/schema.py @@ -92,12 +92,25 @@ class OutputConfig(BaseModel): output_dir: str = Field(default="./evaluation_results", description="Output directory") +class ChartConfig(BaseModel): + """Chart generation configuration.""" + + enabled: bool = Field(default=True, description="Whether to generate win rate chart") + title: Optional[str] = Field(default=None, description="Chart title (auto-generated if not set)") + figsize: tuple = Field(default=(12, 7), description="Figure size (width, height) in inches") + dpi: int = Field(default=150, ge=72, le=300, description="Image resolution") + format: Literal["png", "svg", "pdf"] = Field(default="png", description="Output format") + show_values: bool = Field(default=True, description="Show values on top of bars") + highlight_best: bool = Field(default=True, description="Highlight the best model with accent color") + + class ReportConfig(BaseModel): """Report generation configuration.""" enabled: bool = Field(default=False, description="Whether to generate report") language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en") include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section") + chart: ChartConfig = Field(default_factory=ChartConfig, description="Chart configuration") class ZeroShotConfig(BaseModel): diff --git a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py index 66ade84c..3bbbd770 100644 --- a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py +++ b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py @@ -23,6 +23,7 @@ from loguru import logger from pydantic import BaseModel, Field +from cookbooks.zero_shot_evaluation.chart_generator import WinRateChartGenerator from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector from cookbooks.zero_shot_evaluation.schema import ( @@ -57,6 +58,34 @@ class EvaluationStage(str, Enum): RUBRICS_GENERATED = "rubrics_generated" EVALUATION_COMPLETE = "evaluation_complete" + @classmethod + def get_order(cls, stage: "EvaluationStage") -> int: + """Get numeric order of a stage for comparison.""" + order = { + cls.NOT_STARTED: 0, + cls.QUERIES_GENERATED: 1, + cls.RESPONSES_COLLECTED: 2, + cls.RUBRICS_GENERATED: 3, + cls.EVALUATION_COMPLETE: 4, + } + return order.get(stage, -1) + + def __ge__(self, other: "EvaluationStage") -> bool: + """Compare stages by pipeline order, not string value.""" + return self.get_order(self) >= self.get_order(other) + + def __gt__(self, other: "EvaluationStage") -> bool: + """Compare stages by pipeline order, not string value.""" + return self.get_order(self) > self.get_order(other) + + def __le__(self, other: "EvaluationStage") -> bool: + """Compare stages by pipeline order, not string value.""" + return self.get_order(self) <= self.get_order(other) + + def __lt__(self, other: "EvaluationStage") -> bool: + """Compare stages by pipeline order, not string value.""" + return self.get_order(self) < self.get_order(other) + class _CheckpointData(BaseModel): """Internal checkpoint data model.""" @@ -630,7 +659,7 @@ async def evaluate( if queries: self._queries = queries logger.info(f"Using {len(queries)} provided queries") - elif checkpoint and checkpoint.stage.value >= EvaluationStage.QUERIES_GENERATED.value: + elif checkpoint and checkpoint.stage >= EvaluationStage.QUERIES_GENERATED: self._queries = self._checkpoint_mgr.load_queries() logger.info(f"Resumed {len(self._queries)} queries from checkpoint") elif not self._queries: @@ -644,7 +673,7 @@ async def evaluate( ) # Step 2: Collect or load responses - if checkpoint and checkpoint.stage.value >= EvaluationStage.RESPONSES_COLLECTED.value: + if checkpoint and checkpoint.stage >= EvaluationStage.RESPONSES_COLLECTED: self._responses = self._checkpoint_mgr.load_responses() logger.info(f"Resumed {len(self._responses)} responses from checkpoint") elif not self._responses: @@ -661,7 +690,7 @@ async def evaluate( if rubrics: self._rubrics = rubrics logger.info(f"Using {len(rubrics)} provided rubrics") - elif checkpoint and checkpoint.stage.value >= EvaluationStage.RUBRICS_GENERATED.value: + elif checkpoint and checkpoint.stage >= EvaluationStage.RUBRICS_GENERATED: self._rubrics = self._checkpoint_mgr.load_rubrics() logger.info(f"Resumed {len(self._rubrics)} rubrics from checkpoint") elif not self._rubrics: @@ -702,6 +731,10 @@ async def evaluate( if self.config.report.enabled: await self._generate_and_save_report(result) + # Step 7: Generate win rate chart if enabled (requires report.enabled) + if self.config.report.enabled and self.config.report.chart.enabled: + self._generate_win_rate_chart(result) + return result async def _generate_and_save_report(self, result: EvaluationResult) -> None: @@ -728,6 +761,21 @@ async def _generate_and_save_report(self, result: EvaluationResult) -> None: f.write(report) logger.info(f"Report saved to {report_path}") + def _generate_win_rate_chart(self, result: EvaluationResult) -> None: + """Generate and save win rate comparison chart.""" + logger.info("Step 7: Generating win rate chart...") + + chart_config = self.config.report.chart + generator = WinRateChartGenerator(config=chart_config) + + chart_path = generator.generate( + rankings=result.rankings, + output_dir=self.config.output.output_dir, + task_description=self.config.task.description, + total_queries=result.total_queries, + total_comparisons=result.total_comparisons, + ) + def _display_results(self, result: EvaluationResult) -> None: """Display evaluation results with formatted output.""" endpoint_names = list(self.config.target_endpoints.keys()) diff --git a/docs/applications/zero_shot_evaluation.md b/docs/applications/zero_shot_evaluation.md index 4fa9f2d7..3ab63257 100644 --- a/docs/applications/zero_shot_evaluation.md +++ b/docs/applications/zero_shot_evaluation.md @@ -10,7 +10,7 @@ Zero-shot evaluation is ideal for **model comparison**, **agent pipeline testing !!! tip "No Test Data Required" Unlike traditional evaluation, zero-shot evaluation generates its own test queries from the task description, eliminating the need for pre-existing test datasets. -The pipeline automates five steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → produce rankings. +The pipeline automates seven steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → analyze results → generate report → create visualization. | Step | Component | Description | |------|-----------|-------------| @@ -18,7 +18,9 @@ The pipeline automates five steps: generate test queries → collect responses | 2 | `ResponseCollector` | Collect responses from all target endpoints | | 3 | `TaskBasedRubricGenerator` | Generate evaluation criteria for the task | | 4 | `GradingRunner` | Run pairwise comparisons with judge model | -| 5 | `ZeroShotPipeline` | Analyze results and produce rankings | +| 5 | `PairwiseAnalyzer` | Analyze results and produce rankings | +| 6 | `ReportGenerator` | Generate detailed Markdown evaluation report | +| 7 | `WinRateChartGenerator` | Create win rate visualization chart | ## Quick Start @@ -267,6 +269,15 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons ============================================================ ``` + **Output Files:** + + | File | Description | + |------|-------------| + | `evaluation_report.md` | Detailed Markdown report with analysis | + | `win_rate_chart.png` | Visual bar chart for presentations | + | `comparison_details.json` | Traceable pairwise comparison records | + | `evaluation_results.json` | Structured result data (JSON) | + === "Query Generation Options" Fine-tune query generation behavior: @@ -312,6 +323,7 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons ``` evaluation_results/ ├── evaluation_report.md # Generated Markdown report + ├── win_rate_chart.png # Win rate visualization chart ├── comparison_details.json # All pairwise comparison details ├── evaluation_results.json # Final rankings and statistics ├── queries.json # Generated test queries @@ -322,6 +334,33 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons !!! tip "Example Report" View a real report: [Oncology Medical Translation Evaluation](sample_reports/oncology_translation_report.md) +=== "Win Rate Chart" + + Automatically generate a beautiful bar chart showing model win rates: + + ```yaml + report: + chart: + enabled: true # Enable chart generation (default: true) + title: null # Custom title (auto-generated if not set) + figsize: [12, 7] # Figure size (width, height) in inches + dpi: 150 # Image resolution (72-300) + format: "png" # Output format: png / svg / pdf + show_values: true # Show percentage values on bars + highlight_best: true # Highlight best model with accent color + ``` + + **Chart Features:** + + - 🥇 **Best model highlighted** with orange diagonal stripes + - 📊 **Gray gradient** for other models by rank + - 🔢 **Value labels** on top of each bar + - 🌏 **CJK font support** for Chinese/Japanese/Korean text + + ![Win Rate Chart Example](../images/win_rate_chart_example.png) + + *Example: Oncology medical translation evaluation with 5 models* + === "Checkpoint & Resume" Evaluations automatically save checkpoints for resumption after interruptions: @@ -348,6 +387,7 @@ Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all respons - Set `num_queries` to at least **20** for statistically meaningful results - Choose a **strong judge model** (at least as capable as models being evaluated) - Use `--save` flag to persist results for later analysis + - Use the generated **win rate chart** for presentations and reports !!! warning "Don't" - Use a judge model weaker than the models being evaluated diff --git a/docs/images/win_rate_chart_example.png b/docs/images/win_rate_chart_example.png new file mode 100644 index 00000000..1aed684a Binary files /dev/null and b/docs/images/win_rate_chart_example.png differ diff --git a/pyproject.toml b/pyproject.toml index 4a7bdbd3..2c045878 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,8 @@ dev = [ "pytest-trio", "pytest-twisted", "twisted", - "python-dotenv" + "python-dotenv", + "matplotlib>=3.7.0,<4.0.0" ] verl = [ "transformers>=4.52.4,<5.0.0",