bespokelabsai · devin-ai-integration · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/examples/math_autobencher.py b/examples/math_autobencher.py
@@ -0,0 +1,94 @@
+"""Math problem auto-bencher using curator.Prompter."""
+
+from typing import List, Optional
+from datasets import Dataset
+
+from math_solver import MathProblem, MathResult
+from math_prompter import MathPrompter
+
+
+class MathAutoBencher:
+    """Automated math problem solver and benchmarker."""
+
+    def __init__(
+        self, model_name: str = "gpt-4o-mini", batch_size: int = 20, temperature: float = 0.2
+    ):
+        """Initialize the auto-bencher with specified parameters."""
+        self.prompter = MathPrompter(
+            model_name=model_name, batch=True, batch_size=batch_size, temperature=temperature
+        )
+
+    def run_benchmark(self, problems: List[dict], output_file: Optional[str] = None) -> Dataset:
+        """
+        Run benchmark on a list of math problems.
+
+        Args:
+            problems: List of problem dictionaries with 'question' and optional 'expected_answer'
+            output_file: Optional path to save results
+
+        Returns:
+            Dataset containing benchmark results
+        """
+        # Convert problems to dataset
+        dataset = Dataset.from_list(problems)
+
+        # Process all problems
+        results = self.prompter(dataset)
+
+        if output_file:
+            # Save results to file
+            results.to_json(output_file)
+
+        return results
+
+    def analyze_results(self, results: Dataset) -> dict:
+        """
+        Analyze benchmark results.
+
+        Args:
+            results: Dataset containing benchmark results
+
+        Returns:
+            Dictionary with analysis metrics
+        """
+        total = len(results)
+        correct = sum(1 for result in results if result.get("is_correct", False))
+        errors = sum(1 for result in results if result.get("error") is not None)
+
+        return {
+            "total_problems": total,
+            "correct_answers": correct,
+            "accuracy": correct / total if total > 0 else 0,
+            "errors": errors,
+            "error_rate": errors / total if total > 0 else 0,
+        }
+
+
+def main():
+    """Example usage of MathAutoBencher."""
+    # Example problems
+    problems = [
+        {"question": "What is 15 + 27?", "expected_answer": 42},
+        {"question": "If x = 5 and y = 3, what is x * y?", "expected_answer": 15},
+        {
+            "question": "Calculate the area of a rectangle with width 8 and height 6.",
+            "expected_answer": 48,
+        },
+    ]
+
+    # Initialize and run benchmark
+    bencher = MathAutoBencher()
+    results = bencher.run_benchmark(problems, output_file="benchmark_results.json")
+
+    # Analyze results
+    analysis = bencher.analyze_results(results)
+    print("\nBenchmark Analysis:")
+    print(f"Total Problems: {analysis['total_problems']}")
+    print(f"Correct Answers: {analysis['correct_answers']}")
+    print(f"Accuracy: {analysis['accuracy']:.2%}")
+    print(f"Errors: {analysis['errors']}")
+    print(f"Error Rate: {analysis['error_rate']:.2%}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/math_executor.py b/examples/math_executor.py
@@ -0,0 +1,63 @@
+"""Safe Python code execution utilities for math problem solving."""
+
+import ast
+import contextlib
+import io
+import sys
+from typing import Tuple, Optional
+
+
+def is_safe_ast(tree: ast.AST) -> bool:
+    """Check if the AST contains only safe operations."""
+    for node in ast.walk(tree):
+        # Block imports
+        if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
+            return False
+        # Block exec/eval
+        if isinstance(node, ast.Call):
+            if isinstance(node.func, ast.Name):
+                if node.func.id in ["exec", "eval", "compile"]:
+                    return False
+        # Block attribute access that might be dangerous
+        if isinstance(node, ast.Attribute):
+            if node.attr in ["open", "read", "write", "system"]:
+                return False
+    return True
+
+
+def execute_math_code(code: str, timeout: int = 5) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Execute Python code safely with timeout and output capture.
+
+    Args:
+        code: Python code to execute
+        timeout: Maximum execution time in seconds
+
+    Returns:
+        Tuple of (result, error_message)
+    """
+    try:
+        # Parse and validate AST
+        tree = ast.parse(code)
+        if not is_safe_ast(tree):
+            return None, "Code contains unsafe operations"
+
+        # Capture stdout
+        stdout = io.StringIO()
+        stderr = io.StringIO()
+
+        # Execute with timeout and output capture
+        with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
+            exec(compile(tree, "<string>", "exec"), {"__builtins__": {"print": print}}, {})
+
+        error = stderr.getvalue().strip()
+        if error:
+            return None, f"Execution error: {error}"
+
+        result = stdout.getvalue().strip()
+        return result, None
+
+    except SyntaxError as e:
+        return None, f"Syntax error: {str(e)}"
+    except Exception as e:
+        return None, f"Runtime error: {str(e)}"
diff --git a/examples/math_prompter.py b/examples/math_prompter.py
@@ -0,0 +1,170 @@
+"""Math problem solver using curator.Prompter with safe code execution."""
+
+from typing import Dict, Any, Optional, Union
+
+from bespokelabs import curator
+from math_solver import MathProblem, MathSolution, MathSolutions, MathResult
+from math_executor import execute_math_code
+
+
+class MathPrompter(curator.Prompter):
+    """Prompter specialized for math problem solving with code execution."""
+
+    def __init__(
+        self,
+        model_name: str = "gpt-4o-mini",
+        temperature: float = 0.2,
+        batch: bool = False,
+        batch_size: Optional[int] = None,
+    ):
+        """Initialize MathPrompter with specialized prompt and parse functions."""
+
+        def prompt_func(problem: Union[Dict[str, Any], MathProblem]) -> Dict[str, str]:
+            """Format the math problem for the LLM."""
+            if isinstance(problem, dict):
+                question = problem.get("question", "")
+            else:
+                question = problem.question
+
+            return {
+                "role": "user",
+                "content": (
+                    f"Solve this math problem by writing Python code. The code should print the final answer.\n\n"
+                    f"Problem: {question}\n\n"
+                    f"Requirements:\n"
+                    f"1. Write clear, simple Python code that solves the problem\n"
+                    f"2. The code must print only the final answer\n"
+                    f"3. Include a brief explanation of your solution approach\n"
+                    f"4. Do not use any imports\n"
+                    f"5. Only use basic Python operations\n"
+                ),
+            }
+
+        def parse_func(
+            problem: Union[Dict[str, Any], MathProblem], response: MathSolutions
+        ) -> MathResult:
+            """Execute the solution code and validate results."""
+            try:
+                if isinstance(problem, dict):
+                    question = problem.get("question", "")
+                    expected = problem.get("expected_answer")
+                else:
+                    question = problem.question
+                    expected = problem.expected_answer
+
+                # Handle empty or invalid responses
+                if not response or not response.solutions:
+                    return MathResult(
+                        question=question,
+                        solution=MathSolution(
+                            python_code="", answer="N/A", explanation="Failed to generate solution"
+                        ),
+                        is_correct=False,
+                        error="No valid solution generated",
+                    )
+
+                solution = response.solutions[0]  # We only generate one solution per problem
+
+                # Execute the code
+                result, error = execute_math_code(solution.python_code)
+
+                if error:
+                    return MathResult(
+                        question=question,
+                        solution=MathSolution(
+                            python_code=solution.python_code,
+                            answer="N/A",
+                            explanation=solution.explanation,
+                        ),
+                        is_correct=False,
+                        error=error,
+                    )
+
+                # Update the solution with the executed result
+                try:
+                    # Try to convert string result to number if possible
+                    try:
+                        if "." in result:
+                            result = float(result)
+                        else:
+                            result = int(result)
+                    except (ValueError, TypeError):
+                        # Keep as string if conversion fails
+                        pass
+
+                    solution.answer = result
+                except Exception as e:
+                    return MathResult(
+                        question=question,
+                        solution=MathSolution(
+                            python_code=solution.python_code,
+                            answer="N/A",
+                            explanation=solution.explanation,
+                        ),
+                        is_correct=False,
+                        error=f"Failed to process result: {str(e)}",
+                    )
+
+                # Check if the answer matches expected (if provided)
+                is_correct = None
+                if expected is not None:
+                    try:
+                        # Convert both to same type for comparison
+                        if isinstance(expected, (int, float)):
+                            computed = float(str(result))
+                            is_correct = abs(computed - float(expected)) < 1e-6
+                        else:
+                            is_correct = str(result).strip() == str(expected).strip()
+                    except (ValueError, TypeError):
+                        is_correct = False
+
+                return MathResult(question=question, solution=solution, is_correct=is_correct)
+
+            except Exception as e:
+                # Catch-all error handler
+                return MathResult(
+                    question=question,
+                    solution=MathSolution(
+                        python_code="", answer="N/A", explanation="Error occurred during processing"
+                    ),
+                    is_correct=False,
+                    error=f"Unexpected error: {str(e)}",
+                )
+
+        super().__init__(
+            model_name=model_name,
+            prompt_func=prompt_func,
+            parse_func=parse_func,
+            response_format=MathSolutions,
+            batch=batch,
+            batch_size=batch_size,
+            temperature=temperature,
+        )
+
+    def solve(self, problems: Union[str, Dict[str, Any], MathProblem, list]) -> MathResult:
+        """
+        Solve one or more math problems.
+
+        Args:
+            problems: A single problem (as string, dict, or MathProblem) or list of problems
+
+        Returns:
+            MathResult or Dataset containing results
+        """
+        # Convert string to dict format
+        if isinstance(problems, str):
+            problems = {"question": problems}
+
+        # Convert single problem to list
+        if not isinstance(problems, list):
+            problems = [problems]
+
+        # Process problems using curator Dataset
+        dataset = curator.Dataset.from_list(problems)
+        results = self(dataset)
+
+        # Return single result if only one problem
+        if len(problems) == 1:
+            return results[0]
+
+        return results