Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add math problem solver using curator.Prompter #177

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions examples/math_autobencher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Math problem auto-bencher using curator.Prompter."""

from typing import List, Optional
from datasets import Dataset

from math_solver import MathProblem, MathResult
from math_prompter import MathPrompter


class MathAutoBencher:
"""Automated math problem solver and benchmarker."""

def __init__(
self, model_name: str = "gpt-4o-mini", batch_size: int = 20, temperature: float = 0.2
):
"""Initialize the auto-bencher with specified parameters."""
self.prompter = MathPrompter(
model_name=model_name, batch=True, batch_size=batch_size, temperature=temperature
)

def run_benchmark(self, problems: List[dict], output_file: Optional[str] = None) -> Dataset:
"""
Run benchmark on a list of math problems.

Args:
problems: List of problem dictionaries with 'question' and optional 'expected_answer'
output_file: Optional path to save results

Returns:
Dataset containing benchmark results
"""
# Convert problems to dataset
dataset = Dataset.from_list(problems)

# Process all problems
results = self.prompter(dataset)

if output_file:
# Save results to file
results.to_json(output_file)

return results

def analyze_results(self, results: Dataset) -> dict:
"""
Analyze benchmark results.

Args:
results: Dataset containing benchmark results

Returns:
Dictionary with analysis metrics
"""
total = len(results)
correct = sum(1 for result in results if result.get("is_correct", False))
errors = sum(1 for result in results if result.get("error") is not None)

return {
"total_problems": total,
"correct_answers": correct,
"accuracy": correct / total if total > 0 else 0,
"errors": errors,
"error_rate": errors / total if total > 0 else 0,
}


def main():
"""Example usage of MathAutoBencher."""
# Example problems
problems = [
{"question": "What is 15 + 27?", "expected_answer": 42},
{"question": "If x = 5 and y = 3, what is x * y?", "expected_answer": 15},
{
"question": "Calculate the area of a rectangle with width 8 and height 6.",
"expected_answer": 48,
},
]

# Initialize and run benchmark
bencher = MathAutoBencher()
results = bencher.run_benchmark(problems, output_file="benchmark_results.json")

# Analyze results
analysis = bencher.analyze_results(results)
print("\nBenchmark Analysis:")
print(f"Total Problems: {analysis['total_problems']}")
print(f"Correct Answers: {analysis['correct_answers']}")
print(f"Accuracy: {analysis['accuracy']:.2%}")
print(f"Errors: {analysis['errors']}")
print(f"Error Rate: {analysis['error_rate']:.2%}")


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions examples/math_executor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Safe Python code execution utilities for math problem solving."""

import ast
import contextlib
import io
import sys
from typing import Tuple, Optional


def is_safe_ast(tree: ast.AST) -> bool:
"""Check if the AST contains only safe operations."""
for node in ast.walk(tree):
# Block imports
if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
return False
# Block exec/eval
if isinstance(node, ast.Call):
if isinstance(node.func, ast.Name):
if node.func.id in ["exec", "eval", "compile"]:
return False
# Block attribute access that might be dangerous
if isinstance(node, ast.Attribute):
if node.attr in ["open", "read", "write", "system"]:
return False
return True


def execute_math_code(code: str, timeout: int = 5) -> Tuple[Optional[str], Optional[str]]:
"""
Execute Python code safely with timeout and output capture.

Args:
code: Python code to execute
timeout: Maximum execution time in seconds

Returns:
Tuple of (result, error_message)
"""
try:
# Parse and validate AST
tree = ast.parse(code)
if not is_safe_ast(tree):
return None, "Code contains unsafe operations"

# Capture stdout
stdout = io.StringIO()
stderr = io.StringIO()

# Execute with timeout and output capture
with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
exec(compile(tree, "<string>", "exec"), {"__builtins__": {"print": print}}, {})

error = stderr.getvalue().strip()
if error:
return None, f"Execution error: {error}"

result = stdout.getvalue().strip()
return result, None

except SyntaxError as e:
return None, f"Syntax error: {str(e)}"
except Exception as e:
return None, f"Runtime error: {str(e)}"
170 changes: 170 additions & 0 deletions examples/math_prompter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""Math problem solver using curator.Prompter with safe code execution."""

from typing import Dict, Any, Optional, Union

from bespokelabs import curator
from math_solver import MathProblem, MathSolution, MathSolutions, MathResult
from math_executor import execute_math_code


class MathPrompter(curator.Prompter):
"""Prompter specialized for math problem solving with code execution."""

def __init__(
self,
model_name: str = "gpt-4o-mini",
temperature: float = 0.2,
batch: bool = False,
batch_size: Optional[int] = None,
):
"""Initialize MathPrompter with specialized prompt and parse functions."""

def prompt_func(problem: Union[Dict[str, Any], MathProblem]) -> Dict[str, str]:
"""Format the math problem for the LLM."""
if isinstance(problem, dict):
question = problem.get("question", "")
else:
question = problem.question

return {
"role": "user",
"content": (
f"Solve this math problem by writing Python code. The code should print the final answer.\n\n"
f"Problem: {question}\n\n"
f"Requirements:\n"
f"1. Write clear, simple Python code that solves the problem\n"
f"2. The code must print only the final answer\n"
f"3. Include a brief explanation of your solution approach\n"
f"4. Do not use any imports\n"
f"5. Only use basic Python operations\n"
),
}

def parse_func(
problem: Union[Dict[str, Any], MathProblem], response: MathSolutions
) -> MathResult:
"""Execute the solution code and validate results."""
try:
if isinstance(problem, dict):
question = problem.get("question", "")
expected = problem.get("expected_answer")
else:
question = problem.question
expected = problem.expected_answer

# Handle empty or invalid responses
if not response or not response.solutions:
return MathResult(
question=question,
solution=MathSolution(
python_code="", answer="N/A", explanation="Failed to generate solution"
),
is_correct=False,
error="No valid solution generated",
)

solution = response.solutions[0] # We only generate one solution per problem

# Execute the code
result, error = execute_math_code(solution.python_code)

if error:
return MathResult(
question=question,
solution=MathSolution(
python_code=solution.python_code,
answer="N/A",
explanation=solution.explanation,
),
is_correct=False,
error=error,
)

# Update the solution with the executed result
try:
# Try to convert string result to number if possible
try:
if "." in result:
result = float(result)
else:
result = int(result)
except (ValueError, TypeError):
# Keep as string if conversion fails
pass

solution.answer = result
except Exception as e:
return MathResult(
question=question,
solution=MathSolution(
python_code=solution.python_code,
answer="N/A",
explanation=solution.explanation,
),
is_correct=False,
error=f"Failed to process result: {str(e)}",
)

# Check if the answer matches expected (if provided)
is_correct = None
if expected is not None:
try:
# Convert both to same type for comparison
if isinstance(expected, (int, float)):
computed = float(str(result))
is_correct = abs(computed - float(expected)) < 1e-6
else:
is_correct = str(result).strip() == str(expected).strip()
except (ValueError, TypeError):
is_correct = False

return MathResult(question=question, solution=solution, is_correct=is_correct)

except Exception as e:
# Catch-all error handler
return MathResult(
question=question,
solution=MathSolution(
python_code="", answer="N/A", explanation="Error occurred during processing"
),
is_correct=False,
error=f"Unexpected error: {str(e)}",
)

super().__init__(
model_name=model_name,
prompt_func=prompt_func,
parse_func=parse_func,
response_format=MathSolutions,
batch=batch,
batch_size=batch_size,
temperature=temperature,
)

def solve(self, problems: Union[str, Dict[str, Any], MathProblem, list]) -> MathResult:
"""
Solve one or more math problems.

Args:
problems: A single problem (as string, dict, or MathProblem) or list of problems

Returns:
MathResult or Dataset containing results
"""
# Convert string to dict format
if isinstance(problems, str):
problems = {"question": problems}

# Convert single problem to list
if not isinstance(problems, list):
problems = [problems]

# Process problems using curator Dataset
dataset = curator.Dataset.from_list(problems)
results = self(dataset)

# Return single result if only one problem
if len(problems) == 1:
return results[0]

return results
Loading