diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b31b9ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,210 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Cursor +# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to +# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data +# refer to https://docs.cursor.com/context/ignore-files +.cursorignore +.cursorindexingignore + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# SONiC Agent runtime data +results/ diff --git a/utils.py b/utils.py index 01ef7bd..dff38c2 100644 --- a/utils.py +++ b/utils.py @@ -33,11 +33,11 @@ def initialize_clients(api_provider): raise ValueError("OpenAI api key not found in environment variables") else: raise ValueError((f"Invalid api_provider name: {api_provider}. Must be 'sambanova', 'together', or 'openai'")) - + generator_client = openai.OpenAI(api_key=api_key, base_url=base_url) reflector_client = openai.OpenAI(api_key=api_key, base_url=base_url) curator_client = openai.OpenAI(api_key=api_key, base_url=base_url) - + print("Using Together API for all models") return generator_client, reflector_client, curator_client @@ -54,13 +54,13 @@ def get_section_slug(section_name): "others": "misc", "meta_strategies": "meta" } - + # Clean and convert to snake_case clean_name = section_name.lower().strip().replace(" ", "_").replace("&", "and") - + if clean_name in slug_map: return slug_map[clean_name] - + # Generate slug from first letters words = clean_name.split("_") if len(words) == 1: @@ -74,11 +74,11 @@ def extract_boxed_content(text): match = re.search(pattern, text) if not match: return None - + start = match.end() - 1 # Position of opening brace brace_count = 0 i = start - + while i < len(text): if text[i] == '{': brace_count += 1 @@ -95,28 +95,28 @@ def extract_answer(response): # First try JSON parsing parsed = json.loads(response) answer = str(parsed.get("final_answer", "No final answer found")) - return answer - + return answer + except (json.JSONDecodeError, KeyError, AttributeError): # JSON parsing failed, use fallback logic matches = re.findall(r"Finish\[(.*?)\]", response) if matches: answer = matches[-1] return answer - - # Try to get final answer from JSON style response with regex matching + + # Try to get final answer from JSON style response with regex matching # Try double quotes first matches = re.findall(r'"final_answer"\s*:\s*"([^"]*)"', response) if matches: answer = matches[-1] return answer - + # Try single quotes matches = re.findall(r"'final_answer'\s*:\s*'([^']*)'", response) if matches: answer = matches[-1] return answer - + # Handle JSON format without quotes (for simple expressions) matches = re.findall(r'[\'"]final_answer[\'"]\s*:\s*([^,}]+)', response) if matches: @@ -124,7 +124,7 @@ def extract_answer(response): # Clean up trailing characters answer = re.sub(r'[,}]*$', '', answer) return answer - + # Fallback for "The final answer is: X" pattern with boxed final_answer_pattern = r'[Tt]he final answer is:?\s*\$?\\boxed\{' match = re.search(final_answer_pattern, response) @@ -134,7 +134,7 @@ def extract_answer(response): boxed_content = extract_boxed_content(remaining_text) if boxed_content: return boxed_content - + # More general pattern for "final answer is X" matches = re.findall(r'[Tt]he final answer is:?\s*([^\n.]+)', response) if matches: @@ -144,9 +144,9 @@ def extract_answer(response): answer = answer.replace('$', '').strip() if answer: return answer - + return "No final answer found" - + enc = tiktoken.get_encoding("cl100k_base") def count_tokens(prompt: str) -> int: return len(enc.encode(prompt)) @@ -155,7 +155,7 @@ def count_tokens(prompt: str) -> int: def evaluate_single_test_sample(args_tuple, data_processor) -> Tuple[Dict, str]: """ Evaluate a single test sample - task-agnostic implementation. - + Args: args_tuple: Tuple of (index, task_dict, generator, playbook, max_tokens, log_dir, use_json_mode) data_processor: DataProcessor instance with answer_is_correct method @@ -192,11 +192,11 @@ def evaluate_single_test_sample(args_tuple, data_processor) -> Tuple[Dict, str]: def evaluate_test_set(data_processor, generator, playbook, test_samples, - max_tokens=4096, log_dir=None, max_workers=20, + max_tokens=4096, log_dir=None, max_workers=20, use_json_mode=False) -> Tuple[Dict, Dict]: """ Parallel evaluation of test set - task-agnostic implementation. - + Args: data_processor: DataProcessor instance with answer_is_correct and evaluate_accuracy methods generator: Generator instance @@ -206,7 +206,7 @@ def evaluate_test_set(data_processor, generator, playbook, test_samples, log_dir: Directory for logs max_workers: Number of parallel workers use_json_mode: Whether to use JSON mode - + Returns: Tuple of (results_dict, error_logs_dict) """ @@ -230,13 +230,13 @@ def eval_wrapper(args_tuple): with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_args = { - executor.submit(eval_wrapper, args): args + executor.submit(eval_wrapper, args): args for args in args_list } for i, future in enumerate(as_completed(future_to_args), 1): result, error = future.result() - + if error: print(error) continue @@ -246,40 +246,40 @@ def eval_wrapper(args_tuple): results["total"] += 1 results["answers"].append(result["final_answer"]) results["targets"].append(result["target"]) - + if not result["is_correct"]: results["errors"].append({ "index": result["index"], "prediction": result["final_answer"], "ground_truth": result["target"] }) - + if result["final_answer"] == "No final answer found": results["no_answer"] += 1 if i % 50 == 0: curr_acc = results["correct"] / results["total"] if results["total"] > 0 else 0 print(f"Progress: {i}/{len(args_list)}, Accuracy: {curr_acc:.3f}") - + if results["answers"] and results["targets"]: accuracy = data_processor.evaluate_accuracy(results["answers"], results["targets"]) - + final_results = { "accuracy": accuracy, "correct": results["correct"], "total": results["total"], "no_answer": results["no_answer"] } - + error_logs = { "accuracy": accuracy, "errors": results["errors"] } - + print(f"\nšŸ“Š Final Accuracy: {accuracy:.3f} ({results['correct']}/{results['total']})") else: - results = {"accuracy": 0.0, "correct": 0, "total": 0} + final_results = {"accuracy": 0.0, "correct": 0, "total": 0} error_logs = {} print(f"\nšŸ“Š No valid results!") - + return final_results, error_logs \ No newline at end of file