diff --git a/python/packages/agbench/benchmarks/AssistantBench/.gitignore b/python/packages/agbench/benchmarks/AssistantBench/.gitignore
deleted file mode 100644
index f6c9d117b084..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-ENV.json
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample b/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
deleted file mode 100644
index 1f2c4915e3c7..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "BING_API_KEY": "YOUR_KEY_KEY",
-    "HOMEPAGE": "https://www.bing.com/",
-    "WEB_SURFER_DEBUG_DIR": "/autogen/debug"
-}
diff --git a/python/packages/agbench/benchmarks/AssistantBench/README.md b/python/packages/agbench/benchmarks/AssistantBench/README.md
deleted file mode 100644
index 30bcf881fb00..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# AssistantBench Benchmark
-
-This scenario implements the [AssistantBench](https://assistantbench.github.io/) agent benchmark. Before you begin, make sure you have followed the instructions in `../README.md` to prepare your environment. We modify the evaluation code from AssistantBench in [Scripts](Scripts) and retain the license  including it here [LICENSE](Scripts/evaluate_utils/LICENSE).  Please find the original AssistantBench evaluation code here [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation).
-
-### Setup Environment Variables for AgBench
-
-Navigate to AssistantBench
-
-```bash
-cd benchmarks/AssistantBench
-```
-
-Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
-
-```json
-{
-    "BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
-    "HOMEPAGE": "https://www.bing.com/",
-    "WEB_SURFER_DEBUG_DIR": "/autogen/debug",
-    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
-    "CHAT_COMPLETION_PROVIDER": "azure"
-}
-```
-
-You can also use the openai client by replacing the last two entries in the ENV file by:
-
-- `CHAT_COMPLETION_PROVIDER='openai'`
-- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
-
-```json
-{
-  "api_key": "REPLACE_WITH_YOUR_API",
-  "model": "gpt-4o-2024-05-13"
-}
-```
-
-Now initialize the tasks.
-
-```bash
-python Scripts/init_tasks.py
-```
-
-Note: This will attempt to download AssistantBench from Huggingface, but this requires authentication.
-
-After running the script, you should see the new following folders and files:
-
-```
-.
-./Downloads
-./Downloads/AssistantBench
-./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
-./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
-./Tasks
-./Tasks/assistant_bench_v1.0_dev.jsonl
-./Tasks/assistant_bench_v1.0_dev.jsonl
-```
-
-Then run `Scripts/init_tasks.py` again.
-
-Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.
-
-### Running AssistantBench
-
-Now to run a specific subset of AssistantBench use:
-
-```bash
-agbench run Tasks/assistant_bench_v1.0_dev__MagenticOne.jsonl
-```
-
-You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
-
-```bash
-agbench tabulate Results/assistant_bench_v1.0_dev__MagenticOne
-```
-
-## References
-
-Yoran, Ori, Samuel Joseph Amouyal, Chaitanya Malaviya, Ben Bogin, Ofir Press, and Jonathan Berant. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). https://arxiv.org/abs/2407.15711
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
deleted file mode 100644
index 56d1a04faa67..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
-import json
-from evaluate_utils.evaluate_factory import get_evaluator
-import numpy as np
-
-
-def find_isnan(samp):
-    try:
-        if np.isnan(samp):
-            return True
-        else:
-            return False
-    except:
-        return False
-
-
-def fix_ans(answer):
-    try:
-        answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
-        answer = answer.replace("': ", '": ')
-        return answer
-    except:
-        return answer
-
-
-def parse_answer(answer):
-    if len(answer) == 1:
-        ans, is_num = fix_number(answer[0])
-        if is_num:
-            return ans, "number"
-        try:
-            ans = json.loads(fix_ans(answer[0]))
-            return [ans], "json"
-        except:
-            ans, is_num = fix_number(answer[0])
-            if is_num:
-                return ans, "number"
-            else:
-                return answer[0], "string"
-    else:
-        try:
-            ans = [json.loads(fix_ans(ex)) for ex in answer]
-            return ans, "json"
-        except:
-            return answer, "string list"
-
-
-def fix_number(number):
-    if type(number) == str:
-        copy_ans = number
-        copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
-        copy_ans = copy_ans.strip()
-        copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
-        try:
-            return float(copy_ans), True
-        except:
-            return number, False
-    elif type(number) == int:
-        return float(number), True
-    else:
-        return number, True
-
-
-def fix_prediction(prediction, gold_answer, evaluator):
-    if (
-        type(prediction) == list
-        and len(prediction) == 1
-        and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
-    ):
-        prediction = fix_number(prediction[0])
-
-    if type(prediction) != list:
-        prediction, is_num = fix_number(prediction)
-        if evaluator == "json":
-            try:
-                prediction = [json.loads(pred) for pred in prediction.split("\n")]
-            except:
-                prediction = [prediction]
-
-    if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
-        return prediction, False
-
-    if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
-        return prediction, False
-
-    return prediction, True
-
-
-def question_scorer(prediction, gold_answer):
-    """
-    prediction: str or list of str
-    gold_answer: str or list of str
-
-    returns a float between 0 and 1
-    """
-    try:
-        try:
-            prediction = json.loads(prediction)
-        except:
-            prediction = prediction
-
-        answer_list = (
-            [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
-        )
-        gold_answer, evaluator = parse_answer(answer_list)
-        prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
-
-        has_ans = 1.0
-        if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
-            has_ans = 0.0
-
-        if not run_eval:
-            return 0.0
-
-        metric_eval = get_evaluator(evaluator)
-        accuracy = metric_eval(prediction, gold_answer)
-        # double check if the accuracy is a number between 0 and 1
-        if 0 <= accuracy <= 1:
-            return accuracy
-        else:
-            # throw exception
-            raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
-    except Exception as e:
-        print(
-            f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
-        )
-        return 0.0
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py
deleted file mode 100644
index 61c40acc72f8..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py
+++ /dev/null
@@ -1,232 +0,0 @@
-import os
-import sys
-import re
-from agbench.tabulate_cmd import default_tabulate
-import json
-import pandas as pd
-import sqlite3
-import glob
-import numpy as np
-sys.path.append(os.path.dirname(__file__))
-
-from assistantbench_evaluator import question_scorer
-
-EXCLUDE_DIR_NAMES = ["__pycache__"]
-
-
-def normalize_answer(a):
-    # Lower case
-    # Trim (left and right)
-    # standardize comma separated values
-    # Replace multiple spaces with one space
-    # Remove trailing punctuation
-    norm_answer = ", ".join(a.strip().lower().split(","))
-    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
-    return norm_answer
-
-
-def scorer(instance_dir):
-    # Read the expected answer
-    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
-    if not os.path.isfile(expected_answer_file):
-        return None
-
-    expected_answer = None
-    with open(expected_answer_file, "rt") as fh:
-        expected_answer = fh.read().strip()
-
-    # Read the console
-    console_log_file = os.path.join(instance_dir, "console_log.txt")
-    if not os.path.isfile(console_log_file):
-        return None
-
-    console_log = ""
-    with open(console_log_file, "rt") as fh:
-        console_log = fh.read()
-
-        final_answer = None
-        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
-        if m:
-            final_answer = m.group(1).strip()
-
-        # Missing the final answer line
-        if final_answer is None:
-            return None
-        # get accuracy from assistantbench util, no normalization done for accuracy
-        accuracy = question_scorer(final_answer, expected_answer)
-        n_ex = normalize_answer(expected_answer)
-        n_final = normalize_answer(final_answer)
-        return (accuracy, n_ex, n_final)
-
-
-def get_number_of_chat_messages(chat_messages_dir):
-    result = 0
-    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
-        with open(file, "r") as f:
-            content = json.load(f)
-            for agent, messages in content.items():
-                result += len(messages)
-    return result
-
-
-def main(args):
-    parsed_args, all_results = default_tabulate(args, scorer=scorer)
-    excel_path = parsed_args.excel
-
-    if excel_path:
-        excel_dir = os.path.dirname(excel_path) or "."
-        if not os.path.exists(excel_dir):
-            os.makedirs(excel_dir, exist_ok=True)
-
-        if not excel_path.endswith((".xlsx", ".xls")):
-            excel_path += ".xlsx"
-
-        runlogs = (
-            parsed_args.runlogs
-            if parsed_args.runlogs.endswith("/")
-            else parsed_args.runlogs + "/"
-        )
-
-        if os.path.isdir(runlogs):
-            task_ids = sorted(
-                [
-                    task_id
-                    for task_id in os.listdir(runlogs)
-                    if task_id not in EXCLUDE_DIR_NAMES
-                ],
-                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
-            )
-        else:
-            raise ValueError("please input a valid directory to tabulate result")
-
-        trials = (
-            sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))
-            if len(task_ids) > 0
-            else []
-        )
-        dbnames = [
-            [f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]
-            for trial in trials
-        ]
-
-        query = """
-            SELECT cost, session_id, response, start_time, end_time
-            FROM (
-                SELECT invocation_id, cost, session_id, response, start_time, end_time,
-                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
-                FROM chat_completions
-            )
-            WHERE rn = 1;
-        """
-
-        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
-            for trial_index, each_trial in enumerate(dbnames):
-                result_df = pd.DataFrame(
-                    columns=[
-                        "id",
-                        "status",
-                        "expected_answer",
-                        "final_answer",
-                        "cost",
-                        "latency",
-                        "num_of_llm_requests",
-                        "num_of_chat_messages",
-                        "prompt_tokens",
-                        "completion_tokens",
-                        "total_tokens",
-                        "model",
-                    ]
-                )
-
-                result_df_type_mapping = {
-                    "id": str,
-                    "status": bool,
-                    "expected_answer": str,
-                    "final_answer": str,
-                    "cost": float,
-                    "latency": float,
-                    "num_of_llm_requests": int,
-                    "num_of_chat_messages": int,
-                    "prompt_tokens": int,
-                    "completion_tokens": int,
-                    "total_tokens": int,
-                }
-
-                for dbname, scorer_results in zip(each_trial, all_results):
-                    task_id = scorer_results[0]
-                    scorer_result = scorer_results[trial_index + 1]
-
-                    status, expected_answer, final_answer = (
-                        scorer_result if scorer_result else (False, "", "")
-                    )
-
-                    con = sqlite3.connect(dbname)
-
-                    # TODO: if large amount of data, add chunksize
-                    telemetry_df = pd.read_sql_query(query, con)
-
-                    earliest_starttime = pd.to_datetime(
-                        telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"
-                    ).min()
-                    latest_endtime = pd.to_datetime(
-                        telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"
-                    ).max()
-
-                    num_of_chat_messages = get_number_of_chat_messages(
-                        chat_messages_dir=os.path.dirname(dbname)
-                    )
-                    result = {
-                        "id": task_id,
-                        "status": status,
-                        "expected_answer": expected_answer,
-                        "final_answer": final_answer,
-                        "cost": telemetry_df["cost"].sum(),
-                        "latency": (
-                            latest_endtime - earliest_starttime
-                        ).total_seconds(),
-                        "num_of_llm_requests": len(telemetry_df),
-                        "num_of_chat_messages": num_of_chat_messages,
-                        "prompt_tokens": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
-                            if "usage" in json.loads(x)
-                            and "prompt_tokens" in json.loads(x)["usage"]
-                            else 0
-                        )
-                        .sum(),
-                        "completion_tokens": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["usage"]["completion_tokens"]
-                            if "usage" in json.loads(x)
-                            and "completion_tokens" in json.loads(x)["usage"]
-                            else 0
-                        )
-                        .sum(),
-                        "total_tokens": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["usage"]["total_tokens"]
-                            if "usage" in json.loads(x)
-                            and "total_tokens" in json.loads(x)["usage"]
-                            else 0
-                        )
-                        .sum(),
-                        "model": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["model"]
-                            if "model" in json.loads(x)
-                            else ""
-                        )
-                        .unique(),
-                    }
-
-                    result_df = result_df.astype(result_df_type_mapping)
-                    result_df = pd.concat(
-                        [result_df, pd.DataFrame([result])], ignore_index=True
-                    )
-                result_df.to_excel(
-                    writer, sheet_name=f"trial_{trial_index}", index=False
-                )
-
-
-if __name__ == "__main__" and __package__ is None:
-    main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/LICENSE b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/LICENSE
deleted file mode 100644
index f49a4e16e68b..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/__init__.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_dicts.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_dicts.py
deleted file mode 100644
index 9ce61c8cea99..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_dicts.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# From AssistantBench modified slightly.
-from typing import Dict, List
-import numpy as np
-
-from .utils import _align_bags
-
-
-def calculate_f1_score(precision, recall):
-    if precision + recall == 0:
-        return 0  # Handle the case to avoid division by zero
-    return 2 * (precision * recall) / (precision + recall)
-
-
-def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
-    from .evaluate_factory import get_evaluator_from_gold_answer
-
-    recall = []
-    for gold_key, gold_value in gold.items():
-        pred_value = pred.get(gold_key)
-        gold_value = fix_number(gold_value)
-        pred_value = fix_number(pred_value)
-        if gold_key not in pred:
-            recall.append(0)
-        else:
-            evaluator = (
-                get_evaluator_from_gold_answer(type(gold_value))
-                if use_gold_for_eval
-                else get_evaluator_from_gold_answer(type(pred_value))
-            )
-            if type(pred_value) != type(gold_value):
-                recall.append(0)
-                continue
-            recall.append(evaluator(pred_value, gold_value))
-    avg_recall = np.average(recall)
-    return avg_recall
-
-
-def fix_number(number):
-    if type(number) == str:
-        copy_ans = number
-        copy_ans = " ".join(
-            " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
-        ).strip()
-        copy_ans = copy_ans.strip()
-        copy_ans = copy_ans.replace(",", ".")
-        try:
-            return float(copy_ans)
-        except:
-            return number
-    elif type(number) == int:
-        return float(number)
-    else:
-        return number
-
-
-def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
-    recall = calc_recall(pred, gold, True)
-    precision = calc_recall(gold, pred, False)
-    f1 = calculate_f1_score(precision, recall)
-    return f1
-
-
-def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
-    if not (
-        type(pred) == dict
-        or len(pred) == 0
-        or (type(pred) == list and type(pred[0]) == dict)
-    ):
-        return 0
-    max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
-    return np.average(max_alignment_scores)
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_factory.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_factory.py
deleted file mode 100644
index 6a63c0a26eeb..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_factory.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#From AssistantBench modified slightly.
-
-from typing import Union, Dict
-
-from .evaluate_dicts import evaluate_dicts
-from .evaluate_numbers import evaluate_numbers
-from .evaluate_strings import evaluate_strings
-
-EvaluatorFactory = {
-    "string": evaluate_strings,
-    "number": evaluate_numbers,
-    "json": evaluate_dicts,
-    "string list": evaluate_strings,
-}
-
-EvaluatorFactoryFromType = {
-    str: evaluate_strings,
-    int: evaluate_numbers,
-    float: evaluate_numbers,
-    bool: evaluate_strings,
-    list: evaluate_strings,
-}
-
-
-def get_evaluator(evaluator: str):
-    return EvaluatorFactory[evaluator]
-
-
-def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
-    return EvaluatorFactoryFromType[gold_answer]
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_numbers.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_numbers.py
deleted file mode 100644
index 74a51b512653..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_numbers.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#From AssistantBench modified slightly.
-
-from typing import Union
-import numpy as np
-
-
-# Renamed calc_z function to distance_function_log
-def distance_function_log(pred: float, gold: float):
-    if pred == gold == 0:
-        return 1
-    if pred == 0:
-        pred = 1e-4
-    if gold == 0:
-        gold = 1e-4
-    if pred > gold:
-        return max(0, 1 - np.log(pred / gold))
-    else:
-        return max(0, 1 - np.log(gold / pred))
-
-
-def evaluate_numbers(pred: Union[float, str], gold: float):
-    res = None
-    if type(pred) != float and type(pred) != int:
-        try:
-            pred = float(pred)
-        except ValueError:
-            res = 0
-    if type(gold) != float and type(gold) != int:
-        try:
-            gold = float(gold)
-        except ValueError:
-            res = 0
-    if res is None:
-        res = distance_function_log(pred, gold)
-    return res
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_strings.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_strings.py
deleted file mode 100644
index 301eff3b7764..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_strings.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-From AssistantBench modified slightly.
-Evaluation for two strings or list of strings.
-
-Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
-"""
-
-from collections import defaultdict
-from typing import List, Set, Tuple, Union
-import string
-import re
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-
-
-# From here through _normalize_answer was originally copied from:
-# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
-# Then cleaned up and modified a bit.
-def _remove_articles(text: str) -> str:
-    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-    return re.sub(regex, " ", text)
-
-
-def _white_space_fix(text: str) -> str:
-    return " ".join(text.split())
-
-
-EXCLUDE = set(string.punctuation)
-
-
-def _remove_punc(text: str) -> str:
-    if not _is_number(text):
-        return "".join(ch for ch in text if ch not in EXCLUDE)
-    else:
-        return text
-
-
-def _lower(text: str) -> str:
-    return text.lower()
-
-
-def _tokenize(text: str) -> List[str]:
-    return re.split(" |-", text)
-
-
-def _normalize_answer(text: str) -> str:
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    parts = [
-        _white_space_fix(
-            _remove_articles(_normalize_number(_remove_punc(_lower(token))))
-        )
-        for token in _tokenize(text)
-    ]
-    parts = [part for part in parts if part.strip()]
-    normalized = " ".join(parts).strip()
-    return normalized
-
-
-def _is_number(text: str) -> bool:
-    try:
-        float(text)
-        return True
-    except ValueError:
-        return False
-
-
-def _normalize_number(text: str) -> str:
-    if _is_number(text):
-        return str(float(text))
-    else:
-        return text
-
-
-def _answer_to_bags(
-    answer: Union[str, List[str], Tuple[str, ...]],
-) -> Tuple[List[str], List[Set[str]]]:
-    if isinstance(answer, (list, tuple)):
-        raw_spans = answer
-    else:
-        raw_spans = [answer]
-    normalized_spans: List[str] = []
-    token_bags = []
-    for raw_span in raw_spans:
-        normalized_span = _normalize_answer(raw_span)
-        normalized_spans.append(normalized_span)
-        token_bags.append(set(normalized_span.split()))
-    return normalized_spans, token_bags
-
-
-def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
-    """
-    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
-    between them and gets maximum metric values over all the answers.
-    """
-    scores = np.zeros([len(gold), len(predicted)])
-    for gold_index, gold_item in enumerate(gold):
-        for pred_index, pred_item in enumerate(predicted):
-            if _match_numbers_if_present(gold_item, pred_item):
-                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
-    row_ind, col_ind = linear_sum_assignment(-scores)
-
-    max_scores = np.zeros([max(len(gold), len(predicted))])
-    for row, column in zip(row_ind, col_ind):
-        max_scores[row] = max(max_scores[row], scores[row, column])
-    return max_scores
-
-
-def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
-    intersection = len(gold_bag.intersection(predicted_bag))
-    if not predicted_bag:
-        precision = 1.0
-    else:
-        precision = intersection / float(len(predicted_bag))
-    if not gold_bag:
-        recall = 1.0
-    else:
-        recall = intersection / float(len(gold_bag))
-    f1 = (
-        (2 * precision * recall) / (precision + recall)
-        if not (precision == 0.0 and recall == 0.0)
-        else 0.0
-    )
-    return f1
-
-
-def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
-    gold_numbers = set()
-    predicted_numbers = set()
-    for word in gold_bag:
-        if _is_number(word):
-            gold_numbers.add(word)
-    for word in predicted_bag:
-        if _is_number(word):
-            predicted_numbers.add(word)
-    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
-        return True
-    return False
-
-
-def get_metrics(
-    predicted: Union[str, List[str], Tuple[str, ...]],
-    gold: Union[str, List[str], Tuple[str, ...]],
-) -> Tuple[float, float]:
-    """
-    Takes a predicted answer and a gold answer (that are both either a string or a list of
-    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
-    writing a script for evaluating objects in memory (say, the output of predictions during
-    validation, or while training), this is the function you want to call, after using
-    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
-    """
-    predicted_bags = _answer_to_bags(predicted)
-    gold_bags = _answer_to_bags(gold)
-
-    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
-        gold_bags[0]
-    ):
-        exact_match = 1.0
-    else:
-        exact_match = 0.0
-
-    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
-    f1 = np.mean(f1_per_bag)
-    f1 = round(f1, 2)
-    return exact_match, f1
-
-
-def evaluate_strings(prediction, gold):
-    if type(prediction) != list and type(prediction) != str:
-        prediction = str(prediction)
-    if type(gold) != list and type(gold) != str:
-        gold = str(gold)
-    try:
-        predicted_bags = _answer_to_bags(prediction)
-        gold_bags = _answer_to_bags(gold)
-        f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
-        f1 = np.mean(f1_per_bag)
-    except Exception:
-        f1 = 0.0
-    return f1
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/readme.md b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/readme.md
deleted file mode 100644
index 733706ff4eeb..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/readme.md
+++ /dev/null
@@ -1 +0,0 @@
-These files were obtained from the creators of the AssistantBench benchmark and modified slightly. You can find the latest version at [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation)
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/utils.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/utils.py
deleted file mode 100644
index ea55f392a55a..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import List, Set, Tuple, Union, Callable
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-
-
-def _align_bags(
-    predicted: List[Set[str]],
-    gold: List[Set[str]],
-    method: Callable[[object, object], float],
-) -> List[float]:
-    """
-    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
-    between them and gets maximum metric values over all the answers.
-    """
-    scores = np.zeros([len(gold), len(predicted)])
-    for gold_index, gold_item in enumerate(gold):
-        for pred_index, pred_item in enumerate(predicted):
-            scores[gold_index, pred_index] = method(pred_item, gold_item)
-    row_ind, col_ind = linear_sum_assignment(-scores)
-
-    max_scores = np.zeros([max(len(gold), len(predicted))])
-    for row, column in zip(row_ind, col_ind):
-        max_scores[row] = max(max_scores[row], scores[row, column])
-    return max_scores
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py
deleted file mode 100644
index 752739cb6d9f..000000000000
--- a/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import json
-import os
-import re
-import sys
-
-from huggingface_hub import snapshot_download
-
-SCRIPT_PATH = os.path.realpath(__file__)
-SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
-SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
-
-SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
-TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
-TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
-DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
-REPO_DIR = os.path.join(DOWNLOADS_DIR, "AssistantBench")
-
-
-def download_assistantbench():
-    """Download the AssistantBench benchmark from Hugging Face."""
-
-    if not os.path.isdir(DOWNLOADS_DIR):
-        os.mkdir(DOWNLOADS_DIR)
-
-    """Download the AssistantBench dataset from Hugging Face Hub"""
-    snapshot_download(
-        repo_id="AssistantBench/AssistantBench",
-        repo_type="dataset",
-        local_dir=REPO_DIR,
-        local_dir_use_symlinks=True,
-    )
-
-
-def create_jsonl(data_file_path, file_name, template):
-    """Creates a JSONL scenario file with a given name, and template path."""
-    tasks = []
-    with open(data_file_path) as fh:
-        for line in fh:
-            data = json.loads(line)
-            tasks.append(data)
-    file_name = os.path.basename(file_name)
-    if not os.path.isdir(TASKS_DIR):
-        os.mkdir(TASKS_DIR)
-
-    with open(os.path.join(TASKS_DIR, file_name), "wt") as fh:
-        for task in tasks:
-            if "answer" not in task or task["answer"] is None:
-                task["answer"] = ""
-            print(f"Converting: [{file_name}] {task['id']}")
-            template_cp_list = [template]
-            record = {
-                "id": task["id"],
-                "template": template_cp_list,
-                "substitutions": {
-                    "scenario.py": {
-                        "__FILE_NAME__": "",
-                    },
-                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["answer"]},
-                    "prompt.txt": {"__PROMPT__": task["task"]},
-                },
-                "difficulty": task["difficulty"],
-                "explanation": task["explanation"],
-                "metadata": task["metadata"],
-                "gold_url": task["gold_url"],
-                "set": task["set"],
-            }
-            fh.write(json.dumps(record).strip() + "\n")
-
-
-###############################################################################
-def main():
-    ab_validation_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_dev.jsonl")
-    ab_test_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_test.jsonl")
-
-    if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
-        download_assistantbench()
-
-    if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
-        sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the AssistantBench repository.")
-
-    templates = {}
-    for entry in os.scandir(TEMPLATES_DIR):
-        if entry.is_dir():
-            templates[re.sub(r"\s", "", entry.name)] = entry.path
-    print(templates)
-    # make a copy of the data in the Tasks directory
-    for t in templates.items():
-        create_jsonl(ab_validation_files, f"assistant_bench_v1.0_dev__{t[0]}.jsonl", t[1])
-        create_jsonl(ab_test_files, f"assistant_bench_v1.0_test__{t[0]}.jsonl", t[1])
-
-
-if __name__ == "__main__" and __package__ is None:
-    main()
diff --git a/python/packages/agbench/benchmarks/GAIA/ENV.json.sample b/python/packages/agbench/benchmarks/GAIA/ENV.json.sample
deleted file mode 100644
index 1f2c4915e3c7..000000000000
--- a/python/packages/agbench/benchmarks/GAIA/ENV.json.sample
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-    "BING_API_KEY": "YOUR_KEY_KEY",
-    "HOMEPAGE": "https://www.bing.com/",
-    "WEB_SURFER_DEBUG_DIR": "/autogen/debug"
-}
diff --git a/python/packages/agbench/benchmarks/GAIA/ENV.yaml b/python/packages/agbench/benchmarks/GAIA/ENV.yaml
new file mode 100644
index 000000000000..e2778d51e07d
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/ENV.yaml
@@ -0,0 +1,30 @@
+# ENV.yaml 
+#
+# This file specifies environment variables to be passed to the Docker task
+# instances or virtual environments. These values are ephemeral, and are
+# discarded when the task concludes. This is useful for passing API keys, etc.
+# since they will not be saved in logs or to any task output.
+#
+# String values can reference environment variable on the host machine. 
+# For example:
+#
+# OPENAI_API_KEY: ${OPENAI_API_KEY} 
+#
+# Will copy the host's OPENAI_API_KEY environment variable to the corresponding
+# variable in the task environment. 
+#
+# Complex values will be converte to JSON, and then passed as a string to the
+# task environment. For example:
+#
+# MODEL_CONFIG: 
+#   provider: autogen_ext.models.openai.OpenAIChatCompletionClient
+#   config:
+#     model: gpt-4o
+#
+# Will be converted to:
+#
+# MODEL_CONFIG: >-
+#   {"provider": "autogen_ext.models.openai.OpenAIChatCompletionClient", "config": {"model": "gpt-4o"}}
+#
+
+OPENAI_API_KEY: ${OPENAI_API_KEY}
diff --git a/python/packages/agbench/benchmarks/GAIA/README.md b/python/packages/agbench/benchmarks/GAIA/README.md
index 753d8e4ed51a..ef98a24e4b4e 100644
--- a/python/packages/agbench/benchmarks/GAIA/README.md
+++ b/python/packages/agbench/benchmarks/GAIA/README.md
@@ -10,31 +10,7 @@ Navigate to GAIA
 cd benchmarks/GAIA
 ```
 
-Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
-
-```json
-{
-    "BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
-    "HOMEPAGE": "https://www.bing.com/",
-    "WEB_SURFER_DEBUG_DIR": "/autogen/debug",
-    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
-    "CHAT_COMPLETION_PROVIDER": "azure"
-}
-```
-
-You can also use the openai client by replacing the last two entries in the ENV file by:
-
-- `CHAT_COMPLETION_PROVIDER='openai'`
-- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
-
-```json
-{
-  "api_key": "REPLACE_WITH_YOUR_API",
-  "model": "gpt-4o-2024-05-13"
-}
-```
-
-You might need to add additional packages to the requirements.txt file inside the Templates/MagenticOne folder.
+Update `config.yaml` to point to your model host, as appropriate. The default configuration points to 'gpt-4o'.
 
 Now initialize the tasks.
 
diff --git a/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py
index ec51863e9c7b..1b23ee219f7f 100644
--- a/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py
+++ b/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py
@@ -6,12 +6,14 @@
 import pandas as pd
 import sqlite3
 import glob
+import string
+import warnings
 import numpy as np
 
 EXCLUDE_DIR_NAMES = ["__pycache__"]
 
 
-def normalize_answer(a):
+def in_house_normalize_answer(a):
     # Lower case
     # Trim (left and right)
     # standardize comma separated values
@@ -22,6 +24,106 @@ def normalize_answer(a):
     return norm_answer
 
 
+def in_house_question_scorer(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+     n_ma = in_house_normalize_answer(model_answer)
+     n_gt = in_house_normalize_answer(ground_truth)
+     return (n_gt != "" and n_gt == n_ma)
+ 
+
+def gaia_question_scorer(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+    #FROM: https://huggingface.co/spaces/gaia-benchmark/leaderboard/blob/main/scorer.py
+
+    def normalize_number_str(number_str: str) -> float:
+        # we replace these common units and commas to allow
+        # conversion to float
+        for char in ["$", "%", ","]:
+            number_str = number_str.replace(char, "")
+        try:
+            return float(number_str)
+        except ValueError:
+            print(f"String {number_str} cannot be normalized to number str.")
+            return float("inf")
+
+    def split_string(s: str, char_list: list[str] = [",", ";"],) -> list[str]:
+        pattern = f"[{''.join(char_list)}]"
+        return re.split(pattern, s)
+
+    def normalize_str(input_str, remove_punct=True) -> str:
+        """
+        Normalize a string by:
+        - Removing all white spaces
+        - Optionally removing punctuation (if remove_punct is True)
+        - Converting to lowercase
+        Parameters:
+        - input_str: str, the string to normalize
+        - remove_punct: bool, whether to remove punctuation (default: True)
+        Returns:
+        - str, the normalized string
+        """
+        # Remove all white spaces. Required e.g for seagull vs. sea gull
+        no_spaces = re.sub(r"\s", "", input_str)
+
+        # Remove punctuation, if specified.
+        if remove_punct:
+            translator = str.maketrans("", "", string.punctuation)
+            return no_spaces.lower().translate(translator)
+        else:
+            return no_spaces.lower()
+
+
+    def is_float(element: any) -> bool:
+        try:
+            float(element)
+            return True
+        except ValueError:
+            return False
+
+    # if gt is a number
+    if is_float(ground_truth):
+        normalized_answer = normalize_number_str(model_answer)
+        return normalized_answer == float(ground_truth)
+
+    # if gt is a list
+    elif any(char in ground_truth for char in [",", ";"]):
+        # question with the fish: normalization removes punct
+
+        gt_elems = split_string(ground_truth)
+        ma_elems = split_string(model_answer)
+
+        # check length is the same
+        if len(gt_elems) != len(ma_elems):
+            #warnings.warn(
+            #    "Answer lists have different lengths, returning False.", UserWarning
+            #)
+            return False
+
+        # compare each element as float or str
+        comparisons = []
+        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
+            if is_float(gt_elem):
+                normalized_ma_elem = normalize_number_str(ma_elem)
+                comparisons.append(normalized_ma_elem == float(gt_elem))
+            else:
+                # we do not remove punct since comparisons can include punct
+                comparisons.append(
+                    normalize_str(ma_elem, remove_punct=False)
+                    == normalize_str(gt_elem, remove_punct=False)
+                )
+        return all(comparisons)
+
+    # if gt is a str
+    else:
+        return normalize_str(model_answer) == normalize_str(ground_truth)
+
+
+##############
+
 def scorer(instance_dir):
     # Read the expected answer
     expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
@@ -51,147 +153,12 @@ def scorer(instance_dir):
             return None
 
         # Return true if they are equal after normalization
-        n_ex = normalize_answer(expected_answer)
-        n_final = normalize_answer(final_answer)
-        return (
-            (n_ex != "" and n_ex == n_final),
-            n_ex,
-            n_final
-        )
-
-
-def get_number_of_chat_messages(chat_messages_dir):
-    result = 0
-    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
-        with open(file, "r") as f:
-            content = json.load(f)
-            for agent, messages in content.items():
-                result += len(messages)
-    return result
+        # return in_house_question_scorer(final_answer, expected_answer)
+        return gaia_question_scorer(final_answer, expected_answer)
 
 
 def main(args):
-    parsed_args, all_results = default_tabulate(args, scorer=scorer)
-    excel_path = parsed_args.excel
-
-    if excel_path:
-        excel_dir = os.path.dirname(excel_path) or "."
-        if not os.path.exists(excel_dir):
-            os.makedirs(excel_dir, exist_ok=True)
-
-        if not excel_path.endswith((".xlsx", ".xls")):
-            excel_path += ".xlsx"
-
-        runlogs = parsed_args.runlogs if parsed_args.runlogs.endswith("/") else parsed_args.runlogs + "/"
-
-        if os.path.isdir(runlogs):
-            task_ids = sorted(
-                [task_id for task_id in os.listdir(runlogs) if task_id not in EXCLUDE_DIR_NAMES],
-                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
-            )
-        else:
-            raise ValueError("please input a valid directory to tabulate result")
-
-        trials = sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) if len(task_ids) > 0 else []
-        dbnames = [[f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids] for trial in trials]
-
-        query = """
-            SELECT cost, session_id, response, start_time, end_time
-            FROM (
-                SELECT invocation_id, cost, session_id, response, start_time, end_time,
-                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
-                FROM chat_completions
-            )
-            WHERE rn = 1;
-        """
-
-        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
-            for trial_index, each_trial in enumerate(dbnames):
-                result_df = pd.DataFrame(
-                    columns=[
-                        "id",
-                        "status",
-                        "expected_answer",
-                        "final_answer",
-                        "cost",
-                        "latency",
-                        "num_of_llm_requests",
-                        "num_of_chat_messages",
-                        "prompt_tokens",
-                        "completion_tokens",
-                        "total_tokens",
-                        "model",
-                    ]
-                )
-
-                result_df_type_mapping = {
-                    "id": str,
-                    "status": bool,
-                    "expected_answer": str,
-                    "final_answer": str,
-                    "cost": float,
-                    "latency": float,
-                    "num_of_llm_requests": int,
-                    "num_of_chat_messages": int,
-                    "prompt_tokens": int,
-                    "completion_tokens": int,
-                    "total_tokens": int,
-                }
-
-                for dbname, scorer_results in zip(each_trial, all_results):
-                    task_id = scorer_results[0]
-                    scorer_result = scorer_results[trial_index + 1]
-
-                    status, expected_answer, final_answer = scorer_result if scorer_result else (False,"","")
-
-                    con = sqlite3.connect(dbname)
-
-                    # TODO: if large amount of data, add chunksize
-                    telemetry_df = pd.read_sql_query(query, con)
-
-                    earliest_starttime = pd.to_datetime(telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f").min()
-                    latest_endtime = pd.to_datetime(telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f").max()
-
-                    num_of_chat_messages = get_number_of_chat_messages(chat_messages_dir=os.path.dirname(dbname))
-                    result = {
-                        "id": task_id,
-                        "status": status,
-                        "expected_answer": expected_answer,
-                        "final_answer": final_answer,
-                        "cost": telemetry_df["cost"].sum(),
-                        "latency": (latest_endtime - earliest_starttime).total_seconds(),
-                        "num_of_llm_requests": len(telemetry_df),
-                        "num_of_chat_messages": num_of_chat_messages,
-                        "prompt_tokens": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
-                            if "usage" in json.loads(x) and "prompt_tokens" in json.loads(x)["usage"]
-                            else 0
-                        )
-                        .sum(),
-                        "completion_tokens": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["usage"]["completion_tokens"]
-                            if "usage" in json.loads(x) and "completion_tokens" in json.loads(x)["usage"]
-                            else 0
-                        )
-                        .sum(),
-                        "total_tokens": telemetry_df["response"]
-                        .apply(
-                            lambda x: json.loads(x)["usage"]["total_tokens"]
-                            if "usage" in json.loads(x) and "total_tokens" in json.loads(x)["usage"]
-                            else 0
-                        )
-                        .sum(),
-                        "model": telemetry_df["response"]
-                        .apply(lambda x: json.loads(x)["model"] if "model" in json.loads(x) else "")
-                        .unique(),
-                    }
-
-                    result_df = result_df.astype(result_df_type_mapping)
-                    result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
-                result_df.to_excel(writer, sheet_name=f"trial_{trial_index}", index=False)
-
+    default_tabulate(args, scorer=scorer)
 
 if __name__ == "__main__" and __package__ is None:
     main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/expected_answer.txt b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/expected_answer.txt
new file mode 100644
index 000000000000..8153c2bf8242
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/expected_answer.txt
@@ -0,0 +1 @@
+__EXPECTED_ANSWER__
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/prompt.txt b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/requirements.txt b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/requirements.txt
new file mode 100644
index 000000000000..3db8bfa55857
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/requirements.txt
@@ -0,0 +1,5 @@
+tiktoken
+pyyaml
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-ext[openai,magentic-one]
+/autogen_python/packages/autogen-agentchat
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/scenario.py b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/scenario.py
new file mode 100644
index 000000000000..7f43c111e29a
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/scenario.py
@@ -0,0 +1,89 @@
+import asyncio
+import os
+import yaml
+import warnings
+from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
+from autogen_agentchat.teams import MagenticOneGroupChat
+from autogen_agentchat.ui import Console
+from autogen_core.models import ModelFamily
+from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
+from autogen_agentchat.conditions import TextMentionTermination
+from autogen_core.models import ChatCompletionClient
+from autogen_ext.agents.web_surfer import MultimodalWebSurfer
+from autogen_ext.agents.file_surfer import FileSurfer
+from autogen_agentchat.agents import CodeExecutorAgent
+from autogen_agentchat.messages import TextMessage
+
+# Suppress warnings about the requests.Session() not being closed
+warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
+
+async def main() -> None:
+
+    # Load model configuration and create the model client.
+    with open("config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+
+    orchestrator_client = ChatCompletionClient.load_component(config["orchestrator_client"])
+    coder_client = ChatCompletionClient.load_component(config["coder_client"])
+    web_surfer_client = ChatCompletionClient.load_component(config["web_surfer_client"])
+    file_surfer_client = ChatCompletionClient.load_component(config["file_surfer_client"])
+    
+    # Read the prompt
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read().strip()
+    filename = "__FILE_NAME__".strip()
+
+    # Set up the team
+    coder = MagenticOneCoderAgent(
+        "Assistant",
+        model_client = coder_client,
+    )
+
+    executor = CodeExecutorAgent("ComputerTerminal", code_executor=LocalCommandLineCodeExecutor())
+
+    file_surfer = FileSurfer(
+        name="FileSurfer",
+        model_client = file_surfer_client,
+    )
+                
+    web_surfer = MultimodalWebSurfer(
+        name="WebSurfer",
+        model_client = web_surfer_client,
+        downloads_folder=os.getcwd(),
+        debug_dir="logs",
+        to_save_screenshots=True,
+    )
+
+    team = MagenticOneGroupChat(
+        [coder, executor, file_surfer, web_surfer],
+        model_client=orchestrator_client,
+        max_turns=20,
+        final_answer_prompt= f""",
+We have completed the following task:
+
+{prompt}
+
+The above messages contain the conversation that took place to complete the task.
+Read the above conversation and output a FINAL ANSWER to the question.
+To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+""".strip()
+    )
+
+    # Prepare the prompt
+    filename_prompt = ""
+    if len(filename) > 0:
+        filename_prompt = f"The question is about a file, document or image, which can be accessed by the filename '{filename}' in the current working directory."
+    task = f"{prompt}\n\n{filename_prompt}"
+
+    # Run the task
+    stream = team.run_stream(task=task.strip())
+    await Console(stream)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/expected_answer.txt b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/expected_answer.txt
new file mode 100644
index 000000000000..8153c2bf8242
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/expected_answer.txt
@@ -0,0 +1 @@
+__EXPECTED_ANSWER__
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/prompt.txt b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/requirements.txt b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/requirements.txt
new file mode 100644
index 000000000000..3db8bfa55857
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/requirements.txt
@@ -0,0 +1,5 @@
+tiktoken
+pyyaml
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-ext[openai,magentic-one]
+/autogen_python/packages/autogen-agentchat
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/scenario.py b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/scenario.py
new file mode 100644
index 000000000000..e2e1d8fae009
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/SelectorGroupChat/scenario.py
@@ -0,0 +1,176 @@
+import asyncio
+import os
+import yaml
+import warnings
+from typing import Sequence
+from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
+from autogen_agentchat.teams import SelectorGroupChat
+from autogen_agentchat.conditions import MaxMessageTermination
+from autogen_agentchat.ui import Console
+from autogen_agentchat.utils import content_to_str
+from autogen_core.models import ModelFamily
+from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
+from autogen_agentchat.conditions import TextMentionTermination
+from autogen_agentchat.base import TerminationCondition, TerminatedException
+from autogen_core.models import ChatCompletionClient
+from autogen_ext.agents.web_surfer import MultimodalWebSurfer
+from autogen_ext.agents.file_surfer import FileSurfer
+from autogen_agentchat.agents import CodeExecutorAgent
+from autogen_agentchat.messages import TextMessage, AgentEvent, ChatMessage, HandoffMessage, MultiModalMessage, StopMessage
+from autogen_core.models import LLMMessage, UserMessage, AssistantMessage
+
+# Suppress warnings about the requests.Session() not being closed
+warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
+
+async def main() -> None:
+
+    # Load model configuration and create the model client.
+    with open("config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+
+    orchestrator_client = ChatCompletionClient.load_component(config["orchestrator_client"])
+    coder_client = ChatCompletionClient.load_component(config["coder_client"])
+    web_surfer_client = ChatCompletionClient.load_component(config["web_surfer_client"])
+    file_surfer_client = ChatCompletionClient.load_component(config["file_surfer_client"])
+    
+    # Read the prompt
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read().strip()
+    filename = "__FILE_NAME__".strip()
+
+    # Set up the team
+    coder = MagenticOneCoderAgent(
+        "Assistant",
+        model_client = coder_client,
+    )
+
+    executor = CodeExecutorAgent("ComputerTerminal", code_executor=LocalCommandLineCodeExecutor())
+
+    file_surfer = FileSurfer(
+        name="FileSurfer",
+        model_client = file_surfer_client,
+    )
+                
+    web_surfer = MultimodalWebSurfer(
+        name="WebSurfer",
+        model_client = web_surfer_client,
+        downloads_folder=os.getcwd(),
+        debug_dir="logs",
+        to_save_screenshots=True,
+    )
+
+    # Prepare the prompt
+    filename_prompt = ""
+    if len(filename) > 0:
+        filename_prompt = f"The question is about a file, document or image, which can be accessed by the filename '{filename}' in the current working directory."
+    task = f"{prompt}\n\n{filename_prompt}"
+
+    # Termination conditions
+    max_messages_termination = MaxMessageTermination(max_messages=20)
+    llm_termination = LLMTermination(
+        prompt=f"""Consider the following task:
+{task.strip()}
+
+Does the above conversation suggest that the task has been solved?
+If so, reply "TERMINATE", otherwise reply "CONTINUE"
+""",
+        model_client=orchestrator_client
+    )
+
+    termination = max_messages_termination | llm_termination
+
+    # Create the team
+    team = SelectorGroupChat(
+        [coder, executor, file_surfer, web_surfer],
+        model_client=orchestrator_client,
+        termination_condition=termination,
+    )
+
+    # Run the task
+    stream = team.run_stream(task=task.strip())
+    result = await Console(stream)
+
+    # Do one more inference to format the results
+    final_context: Sequence[LLMMessage] = []
+    for message in result.messages:
+        if isinstance(message, TextMessage):
+            final_context.append(UserMessage(content=message.content, source=message.source))
+        elif isinstance(message, MultiModalMessage):
+            if orchestrator_client.model_info["vision"]:
+                final_context.append(UserMessage(content=message.content, source=message.source))
+            else:
+                final_context.append(UserMessage(content=content_to_str(message.content), source=message.source))
+    final_context.append(UserMessage(
+        content=f"""We have completed the following task:
+{prompt}
+
+The above messages contain the conversation that took place to complete the task.
+Read the above conversation and output a FINAL ANSWER to the question.
+To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+#""".strip(),
+        source="user"))
+
+    # Call the model to evaluate
+    response = await orchestrator_client.create(final_context)
+    print(response.content, flush=True)
+
+
+class LLMTermination(TerminationCondition):
+    """Terminate the conversation if an LLM determines the task is complete.
+
+    Args:
+        prompt: The prompt to evaluate in the llm
+        model_client: The LLM model_client to use
+        termination_phrase: The phrase to look for in the LLM output to trigger termination
+    """
+
+    def __init__(self, prompt: str, model_client: ChatCompletionClient, termination_phrase: str = "TERMINATE") -> None:
+        self._prompt = prompt
+        self._model_client = model_client
+        self._termination_phrase = termination_phrase
+        self._terminated = False
+        self._context: Sequence[LLMMessage] = []
+
+    @property
+    def terminated(self) -> bool:
+        return self._terminated
+
+    async def __call__(self, messages: Sequence[AgentEvent | ChatMessage]) -> StopMessage | None:
+        if self._terminated:
+            raise TerminatedException("Termination condition has already been reached")
+
+        # Build the context
+        for message in messages:
+            if isinstance(message, TextMessage):
+                self._context.append(UserMessage(content=message.content, source=message.source))
+            elif isinstance(message, MultiModalMessage):
+                if self._model_client.model_info["vision"]:
+                    self._context.append(UserMessage(content=message.content, source=message.source))
+                else:
+                    self._context.append(UserMessage(content=content_to_str(message.content), source=message.source))
+
+        if len(self._context) == 0:
+            return None
+
+        # Call the model to evaluate
+        response = await self._model_client.create(self._context + [UserMessage(content=self._prompt, source="user")]) 
+
+        # Check for termination
+        if isinstance(message.content, str) and self._termination_phrase in response.content:
+            self._terminated = True
+            return StopMessage(content=message.content, source="LLMTermination")
+        return None
+
+    async def reset(self) -> None:
+        self._terminated = False
+        self._context = []
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/GAIA/config.yaml b/python/packages/agbench/benchmarks/GAIA/config.yaml
new file mode 100644
index 000000000000..a13c1b1a4598
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/config.yaml
@@ -0,0 +1,38 @@
+# config.yaml
+#
+# The contents of this file will be copied into the 'config.yaml' file of
+# every expanded Task, just prior to running the scenario. This provides a
+# good place to store model or other configurations important for the scenario.
+
+###############################
+# Open AI model configuration #
+###############################
+model_config: &client
+  provider: autogen_ext.models.openai.OpenAIChatCompletionClient
+  config:
+    model: gpt-4o
+
+
+##############################
+# Ollama model configuration #
+##############################
+#model_config: &client
+#    provider: autogen_ext.models.openai.OpenAIChatCompletionClient
+#    config:
+#      model: deepseek-r1:7b
+#      base_url: http://localhost:11434/v1/
+#      api_key: ollama
+#      model_info:
+#        function_calling: false
+#        json_output: false
+#        vision: false
+#        family: r1
+#
+
+#######################
+# Used by MagenticOne #
+#######################
+orchestrator_client: *client
+coder_client: *client
+web_surfer_client: *client
+file_surfer_client: *client  
diff --git a/python/packages/agbench/benchmarks/HumanEval/ENV.yaml b/python/packages/agbench/benchmarks/HumanEval/ENV.yaml
new file mode 100644
index 000000000000..e2778d51e07d
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/ENV.yaml
@@ -0,0 +1,30 @@
+# ENV.yaml 
+#
+# This file specifies environment variables to be passed to the Docker task
+# instances or virtual environments. These values are ephemeral, and are
+# discarded when the task concludes. This is useful for passing API keys, etc.
+# since they will not be saved in logs or to any task output.
+#
+# String values can reference environment variable on the host machine. 
+# For example:
+#
+# OPENAI_API_KEY: ${OPENAI_API_KEY} 
+#
+# Will copy the host's OPENAI_API_KEY environment variable to the corresponding
+# variable in the task environment. 
+#
+# Complex values will be converte to JSON, and then passed as a string to the
+# task environment. For example:
+#
+# MODEL_CONFIG: 
+#   provider: autogen_ext.models.openai.OpenAIChatCompletionClient
+#   config:
+#     model: gpt-4o
+#
+# Will be converted to:
+#
+# MODEL_CONFIG: >-
+#   {"provider": "autogen_ext.models.openai.OpenAIChatCompletionClient", "config": {"model": "gpt-4o"}}
+#
+
+OPENAI_API_KEY: ${OPENAI_API_KEY}
diff --git a/python/packages/agbench/benchmarks/HumanEval/README.md b/python/packages/agbench/benchmarks/HumanEval/README.md
index 25acc2630523..0c045af4cc1a 100644
--- a/python/packages/agbench/benchmarks/HumanEval/README.md
+++ b/python/packages/agbench/benchmarks/HumanEval/README.md
@@ -15,26 +15,8 @@ Navigate to HumanEval
 cd benchmarks/HumanEval
 ```
 
-Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
+Update `config.yaml` to point to your model host, as appropriate. The default configuration points to 'gpt-4o'.
 
-```json
-{
-    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
-    "CHAT_COMPLETION_PROVIDER": "azure"
-}
-```
-
-You can also use the openai client by replacing the last two entries in the ENV file by:
-
-- `CHAT_COMPLETION_PROVIDER='openai'`
-- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
-
-```json
-{
-  "api_key": "REPLACE_WITH_YOUR_API",
-  "model": "gpt-4o-2024-05-13"
-}
-```
 
 Now initialize the tasks.
 
@@ -51,13 +33,13 @@ Once the script completes, you should now see a folder in your current directory
 Now to run a specific subset of HumanEval use:
 
 ```bash
-agbench run Tasks/human_eval_MagenticOne.jsonl
+agbench run Tasks/human_eval_AgentChat.jsonl
 ```
 
 You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
 
 ```bash
-agbench tabulate Results/human_eval_MagenticOne
+agbench tabulate Results/human_eval_AgentChat
 ```
 
 
diff --git a/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py
index df4e6b194841..2dc7d4f0fb7b 100644
--- a/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py
+++ b/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py
@@ -93,9 +93,9 @@ def create_jsonl(name, tasks, template):
                 "id": task["task_id"].replace("/", "_"),
                 "template": template,
                 "substitutions": {
-                    "scenario.py": {"__ENTRY_POINT__": task["entry_point"]},
                     "prompt.txt": {"__PROMPT__": task["prompt"]},
-                    "unit_tests.py": {"__TEST__": task["test"]},
+                    "test.txt": {"__TEST__": task["test"]},
+                    "custom_code_executor.py": {"__ENTRY_POINT__": task["entry_point"]},
                 },
             }
 
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/custom_code_executor.py b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/custom_code_executor.py
new file mode 100644
index 000000000000..5d9893e057d0
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/custom_code_executor.py
@@ -0,0 +1,54 @@
+import re
+from typing import List, Sequence
+
+from autogen_core.code_executor import CodeBlock, CodeExecutor
+from autogen_agentchat.agents import CodeExecutorAgent
+
+
+class CustomCodeExecutorAgent(CodeExecutorAgent):
+
+    def __init__(
+        self,
+        name: str,
+        code_executor: CodeExecutor,
+        *,
+        description: str = "A computer terminal that performs no other action than running Python scripts (provided to it quoted in ```python code blocks), or sh shell scripts (provided to it quoted in ```sh code blocks).",
+        sources: Sequence[str] | None = None,
+    ) -> None:
+        super().__init__(name=name, description=description, code_executor=code_executor, sources=sources)
+        self._test_code = ""
+        with open("test.txt", "rt") as fh:
+            self._test_code = fh.read()
+
+
+    def _extract_markdown_code_blocks(self, markdown_text: str) -> List[CodeBlock]:
+        code_blocks = super()._extract_markdown_code_blocks(markdown_text)
+        new_blocks: List[CodeBlock] = []
+        for block in code_blocks:
+
+            # Handle deepseek
+            code_content = block.code
+            #m = re.search(r"^\s*<think>\s*(.*?)\s*</think>\s*(.*?)\s*$", code_content, re.DOTALL)
+            #if m:
+            #    code_content = m.group(2)
+
+            # If python, wrap the extracted code in a unit testing harness
+            if block.language and block.language.lower() == "python":
+                code_content = self._test_code + """
+
+def run_tests(candidate):
+    try:
+        check(candidate)
+        # We can search for this string in the output
+        print("ALL TESTS PASSED !#!#")
+        print("TERMINATE")
+    except AssertionError:
+        print("SOME TESTS FAILED - TRY AGAIN !#!#")
+
+""" + code_content + """
+
+run_tests(__ENTRY_POINT__)
+"""
+            new_blocks.append(CodeBlock(code=code_content, language=block.language))
+
+        return new_blocks
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/prompt.txt b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/requirements.txt b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/requirements.txt
new file mode 100644
index 000000000000..5ba1405ce6e0
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/requirements.txt
@@ -0,0 +1,4 @@
+pyyaml
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-ext[openai]
+/autogen_python/packages/autogen-agentchat
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py
new file mode 100644
index 000000000000..96d8cd968bee
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py
@@ -0,0 +1,54 @@
+import asyncio
+import os
+import yaml
+from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
+from autogen_agentchat.teams import RoundRobinGroupChat
+from autogen_agentchat.ui import Console
+from autogen_core.models import ModelFamily
+from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
+from autogen_agentchat.conditions import TextMentionTermination
+from custom_code_executor import CustomCodeExecutorAgent
+from autogen_core.models import ChatCompletionClient
+
+async def main() -> None:
+
+    # Load model configuration and create the model client.
+    with open("config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+    model_client = ChatCompletionClient.load_component(config["model_config"])
+
+    # Coder
+    coder_agent = MagenticOneCoderAgent(
+        name="coder",
+        model_client=model_client,
+    )
+
+    # Executor
+    executor = CustomCodeExecutorAgent(
+        name="executor",
+        code_executor=LocalCommandLineCodeExecutor(),
+        sources=["coder"],
+    )
+
+    # Termination condition
+    termination = TextMentionTermination(text="TERMINATE", sources=["executor"])
+
+    # Define a team
+    agent_team = RoundRobinGroupChat([coder_agent, executor], max_turns=12, termination_condition=termination)
+
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read()
+
+    task = f"""Complete the following python function. Format your output as Markdown python code block containing the entire function definition:
+
+```python
+{prompt}
+```
+"""
+
+    # Run the team and stream messages to the console.
+    stream = agent_team.run_stream(task=task)
+    await Console(stream)
+
+asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/test.txt b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/test.txt
new file mode 100644
index 000000000000..91318587b914
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/test.txt
@@ -0,0 +1 @@
+__TEST__
diff --git a/python/packages/agbench/benchmarks/HumanEval/config.yaml b/python/packages/agbench/benchmarks/HumanEval/config.yaml
new file mode 100644
index 000000000000..9e2f22819d7a
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/config.yaml
@@ -0,0 +1,29 @@
+# config.yaml
+#
+# The contents of this file will be copied into the 'config.yaml' file of
+# every expanded Task, just prior to running the scenario. This provides a
+# good place to store model or other configurations important for the scenario.
+
+###############################
+# Open AI model configuration #
+###############################
+model_config:
+  provider: autogen_ext.models.openai.OpenAIChatCompletionClient
+  config:
+    model: gpt-4o
+
+
+##############################
+# Ollama model configuration #
+##############################
+#model_config:
+#    provider: autogen_ext.models.openai.OpenAIChatCompletionClient
+#    config:
+#      model: deepseek-r1:7b
+#      base_url: http://localhost:11434/v1/
+#      api_key: ollama
+#      model_info:
+#        function_calling: false
+#        json_output: false
+#        vision: false
+#        family: r1
diff --git a/python/packages/agbench/benchmarks/WebArena/ENV.sample b/python/packages/agbench/benchmarks/WebArena/ENV.sample
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/packages/agbench/benchmarks/WebArena/README.md b/python/packages/agbench/benchmarks/WebArena/README.md
deleted file mode 100644
index 74e17f892dad..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# WebArena Benchmark
-
-This scenario implements the [WebArena](https://github.com/web-arena-x/webarena/tree/main) benchmark. The evaluation code has been modified from WebArena in [evaluation_harness](Templates/Common/evaluation_harness) we retain the License from WebArena and include it here [LICENSE](Templates/Common/evaluation_harness/LICENSE).
-
-
-## References
-
-Zhou, Shuyan, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng et al. "Webarena: A realistic web environment for building autonomous agents." arXiv preprint arXiv:2307.13854 (2023).
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/WebArena/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/WebArena/Scripts/custom_tabulate.py
deleted file mode 100644
index 6697a08749a5..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Scripts/custom_tabulate.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-import sys
-import re
-from agbench.tabulate_cmd import default_tabulate
-
-
-def scorer(instance_dir):
-
-    # Read the console
-    console_log_file = os.path.join(instance_dir, "console_log.txt")
-    if not os.path.isfile(console_log_file):
-        return None
-
-    console_log = ""
-    with open(console_log_file, "rt") as fh:
-        console_log = fh.read()
-
-        final_score = None 
-        m = re.search(r"FINAL SCORE:(.*?)\n", console_log, re.DOTALL)
-        if m:
-            final_score = m.group(1).strip()
-
-        # Missing the final answer line
-        if final_score is None:
-            return None
-        else:
-            return float(final_score) > 0
-
-
-def main(args):
-    default_tabulate(args, scorer=scorer)
-
-
-if __name__ == "__main__" and __package__ is None:
-    main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/WebArena/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/WebArena/Scripts/init_tasks.py
deleted file mode 100644
index 5ba3fd4d08f4..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Scripts/init_tasks.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#
-# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
-# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
-#
-
-import requests
-import tarfile
-import hashlib
-import io
-import json
-import os
-import re
-import sys
-
-URL = "https://raw.githubusercontent.com/web-arena-x/webarena/main/config_files/test.raw.json"
-
-SCRIPT_PATH = os.path.realpath(__file__)
-SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
-SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
-
-SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
-TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
-TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
-DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
-
-
-def download():
-    """Download the WebArena dataset (if not already downloaded).
-    Return a JSON list of problem instances."""
-
-    if not os.path.isdir(DOWNLOADS_DIR):
-        os.mkdir(DOWNLOADS_DIR)
-
-    json_file = os.path.join(DOWNLOADS_DIR, "test.raw.json")
-
-    if not os.path.isfile(json_file):
-        # Send a HTTP request to the URL
-        response = requests.get(URL, stream=True)
-        response.raise_for_status()
-
-        # If the HTTP request returns a status code 200, proceed
-        with open(json_file, "wb") as fh:
-            for chunk in response.iter_content(chunk_size=512):
-                fh.write(chunk)
-
-    # Load the problems
-    problems = None
-    with open(json_file, "rb") as fh:
-        problems = json.load(fh)
-    return problems
-
-
-def create_jsonl(name, tasks, template):
-    """Creates a JSONL scenario file with a given name, dictionary of MATH problems, and template path."""
-
-    # Create a task directory if it doesn't exist
-    if not os.path.isdir(TASKS_DIR):
-        os.mkdir(TASKS_DIR)
-
-    # Create the jsonl file
-    prompt_fields = ["task_id", "intent_template_id", "sites", "require_login", "start_url", "geolocation", "intent"]
-    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
-        for task in tasks:
-            print(f"Converting: {name}, {task['task_id']}")
-
-            task_prompt = {}
-            for field in prompt_fields:
-                task_prompt[field] = task[field]
-
-            record = {
-                "id": str(task["task_id"]),
-                "template": [os.path.join(TEMPLATES_DIR, "Common"), template],
-                "substitutions": {
-                    "task_prompt.json.txt": {"__TASK_PROMPT__": json.dumps(task_prompt, indent=4)},
-                    "full_task.json.txt": {"__FULL_TASK__": json.dumps(task, indent=4)},
-                },
-            }
-
-            fh.write(json.dumps(record).strip() + "\n")
-
-
-###############################################################################
-def main():
-    tasks = download()
-
-    # list all directories in the Templates directory
-    # and populate a dictionary with the name and path
-    templates = {}
-    for entry in os.scandir(TEMPLATES_DIR):
-        if entry.is_dir():
-            if entry.name == "Common":  # Skip the common template, which will be included in all
-                continue
-            templates[re.sub(r"\s", "", entry.name)] = entry.path
-
-    # Divide the tasks by their websites and if they are validation or test
-    page_groups = dict()
-    for task in tasks:
-
-        # We don't know how the intent ids are distributed, so hash them to get a uniform distribution
-        template_hash = hashlib.md5(str(task["intent_template_id"]).encode("utf-8")).hexdigest()
-
-        # The full hash will consist of 32 hexadecimal digits. We can get a 50/50 split by checking if the first digit is in the range (0-7) vs (8-F)
-        task_set = "validation" if template_hash[0] in "01234567" else "test"
-
-        key = task["sites"][0]
-        if len(task["sites"]) > 1:
-            key = "several_sites"
-        key = task_set + "_" + key
-
-        # key = "__".join(sorted([s for s in task["sites"]]))
-        if key not in page_groups:
-            page_groups[key] = list()
-        page_groups[key].append(task)
-
-    # Create the json files
-    for t in templates.items():
-        for pg in page_groups:
-            create_jsonl(f"webarena__{pg}_{t[0]}", page_groups[pg], t[1])
-
-
-if __name__ == "__main__" and __package__ is None:
-    main()
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/ATTRIBUTION b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/ATTRIBUTION
deleted file mode 100644
index 0713904fb45b..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/ATTRIBUTION
+++ /dev/null
@@ -1,28 +0,0 @@
-The contents of this `evaluation_harness` folder are adapted from:
-
-              https://github.com/web-arena-x/webarena
-
-under the following license:
-
-=========================================================================================================
-
-Copyright (c) 2024 Jing Yu Koh, Robert Lo, Lawrence Jang, Vikram Duvvur, Ming Chong Lim, and Po-Yu Huang
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/LICENSE b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/LICENSE
deleted file mode 100644
index f49a4e16e68b..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/__init__.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/__init__.py
deleted file mode 100644
index e942c1066769..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .evaluators import *
-from .helper_functions import (
-    shopping_get_latest_order_url,
-    shopping_get_sku_latest_review_author,
-    shopping_get_sku_latest_review_rating,
-)
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/env_config.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/env_config.py
deleted file mode 100644
index ed84ae4735ef..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/env_config.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# websites domain
-import os
-
-REDDIT = os.environ.get("REDDIT", "")
-SHOPPING = os.environ.get("SHOPPING", "")
-SHOPPING_ADMIN = os.environ.get("SHOPPING_ADMIN", "")
-GITLAB = os.environ.get("GITLAB", "")
-WIKIPEDIA = os.environ.get("WIKIPEDIA", "")
-MAP = os.environ.get("MAP", "")
-HOMEPAGE = os.environ.get("HOMEPAGE", "")
-
-REDDIT_USERNAME = os.environ.get("REDDIT_USERNAME", "")
-REDDIT_PASSWORD = os.environ.get("REDDIT_PASSWORD", "")
-
-GITLAB_USERNAME = os.environ.get("GITLAB_USERNAME", "")
-GITLAB_PASSWORD = os.environ.get("GITLAB_PASSWORD", "")
-
-SHOPPING_USERNAME = os.environ.get("SHOPPING_USERNAME", "")
-SHOPPING_PASSWORD = os.environ.get("SHOPPING_PASSWORD", "")
-
-SHOPPING_ADMIN_USERNAME = os.environ.get("SHOPPING_ADMIN_USERNAME", "")
-SHOPPING_ADMIN_PASSWORD = os.environ.get("SHOPPING_ADMIN_PASSWORD", "")
-
-assert REDDIT and SHOPPING and SHOPPING_ADMIN and GITLAB and WIKIPEDIA and MAP and HOMEPAGE, (
-    "Please setup the URLs to each site. Current: \n"
-    + f"Reddit: {REDDIT}\n"
-    + f"Shopping: {SHOPPING}\n"
-    + f"Shopping Admin: {SHOPPING_ADMIN}\n"
-    + f"Gitlab: {GITLAB}\n"
-    + f"Wikipedia: {WIKIPEDIA}\n"
-    + f"Map: {MAP}\n"
-    + f"Homepage: {HOMEPAGE}\n"
-)
-
-ACCOUNTS = {
-    "reddit": {"username": REDDIT_USERNAME, "password": REDDIT_PASSWORD},
-    "gitlab": {"username": GITLAB_USERNAME, "password": GITLAB_PASSWORD},
-    "shopping": {"username": SHOPPING_USERNAME, "password": SHOPPING_PASSWORD},
-    "shopping_admin": {"username": SHOPPING_ADMIN_USERNAME, "password": SHOPPING_ADMIN_PASSWORD},
-    "shopping_site_admin": {"username": SHOPPING_ADMIN_USERNAME, "password": SHOPPING_ADMIN_PASSWORD},
-}
-
-URL_MAPPINGS = {
-    REDDIT: "http://reddit.com",
-    SHOPPING: "http://onestopmarket.com",
-    SHOPPING_ADMIN: "http://luma.com/admin",
-    GITLAB: "http://gitlab.com",
-    WIKIPEDIA: "http://wikipedia.org",
-    MAP: "http://openstreetmap.org",
-    HOMEPAGE: "http://homepage.com",
-}
-
-# ADDED BY MSR Frontiers
-#########################
-SITE_URLS = {
-    "reddit": REDDIT,
-    "gitlab": GITLAB, 
-    "shopping": SHOPPING,
-    "shopping_admin": SHOPPING_ADMIN,
-    "shopping_site_admin": SHOPPING_ADMIN,
-    "map": MAP,
-    "wikipedia": WIKIPEDIA,
-}
-
-LOGIN_PROMPTS = {
-    "reddit": f"Type '{REDDIT}' into the address bar to navigate to the site. Click 'Log in', type the username '{ACCOUNTS['reddit']['username']}', and password is '{ACCOUNTS['reddit']['password']}'. Finally click the login button.",
-    "gitlab": f"Type '{GITLAB}' into the address bar to navigate to the site. At the log in prompt, type the username '{ACCOUNTS['gitlab']['username']}', and the password '{ACCOUNTS['gitlab']['password']}'. Finally click the 'Sign in' button.",
-    "shopping": f"Type '{SHOPPING}' into the address bar to navigate to the site. Click 'Sign In' at the top of the page. Enter the Email '{ACCOUNTS['shopping']['username']}', and password '{ACCOUNTS['shopping']['password']}'. Finally click the 'Sign In' button.",
-    "shopping_admin": f"Type '{SHOPPING_ADMIN}' into the address bar to navigate to the site. At the log in prompt, enter the username '{ACCOUNTS['shopping_admin']['username']}', and the password '{ACCOUNTS['shopping_admin']['password']}'. Finally click the 'Sign In' button.",
-}
-
-SITE_DESCRIPTIONS = {
-    "reddit": "a Postmill forum populated with a large sample of data crawled from Reddit. Postmill is similar to Reddit, but the UI is distinct, and 'subreddits' begin with /f/ rather than /r/",
-    "gitlab": "a Gitlab site populated with various programming projects. Gitlab is similar to GitHub, though the UIs are slightly different",
-    "shopping": "an online store built with the Magento open source eCommerce platform",
-    "shopping_admin": "the content management admin portal for an online store running the Magento open source eCommerce software",
-}
-
-
-def url_to_sitename(url):
-    if url.startswith(REDDIT):
-        return "reddit"
-    elif url.startswith(GITLAB):
-        return "gitlab"
-    elif url.startswith(SHOPPING):
-        return "shopping"
-    elif url.startswith(SHOPPING_ADMIN):
-        return "shopping_admin"
-    else:
-        return None
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/evaluators.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/evaluators.py
deleted file mode 100644
index 05c9a3bc15a1..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/evaluators.py
+++ /dev/null
@@ -1,387 +0,0 @@
-"""From WebArena. base class for evaluation"""
-
-# answer string match
-import collections
-import html
-import importlib
-import json
-import time
-import urllib
-import inspect
-from pathlib import Path
-from typing import Any, Tuple, Union, TypedDict, Dict
-
-from beartype import beartype
-from nltk.tokenize import word_tokenize  # type: ignore
-from playwright.async_api import CDPSession, Page
-
-import numpy as np
-import numpy.typing as npt
-
-from .helper_functions import (
-    PseudoPage,
-    gitlab_get_project_memeber_role,
-    llm_fuzzy_match,
-    llm_ua_match,
-    reddit_get_post_url,
-    shopping_get_latest_order_url,
-    shopping_get_sku_latest_review_author,
-    shopping_get_sku_latest_review_rating,
-)
-
-
-# Subset used for evaluation (added by: adamfo)
-#####################################################################
-class Action(TypedDict):
-    answer: str
-
-
-Observation = str | npt.NDArray[np.uint8]
-
-
-class StateInfo(TypedDict):
-    observation: dict[str, Observation]
-    info: Dict[str, Any]
-
-
-Trajectory = list[Union[Action, StateInfo]]
-
-
-def make_answer_trajecotry(answer: str) -> Trajectory:
-    ans = Action()
-    ans["answer"] = answer
-    return [ans]
-
-
-#####################################################################
-class Evaluator(object):
-    def __init__(self, eval_tag: str = "") -> None:
-        self.eval_tag = eval_tag
-
-    @beartype
-    async def __call__(
-        self,
-        trajectory: Trajectory,
-        config_file: Path | str,
-        page: Page | PseudoPage,
-        client: CDPSession,
-        azure_config: dict[str, Any] | None = None,
-    ) -> float:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_last_action(trajectory: Trajectory) -> Action:
-        try:
-            # is_bearable(trajectory[-1], Action)
-            last_action = trajectory[-1]
-        except Exception:
-            raise ValueError("The last element of trajectory should be an action, add a fake stop action if needed")
-
-        return last_action  # type: ignore[return-value]
-
-    @staticmethod
-    def get_last_state(trajectory: Trajectory) -> StateInfo:
-        try:
-            # is_bearable(trajectory[-2], StateInfo)
-            last_state = trajectory[-2]
-        except Exception:
-            raise ValueError(
-                "The second last element of trajectory should be a state, add a fake stop action if needed"
-            )
-
-        return last_state  # type: ignore[return-value]
-
-
-class StringEvaluator(Evaluator):
-    """Check whether the answer is correct with:
-    exact match: the answer is exactly the same as the reference answer
-    must include: each phrase in the reference answer must be included in the answer
-    fuzzy match: the answer is similar to the reference answer, using LLM judge
-    """
-
-    @staticmethod
-    @beartype
-    def clean_answer(answer: str) -> str:
-        answer = answer.strip()
-        if answer.startswith("'") and answer.endswith("'"):
-            answer = answer[1:-1]
-        elif answer.startswith('"') and answer.endswith('"'):
-            answer = answer[1:-1]
-        return answer.lower()
-
-    @staticmethod
-    @beartype
-    def exact_match(ref: str, pred: str) -> float:
-        return float(StringEvaluator.clean_answer(pred) == StringEvaluator.clean_answer(ref))
-
-    @staticmethod
-    @beartype
-    def must_include(ref: str, pred: str, tokenize: bool = False) -> float:
-        clean_ref = StringEvaluator.clean_answer(ref)
-        clean_pred = StringEvaluator.clean_answer(pred)
-        # tokenize the answer if the ref is a single word
-        # prevent false positive (e.g, 0)
-        if tokenize and len(clean_ref) == 1 and len(word_tokenize(clean_ref)) == 1:
-            tok_pred = word_tokenize(clean_pred)
-            return float(clean_ref in tok_pred)
-        else:
-            return float(clean_ref in clean_pred)
-
-    @staticmethod
-    @beartype
-    def fuzzy_match(ref: str, pred: str, intent: str, azure_config: dict[str, Any] | None) -> float:
-        return llm_fuzzy_match(pred, ref, intent, azure_config)
-
-    @staticmethod
-    @beartype
-    def ua_match(ref: str, pred: str, intent: str, azure_config: dict[str, Any] | None) -> float:
-        return llm_ua_match(pred, ref, intent, azure_config)
-
-    async def __call__(
-        self,
-        trajectory: Trajectory,
-        config_file: Path | str,
-        page: Page | PseudoPage | None = None,
-        client: CDPSession | None = None,
-        azure_config: dict[str, Any] | None = None,
-    ) -> float:
-        with open(config_file, "r") as f:
-            configs = json.load(f)
-
-        last_action = self.get_last_action(trajectory)
-        pred = self.clean_answer(last_action["answer"])
-
-        score = 1.0
-        for approach, value in configs["eval"]["reference_answers"].items():
-            match approach:
-                case "exact_match":
-                    score *= self.exact_match(ref=value, pred=pred)
-
-                case "must_include":
-                    assert isinstance(value, list)
-                    for must_value in value:
-                        score *= self.must_include(
-                            ref=must_value,
-                            pred=pred,
-                            tokenize=(len(value) == 1),
-                        )
-                case "fuzzy_match":
-                    intent = configs["intent"]
-                    if value == "N/A":
-                        # if the instruction only asks the model to generate N/A when encountering an unachievable task
-                        # without more concrete reasons
-                        score *= self.exact_match(ref=value, pred=pred)
-                        # if the instruction also asks the model to generate the reason why the task is unachievable
-                        # this should be the default as it will prevent false positive N/A`
-                        if score != 1:
-                            score = 1.0 * self.ua_match(
-                                intent=configs["intent"],
-                                ref=configs["eval"]["string_note"],
-                                pred=pred,
-                                azure_config=azure_config,
-                            )
-                    else:
-                        assert isinstance(value, list)
-                        for reference in value:
-                            score *= self.fuzzy_match(
-                                ref=reference, pred=pred, intent=intent, azure_config=azure_config
-                            )
-        return score
-
-
-class URLEvaluator(Evaluator):
-    """Check URL matching"""
-
-    @beartype
-    async def __call__(
-        self,
-        trajectory: Trajectory,
-        config_file: Path | str,
-        page: Page | PseudoPage,
-        client: CDPSession | None = None,
-        azure_config: dict[str, Any] | None = None,
-    ) -> float:
-        with open(config_file, "r") as f:
-            configs = json.load(f)
-
-        def clean_url(url: str) -> str:
-            url = str(url)
-            url = url.rstrip("/")
-            return url
-
-        def parse_url(url: str) -> tuple[str, dict[str, list[str]]]:
-            """Parse a URL into its base, path, and query components."""
-            parsed_url = urllib.parse.urlparse(url)
-            base_path = parsed_url.netloc + parsed_url.path
-            query = urllib.parse.parse_qs(parsed_url.query)
-            return base_path, query
-
-        def parse_urls(
-            urls: list[str],
-        ) -> tuple[list[str], dict[str, set[str]]]:
-            """Parse a list of URLs."""
-            base_paths = []
-            queries = collections.defaultdict(set)
-            for url in urls:
-                base_path, query = parse_url(url)
-                base_paths.append(base_path)
-                for k, v in query.items():
-                    queries[k].update(v)
-            return base_paths, queries
-
-        pred = clean_url(page.url)
-        ref_urls = configs["eval"]["reference_url"].split(" |OR| ")
-        ref_urls = [clean_url(url) for url in ref_urls]
-        matching_rule = configs["eval"].get("url_note", "GOLD in PRED")
-        if matching_rule == "GOLD in PRED":
-            print(f"Pred: {pred}")
-            print(f"Ref: {ref_urls}")
-            ref_base_paths, ref_queries = parse_urls(ref_urls)
-            pred_base_paths, pred_query = parse_url(pred)
-
-            base_score = float(any([ref_base_path in pred_base_paths for ref_base_path in ref_base_paths]))
-            query_score = 1.0
-            for k, possible_values in ref_queries.items():
-                query_score *= float(
-                    any(possible_ref_value in pred_query.get(k, []) for possible_ref_value in possible_values)
-                )
-            score = base_score * query_score
-
-        else:
-            raise ValueError(f"Unknown matching rule: {matching_rule}")
-
-        return score
-
-
-class HTMLContentEvaluator(Evaluator):
-    """Check whether the contents appear in the page"""
-
-    @beartype
-    async def __call__(
-        self,
-        trajectory: Trajectory,
-        config_file: Path | str,
-        page: Page | PseudoPage,
-        client: CDPSession | None = None,
-        azure_config: dict[str, Any] | None = None,
-    ) -> float:
-        with open(config_file, "r") as f:
-            configs = json.load(f)
-
-        targets = configs["eval"]["program_html"]
-
-        score = 1.0
-        for target in targets:
-            target_url: str = target["url"]  # which url to check
-            if target_url.startswith("func"):
-                func = target_url.split("func:")[1]
-                func = func.replace("__last_url__", page.url)
-                target_url = eval(func)
-                if inspect.isawaitable(target_url):
-                    target_url = await target_url
-
-            locator: str = target["locator"]  # js element locator
-
-            # navigate to that url
-            if target_url != "last":
-                await page.goto(target_url)
-                time.sleep(3)  # TODO [shuyanzh]: fix this hard-coded sleep
-
-            # empty, use the full page
-            if not locator.strip():
-                selected_element = await page.content()
-            # use JS to select the element
-            elif locator.startswith("document.") or locator.startswith("[...document."):
-                if "prep_actions" in target:
-                    try:
-                        for prep_action in target["prep_actions"]:
-                            await page.evaluate(f"() => {prep_action}")
-                    except Exception:
-                        pass
-                try:
-                    selected_element = await page.evaluate(f"() => {locator}")
-                    selected_element = str(selected_element)
-                    if not selected_element:
-                        selected_element = ""
-                except Exception:
-                    # the page is wrong, return empty
-                    selected_element = ""
-            # run program to call API
-            elif locator.startswith("func:"):  # a helper function
-                func = locator.split("func:")[1]
-                func = func.replace("__page__", "page")
-                selected_element = eval(func)
-                if inspect.isawaitable(selected_element):
-                    selected_element = await selected_element
-            else:
-                raise ValueError(f"Unknown locator: {locator}")
-
-            selected_element = html.unescape(selected_element)
-
-            if "exact_match" in target["required_contents"]:
-                required_contents = target["required_contents"]["exact_match"]
-                cur_score = StringEvaluator.exact_match(ref=required_contents, pred=selected_element)
-                score *= float(cur_score)
-                # print(f"[exact match] {cur_score}, selected element: {selected_element}, required contents: {required_contents}")
-            elif "must_include" in target["required_contents"]:
-                required_contents = target["required_contents"]["must_include"]
-                assert isinstance(required_contents, list)
-                for content in required_contents:
-                    content_or = content.split(" |OR| ")
-                    cur_score = any(
-                        [
-                            StringEvaluator.must_include(
-                                ref=content,
-                                pred=selected_element,
-                                tokenize=False,
-                            )
-                            for content in content_or
-                        ]
-                    )
-                    score *= float(cur_score)
-                    # print(f"[must include] {cur_score}, selected element: {selected_element}, required contents: {content_or}")
-            else:
-                raise ValueError(f"Unknown required_contents: {target['required_contents'].keys()}")
-        return score
-
-
-class EvaluatorComb:
-    def __init__(self, evaluators: list[Evaluator]) -> None:
-        self.evaluators = evaluators
-
-    @beartype
-    async def __call__(
-        self,
-        trajectory: Trajectory,
-        config_file: Path | str,
-        page: Page | PseudoPage,
-        client: CDPSession,
-        azure_config: dict[str, Any] | None = None,
-    ) -> float:
-        score = 1.0
-        for evaluator in self.evaluators:
-            cur_score = await evaluator(trajectory, config_file, page, client, azure_config)
-            score *= cur_score
-        return score
-
-
-@beartype
-def evaluator_router(config_file: Path | str) -> EvaluatorComb:
-    """Router to get the evaluator class"""
-    with open(config_file, "r") as f:
-        configs = json.load(f)
-
-    eval_types = configs["eval"]["eval_types"]
-    evaluators: list[Evaluator] = []
-    for eval_type in eval_types:
-        match eval_type:
-            case "string_match":
-                evaluators.append(StringEvaluator())
-            case "url_match":
-                evaluators.append(URLEvaluator())
-            case "program_html":
-                evaluators.append(HTMLContentEvaluator())
-            case _:
-                raise ValueError(f"eval_type {eval_type} is not supported")
-
-    return EvaluatorComb(evaluators)
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/helper_functions.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/helper_functions.py
deleted file mode 100644
index eff8520b5ab4..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/helper_functions.py
+++ /dev/null
@@ -1,233 +0,0 @@
-"""From WebArena with minor modifications. Implements helper functions to assist evaluation cases where other evaluators are not suitable."""
-
-import json
-from typing import Any
-from urllib.parse import urlparse
-
-import requests
-from playwright.async_api import Page
-
-from .env_config import (
-    ACCOUNTS,
-    GITLAB,
-    MAP,
-    REDDIT,
-    SHOPPING,
-    SHOPPING_ADMIN,
-    WIKIPEDIA,
-)
-
-from .openai_utils import (
-    generate_from_openai_chat_completion,
-)
-
-import autogen
-
-
-def shopping_get_auth_token() -> str:
-    response = requests.post(
-        url=f"{SHOPPING}/rest/default/V1/integration/admin/token",
-        headers={"content-type": "application/json"},
-        data=json.dumps(
-            {
-                "username": ACCOUNTS["shopping_site_admin"]["username"],
-                "password": ACCOUNTS["shopping_site_admin"]["password"],
-            }
-        ),
-    )
-    token: str = response.json()
-    return token
-
-
-def shopping_get_latest_order_url() -> str:
-    """Get the latest order url from the shopping website."""
-
-    header = {
-        "Authorization": f"Bearer {shopping_get_auth_token()}",
-        "Content-Type": "application/json",
-    }
-
-    params = {
-        "searchCriteria[sortOrders][0][field]": "created_at",
-        "searchCriteria[sortOrders][0][direction]": "DESC",
-        "searchCriteria[pageSize]": "1",
-    }
-
-    response = requests.get(f"{SHOPPING}/rest/V1/orders", params=params, headers=header)
-    assert response.status_code == 200
-    response_obj = response.json()["items"][0]
-    order_id = int(response_obj["increment_id"])
-    order_url = f"{SHOPPING}/sales/order/view/order_id/{order_id}/"
-    return order_url
-
-
-def shopping_get_sku_latest_review_author(sku: str) -> str:
-    """Get the latest review for shopping admin."""
-    header = {
-        "Authorization": f"Bearer {shopping_get_auth_token()}",
-        "Content-Type": "application/json",
-    }
-    response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header)
-    assert response.status_code == 200
-    response_obj = response.json()
-    if len(response_obj) == 0:
-        return ""
-    author: str = response_obj[-1]["nickname"]
-    return author
-
-
-def shopping_get_sku_latest_review_rating(sku: str) -> str:
-    """Get the latest review for shopping admin."""
-    header = {
-        "Authorization": f"Bearer {shopping_get_auth_token()}",
-        "Content-Type": "application/json",
-    }
-    response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header)
-    assert response.status_code == 200
-    response_obj = response.json()
-    if len(response_obj) == 0:
-        return ""
-    assert response_obj[0]["ratings"][0]["rating_name"] == "Rating"
-    rating: str = str(response_obj[-1]["ratings"][0]["percent"])
-    return rating
-
-
-def reddit_get_post_url(url: str) -> str:
-    """Get the post url"""
-    # Url is http://domain/f/subreddit/post_id/...
-    # get domain, subreddit, post_id
-    domain = urlparse(url).netloc
-    tok_url = urlparse(url).path.split("/")
-    # not a valid post/comment url, return the url as is
-    if len(tok_url) < 4:
-        return url
-    if tok_url[1] != "f":
-        return url
-    subreddit = urlparse(url).path.split("/")[2]
-    post_id = urlparse(url).path.split("/")[3]
-    scheme = urlparse(url).scheme
-    post_url = f"{scheme}://{domain}/f/{subreddit}/{post_id}/"
-    return post_url
-
-
-async def gitlab_get_project_memeber_role(page: Page, account_name: str) -> str:
-    # get the account index
-    try:
-        account_idx = await page.evaluate(
-            f"""(() => {{
-                const elements = document.querySelectorAll("td[data-label='Account'] span.gl-avatar-labeled-sublabel");
-                let index = -1;  // Default value if not found
-
-                for(let i = 0; i < elements.length; i++) {{
-                    if(elements[i].outerText === '@{account_name}') {{
-                        index = i;
-                        break;
-                    }}
-                }}
-
-                return index;
-            }})()"""
-        )
-
-        # get the role
-        role: str = await page.evaluate(
-            f"""(() => {{
-                return document.querySelectorAll("td.col-max-role span")[{account_idx}].outerText;
-            }})()"""
-        )
-    except Exception:
-        role = ""
-
-    return role
-
-
-def llm_fuzzy_match(pred: str, reference: str, question: str, azure_config: dict[str, Any] | None) -> float:
-    """Check whether the prediction matches the reference with GPT4-turbo"""
-    messages: list[dict[str, Any]] = []
-    # construct the question to ask
-    message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n"
-    message += f"question: {question}\n"
-    message += f"reference answer: {reference}\n"
-    message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n"
-    message += f"student answer: {pred}\n"
-    message += "Conclude the judgement by correct/incorrect/partially correct."
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant"},
-        {"role": "user", "content": message},
-    ]
-
-    response = None
-    if azure_config is None:
-        response = generate_from_openai_chat_completion(
-            model="gpt-4-1106-preview",
-            messages=messages,
-            temperature=0,
-            max_tokens=768,
-            top_p=1.0,
-            context_length=0,
-        ).lower()
-    else:
-        client = autogen.OpenAIWrapper(**azure_config)
-        raw_response = client.create(context=None, messages=messages)
-        response = client.extract_text_or_completion_object(raw_response)[0].lower()
-
-    if "partially correct" in response or "incorrect" in response:
-        return 0.0
-    else:
-        assert "correct" in response
-        return 1.0
-
-
-def llm_ua_match(pred: str, reference: str, question: str, azure_config: dict[str, Any] | None) -> float:
-    """Check whether the prediction matches the reference with GPT-turbo"""
-    messages: list[dict[str, Any]] = []
-    # construct the question to ask
-    message = ""
-    message += f"task: {question}\n"
-    message += f"actual unachievable reason: {reference}\n"
-    message += f"reported unachievable reason: {pred}\n"
-    message += (
-        "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. "
-        "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, "
-        "which is listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. "
-        "Determine if the reported reason aligns with the actual reason, even if implicitly. "
-        "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'."
-    )
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant"},
-        {"role": "user", "content": message},
-    ]
-
-    response = None
-    if azure_config is None:
-        response = generate_from_openai_chat_completion(
-            model="gpt-4-1106-preview",
-            messages=messages,
-            temperature=0,
-            max_tokens=768,
-            top_p=1.0,
-            context_length=0,
-        ).lower()
-    else:
-        client = autogen.OpenAIWrapper(**azure_config)
-        raw_response = client.create(context=None, messages=messages)
-        response = client.extract_text_or_completion_object(raw_response)[0].lower()
-
-    if "different" in response:
-        return 0.0
-    else:
-        assert "same" in response
-        return 1.0
-
-
-class PseudoPage:
-    def __init__(self, original_page: Page, url: str):
-        self.url = url
-        self.original_page = original_page
-
-    def __getattr__(self, attr: str) -> Any:
-        # Delegate attribute access to the original page object
-        if attr not in ["url"]:
-            return getattr(self.original_page, attr)
-        else:
-            return getattr(self, attr)
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/openai_utils.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/openai_utils.py
deleted file mode 100644
index 1381f392cdf2..000000000000
--- a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/openai_utils.py
+++ /dev/null
@@ -1,275 +0,0 @@
-"""Tools to generate from OpenAI prompts.
-Adopted from https://github.com/zeno-ml/zeno-build/"""
-
-import asyncio
-import logging
-import os
-import random
-import time
-from typing import Any
-
-import aiolimiter
-import openai
-from openai import AsyncOpenAI, OpenAI
-
-client = None
-aclient = None
-if "OPENAI_API_KEY" not in os.environ and "OAI_CONFIG_LIST" not in os.environ:
-    raise ValueError("Neither OPENAI_API_KEY nor OAI_CONFIG_LIST is defined in the environment.")
-
-if "OPENAI_API_KEY" in os.environ:
-    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
-    aclient = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
-from tqdm.asyncio import tqdm_asyncio
-
-
-def retry_with_exponential_backoff(  # type: ignore
-    func,
-    initial_delay: float = 1,
-    exponential_base: float = 2,
-    jitter: bool = True,
-    max_retries: int = 3,
-    errors: tuple[Any] = (
-        openai.RateLimitError,
-        openai.BadRequestError,
-        openai.InternalServerError,
-    ),
-):
-    """Retry a function with exponential backoff."""
-
-    def wrapper(*args, **kwargs):  # type: ignore
-        # Initialize variables
-        num_retries = 0
-        delay = initial_delay
-
-        # Loop until a successful response or max_retries is hit or an exception is raised
-        while True:
-            try:
-
-                return func(*args, **kwargs)
-
-            # Retry on specified errors
-            except errors:
-                # Increment retries
-                num_retries += 1
-
-                # Check if max retries has been reached
-                if num_retries > max_retries:
-                    raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
-
-                # Increment the delay
-                delay *= exponential_base * (1 + jitter * random.random())
-
-                # Sleep for the delay
-                time.sleep(delay)
-
-            # Raise exceptions for any errors not specified
-            except Exception as e:
-                raise e
-
-    return wrapper
-
-
-async def _throttled_openai_completion_acreate(
-    engine: str,
-    prompt: str,
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    limiter: aiolimiter.AsyncLimiter,
-) -> dict[str, Any]:
-    async with limiter:
-        for _ in range(3):
-            try:
-                return await aclient.completions.create(
-                    engine=engine,
-                    prompt=prompt,
-                    temperature=temperature,
-                    max_tokens=max_tokens,
-                    top_p=top_p,
-                )
-            except openai.RateLimitError:
-                logging.warning("OpenAI API rate limit exceeded. Sleeping for 10 seconds.")
-                await asyncio.sleep(10)
-            except openai.APIError as e:
-                logging.warning(f"OpenAI API error: {e}")
-                break
-        return {"choices": [{"message": {"content": ""}}]}
-
-
-async def agenerate_from_openai_completion(
-    prompts: list[str],
-    engine: str,
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    context_length: int,
-    requests_per_minute: int = 300,
-) -> list[str]:
-    """Generate from OpenAI Completion API.
-
-    Args:
-        prompts: list of prompts
-        temperature: Temperature to use.
-        max_tokens: Maximum number of tokens to generate.
-        top_p: Top p to use.
-        context_length: Length of context to use.
-        requests_per_minute: Number of requests per minute to allow.
-
-    Returns:
-        List of generated responses.
-    """
-    if "OPENAI_API_KEY" not in os.environ:
-        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
-
-    limiter = aiolimiter.AsyncLimiter(requests_per_minute)
-    async_responses = [
-        _throttled_openai_completion_acreate(
-            engine=engine,
-            prompt=prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            limiter=limiter,
-        )
-        for prompt in prompts
-    ]
-    responses = await tqdm_asyncio.gather(*async_responses)
-    return [x["choices"][0]["text"] for x in responses]
-
-
-@retry_with_exponential_backoff
-def generate_from_openai_completion(
-    prompt: str,
-    engine: str,
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    context_length: int,
-    stop_token: str | None = None,
-) -> str:
-    if "OPENAI_API_KEY" not in os.environ:
-        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
-
-    response = client.completions.create(
-        prompt=prompt,
-        engine=engine,
-        temperature=temperature,
-        max_tokens=max_tokens,
-        top_p=top_p,
-        stop=[stop_token],
-    )
-    answer: str = response["choices"][0]["text"]
-    return answer
-
-
-async def _throttled_openai_chat_completion_acreate(
-    model: str,
-    messages: list[dict[str, str]],
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    limiter: aiolimiter.AsyncLimiter,
-) -> dict[str, Any]:
-    async with limiter:
-        for _ in range(3):
-            try:
-                return await aclient.chat.completions.create(
-                    model=model,
-                    messages=messages,
-                    temperature=temperature,
-                    max_tokens=max_tokens,
-                    top_p=top_p,
-                )
-            except openai.RateLimitError:
-                logging.warning("OpenAI API rate limit exceeded. Sleeping for 10 seconds.")
-                await asyncio.sleep(10)
-            except asyncio.exceptions.TimeoutError:
-                logging.warning("OpenAI API timeout. Sleeping for 10 seconds.")
-                await asyncio.sleep(10)
-            except openai.APIError as e:
-                logging.warning(f"OpenAI API error: {e}")
-                break
-        return {"choices": [{"message": {"content": ""}}]}
-
-
-async def agenerate_from_openai_chat_completion(
-    messages_list: list[list[dict[str, str]]],
-    engine: str,
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    context_length: int,
-    requests_per_minute: int = 300,
-) -> list[str]:
-    """Generate from OpenAI Chat Completion API.
-
-    Args:
-        messages_list: list of message list
-        temperature: Temperature to use.
-        max_tokens: Maximum number of tokens to generate.
-        top_p: Top p to use.
-        context_length: Length of context to use.
-        requests_per_minute: Number of requests per minute to allow.
-
-    Returns:
-        List of generated responses.
-    """
-    if "OPENAI_API_KEY" not in os.environ:
-        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
-
-    limiter = aiolimiter.AsyncLimiter(requests_per_minute)
-    async_responses = [
-        _throttled_openai_chat_completion_acreate(
-            model=engine,
-            messages=message,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p,
-            limiter=limiter,
-        )
-        for message in messages_list
-    ]
-    responses = await tqdm_asyncio.gather(*async_responses)
-    return [x["choices"][0]["message"]["content"] for x in responses]
-
-
-@retry_with_exponential_backoff
-def generate_from_openai_chat_completion(
-    messages: list[dict[str, str]],
-    model: str,
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    context_length: int,
-    stop_token: str | None = None,
-) -> str:
-    if "OPENAI_API_KEY" not in os.environ:
-        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
-    response = client.chat.completions.create(
-        model=model,
-        messages=messages,
-        temperature=temperature,
-        max_tokens=max_tokens,
-        top_p=top_p,
-    )
-    answer: str = response.choices[0].message.content
-    return answer
-
-
-@retry_with_exponential_backoff
-# debug only
-def fake_generate_from_openai_chat_completion(
-    messages: list[dict[str, str]],
-    model: str,
-    temperature: float,
-    max_tokens: int,
-    top_p: float,
-    context_length: int,
-    stop_token: str | None = None,
-) -> str:
-    if "OPENAI_API_KEY" not in os.environ:
-        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
-
-    answer = "Let's think step-by-step. This page shows a list of links and buttons. There is a search box with the label 'Search query'. I will click on the search box to type the query. So the action I will perform is \"click [60]\"."
-    return answer
diff --git a/python/packages/agbench/src/agbench/res/Dockerfile b/python/packages/agbench/src/agbench/res/Dockerfile
index a7da943f343d..033c92162658 100644
--- a/python/packages/agbench/src/agbench/res/Dockerfile
+++ b/python/packages/agbench/src/agbench/res/Dockerfile
@@ -11,23 +11,20 @@ RUN ln -snf /usr/share/zoneinfo/US/Pacific /etc/localtime && echo "US/Pacific" >
 # Upgrade pip
 RUN pip install --upgrade pip
 
-# Pre-load autogen_core dependencies, but not autogen_core itself since we'll often want to install the latest from source
-RUN pip install openai pillow aiohttp typing-extensions pydantic types-aiofiles grpcio protobuf
+# Pre-load autogen to get the dependencies, but then uninstall them (leaving dependencies in place)
+RUN pip install autogen-core autogen-agentchat autogen-ext pyyaml
+RUN pip uninstall --yes autogen-core autogen-agentchat autogen-ext
+
+# Optional markitdown dependencies
+RUN pip install markitdown SpeechRecognition pydub youtube_transcript_api==0.6.0
 
 # Pre-load popular packages as per https://learnpython.com/blog/most-popular-python-packages/
 RUN pip install numpy pandas matplotlib seaborn scikit-learn requests urllib3 nltk pytest
 
-# Pre-load packages needed for mdconvert file utils
-RUN pip install python-docx pdfminer.six python-pptx SpeechRecognition openpyxl pydub mammoth puremagic youtube_transcript_api==0.6.0
-
 # Pre-load Playwright
 RUN pip install playwright
 RUN playwright install --with-deps chromium
 
-# Fix an incompatibility with numpy
-RUN pip uninstall --yes numpy
-RUN pip install "numpy<2.0"
-
 # Webarena (evaluation code)
-RUN pip install beartype aiolimiter
-RUN /usr/bin/echo -e "import nltk\nnltk.download('punkt')" | python
+#RUN pip install beartype aiolimiter
+#RUN /usr/bin/echo -e "import nltk\nnltk.download('punkt')" | python
diff --git a/python/packages/agbench/src/agbench/run_cmd.py b/python/packages/agbench/src/agbench/run_cmd.py
index 9d94d79170fe..181088d44be8 100644
--- a/python/packages/agbench/src/agbench/run_cmd.py
+++ b/python/packages/agbench/src/agbench/run_cmd.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import random
+import re
 import shutil
 import subprocess
 import sys
@@ -14,6 +15,7 @@
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 import docker
+import yaml
 from azure.core.exceptions import ClientAuthenticationError
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 from docker.errors import APIError, DockerException, ImageNotFound
@@ -38,7 +40,9 @@
 # Do not use this field to specify the name of an existing image (e.g., on Dockerhub)
 DEFAULT_DOCKER_IMAGE_TAG = "agbench"
 
-DEFAULT_ENV_FILE = "ENV.json"
+DEFAULT_ENV_FILE_JSON = "ENV.json"
+DEFAULT_ENV_FILE_YAML = "ENV.yaml"
+DEFAULT_CONFIG_YAML = "config.yaml"
 
 # Get a random number generator for subsampling
 subsample_rng = random.Random(425)
@@ -55,10 +59,12 @@ def run_scenarios(
     scenario: str,
     n_repeats: int,
     is_native: bool,
+    config_file: Union[None, str],
     token_provider: Optional[Callable[[], str]],
     docker_image: Optional[str] = None,
     results_dir: str = "Results",
     subsample: Union[None, int, float] = None,
+    env_file: Union[None, str] = None,
 ) -> None:
     """
     Run a set agbench scenarios a given number of times.
@@ -151,10 +157,10 @@ def run_scenarios(
                 print(f"Running scenario {results_repetition}")
 
                 # Expand the scenario
-                expand_scenario(scenario_dir, instance, results_repetition)
+                expand_scenario(scenario_dir, instance, results_repetition, config_file)
 
                 # Prepare the environment (keys/values that need to be added)
-                env = get_scenario_env(token_provider)
+                env = get_scenario_env(token_provider=token_provider, env_file=env_file)
 
                 # Run the scenario
                 if is_native:
@@ -171,7 +177,9 @@ def run_scenarios(
             file_handle.close()
 
 
-def expand_scenario(scenario_dir: str, scenario: ScenarioInstance, output_dir: str) -> None:
+def expand_scenario(
+    scenario_dir: str, scenario: ScenarioInstance, output_dir: str, config_file: Union[str, None]
+) -> None:
     """
     Expand a scenario into a folder.
     Despite some awkwardness created by backwards compatibility and notational conveniences, expansion is conceptually simple.
@@ -244,16 +252,26 @@ def expand_scenario(scenario_dir: str, scenario: ScenarioInstance, output_dir: s
                     line = line.replace(k, v)
                 fh.write(line)
 
+    # Copy the config
+    if config_file is None:
+        if os.path.isfile(DEFAULT_CONFIG_YAML):
+            config_file = DEFAULT_CONFIG_YAML
+
+    if config_file is not None:
+        src_path = pathlib.Path(config_file).absolute()
+        dest_path = pathlib.Path(os.path.join(output_dir, "config.yaml")).absolute()
+        shutil.copyfile(src_path, dest_path)
+    else:
+        logging.warning(f"No {DEFAULT_CONFIG_YAML} file found.")
 
-def get_scenario_env(
-    token_provider: Optional[Callable[[], str]] = None, env_file: str = DEFAULT_ENV_FILE
-) -> Dict[str, str]:
+
+def get_scenario_env(token_provider: Optional[Callable[[], str]] = None, env_file: str | None = None) -> Dict[str, str]:
     """
     Return a dictionary of environment variables needed to run a scenario.
 
     Args:
         config_list (list): An AutoGen OAI_CONFIG_LIST to be used when running scenarios.
-        env_file (str): The path to the env_file to read. (default: DEFAULT_ENV_FILE)
+        env_file (str): The path to the env_file to read. (if None, default to DEFAULT_ENV_FILE)
 
     Returns: A dictionary of keys and values that need to be added to the system environment.
     """
@@ -264,10 +282,6 @@ def get_scenario_env(
     if openai_api_key is not None and len(openai_api_key.strip()) > 0:
         env["OPENAI_API_KEY"] = openai_api_key
 
-    bing_api_key = os.environ.get("BING_API_KEY")
-    if bing_api_key is not None and len(bing_api_key.strip()) > 0:
-        env["BING_API_KEY"] = bing_api_key
-
     ## Support Azure auth tokens
     azure_openai_ad_token = os.environ.get("AZURE_OPENAI_AD_TOKEN")
     if not azure_openai_ad_token and token_provider:
@@ -282,13 +296,91 @@ def get_scenario_env(
         env["AZURE_OPENAI_AD_TOKEN"] = azure_openai_ad_token
 
     # Update with any values from the ENV.json file
-    if os.path.isfile(env_file):
+    env_file_contents: Dict[str, Any] = {}
+    if env_file is None:
+        # Env file was not specified, so read the default, or warn if the default file is missing.
+        if os.path.isfile(DEFAULT_ENV_FILE_YAML):
+            with open(DEFAULT_ENV_FILE_YAML, "r") as fh:
+                env_file_contents = yaml.safe_load(fh)
+        elif os.path.isfile(DEFAULT_ENV_FILE_JSON):
+            with open(DEFAULT_ENV_FILE_JSON, "rt") as fh:
+                env_file_contents = json.loads(fh.read())
+            logging.warning(f"JSON environment files are deprecated. Migrate to '{DEFAULT_ENV_FILE_YAML}'")
+        else:
+            logging.warning(
+                f"The environment file '{DEFAULT_ENV_FILE_YAML}' was not found. A default environment will be provided, containing the keys: {env.keys()}"
+            )
+    else:
+        # Env file was specified. Throw an error if the file can't be read.
         with open(env_file, "rt") as fh:
-            env.update(json.loads(fh.read()))
+            if env_file.endswith(".json"):
+                logging.warning("JSON environment files are deprecated. Migrate to YAML")
+                env_file_contents = json.loads(fh.read())
+            else:
+                env_file_contents = yaml.safe_load(fh)
+
+    # Apply substitutions in-place
+    substitute_env_variables(env_file_contents)
+
+    # Flatten any structures
+    for key, value in env_file_contents.items():
+        if isinstance(value, dict) or isinstance(value, list):
+            env_file_contents[key] = json.dumps(value)
+
+    # Warn about carrying env variables
+    if "OPENAI_API_KEY" in env and "OPENAI_API_KEY" not in env_file_contents:
+        logging.warning(
+            f"Implicit inclusion of OPENAI_API_KEY in the task environment is deprecated. Add it to {DEFAULT_ENV_FILE_YAML} instead. E.g.,\n"
+            + """
+
+OPENAI_API_KEY: ${OPENAI_API_KEY}
+
+"""
+        )
+
+    # Apply the loaded variables
+    env.update(cast(Dict[str, str], env_file_contents))
 
     return env
 
 
+def substitute_env_variables(json_data: Any) -> None:
+    """
+    Recursively replaces any instance of "${ENV_VARIABLE}" with os.environ("ENV_VARIABLE") in a structure returned from json.loads()
+    """
+
+    def replace_env_var(match: Any) -> str:
+        var_name = match.group(1)
+        return os.environ.get(var_name, "")
+
+    pattern = re.compile(r"\$\{(\w+)\}")
+
+    def replace_in_dict(d: Dict[str, Any]) -> None:
+        for key, value in d.items():
+            if isinstance(value, str):
+                d[key] = pattern.sub(replace_env_var, value)
+            elif isinstance(value, dict):
+                replace_in_dict(cast(Dict[str, Any], value))
+            elif isinstance(value, list):
+                # Note: with the task mypy complains of a redundant cast
+                # without the cast, pyright complains the type is unknown
+                replace_in_list(cast(List[Any], value))  # type: ignore
+
+    def replace_in_list(lst: List[Any]) -> None:
+        for i, item in enumerate(lst):
+            if isinstance(item, str):
+                lst[i] = pattern.sub(replace_env_var, item)
+            elif isinstance(item, dict):
+                replace_in_dict(cast(Dict[str, Any], item))
+            elif isinstance(item, list):
+                replace_in_list(cast(List[Any], item))  # type: ignore
+
+    if isinstance(json_data, dict):
+        replace_in_dict(cast(Dict[str, Any], json_data))
+    elif isinstance(json_data, list):
+        replace_in_list(cast(List[Any], json_data))  # type: ignore
+
+
 def run_scenario_natively(work_dir: str, env: Mapping[str, str], timeout: int = TASK_TIMEOUT) -> None:
     """
     Run a scenario in the native environment.
@@ -647,9 +739,11 @@ def run_scenarios_subset(
     scenarios: List[Dict[str, Any]],
     n_repeats: int,
     is_native: bool,
+    config_file: Union[None, str],
     docker_image: Optional[str] = None,
     results_dir: str = "Results",
     subsample: Union[None, int, float] = None,
+    env_file: Union[None, str] = None,
 ) -> None:
     """
     Run a subset of agbench scenarios a given number of times.
@@ -680,10 +774,10 @@ def run_scenarios_subset(
             print(f"Running scenario {results_repetition}")
 
             # Expand the scenario
-            expand_scenario(".", instance, results_repetition)  # type: ignore
+            expand_scenario(".", instance, results_repetition, config_file)  # type: ignore
 
             # Prepare the environment (keys/values that need to be added)
-            env = get_scenario_env()
+            env = get_scenario_env(env_file=env_file)
 
             # Run the scenario
             if is_native:
@@ -715,9 +809,11 @@ def run_parallel(args: argparse.Namespace) -> None:
                 scenario_subset,
                 args.repeat,
                 args.native,
+                args.config,
                 args.docker_image,
                 "Results",
                 args.subsample,
+                args.env,
             )
             for scenario_subset in scenarios
         ]
@@ -742,7 +838,7 @@ def get_azure_token_provider() -> Optional[Callable[[], str]]:
         except ClientAuthenticationError:
             error_message = traceback.format_exc()
             print(
-                f"Azure token provider failed loading. Try using 'az login --use-device-code':\n{error_message}\n\nContinuing without Azure token provider..."
+                f"Azure token provider failed loading. Try using 'az login --use-device-code'\n\nError details:\n{error_message}\n\nContinuing without Azure token provider..."
             )
         logging.disable(logging.NOTSET)
     return None
@@ -776,7 +872,6 @@ def run_cli(args: Sequence[str]) -> None:
         help='Run on a subsample of the tasks in the JSONL file(s). If a decimal value is specified, then run on the given proportion of tasks in each file. For example "0.7" would run on 70%% of tasks, and "1.0" would run on 100%% of tasks. If an integer value is specified, then randomly select *that* number of tasks from each specified JSONL file. For example "7" would run tasks, while "1" would run only 1 task from each specified JSONL file. (default: 1.0; which is 100%%)',
         default=None,
     )
-
     parser.add_argument(
         "-p",
         "--parallel",
@@ -784,7 +879,22 @@ def run_cli(args: Sequence[str]) -> None:
         help="The number of parallel processes to run (default: 1).",
         default=1,
     )
-
+    parser.add_argument(
+        "-e",
+        "--env",
+        type=str,
+        help="The environment file to load into Docker, or into the native task context (default: '"
+        + DEFAULT_ENV_FILE_YAML
+        + "').",
+        default=None,
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        help="The config file to copy into the Task (default: '" + DEFAULT_CONFIG_YAML + "').",
+        default=None,
+    )
     parser.add_argument(
         "-d",
         "--docker-image",
@@ -802,6 +912,11 @@ def run_cli(args: Sequence[str]) -> None:
 
     parsed_args = parser.parse_args(args)
 
+    if parsed_args.config is not None:
+        # Make sure the config file is readable, so that we fail early
+        with open(parsed_args.config, "r"):
+            pass
+
     # don't support parallel and subsample together
     if parsed_args.parallel > 1 and parsed_args.subsample is not None:
         sys.exit("The options --parallel and --subsample can not be used together currently. Exiting.")
@@ -861,7 +976,9 @@ def run_cli(args: Sequence[str]) -> None:
             scenario=parsed_args.scenario,
             n_repeats=parsed_args.repeat,
             is_native=True if parsed_args.native else False,
+            config_file=parsed_args.config,
             token_provider=azure_token_provider,
             docker_image=parsed_args.docker_image,
             subsample=subsample,
+            env_file=parsed_args.env,
         )
diff --git a/python/packages/agbench/src/agbench/tabulate_cmd.py b/python/packages/agbench/src/agbench/tabulate_cmd.py
index 4aac0e2d03d0..2b1eb37b933c 100644
--- a/python/packages/agbench/src/agbench/tabulate_cmd.py
+++ b/python/packages/agbench/src/agbench/tabulate_cmd.py
@@ -19,6 +19,10 @@
     "ALL TESTS PASSED !#!#",
 ]
 
+COMPLETED_STRINGS = [
+    "SCENARIO.PY COMPLETE !#!#",
+]
+
 EXCLUDE_DIR_NAMES = ["__pycache__"]
 
 
@@ -63,10 +67,19 @@ def default_scorer(instance_dir: str, success_strings: List[str] = SUCCESS_STRIN
     if os.path.isfile(console_log):
         with open(console_log, "rt") as fh:
             content = fh.read()
+
+            # It succeeded
             for s in success_strings:
                 if s in content:
                     return True
-            return False
+
+            # It completed without succeeding
+            for s in COMPLETED_STRINGS:
+                if s in content:
+                    return False
+
+            # Has not, or did not, complete
+            return None
     else:
         return None
 
@@ -187,7 +200,9 @@ def _count_equals(value: Optional[bool], trial: int) -> int:
             failures = 0
             for row in all_results:
                 if isinstance(row[i + 1], tuple):
-                    failures += row[i + 1][0] != 1
+                    failures += row[i + 1][0] not in [1, None]
+                else:
+                    failures += row[i + 1] not in [1, None]
             footer_row.append(failures)
         footer.append(footer_row)