Additional queries file and added to tool definition

EoghanOConnor · EoghanOConnor · commit 26421e8d1c5e · 2025-11-05T16:22:27.000Z
diff --git a/evaluator/algorithms/tool_rag_algorithm.py b/evaluator/algorithms/tool_rag_algorithm.py
@@ -216,7 +216,6 @@ def _compose_tool_text(self, tool: BaseTool) -> str:
         parts_to_include = self._settings["indexed_tool_def_parts"]
         if not parts_to_include:
             raise ValueError("indexed_tool_def_parts must be a non-empty list")
-
         segments = []
         for p in parts_to_include:
             if p.lower() == "name":
@@ -236,18 +235,15 @@ def _compose_tool_text(self, tool: BaseTool) -> str:
                 if tags:
                     segments.append(f"tags: {' '.join(tags)}")
             elif p.lower() == "additional_queries":
-                # Append example queries supplied via settings["additional_queries"][tool.name]
                 examples_map = self._settings.get("additional_queries") or {}
                 examples_list = examples_map.get(tool.name) or []
                 if examples_list:
                     rendered = self._render_examples(examples_list)
                     if rendered:
                         segments.append(f"ex: {rendered}")
-
         if not segments:
             raise ValueError(f"The following tool contains none of the fields listed in indexed_tool_def_parts:\n{tool}")
         text = " | ".join(segments)
-
         # one-pass preprocess + truncation
         text = self._preprocess_text(text)
         text = self._truncate(text)
@@ -260,7 +256,31 @@ def _create_docs_from_tools(self, tools: List[BaseTool]) -> List[Document]:
             documents.append(Document(page_content=page_content, metadata={"name": tool.name}))
         return documents
 
-    def _index_tools(self, tools: List[BaseTool], queries: List[QuerySpecification]) -> None:
+    def _collect_examples_from_tool_specs(self, tool_specs: Dict[str, Dict[str, Any]]) -> Dict[str, List[str]]:
+        """
+        Build {tool_name: [example1, example2, ...]} from a tools dict where each
+        value may contain an 'additional_queries' dict mapping query keys to strings.
+        """
+        examples: Dict[str, List[str]] = {}
+        for tool_name, spec in (tool_specs or {}).items():
+            if not isinstance(spec, dict):
+                continue
+            aq = spec.get("additional_queries")
+            if isinstance(aq, dict):
+                for _, qtext in aq.items():
+                    if isinstance(qtext, str) and qtext.strip():
+                        examples.setdefault(tool_name, []).append(qtext.strip())
+        # de-duplicate while preserving order
+        for k, v in list(examples.items()):
+            seen, out = set(), []
+            for s in v:
+                if s not in seen:
+                    seen.add(s)
+                    out.append(s)
+            examples[k] = out
+        return examples
+
+    def _index_tools(self, tools: List[BaseTool]) -> None:
         self.tool_name_to_base_tool = {tool.name: tool for tool in tools}
 
         self.embeddings = HuggingFaceEmbeddings(model_name=self._settings["embedding_model_id"])
@@ -319,7 +339,7 @@ def _index_tools(self, tools: List[BaseTool], queries: List[QuerySpecification])
                 search_params=search_params,
             )
 
-    def set_up(self, model: BaseChatModel, tools: List[BaseTool], queries: List[QuerySpecification]) -> None:
+    def set_up(self, model: BaseChatModel, tools: List[BaseTool], tool_specs: Any) -> None:
         super().set_up(model, tools)
 
         if self._settings["cross_encoder_model_name"]:
@@ -331,34 +351,15 @@ def set_up(self, model: BaseChatModel, tools: List[BaseTool], queries: List[Quer
         if self._settings["enable_query_decomposition"] or self._settings["enable_query_rewriting"]:
             self.query_rewriting_model = self._get_llm(self._settings["query_rewriting_model_id"])
 
-        # Build additional_queries mapping from provided QuerySpecifications so YAML is not required.
+        # Build additional_queries mapping from provided specs (accept dict of tool specs or list of QuerySpecifications)
         try:
-            tool_examples: Dict[str, List[str]] = {}
-            for spec in (queries or []):
-                add_q = getattr(spec, "additional_queries", None) or {}
-                # Flatten wrapper {"additional_queries": {...}} if present
-                if isinstance(add_q, dict) and "additional_queries" in add_q and len(add_q) == 1:
-                    add_q = add_q["additional_queries"]
-                for tool_name, qmap in add_q.items():
-                    if isinstance(qmap, dict):
-                        for _, qtext in qmap.items():
-                            if isinstance(qtext, str) and qtext.strip():
-                                tool_examples.setdefault(tool_name, []).append(qtext.strip())
-            # Dedupe while preserving order
-            for k, v in list(tool_examples.items()):
-                seen = set()
-                deduped = []
-                for s in v:
-                    if s not in seen:
-                        seen.add(s)
-                        deduped.append(s)
-                tool_examples[k] = deduped
-            if tool_examples:
-                self._settings["additional_queries"] = tool_examples
+            examples_map: Dict[str, List[str]] = {}
+            if isinstance(tool_specs, dict):
+                examples_map = self._collect_examples_from_tool_specs(tool_specs)
+                self._settings["additional_queries"] = examples_map
         except Exception:
             pass
-
-        self._index_tools(tools, queries)
+        self._index_tools(tools)
 
     def _threshold_results(self, docs_and_scores: List[Tuple[Document, float]]) -> List[Document]:
         """
@@ -619,4 +620,4 @@ def _dedup_keep_order(xs: List[str]) -> List[str]:
 
     @staticmethod
     def _strip_numbering(s: str) -> str:
-        return re.sub(r"^\s*(?:[-*]|\d+[).:]?)\s*", "", s).strip().rstrip(".")
+        return re.sub(r"^\s*(?:[-*]|\d+[).:]?)\s*", "", s).strip().rstrip(".")
diff --git a/evaluator/components/data_provider.py b/evaluator/components/data_provider.py
@@ -315,7 +315,7 @@ def _load_queries_from_single_file(
         root_dataset_path: str or Path,
         experiment_environment: EnvironmentConfig,
         dataset_config: DatasetConfig,
-) -> Tuple[List[QuerySpecification], List[Dict[str, Any]]]:
+) -> List[QuerySpecification]:
     with open(query_file_path, 'r') as f:
         data = json.load(f)
 
@@ -334,13 +334,6 @@ def _load_queries_from_single_file(
             log(f"Invalid query spec, skipping this query.")
         else:
             query = raw_query_spec.get("query")
-            if raw_query_spec.get("additional_queries"):
-                additional_queries = raw_query_spec.get("additional_queries")
-                print(f"Additional queries provided: {additional_queries}")
-
-            else:
-                print(f"No additional queries provided")
-                additional_queries = None
             query_id = int(raw_query_spec.get("query_id"))
             golden_tools, additional_tools = (
                 _parse_raw_query_tool_definitions(raw_query_spec, experiment_environment, dataset_config))
@@ -354,8 +347,6 @@ def _load_queries_from_single_file(
                     QuerySpecification(
                         id=query_id,
                         query=query,
-                        path=str(query_file_path),
-                        additional_queries=additional_queries,
                         reference_answer=reference_answer,
                         golden_tools=golden_tools,
                         additional_tools=additional_tools or None
@@ -373,7 +364,7 @@ def get_queries(
         experiment_environment: EnvironmentConfig,
         dataset_config: DatasetConfig,
         fine_tuning_mode=False
-) -> Tuple[List[QuerySpecification], List[Dict[str, Any]]]:
+) -> List[QuerySpecification]:
     """Load queries from the dataset."""
     root_dataset_path = Path(os.getenv("ROOT_DATASET_PATH"))
     if not root_dataset_path:
@@ -390,14 +381,14 @@ def get_queries(
     queries_num = None if fine_tuning_mode else dataset_config.queries_num
     queries = []
     for path in local_paths:
-        print(f"\n\n")
-        print(f"--------------------------------")
-        print(f"Loading queries from file: {path}")
-        print(f"\n\n")
         remaining_queries_num = None if queries_num is None else queries_num - len(queries)
         if remaining_queries_num == 0:
             break
-        new_queries= _load_queries_from_single_file(path, remaining_queries_num, root_dataset_path, experiment_environment, dataset_config)
+        new_queries = _load_queries_from_single_file(path,
+                                                     remaining_queries_num,
+                                                     root_dataset_path,
+                                                     experiment_environment,
+                                                     dataset_config)
         queries.extend(new_queries)
 
     return queries
@@ -406,9 +397,55 @@ def get_queries(
 def get_tools_from_queries(queries: List[QuerySpecification]) -> ToolSet:
     tools = {}
 
+    # Base tools from the dataset
     for query_spec in queries:
         tools.update(query_spec.golden_tools)
         if query_spec.additional_tools:
             tools.update(query_spec.additional_tools)
 
+        # Merge per-query additional queries from centralized store under the correct tool entry
+        aq = get_additional_query(query_spec.id)
+        if isinstance(aq, dict):
+            golden_tools = query_spec.golden_tools
+            for tool in golden_tools:
+                additional_queries = aq.get(tool)
+                tools[tool]["additional_queries"] = additional_queries
+
     return tools
+
+
+def load_additional_queries_store(path: str | None = None) -> List[Dict[str, Any]]:
+    """
+    Load the centralized additional queries store.
+    Expected format: a JSON list of objects {"query_id": int, "additional_queries": {...}}.
+    Returns an empty list if the file doesn't exist or cannot be parsed.
+    """
+    try:
+        store_path = Path(path) if path else (Path("data") / "additional_queries.json")
+        if not store_path.exists():
+            return []
+        with store_path.open("r", encoding="utf-8") as f:
+            loaded = json.load(f)
+        return loaded if isinstance(loaded, list) else []
+    except Exception:
+        return []
+
+
+def get_additional_query(query_id: int) -> Dict[str, Any] | None:
+    """
+    Return the additional_queries dict for the given query_id from data/additional_queries.json,
+    or None if not found or invalid.
+    """
+    store = load_additional_queries_store()
+    for item in store:
+        if not isinstance(item, dict):
+            continue
+        if "query_id" not in item or "additional_queries" not in item:
+            continue
+        try:
+            qid = int(item["query_id"])
+        except Exception:
+            continue
+        if qid == query_id and isinstance(item["additional_queries"], dict):
+            return item["additional_queries"]
+    return None
diff --git a/evaluator/config/yaml/tool_rag_experiments.yaml b/evaluator/config/yaml/tool_rag_experiments.yaml
@@ -12,6 +12,7 @@ data:
   reference_answers_path: "https://huggingface.co/datasets/stabletoolbench/baselines/resolve/main/data_baselines.zip"
   reference_model_id: "chatgpt_cot"
   queries_num: null
+  additional_queries_model_id: "Qwen/Qwen3-8B"
 
 models:
   - id: "Qwen/Qwen3-8B"
diff --git a/evaluator/evaluator.py b/evaluator/evaluator.py
@@ -1,5 +1,6 @@
 import asyncio
 import os
+from re import S
 import time
 import traceback
 from typing import List, Tuple
@@ -68,6 +69,18 @@ async def run(self) -> None:
         # Actually run the experiments
         metadata_columns = ["Experiment ID", "Algorithm ID", "Algorithm Details", "Environment", "Number of Queries"]
         with CSVLogger(metric_collectors, os.getenv("OUTPUT_DIR_PATH"), metadata_columns=metadata_columns) as logger:
+            # generate additional queries here (optional)
+            try:
+                log(f"Generating additional queries...")
+                environment = experiment_specs[0][1]
+                gen_model_id = self.config.data.additional_queries_model_id
+                llm = get_llm(model_id=gen_model_id, model_config=self.config.models)
+                queries = get_queries(environment, self.config.data)
+                generate_and_save_additional_queries(llm, queries)
+            except Exception as _:
+                log("Skipping additional query generation due to error.")
+
+            # generate queries here
             for i, spec in enumerate(experiment_specs):
                 algorithm, environment = spec
                 log(f"{'-' * 60}\nRunning Experiment {i+1} of {len(experiment_specs)}: {self._spec_to_str(spec)}...\n{'-' * 60}")
@@ -114,13 +127,15 @@ async def _run_experiment(self,
         Runs the specified experiment and returns the number of evaluated queries.
         """
         processed_queries_num = 0
+
         try:
             queries = await self._set_up_experiment(spec, metric_collectors, mcp_proxy_manager)
             algorithm, environment = spec
 
             try:
                 for i, query_spec in enumerate(queries):
                     log(f"Processing query #{query_spec.id} (Experiment {exp_index} of {total_exp_num}, query {i+1} of {len(queries)})...")
+                    
                     for mc in metric_collectors:
                         mc.prepare_for_measurement(query_spec)
 
@@ -195,29 +210,28 @@ async def _set_up_experiment(self,
                                  mcp_proxy_manager: MCPProxyManager,
                                  ) -> List[QuerySpecification]:
         algorithm, environment = spec
-
         log(f"Initializing LLM connection: {environment.model_id}")
-        llm = get_llm(model_id=environment.model_id, model_config=self.config.models)
         log("Connection established successfully.\n")
         log("Fetching queries for the current experiment...")
         queries = get_queries(environment, self.config.data)
         log(f"Successfully loaded {len(queries)} queries.\n")
         print_iterable_verbose("The following queries will be executed:\n", queries)
-        log(f"Generating additional queries.\n")
-        generate_and_save_additional_queries(llm, queries)
+        llm = get_llm(model_id=environment.model_id, model_config=self.config.models)
         queries = get_queries(environment, self.config.data)
         log("Retrieving tool definitions for the current experiment...")
         tool_specs = get_tools_from_queries(queries)
         tools = await mcp_proxy_manager.run_mcp_proxy(tool_specs, init_client=True).get_tools()
         print_iterable_verbose("The following tools will be available during evaluation:\n", tools)
         log(f"The experiment will proceed with {len(tools)} tool(s).\n")
-
         log("Setting up the algorithm and the metric collectors...")
-        
-        algorithm.set_up(llm, tools, queries)
+        # Pass queries to algorithms that accept them; fall back for others
+        if algorithm.__module__ == "evaluator.algorithms.tool_rag_algorithm":
+            algorithm.set_up(llm, tools, tool_specs)
+        else:
+            algorithm.set_up(llm, tools)
         for mc in metric_collectors:
             mc.set_up()
-        log("All set!\n")
+        log("Setup complete!")
 
         return queries
 
diff --git a/evaluator/metric_collectors/tool_selection_metric_collector.py b/evaluator/metric_collectors/tool_selection_metric_collector.py
@@ -16,10 +16,10 @@ class ToolSelectionMetricCollector(MetricCollector):
     def __init__(self, settings: Dict, model_config: List[ModelConfig]):
         super().__init__(settings, model_config)
 
-        self.total_queries = 0
-        self.exact_matches = 0
-        self.precision_sum = 0.0
-        self.recall_sum = 0.0
+        self.total_queries = None
+        self.exact_matches = None
+        self.precision_sum = None
+        self.recall_sum = None
 
     def get_collected_metrics_names(self) -> List[str]:
         return ["Exact Tool Selection Match Rate",
@@ -96,10 +96,7 @@ def report_results(self) -> Dict[str, Any] or None:
             raise RuntimeError("No measurements registered, cannot produce results.")
 
         results = {
-            "Exact Tool Selection Match Rate": (
-                (self.exact_matches or 0) / (self.total_queries or 1)
-                if self.total_queries else 0.0
-            ),
+            "Exact Tool Selection Match Rate": self.exact_matches / self.total_queries,
             "Tool Selection Precision": self.precision_sum / self.total_queries,
             "Tool Selection Recall": self.recall_sum / self.total_queries,
             "Spurious Tool Calling Rate": 1.0 - (self.precision_sum / self.total_queries),
diff --git a/evaluator/utils/parsing_tools.py b/evaluator/utils/parsing_tools.py