diff --git a/.github/workflows/workflow.yaml b/.github/workflows/workflow.yaml new file mode 100644 index 000000000..34e5b8e90 --- /dev/null +++ b/.github/workflows/workflow.yaml @@ -0,0 +1,59 @@ +name: PZ Merge Checks + +on: + pull_request: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install . + + - name: Download and register testdata + run: | + pushd testdata + wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/enron-eval-tiny.tar.gz + wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/real-estate-eval-tiny.tar.gz + tar -xzf enron-eval-tiny.tar.gz + tar -xzf real-estate-eval-tiny.tar.gz + rm *.tar.gz + popd + pz reg --path testdata/enron-eval-tiny --name enron-eval-tiny + pz reg --path testdata/real-estate-eval-tiny --name real-estate-eval-tiny + + - name: Test with pytest + env: # Or as an environment variable + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }} + run: | + pip install pytest + pytest -v tests/pytest + + lint-and-format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install the code linting and formatting tool Ruff + run: pip install "ruff>=0.9.0" + - name: check version + run: ruff --version + - name: Lint code with Ruff + run: ruff check --output-format=github --target-version=py38 + - name: Check code formatting with Ruff + run: ruff check --no-fix . --target-version=py38 + continue-on-error: true diff --git a/demos/askem-var.py b/demos/askem-var.py index 2cb58e457..0d208c269 100644 --- a/demos/askem-var.py +++ b/demos/askem-var.py @@ -9,6 +9,7 @@ import pandas as pd import streamlit as st + from palimpzest.constants import Cardinality from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.fields import Field @@ -33,18 +34,26 @@ class Variable(Schema): value = Field(desc="The value of the variable, optional, set 'null' if not found") +dict_of_excerpts = [ + {"id": 0, "text": "ne of the few states producing detailed daily reports of COVID-19 confirmed cases, COVID-19 related cumulative hospitalizations, intensive care unit (ICU) admissions, and deaths per county. Likewise, Ohio is a state with marked variation of demographic and geographic attributes among counties along with substantial differences in the capacity of healthcare within the state. Our aim is to predict the spatiotemporal dynamics of the COVID-19 pandemic in relation with the distribution of the capacity of healthcare in Ohio. 2. Methods 2.1. Mathematical model We developed a spatial mathematical model to simulate the transmission dynamics of COVID-19 disease infection and spread. The spatially-explicit model incorporates geographic connectivity information at county level. The Susceptible-Infected-Hospitalized-Recovered- Dead (SIHRD) COVID-19 model classified the population into susceptibles (S), confirmed infections (I), hospitalized and ICU admitted (H), recovered (R) and dead (D). Based on a previous study that identified local air hubs and main roads as important geospatial attributes lio residing in the county. In the second scenario, we used the model to generate projections of the impact of potential easing on the non-pharmaceutical interventions in the critical care capacity of each county in Ohio. We assessed the impact of 50% reduction on the estimated impact of non-pharmaceutical interventions in reducing the hazard rate of infection. Under this scenario we calculated the proportion of ICU \n'"}, + {"id": 1, "text": "t model incorporates geographic connectivity information at county level. The Susceptible-Infected-Hospitalized-Recovered- Dead (SIHRD) COVID-19 model classified the population into susceptibles (S), confirmed infections (I), hospitalized and ICU admitted (H), recovered (R) and dead (D). Based on a previous study that identified local air hubs and main roads as important geospatial attributes linked to differential COVID-19 related hospitalizations and mortality (Correa-Agudelo et a"} +] + +list_of_strings = ["I have a variable a, the value is 1", "I have a variable b, the value is 2"] +list_of_numbers = [1, 2, 3, 4, 5] + if __name__ == "__main__": run_pz = True dataset = "askem" + file_path = "testdata/askem-tiny/" if run_pz: # reference, plan, stats = run_workload() - excerpts = Dataset(dataset, schema=TextFile) + df_input = pd.DataFrame(dict_of_excerpts) + excerpts = Dataset(df_input) output = excerpts.convert( - Variable, desc="A variable used or introduced in the paper snippet", cardinality=Cardinality.ONE_TO_MANY - ) - - engine = StreamingSequentialExecution + Variable, desc="A variable used or introduced in the context", cardinality=Cardinality.ONE_TO_MANY + ).filter("The value name is 'a'", depends_on="name") policy = MaxQuality() engine = StreamingSequentialExecution( policy=policy, @@ -55,8 +64,7 @@ class Variable(Schema): allow_bonded_query=True, ) engine.generate_plan(output, policy) - - print(engine.plan) + print("Generated plan:\n", engine.plan) with st.container(): st.write("### Executed plan: \n") # st.write(" " + str(plan).replace("\n", " \n ")) @@ -66,9 +74,9 @@ class Variable(Schema): st.write(strop) input_records = engine.get_input_records() - input_df = DataRecord.to_df(input_records) + input_df = DataRecord.to_df(input_records, fields_in_schema=True) print(input_df) - + variables = [] statistics = [] start_time = time.time() @@ -81,9 +89,9 @@ class Variable(Schema): total_plan_time = time.time() - start_time engine.plan_stats.finalize(total_plan_time) - record_time = time.time() statistics.append(engine.plan_stats) - + intermediate_vars = DataRecord.to_df(vars, fields_in_schema=True) + print(intermediate_vars) for var in vars: # ref.key = ref.first_author.split()[0] + ref.title.split()[0] + str(ref.year) try: @@ -119,8 +127,8 @@ class Variable(Schema): st.write(" **value:** ", var.value, "\n") # write variables to a json file with readable format - with open(f"askem-variables-{dataset}.json", "w") as f: - json.dump(variables, f, indent=4) + # with open(f"askem-variables-{dataset}.json", "w") as f: + # json.dump(variables, f, indent=4) vars_df = pd.DataFrame(variables) # G = nx.DiGraph() @@ -150,7 +158,7 @@ class Variable(Schema): # # nx.write_gexf(G, "demos/bdf-usecase3.gexf") - print("References:", vars_df) + # print("References:", vars_df) # st.write(table.title, table.author, table.abstract) # endTime = time.time() # print("Elapsed time:", endTime - startTime) diff --git a/demos/bdf-suite.py b/demos/bdf-suite.py index f96003669..a0ade70ee 100644 --- a/demos/bdf-suite.py +++ b/demos/bdf-suite.py @@ -10,6 +10,7 @@ import networkx as nx import pandas as pd import streamlit as st + from palimpzest.constants import Cardinality from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import URL, File, PDFFile, Schema, Table, XLSFile diff --git a/demos/bdf-usecase3.py b/demos/bdf-usecase3.py index 6e3b2aa4c..1f3b8b174 100644 --- a/demos/bdf-usecase3.py +++ b/demos/bdf-usecase3.py @@ -11,6 +11,7 @@ import networkx as nx import pandas as pd import streamlit as st # type: ignore + from palimpzest.constants import Cardinality from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import PDFFile, Schema diff --git a/demos/biofabric-demo-matching.ipynb b/demos/biofabric-demo-matching.ipynb index 03517d8e6..b962abc37 100644 --- a/demos/biofabric-demo-matching.ipynb +++ b/demos/biofabric-demo-matching.ipynb @@ -10,6 +10,7 @@ "import os\n", "\n", "import pandas as pd # type: ignore\n", + "\n", "from palimpzest.constants import Cardinality\n", "from palimpzest.core.lib.fields import Field\n", "from palimpzest.core.lib.schemas import Schema, Table, XLSFile\n", diff --git a/demos/demo_core.py b/demos/demo_core.py index 8fddedcbe..f50c321d1 100644 --- a/demos/demo_core.py +++ b/demos/demo_core.py @@ -3,13 +3,14 @@ import os import pandas as pd +from tabulate import tabulate + from palimpzest.core.elements.groupbysig import GroupBySig from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import ImageFile, Number, PDFFile, TextFile from palimpzest.query import Execute from palimpzest.sets import Dataset -from tabulate import tabulate class ScientificPaper(PDFFile): diff --git a/demos/fever-demo.py b/demos/fever-demo.py index 1cd249704..d984ab524 100644 --- a/demos/fever-demo.py +++ b/demos/fever-demo.py @@ -4,6 +4,8 @@ import random from pathlib import Path +from ragatouille import RAGPretrainedModel + from palimpzest.constants import Model, OptimizationStrategy from palimpzest.core.data.datasources import ValidationDataSource from palimpzest.core.elements.records import DataRecord @@ -21,7 +23,6 @@ ) from palimpzest.sets import Dataset from palimpzest.utils.model_helpers import get_models -from ragatouille import RAGPretrainedModel class FeverClaimsSchema(Schema): diff --git a/demos/image-demo.py b/demos/image-demo.py index 54458f449..dcebfd74a 100644 --- a/demos/image-demo.py +++ b/demos/image-demo.py @@ -7,13 +7,14 @@ import gradio as gr import numpy as np +from PIL import Image + from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import ImageFile from palimpzest.datamanager.datamanager import DataDirectory from palimpzest.policy import MaxQuality from palimpzest.query import Execute, NoSentinelSequentialSingleThreadExecution from palimpzest.sets import Dataset -from PIL import Image if not os.environ.get("OPENAI_API_KEY"): from palimpzest.utils.env_helpers import load_env diff --git a/demos/optimizer-demo.py b/demos/optimizer-demo.py index c1e773b76..b9d899a94 100644 --- a/demos/optimizer-demo.py +++ b/demos/optimizer-demo.py @@ -6,6 +6,8 @@ from pathlib import Path import datasets +from ragatouille import RAGPretrainedModel + from palimpzest.constants import Model, OptimizationStrategy from palimpzest.core.data.datasources import ValidationDataSource from palimpzest.core.elements.records import DataRecord @@ -25,7 +27,6 @@ ) from palimpzest.sets import Dataset from palimpzest.utils.model_helpers import get_models -from ragatouille import RAGPretrainedModel # Addresses far from MIT; we use a simple lookup like this to make the # experiments re-producible w/out needed a Google API key for geocoding lookups diff --git a/demos/paper-demo.py b/demos/paper-demo.py index 8c00d88c2..97c04ced6 100644 --- a/demos/paper-demo.py +++ b/demos/paper-demo.py @@ -5,22 +5,18 @@ import gradio as gr import numpy as np -from palimpzest.constants import Cardinality, OptimizationStrategy +from PIL import Image + +from palimpzest.constants import Cardinality from palimpzest.core.data.datasources import UserSource from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.fields import BooleanField, Field, ImageFilepathField, ListField, NumericField, StringField from palimpzest.core.lib.schemas import Schema, Table, TextFile, XLSFile from palimpzest.datamanager.datamanager import DataDirectory from palimpzest.policy import MaxQuality, MinCost, MinTime -from palimpzest.query import ( - Execute, - NoSentinelPipelinedParallelExecution, - NoSentinelPipelinedSingleThreadExecution, - NoSentinelSequentialSingleThreadExecution, -) +from palimpzest.query.processor.config import QueryProcessorConfig from palimpzest.sets import Dataset from palimpzest.utils.udfs import xls_to_tables -from PIL import Image # Addresses far from MIT; we use a simple lookup like this to make the # experiments re-producible w/out needed a Google API key for geocoding lookups @@ -210,18 +206,6 @@ def get_item(self, idx: int): print("Policy not supported for this demo") exit(1) - execution_engine = None - executor = args.executor - if executor == "sequential": - execution_engine = NoSentinelSequentialSingleThreadExecution - elif executor == "pipelined": - execution_engine = NoSentinelPipelinedSingleThreadExecution - elif executor == "parallel": - execution_engine = NoSentinelPipelinedParallelExecution - else: - print("Executor not supported for this demo") - exit(1) - if os.getenv("OPENAI_API_KEY") is None and os.getenv("TOGETHER_API_KEY") is None: print("WARNING: Both OPENAI_API_KEY and TOGETHER_API_KEY are unset") @@ -261,15 +245,25 @@ def get_item(self, idx: int): plan = plan.filter("The rows of the table contain the patient age") plan = plan.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY) - # execute pz plan - records, execution_stats = Execute( - plan, - policy, - nocache=True, - optimization_strategy=OptimizationStrategy.PARETO, - execution_engine=execution_engine, - verbose=verbose, - ) + + config = QueryProcessorConfig(nocache=True, policy=policy, max_workers=10) + # # Option1: Create a basic processor + # # We could pass this process around to different service if needed. + # from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory + # processor = QueryProcessorFactory.create_processor( + # datasource=plan, + # processing_strategy="no_sentinel", + # execution_strategy="sequential", + # optimizer_strategy="pareto", + # config=config + # ) + # records, execution_stats = processor.execute() + + # Option2: Use the new interface + records, execution_stats = plan.run(config, + optimizer_strategy="pareto", + execution_strategy="sequential", + processing_strategy="no_sentinel") # save statistics if profile: diff --git a/demos/simple-demo.ipynb b/demos/simple-demo.ipynb index 84924cd5d..d9f3d0267 100644 --- a/demos/simple-demo.ipynb +++ b/demos/simple-demo.ipynb @@ -10,6 +10,7 @@ "# [Cell 1] - Imports\n", "from demo_core import execute_task, format_results_table\n", "from IPython.display import HTML, display\n", + "\n", "from palimpzest.policy import MinCost\n", "from palimpzest.query import (\n", " NoSentinelPipelinedParallelExecution,\n", diff --git a/demos/simple-demo.py b/demos/simple-demo.py index e9645100d..c84766dfc 100755 --- a/demos/simple-demo.py +++ b/demos/simple-demo.py @@ -4,6 +4,7 @@ import time from demo_core import execute_task, format_results_table + from palimpzest.policy import MaxQuality, MinCost, MinTime from palimpzest.query import ( NoSentinelPipelinedParallelExecution, diff --git a/pyproject.toml b/pyproject.toml index 822a27445..043b8fd05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "click>=8.1.7", "click-aliases>=1.0.4", "colorama>=0.4.6", - "fastapi~=0.100.0", + "fastapi~=0.115.0", "fuzzywuzzy>=0.18.0", "google-generativeai>=0.8.0", "gradio>=4.20.1", @@ -46,6 +46,7 @@ dependencies = [ "pyyaml>=6.0.1", "requests>=2.25", "requests-html>=0.10.0", + "ruff>=0.9.0", "scikit-learn>=1.5.2", "scipy>=1.9.0", "setuptools>=70.1.1", diff --git a/quickstart.ipynb b/quickstart.ipynb index bb49a7762..280b1eb68 100644 --- a/quickstart.ipynb +++ b/quickstart.ipynb @@ -34,6 +34,7 @@ "outputs": [], "source": [ "import os\n", + "\n", "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-api-key\"\n", "# os.environ[\"TOGETHER_API_KEY\"] = \"your-together-api-key\"" ] @@ -75,8 +76,8 @@ "outputs": [], "source": [ "import palimpzest.datamanager.datamanager as pzdm\n", - "from palimpzest.sets import Dataset\n", "from palimpzest.core.lib.schemas import TextFile\n", + "from palimpzest.sets import Dataset\n", "\n", "# Dataset registration\n", "dataset_path = \"testdata/enron-tiny\"\n", @@ -106,8 +107,9 @@ "metadata": {}, "outputs": [], "source": [ - "from palimpzest.core.lib.schemas import Schema\n", "from palimpzest.core.lib.fields import Field\n", + "from palimpzest.core.lib.schemas import Schema\n", + "\n", "\n", "class Email(Schema):\n", " \"\"\"Represents an email, which in practice is usually from a text file\"\"\"\n", @@ -204,8 +206,8 @@ } ], "source": [ + "from palimpzest.policy import MaxQuality, MinCost\n", "from palimpzest.query import Execute\n", - "from palimpzest.policy import MinCost, MaxQuality\n", "\n", "policy = MinCost()\n", "results, execution_stats = Execute(dataset, policy)" @@ -281,6 +283,7 @@ ], "source": [ "import pandas as pd\n", + "\n", "output_df = pd.DataFrame([r.to_dict() for r in results])[[\"date\",\"sender\",\"subject\"]]\n", "display(output_df)\n" ] diff --git a/ruff.toml b/ruff.toml index 3994fd7a1..c54f252f8 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,6 +1,7 @@ # Config https://docs.astral.sh/ruff/configuration/ line-length = 120 indent-width = 4 +exclude = ["*.ipynb"] [lint] ignore = ["E501"] diff --git a/src/cli/README.md b/src/cli/README.md index fc1e1ff65..71f7a2591 100644 --- a/src/cli/README.md +++ b/src/cli/README.md @@ -132,6 +132,22 @@ name: together-conf parallel: true ``` +You can update an existing config using the `pz update` command (also aliased as `pz uc`): +```bash +$ pz update --name default --settings parallel=true,pdfprocessor=pdfplumber +Updated config: default + +$ pz config +--- default --- +filecachedir: /some/local/filepath +llmservice: anthropic +name: default +parallel: true +pdfprocessor: pdfplumber +``` + +The `--name` parameter specifies which config to update. `--settings` specifies all the parameter name and value pairs in the format `param_name=param_value`, separated by commas. + Finally, you can delete a config with the `pz rm-config` command (also aliased as `pz rmc`): ```bash $ pz rmc --name together-conf diff --git a/src/cli/cli_main.py b/src/cli/cli_main.py index cc7a3a903..919879fc6 100644 --- a/src/cli/cli_main.py +++ b/src/cli/cli_main.py @@ -325,6 +325,53 @@ def set_config(name: str) -> None: _print_msg(f"Set config: {name}") +@cli.command(aliases=["uc", "update"]) +@click.option("--name", type=str, default=None, required=True, help="Name of the config to update.") +@click.option( + "--settings", + type=str, + required=True, + help="Parameters to update in format 'param1=value1,param2=value2'. Example: 'llmservice=openai,parallel=true,pdfprocessor=pdfplumber'" +) +def update_config(name: str, settings: str) -> None: + """ + Update multiple parameters in an existing Palimpzest config. + + Parameters + ---------- + name: str + Name of the config to update + params: str + Comma-separated list of parameter=value pairs to update + """ + from palimpzest.config import Config + from palimpzest.constants import PZ_DIR + + # check that config exists + if not os.path.exists(os.path.join(PZ_DIR, f"config_{name}.yaml")): + raise InvalidCommandError(f"Config with name {name} does not exist.") + + # load the specified config + config = Config(name) + + # Parse the params string into a dictionary + try: + param_pairs = settings.split(',') + updates = {} + for pair in param_pairs: + if pair.strip() == "": + continue + param, value = pair.split('=') + updates[param.strip()] = value.strip() + except Exception as e: + raise InvalidCommandError("Invalid params format. Use: param1=value1,param2=value2") from e + + # Update each parameter + for param, value in updates.items(): + config.set(param, value) + + _print_msg(f"Updated config {name} with: {updates}") + def main(): """ Entrypoint for Palimpzest CLI tool implemented using Click. @@ -339,4 +386,5 @@ def main(): cli.add_command(create_config) cli.add_command(rm_config) cli.add_command(set_config) + cli.add_command(update_config) cli() diff --git a/src/palimpzest/__init__.py b/src/palimpzest/__init__.py index 4f53846ae..28dda3d57 100644 --- a/src/palimpzest/__init__.py +++ b/src/palimpzest/__init__.py @@ -1,4 +1,4 @@ -from palimpzest.constants import MAX_ROWS, Cardinality, OptimizationStrategy +from palimpzest.constants import MAX_ROWS, Cardinality # Dataset functionality #from palimpzest.sets import Dataset @@ -20,7 +20,6 @@ # constants "MAX_ROWS", "Cardinality", - "OptimizationStrategy", # datamanager "DataDirectory", # policy diff --git a/src/palimpzest/constants.py b/src/palimpzest/constants.py index 1596e7b7f..f4e5f0419 100644 --- a/src/palimpzest/constants.py +++ b/src/palimpzest/constants.py @@ -36,18 +36,6 @@ class PromptStrategy(str, Enum): COT_MOA_AGG = "chain-of-thought-mixture-of-agents-aggregation" -class OptimizationStrategy(str, Enum): - """ - OptimizationStrategy determines which (set of) plan(s) the Optimizer - will return to the Execution layer. - """ - GREEDY = "greedy" - CONFIDENCE_INTERVAL = "confidence-interval" - PARETO = "pareto" - SENTINEL = "sentinel" - NONE = "none" - - class AggFunc(str, Enum): COUNT = "count" AVERAGE = "average" diff --git a/src/palimpzest/core/data/datasources.py b/src/palimpzest/core/data/datasources.py index e1a1627eb..c9e809ba0 100644 --- a/src/palimpzest/core/data/datasources.py +++ b/src/palimpzest/core/data/datasources.py @@ -151,7 +151,6 @@ def __init__(self, vals: Any, dataset_id: str = "default_memory_input"): self.vals = list(vals) else: self.vals = vals - schema = Schema.from_df(self.vals) if isinstance(self.vals, pd.DataFrame) else DefaultSchema super().__init__(schema, dataset_id) @@ -166,7 +165,6 @@ def get_size(self): def get_item(self, idx: int) -> DataRecord: dr = DataRecord(self.schema, source_id=idx) - if isinstance(self.vals, pd.DataFrame): row = self.vals.iloc[idx] for field_name in row.index: diff --git a/src/palimpzest/core/elements/records.py b/src/palimpzest/core/elements/records.py index 645116a80..aa7f3c752 100644 --- a/src/palimpzest/core/elements/records.py +++ b/src/palimpzest/core/elements/records.py @@ -5,11 +5,11 @@ import pandas as pd -from palimpzest.constants import DERIVED_SCHEMA_PREFIX, FROM_DF_PREFIX +from palimpzest.constants import FROM_DF_PREFIX from palimpzest.core.data.dataclasses import RecordOpStats from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import Schema -from palimpzest.utils.hash_helpers import hash_for_id, hash_for_temp_schema +from palimpzest.utils.hash_helpers import hash_for_id class DataRecord: diff --git a/src/palimpzest/core/lib/schemas.py b/src/palimpzest/core/lib/schemas.py index 7ae6ea3c1..612b81c2e 100644 --- a/src/palimpzest/core/lib/schemas.py +++ b/src/palimpzest/core/lib/schemas.py @@ -5,7 +5,9 @@ import numpy as np from typing import Any as TypingAny -from palimpzest.constants import MAX_ROWS +import pandas as pd + +from palimpzest.constants import DERIVED_SCHEMA_PREFIX, MAX_ROWS from palimpzest.core.lib.fields import ( BooleanField, BytesField, @@ -276,6 +278,38 @@ def class_name(cls) -> str: """Return the name of this class""" return cls.__name__ + @staticmethod + def from_df(df: pd.DataFrame) -> Schema: + # Create a unique schema name based on columns + schema_name = f"{DERIVED_SCHEMA_PREFIX}{hash_for_temp_schema(str(tuple(sorted(df.columns))))}" + + # consider to save to temp file and load from there + if schema_name in globals(): + return globals()[schema_name] + + # NOTE: we will not be able to infer more complicated types like ImageFilepathField + # without some input from the user + # construct attributes for schema (i.e. its fields and metadata) + desc = "Schema derived from DataFrame" + attributes = {"_desc": desc, "__doc__": desc, "__module__": Schema.__module__} + for col, dtype in zip(df.columns, df.dtypes): + if dtype == "object": + attributes[col] = StringField(desc=col) + elif dtype == "bool": + attributes[col] = BooleanField(desc=col) + elif dtype == "int64": + attributes[col] = IntField(desc=col) + elif dtype == "float64": + attributes[col] = FloatField(desc=col) + else: + attributes[col] = Field(desc=col) + + # Create new schema only if it doesn't exist + new_schema = type(schema_name, (Schema,), attributes) + + # Store the schema class globally + globals()[schema_name] = new_schema + return new_schema ################################################################################### # "Core" useful Schemas. These are Schemas that almost everyone will need. @@ -289,7 +323,6 @@ class DefaultSchema(Schema): value = Field(desc="The context data.") - class Download(Schema): """A download is a URL and the contents of the download.""" diff --git a/src/palimpzest/datamanager/datamanager.py b/src/palimpzest/datamanager/datamanager.py index 8290d5960..c4dccf83c 100644 --- a/src/palimpzest/datamanager/datamanager.py +++ b/src/palimpzest/datamanager/datamanager.py @@ -3,6 +3,7 @@ from threading import Lock import pandas as pd import yaml + from palimpzest import constants from palimpzest.config import Config from palimpzest.constants import PZ_DIR, DEFAULT_DATASET_ID_CHARS, MAX_DATASET_ID_CHARS diff --git a/src/palimpzest/policy.py b/src/palimpzest/policy.py index 9f8c20399..ba2d273f0 100644 --- a/src/palimpzest/policy.py +++ b/src/palimpzest/policy.py @@ -1,5 +1,5 @@ from palimpzest.core.data.dataclasses import PlanCost - +import json class Policy: """ @@ -44,6 +44,12 @@ def choose(self, plan: PlanCost, other_plan: PlanCost) -> float: """ raise NotImplementedError("Calling this method from an abstract base class.") + def to_json_str(self) -> str: + """Convert policy configuration to a JSON-serializable dictionary.""" + return json.dumps({ + "type": self.__class__.__name__, + "config": self.get_dict() + }, indent=2) class MaxQuality(Policy): """ diff --git a/src/palimpzest/query/__init__.py b/src/palimpzest/query/__init__.py index 321641138..1eace9046 100644 --- a/src/palimpzest/query/__init__.py +++ b/src/palimpzest/query/__init__.py @@ -1,18 +1,3 @@ -from palimpzest.query.execution.execute import Execute -from palimpzest.query.execution.mab_sentinel_execution import ( - MABSequentialParallelSentinelExecution, - MABSequentialSingleThreadSentinelExecution, -) -from palimpzest.query.execution.nosentinel_execution import ( - NoSentinelPipelinedParallelExecution, - NoSentinelPipelinedSingleThreadExecution, - NoSentinelSequentialSingleThreadExecution, -) -from palimpzest.query.execution.random_sampling_sentinel_execution import ( - RandomSamplingSequentialParallelSentinelExecution, - RandomSamplingSequentialSingleThreadSentinelExecution, -) -from palimpzest.query.execution.streaming_execution import StreamingSequentialExecution from palimpzest.query.operators.aggregate import AggregateOp, ApplyGroupByOp, AverageAggregateOp, CountAggregateOp from palimpzest.query.operators.convert import ( ConvertOp, @@ -37,22 +22,34 @@ RetrieveScan, ) from palimpzest.query.operators.physical import PhysicalOperator +from palimpzest.query.processor.mab_sentinel_processor import ( + MABSentinelPipelinedParallelProcessor, + MABSentinelSequentialSingleThreadProcessor, +) +from palimpzest.query.processor.nosentinel_processor import ( + NoSentinelPipelinedParallelProcessor, + NoSentinelPipelinedSinglelProcessor, + NoSentinelSequentialSingleThreadProcessor, +) +from palimpzest.query.processor.random_sampling_sentinel_processor import ( + RandomSamplingSentinelPipelinedProcessor, + RandomSamplingSentinelSequentialSingleThreadProcessor, +) +from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor __all__ = [ - # execute - "Execute", - # mab_sentinel_execution - "MABSequentialParallelSentinelExecution", - "MABSequentialSingleThreadSentinelExecution", - # nosentinel_execution - "NoSentinelPipelinedParallelExecution", - "NoSentinelPipelinedSingleThreadExecution", - "NoSentinelSequentialSingleThreadExecution", - # random_sampling_sentinel_execution - "RandomSamplingSequentialParallelSentinelExecution", - "RandomSamplingSequentialSingleThreadSentinelExecution", - # streaming_execution - "StreamingSequentialExecution", + # mab_sentinel_processor + "MABSentinelPipelinedParallelProcessor", + "MABSentinelSequentialSingleThreadProcessor", + # nosentinel_processor + "NoSentinelPipelinedParallelProcessor", + "NoSentinelPipelinedSinglelProcessor", + "NoSentinelSequentialSingleThreadProcessor", + # random_sampling_sentinel_processor + "RandomSamplingSentinelPipelinedProcessor", + "RandomSamplingSentinelSequentialSingleThreadProcessor", + # streaming_processor + "StreamingQueryProcessor", # aggregate "AggregateOp", "ApplyGroupByOp", diff --git a/src/palimpzest/query/execution/execute.py b/src/palimpzest/query/execution/execute.py deleted file mode 100644 index 94aeef36c..000000000 --- a/src/palimpzest/query/execution/execute.py +++ /dev/null @@ -1,67 +0,0 @@ -from palimpzest.constants import Model, OptimizationStrategy -from palimpzest.core.data.datasources import DataSource -from palimpzest.datamanager.datamanager import DataDirectory -from palimpzest.policy import Policy -from palimpzest.query.execution.execution_engine import ExecutionEngine -from palimpzest.query.execution.nosentinel_execution import NoSentinelSequentialSingleThreadExecution -from palimpzest.sets import Set - - -class Execute: - @classmethod - def get_datasource(cls, dataset: Set | DataSource) -> str: - """ - Gets the DataSource for the given dataset. - """ - # iterate until we reach DataSource - while isinstance(dataset, Set): - dataset = dataset._source - - # this will throw an exception if datasource is not registered with PZ - return DataDirectory().get_registered_dataset(dataset.dataset_id) - - def __new__( - cls, - dataset: Set, - policy: Policy, - num_samples: int = 20, - nocache: bool = True, - include_baselines: bool = False, - min_plans: int | None = None, - max_workers: int = 1, - verbose: bool = False, - available_models: list[Model] | None = None, - allow_bonded_query: bool = True, - allow_conventional_query: bool = False, - allow_model_selection: bool = True, - allow_code_synth: bool = False, - allow_token_reduction: bool = False, - allow_rag_reduction: bool = True, - allow_mixtures: bool = True, - optimization_strategy: OptimizationStrategy = OptimizationStrategy.PARETO, - execution_engine: ExecutionEngine = NoSentinelSequentialSingleThreadExecution, - *args, - **kwargs, - ): - if available_models is None: - available_models = [] - return execution_engine( - *args, - **kwargs, - datasource=cls.get_datasource(dataset), - num_samples=num_samples, - nocache=nocache, - include_baselines=include_baselines, - min_plans=min_plans, - max_workers=max_workers, - verbose=verbose, - available_models=available_models, - allow_bonded_query=allow_bonded_query, - allow_conventional_query=allow_conventional_query, - allow_code_synth=allow_code_synth, - allow_model_selection=allow_model_selection, - allow_token_reduction=allow_token_reduction, - allow_rag_reduction=allow_rag_reduction, - allow_mixtures=allow_mixtures, - optimization_strategy=optimization_strategy, - ).execute(dataset=dataset, policy=policy) diff --git a/src/palimpzest/query/execution/execution_strategy.py b/src/palimpzest/query/execution/execution_strategy.py new file mode 100644 index 000000000..1edbfc423 --- /dev/null +++ b/src/palimpzest/query/execution/execution_strategy.py @@ -0,0 +1,77 @@ +import time +from abc import ABC, abstractmethod +from enum import Enum + +from palimpzest.core.data.dataclasses import ExecutionStats, PlanStats +from palimpzest.core.elements.records import DataRecord +from palimpzest.datamanager.datamanager import DataDirectory +from palimpzest.query.optimizer.plan import PhysicalPlan + + +class ExecutionStrategyType(str, Enum): + """Available execution strategy types""" + SEQUENTIAL = "sequential" + PIPELINED_SINGLE_THREAD = "pipelined" + PIPELINED_PARALLEL = "pipelined_parallel" + AUTO = "auto" + + +class ExecutionStrategy(ABC): + """ + Base strategy for executing query plans. + Defines how to execute a single plan. + """ + def __init__(self, + scan_start_idx: int = 0, + datadir: DataDirectory | None = None, + max_workers: int | None = None, + nocache: bool = True, + verbose: bool = False): + self.scan_start_idx = scan_start_idx + self.datadir = datadir + self.nocache = nocache + self.verbose = verbose + self.max_workers = max_workers + self.execution_stats = [] + + + @abstractmethod + def execute_plan( + self, + plan: PhysicalPlan, + num_samples: int | float = float("inf"), + workers: int = 1 + ) -> tuple[list[DataRecord], PlanStats]: + """Execute a single plan according to strategy""" + pass + + + @abstractmethod + def _should_stop_execution( + self, + records: list[DataRecord], + plan_stats: list[PlanStats] + ) -> bool: + """Override to implement early stopping logic""" + return False + + def _create_execution_stats( + self, + plan_stats: list[PlanStats], + start_time: float + ) -> ExecutionStats: + """Create execution statistics""" + return ExecutionStats( + execution_id=f"exec_{int(start_time)}", + plan_stats={ps.plan_id: ps for ps in plan_stats}, + total_execution_time=time.time() - start_time, + total_execution_cost=sum(ps.total_cost for ps in plan_stats) + ) + + def _should_stop_execution( + self, + records: list[DataRecord], + plan_stats: list[PlanStats] + ) -> bool: + """Override to implement early stopping logic""" + return False diff --git a/src/palimpzest/query/execution/plan_executors/parallel_plan_execution.py b/src/palimpzest/query/execution/parallel_execution_strategy.py similarity index 86% rename from src/palimpzest/query/execution/plan_executors/parallel_plan_execution.py rename to src/palimpzest/query/execution/parallel_execution_strategy.py index 64d9af1cf..c8e4f6841 100644 --- a/src/palimpzest/query/execution/plan_executors/parallel_plan_execution.py +++ b/src/palimpzest/query/execution/parallel_execution_strategy.py @@ -1,3 +1,4 @@ +import multiprocessing import time from concurrent.futures import ThreadPoolExecutor, wait @@ -5,19 +6,17 @@ from palimpzest.core.data.dataclasses import OperatorStats, PlanStats from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import SourceRecord -from palimpzest.query.execution.execution_engine import ExecutionEngine +from palimpzest.query.execution.execution_strategy import ExecutionStrategy from palimpzest.query.operators.aggregate import AggregateOp -from palimpzest.query.operators.datasource import MarshalAndScanDataOp +from palimpzest.query.operators.datasource import DataSourcePhysicalOp from palimpzest.query.operators.limit import LimitScanOp from palimpzest.query.operators.physical import PhysicalOperator from palimpzest.query.optimizer.plan import PhysicalPlan -class PipelinedParallelPlanExecutor(ExecutionEngine): +class PipelinedParallelExecutionStrategy(ExecutionStrategy): """ - This class implements the abstract execute_plan() method from the ExecutionEngine. - This class still needs to be sub-classed by another Execution class which implements - the higher-level execute() method. + A parallel execution strategy that processes data through a pipeline of operators using thread-based parallelism. """ def __init__(self, *args, **kwargs): @@ -28,8 +27,7 @@ def __init__(self, *args, **kwargs): else self.max_workers ) - @staticmethod - def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]: + def execute_op_wrapper(self, operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]: """ Wrapper function around operator execution which also and returns the operator. This is useful in the parallel setting(s) where operators are executed by a worker pool, @@ -38,6 +36,16 @@ def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[D record_set = operator(op_input) return record_set, operator + + def get_parallel_max_workers(self): + # for now, return the number of system CPUs; + # in the future, we may want to consider the models the user has access to + # and whether or not they will encounter rate-limits. If they will, we should + # set the max workers in a manner that is designed to avoid hitting them. + # Doing this "right" may require considering their logical, physical plan, + # and tier status with LLM providers. It may also be worth dynamically + # changing the max_workers in response to 429 errors. + return max(int(0.8 * multiprocessing.cpu_count()), 1) def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1): """Initialize the stats and the execute the plan.""" @@ -76,12 +84,9 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf # get handle to DataSource and pre-compute its op_id and size source_operator = plan.operators[0] + assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp" source_op_id = source_operator.get_op_id() - datasource = ( - self.datadir.get_registered_dataset(source_operator.dataset_id) - if isinstance(source_operator, MarshalAndScanDataOp) - else self.datadir.get_cached_result(source_operator.dataset_id) - ) + datasource = source_operator.get_datasource() datasource_len = len(datasource) # get limit of final limit operator (if one exists) @@ -98,7 +103,7 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx) candidate.idx = current_scan_idx candidate.get_item_fn = datasource.get_item - futures.append(executor.submit(PipelinedParallelPlanExecutor.execute_op_wrapper, source_operator, candidate)) + futures.append(executor.submit(self.execute_op_wrapper, source_operator, candidate)) op_id_to_futures_in_flight[source_op_id] += 1 current_scan_idx += 1 @@ -157,7 +162,7 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx) candidate.idx = current_scan_idx candidate.get_item_fn = datasource.get_item - new_futures.append(executor.submit(PipelinedParallelPlanExecutor.execute_op_wrapper, source_operator, candidate)) + new_futures.append(executor.submit(self.execute_op_wrapper, source_operator, candidate)) op_id_to_futures_in_flight[source_op_id] += 1 current_scan_idx += 1 @@ -172,7 +177,7 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf for operator, candidate in processing_queue: # if the candidate is not an input to an aggregate, execute it right away if not isinstance(operator, AggregateOp): - future = executor.submit(PipelinedParallelPlanExecutor.execute_op_wrapper, operator, candidate) + future = executor.submit(self.execute_op_wrapper, operator, candidate) new_futures.append(future) op_id_to_futures_in_flight[operator.get_op_id()] += 1 @@ -208,7 +213,7 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf if upstream_ops_are_finished: operator = op_id_to_operator[agg_op_id] candidates = list(map(lambda tup: tup[1], candidate_tuples)) - future = executor.submit(PipelinedParallelPlanExecutor.execute_op_wrapper, operator, candidates) + future = executor.submit(self.execute_op_wrapper, operator, candidates) new_futures.append(future) op_id_to_futures_in_flight[operator.get_op_id()] += 1 diff --git a/src/palimpzest/query/execution/plan_executors/single_threaded_plan_execution.py b/src/palimpzest/query/execution/single_threaded_execution_strategy.py similarity index 89% rename from src/palimpzest/query/execution/plan_executors/single_threaded_plan_execution.py rename to src/palimpzest/query/execution/single_threaded_execution_strategy.py index 08a3b9f4f..3cb237992 100644 --- a/src/palimpzest/query/execution/plan_executors/single_threaded_plan_execution.py +++ b/src/palimpzest/query/execution/single_threaded_execution_strategy.py @@ -3,19 +3,23 @@ from palimpzest.core.data.dataclasses import OperatorStats, PlanStats from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.schemas import SourceRecord -from palimpzest.query.execution.execution_engine import ExecutionEngine +from palimpzest.query.execution.execution_strategy import ExecutionStrategy from palimpzest.query.operators.aggregate import AggregateOp -from palimpzest.query.operators.datasource import DataSourcePhysicalOp, MarshalAndScanDataOp +from palimpzest.query.operators.datasource import DataSourcePhysicalOp from palimpzest.query.operators.filter import FilterOp from palimpzest.query.operators.limit import LimitScanOp from palimpzest.query.optimizer.plan import PhysicalPlan -class SequentialSingleThreadPlanExecutor(ExecutionEngine): +class SequentialSingleThreadExecutionStrategy(ExecutionStrategy): """ - This class implements the abstract execute_plan() method from the ExecutionEngine. - This class still needs to be sub-classed by another Execution class which implements - the higher-level execute() method. + A single-threaded execution strategy that processes operators sequentially. + + This strategy processes all records through one operator completely before moving to the next operator + in the execution plan. For example, if we have operators A -> B -> C and records [1,2,3]: + 1. First processes records [1,2,3] through operator A + 2. Then takes A's output and processes all of it through operator B + 3. Finally processes all of B's output through operator C """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -44,11 +48,8 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf # get handle to DataSource and pre-compute its size source_operator = plan.operators[0] - datasource = ( - self.datadir.get_registered_dataset(source_operator.dataset_id) - if isinstance(source_operator, MarshalAndScanDataOp) - else self.datadir.get_cached_result(source_operator.dataset_id) - ) + assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp" + datasource = source_operator.get_datasource() datasource_len = len(datasource) # initialize processing queues for each operation @@ -139,12 +140,20 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf return output_records, plan_stats -class PipelinedSingleThreadPlanExecutor(ExecutionEngine): +class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy): """ - This class implements the abstract execute_plan() method from the ExecutionEngine. - This class still needs to be sub-classed by another Execution class which implements - the higher-level execute() method. + A single-threaded execution strategy that processes records through a pipeline of operators. + + This strategy implements a pipelined execution model where each record flows through + the entire operator chain before the next record is processed. + + Example Flow: + For operators A -> B -> C and records [1,2,3]: + 1. Record 1: A -> B -> C + 2. Record 2: A -> B -> C + 3. Record 3: A -> B -> C """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.max_workers = 1 if self.max_workers is None else self.max_workers @@ -174,11 +183,8 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf # get handle to DataSource and pre-compute its size source_operator = plan.operators[0] - datasource = ( - self.datadir.get_registered_dataset(source_operator.dataset_id) - if isinstance(source_operator, MarshalAndScanDataOp) - else self.datadir.get_cached_result(source_operator.dataset_id) - ) + assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp" + datasource = source_operator.get_datasource() datasource_len = len(datasource) # initialize processing queues for each operation diff --git a/src/palimpzest/query/operators/datasource.py b/src/palimpzest/query/operators/datasource.py index 49b6122b7..f3ce18c5d 100644 --- a/src/palimpzest/query/operators/datasource.py +++ b/src/palimpzest/query/operators/datasource.py @@ -1,6 +1,7 @@ from __future__ import annotations import time +from abc import ABC, abstractmethod from palimpzest.constants import ( LOCAL_SCAN_TIME_PER_KB, @@ -12,7 +13,7 @@ from palimpzest.query.operators.physical import PhysicalOperator -class DataSourcePhysicalOp(PhysicalOperator): +class DataSourcePhysicalOp(PhysicalOperator, ABC): """ Physical operators which implement DataSources require slightly more information in order to accurately compute naive cost estimates. Thus, we use a slightly @@ -57,7 +58,12 @@ def naive_cost_estimates( at least ballpark correct estimates of this quantity). """ raise NotImplementedError("Abstract method") - + + # TODO: we need to revisit this to make get_datasource() unified for DataScan operators + @abstractmethod + def get_datasource(self): + raise NotImplementedError("Abstract method") + class MarshalAndScanDataOp(DataSourcePhysicalOp): def naive_cost_estimates( @@ -127,6 +133,12 @@ def __call__(self, candidate: DataRecord) -> DataRecordSet: return record_set + def get_datasource(self): + return self.datadir.get_registered_dataset(self.dataset_id) + + def get_datasource_type(self): + return self.datadir.get_registered_dataset_type(self.dataset_id) + class CacheScanDataOp(DataSourcePhysicalOp): def naive_cost_estimates( @@ -186,3 +198,6 @@ def __call__(self, candidate: DataRecord) -> DataRecordSet: record_set = DataRecordSet(records, record_op_stats_lst) return record_set + + def get_datasource(self): + return self.datadir.get_cached_result(self.dataset_id) diff --git a/src/palimpzest/query/optimizer/cost_model.py b/src/palimpzest/query/optimizer/cost_model.py index 4e5dfdbf5..ea54caf61 100644 --- a/src/palimpzest/query/optimizer/cost_model.py +++ b/src/palimpzest/query/optimizer/cost_model.py @@ -192,7 +192,7 @@ def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCost # create source_op_estimates for datasources if they are not provided if isinstance(operator, DataSourcePhysicalOp): # get handle to DataSource and pre-compute its size (number of records) - datasource = self.datadir.get_registered_dataset(operator.dataset_id) + datasource = operator.get_datasource() datasource_len = len(datasource) source_op_estimates = OperatorCostEstimates( @@ -611,8 +611,8 @@ def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCost # initialize estimates of operator metrics based on naive (but sometimes precise) logic if isinstance(operator, MarshalAndScanDataOp): # get handle to DataSource and pre-compute its size (number of records) - datasource = self.datadir.get_registered_dataset(operator.dataset_id) - dataset_type = self.datadir.get_registered_dataset_type(operator.dataset_id) + datasource = operator.get_datasource() + dataset_type = operator.get_datasource_type() datasource_len = len(datasource) datasource_memsize = datasource.get_size() diff --git a/src/palimpzest/query/optimizer/optimizer.py b/src/palimpzest/query/optimizer/optimizer.py index 6218d8412..ac95fbf08 100644 --- a/src/palimpzest/query/optimizer/optimizer.py +++ b/src/palimpzest/query/optimizer/optimizer.py @@ -2,7 +2,7 @@ from copy import deepcopy -from palimpzest.constants import Model, OptimizationStrategy +from palimpzest.constants import Model from palimpzest.core.data.datasources import DataSource from palimpzest.core.lib.fields import Field from palimpzest.datamanager.datamanager import DataDirectory @@ -24,7 +24,11 @@ TRANSFORMATION_RULES, ) from palimpzest.query.optimizer.cost_model import CostModel -from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan +from palimpzest.query.optimizer.optimizer_strategy import ( + OptimizationStrategyType, + OptimizerStrategyRegistry, +) +from palimpzest.query.optimizer.plan import PhysicalPlan from palimpzest.query.optimizer.primitives import Group, LogicalExpression from palimpzest.query.optimizer.rules import ( CodeSynthesisConvertRule, @@ -83,11 +87,11 @@ def __init__( allow_token_reduction: bool = False, allow_rag_reduction: bool = True, allow_mixtures: bool = True, - optimization_strategy: OptimizationStrategy = OptimizationStrategy.PARETO, + optimization_strategy_type: OptimizationStrategyType = OptimizationStrategyType.PARETO, use_final_op_quality: bool = False, # TODO: make this func(plan) -> final_quality ): # store the policy - if available_models is None: + if available_models is None or len(available_models) == 0: available_models = [] self.policy = policy @@ -110,13 +114,15 @@ def __init__( self.implementation_rules = IMPLEMENTATION_RULES self.transformation_rules = TRANSFORMATION_RULES + self.strategy = OptimizerStrategyRegistry.get_strategy(optimization_strategy_type.value) + # if we are doing SENTINEL / NONE optimization; remove transformation rules - if optimization_strategy in [OptimizationStrategy.SENTINEL, OptimizationStrategy.NONE]: + if optimization_strategy_type in [OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE]: self.transformation_rules = [] # if we are not performing optimization, set available models to be single model # and remove all optimizations (except for bonded queries) - if optimization_strategy == OptimizationStrategy.NONE: + if optimization_strategy_type == OptimizationStrategyType.NONE: self.allow_bonded_query = True self.allow_conventional_query = False self.allow_code_synth = False @@ -135,7 +141,7 @@ def __init__( self.allow_token_reduction = allow_token_reduction self.allow_rag_reduction = allow_rag_reduction self.allow_mixtures = allow_mixtures - self.optimization_strategy = optimization_strategy + self.optimization_strategy_type = optimization_strategy_type self.use_final_op_quality = use_final_op_quality # prune implementation rules based on boolean flags @@ -177,6 +183,7 @@ def __init__( def update_cost_model(self, cost_model: CostModel): self.cost_model = cost_model + def get_physical_op_params(self): return { "verbose": self.verbose, @@ -186,6 +193,23 @@ def get_physical_op_params(self): "conventional_fallback_model": get_conventional_fallback_model(self.available_models), } + + def deepcopy_clean_optimizer(self): + optimizer = Optimizer( + policy=self.policy, + cost_model=CostModel(), + no_cache=self.no_cache, + verbose=self.verbose, + available_models=self.available_models, + allow_bonded_query=self.allow_bonded_query, + allow_conventional_query=self.allow_conventional_query, + allow_code_synth=self.allow_code_synth, + allow_token_reduction=self.allow_token_reduction, + optimization_strategy_type=self.optimization_strategy_type, + use_final_op_quality=self.use_final_op_quality, + ) + return optimizer + def construct_group_tree(self, dataset_nodes: list[Set]) -> tuple[list[int], dict[str, Field], dict[str, set[str]]]: # get node, output_schema, and input_schema(if applicable) node = dataset_nodes[-1] @@ -408,142 +432,11 @@ def search_optimization_space(self, group_id: int) -> None: context = {"costed_phys_op_ids": self.costed_phys_op_ids} new_tasks = task.perform(self.groups, self.expressions, context=context, **self.get_physical_op_params()) elif isinstance(task, OptimizePhysicalExpression): - context = {"optimization_strategy": self.optimization_strategy} + context = {"optimization_strategy_type": self.optimization_strategy_type} new_tasks = task.perform(self.cost_model, self.groups, self.policy, context=context) self.tasks_stack.extend(new_tasks) - def get_sentinel_plan(self, group_id: int) -> SentinelPlan: - """ - Create and return a SentinelPlan object. - """ - # get all the physical expressions for this group - phys_exprs = self.groups[group_id].physical_expressions - phys_op_set = [expr.operator for expr in phys_exprs] - - # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), - # create and return the physical plan - best_phys_expr = self.groups[group_id].best_physical_expression - if len(best_phys_expr.input_group_ids) == 0: - return SentinelPlan(operator_sets=[phys_op_set]) - - # TODO: need to handle joins - # get the best physical plan(s) for this group's inputs - best_phys_subplan = SentinelPlan(operator_sets=[]) - for input_group_id in best_phys_expr.input_group_ids: - input_best_phys_plan = self.get_sentinel_plan(input_group_id) - best_phys_subplan = SentinelPlan.from_ops_and_sub_plan(best_phys_subplan.operator_sets, input_best_phys_plan) - - # add this operator set to best physical plan and return - return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan) - - def get_greedy_physical_plan(self, group_id: int) -> PhysicalPlan: - """ - Return the best plan with respect to the user provided policy. - """ - # get the best physical expression for this group - best_phys_expr = self.groups[group_id].best_physical_expression - - # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), - # create and return the physical plan - if len(best_phys_expr.input_group_ids) == 0: - return PhysicalPlan(operators=[best_phys_expr.operator], plan_cost=best_phys_expr.plan_cost) - - # get the best physical plan(s) for this group's inputs - input_group_id = best_phys_expr.input_group_ids[0] # TODO: need to handle joins - input_best_phys_plan = self.get_greedy_physical_plan(input_group_id) - - # add this operator to best physical plan and return - return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost) - - - def get_candidate_pareto_physical_plans(self, group_id: int, policy: Policy) -> list[PhysicalPlan]: - """ - Return a list of plans which will contain all of the pareto optimal plans (and some additional - plans which may not be pareto optimal). - - TODO: can we cache group_id --> final_pareto_optimal_plans to avoid re-computing upstream - groups' pareto-optimal plans for each expression? - """ - # get the pareto optimal physical expressions for this group - pareto_optimal_phys_exprs = self.groups[group_id].pareto_optimal_physical_expressions - - # construct list of pareto optimal plans - pareto_optimal_plans = [] - for phys_expr in pareto_optimal_phys_exprs: - # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), - # create and return the physical plan - if len(phys_expr.input_group_ids) == 0: - for plan_cost, _ in phys_expr.pareto_optimal_plan_costs: - plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=plan_cost) - pareto_optimal_plans.append(plan) - - # otherwise, get the pareto optimal physical plan(s) for this group's inputs - else: - # get the pareto optimal physical plan(s) for this group's inputs - input_group_id = phys_expr.input_group_ids[0] # TODO: need to handle joins - pareto_optimal_phys_subplans = self.get_candidate_pareto_physical_plans(input_group_id, policy) - - # iterate over the input subplans and find the one(s) which combine with this physical expression - # to make a pareto-optimal plan - for plan_cost, input_plan_cost in phys_expr.pareto_optimal_plan_costs: - for subplan in pareto_optimal_phys_subplans: - if ( - subplan.plan_cost.cost == input_plan_cost.cost - and subplan.plan_cost.time == input_plan_cost.time - and subplan.plan_cost.quality == input_plan_cost.quality - ): - # TODO: The plan_cost gets summed with subplan.plan_cost; - # am I defining expression.best_plan_cost to be the cost of that operator, - # and expression.pareto_optimal_plan_costs to be the cost(s) of the subplan including that operator? - # i.e. are my definitions inconsistent? - plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, plan_cost) - pareto_optimal_plans.append(plan) - - return pareto_optimal_plans - - def get_confidence_interval_optimal_plans(self, group_id: int) -> list[PhysicalPlan]: - """ - Return all physical plans whose upper bound on the primary policy metric is greater than the - best plan's lower bound on the primary policy metric (subject to satisfying the policy constraint). - - The OptimizePhysicalExpression task guarantees that each group's `ci_best_physical_expressions` - maintains a list of expressions with overlapping CI's on the primary policy metric (while also - satisfying the policy constraint). - - This function computes the cross-product of all such expressions across all groups. - """ - # get all the physical expressions which could be the best for this group - best_phys_exprs = self.groups[group_id].ci_best_physical_expressions - - best_plans = [] - for phys_expr in best_phys_exprs: - # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), - # create the physical plan and append it to the best_plans for this group - if len(phys_expr.input_group_ids) == 0: - plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=phys_expr.plan_cost) - best_plans.append(plan) - - # otherwise, get the best physical plan(s) for this group's inputs - else: - # TODO: need to handle joins - best_phys_subplans = [PhysicalPlan(operators=[])] - for input_group_id in phys_expr.input_group_ids: - input_best_phys_plans = self.get_confidence_interval_optimal_plans(input_group_id) - best_phys_subplans = [ - PhysicalPlan.from_ops_and_sub_plan(subplan.operators, input_subplan, subplan.plan_cost) - for subplan in best_phys_subplans - for input_subplan in input_best_phys_plans - ] - - # add this operator to best physical plan and return - for subplan in best_phys_subplans: - plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, phys_expr.plan_cost) - best_plans.append(plan) - - return best_plans - - def optimize(self, query_plan: Dataset, policy: Policy | None = None) -> list[PhysicalPlan]: """ The optimize function takes in an initial query plan and searches the space of @@ -558,42 +451,5 @@ def optimize(self, query_plan: Dataset, policy: Policy | None = None) -> list[Ph # search the optimization space by applying logical and physical transformations to the initial group tree self.search_optimization_space(final_group_id) - - # construct the optimal physical plan(s) by traversing the memo table - plans = [] - if self.optimization_strategy == OptimizationStrategy.SENTINEL: - plans = [self.get_sentinel_plan(final_group_id)] - - elif self.optimization_strategy == OptimizationStrategy.GREEDY: - plans = [self.get_greedy_physical_plan(final_group_id)] - - elif self.optimization_strategy == OptimizationStrategy.PARETO: - # compute all of the pareto optimal physical plans - plans = self.get_candidate_pareto_physical_plans(final_group_id, policy) - - # adjust plans' plan_cost.quality to reflect only the quality of the final operator - if self.use_final_op_quality: - for plan in plans: - plan.plan_cost.quality = plan.plan_cost.op_estimates.quality - - # filter pareto optimal plans for ones which satisfy policy constraint (if at least one of them does) - # import pdb; pdb.set_trace() - if any([policy.constraint(plan.plan_cost) for plan in plans]): - plans = [plan for plan in plans if policy.constraint(plan.plan_cost)] - - # select the plan which is best for the given policy - optimal_plan, plans = plans[0], plans[1:] - for plan in plans: - optimal_plan = optimal_plan if policy.choose(optimal_plan.plan_cost, plan.plan_cost) else plan - - plans = [optimal_plan] - - elif self.optimization_strategy == OptimizationStrategy.CONFIDENCE_INTERVAL: - # TODO: fix this to properly handle multiple potential plans - raise Exception("NotImplementedError") - plans = self.get_confidence_interval_optimal_plans(final_group_id) - - elif self.optimization_strategy == OptimizationStrategy.NONE: - plans = [self.get_greedy_physical_plan(final_group_id)] - - return plans + + return self.strategy.get_optimal_plans(self.groups, final_group_id, policy) diff --git a/src/palimpzest/query/optimizer/optimizer_strategy.py b/src/palimpzest/query/optimizer/optimizer_strategy.py new file mode 100644 index 000000000..373046e00 --- /dev/null +++ b/src/palimpzest/query/optimizer/optimizer_strategy.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from enum import Enum + +from palimpzest.policy import Policy +from palimpzest.query.optimizer.plan import PhysicalPlan, SentinelPlan + + +class OptimizationStrategyType(str, Enum): + """ + OptimizationStrategyType determines which (set of) plan(s) the Optimizer + will return to the Execution layer. + """ + GREEDY = "greedy" + CONFIDENCE_INTERVAL = "confidence-interval" + PARETO = "pareto" + SENTINEL = "sentinel" + NONE = "none" + AUTO = "auto" + + +class OptimizationStrategy(ABC): + @abstractmethod + def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy) -> list[PhysicalPlan | SentinelPlan]: + """Strategy decides how to search through the groups for optimal plan(s)""" + pass + + @classmethod + def get_strategy(cls, strategy_type: str) -> OptimizationStrategy: + """Factory method to create strategy instances""" + return OptimizerStrategyRegistry.get_strategy(strategy_type) + + +class GreedyStrategy(OptimizationStrategy): + def _get_greedy_physical_plan(self, groups: dict, group_id: int) -> list[PhysicalPlan]: + """ + Return the best plan with respect to the user provided policy. + """ + # get the best physical expression for this group + best_phys_expr = groups[group_id].best_physical_expression + + # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), + # create and return the physical plan + if len(best_phys_expr.input_group_ids) == 0: + return PhysicalPlan(operators=[best_phys_expr.operator], plan_cost=best_phys_expr.plan_cost) + + # get the best physical plan(s) for this group's inputs + input_group_id = best_phys_expr.input_group_ids[0] # TODO: need to handle joins + input_best_phys_plan = self._get_greedy_physical_plan(groups, input_group_id) + + # add this operator to best physical plan and return + return PhysicalPlan.from_ops_and_sub_plan([best_phys_expr.operator], input_best_phys_plan, best_phys_expr.plan_cost) + + def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy) -> list[PhysicalPlan]: + return [self._get_greedy_physical_plan(groups, final_group_id)] + + +class ParetoStrategy(OptimizationStrategy): + def _get_candidate_pareto_physical_plans(self, groups: dict, group_id: int, policy: Policy) -> list[PhysicalPlan]: + """ + Return a list of plans which will contain all of the pareto optimal plans (and some additional + plans which may not be pareto optimal). + + TODO: can we cache group_id --> final_pareto_optimal_plans to avoid re-computing upstream + groups' pareto-optimal plans for each expression? + """ + # get the pareto optimal physical expressions for this group + pareto_optimal_phys_exprs = groups[group_id].pareto_optimal_physical_expressions + + # construct list of pareto optimal plans + pareto_optimal_plans = [] + for phys_expr in pareto_optimal_phys_exprs: + # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), + # create and return the physical plan + if len(phys_expr.input_group_ids) == 0: + for plan_cost, _ in phys_expr.pareto_optimal_plan_costs: + plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=plan_cost) + pareto_optimal_plans.append(plan) + + # otherwise, get the pareto optimal physical plan(s) for this group's inputs + else: + # get the pareto optimal physical plan(s) for this group's inputs + input_group_id = phys_expr.input_group_ids[0] # TODO: need to handle joins + pareto_optimal_phys_subplans = self._get_candidate_pareto_physical_plans(groups, input_group_id, policy) + + # iterate over the input subplans and find the one(s) which combine with this physical expression + # to make a pareto-optimal plan + for plan_cost, input_plan_cost in phys_expr.pareto_optimal_plan_costs: + for subplan in pareto_optimal_phys_subplans: + if ( + subplan.plan_cost.cost == input_plan_cost.cost + and subplan.plan_cost.time == input_plan_cost.time + and subplan.plan_cost.quality == input_plan_cost.quality + ): + # TODO: The plan_cost gets summed with subplan.plan_cost; + # am I defining expression.best_plan_cost to be the cost of that operator, + # and expression.pareto_optimal_plan_costs to be the cost(s) of the subplan including that operator? + # i.e. are my definitions inconsistent? + plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, plan_cost) + pareto_optimal_plans.append(plan) + + return pareto_optimal_plans + + def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy) -> list[PhysicalPlan]: + # compute all of the pareto optimal physical plans + plans = self._get_candidate_pareto_physical_plans(groups, final_group_id, policy) + + # adjust plans' plan_cost.quality to reflect only the quality of the final operator + # if self.use_final_op_quality: + # TODO(JUN): use_final_op_quality=true by default. Think about how to make this configurable + for plan in plans: + plan.plan_cost.quality = plan.plan_cost.op_estimates.quality + + # filter pareto optimal plans for ones which satisfy policy constraint (if at least one of them does) + # import pdb; pdb.set_trace() + if any([policy.constraint(plan.plan_cost) for plan in plans]): + plans = [plan for plan in plans if policy.constraint(plan.plan_cost)] + + # select the plan which is best for the given policy + optimal_plan, plans = plans[0], plans[1:] + for plan in plans: + optimal_plan = optimal_plan if policy.choose(optimal_plan.plan_cost, plan.plan_cost) else plan + + plans = [optimal_plan] + return plans + + +class SentinelStrategy(OptimizationStrategy): + def _get_sentinel_plan(self, groups: dict, group_id: int) -> SentinelPlan: + """ + Create and return a SentinelPlan object. + """ + # get all the physical expressions for this group + phys_exprs = groups[group_id].physical_expressions + phys_op_set = [expr.operator for expr in phys_exprs] + + # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), + # create and return the physical plan + best_phys_expr = groups[group_id].best_physical_expression + if len(best_phys_expr.input_group_ids) == 0: + return SentinelPlan(operator_sets=[phys_op_set]) + + # TODO: need to handle joins + # get the best physical plan(s) for this group's inputs + best_phys_subplan = SentinelPlan(operator_sets=[]) + for input_group_id in best_phys_expr.input_group_ids: + input_best_phys_plan = self._get_sentinel_plan(groups, input_group_id) + best_phys_subplan = SentinelPlan.from_ops_and_sub_plan(best_phys_subplan.operator_sets, input_best_phys_plan) + + # add this operator set to best physical plan and return + return SentinelPlan.from_ops_and_sub_plan([phys_op_set], best_phys_subplan) + + def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy) -> list[SentinelPlan]: + return [self._get_sentinel_plan(groups, final_group_id)] + + +class NoOptimizationStrategy(GreedyStrategy): + """ + NoOptimizationStrategy is used to intentionally construct a PhysicalPlan without applying any + logical transformations or optimizations. + """ + + +class ConfidenceIntervalStrategy(OptimizationStrategy): + def _get_confidence_interval_optimal_plans(self, groups: dict, group_id: int) -> list[PhysicalPlan]: + """ + Return all physical plans whose upper bound on the primary policy metric is greater than the + best plan's lower bound on the primary policy metric (subject to satisfying the policy constraint). + + The OptimizePhysicalExpression task guarantees that each group's `ci_best_physical_expressions` + maintains a list of expressions with overlapping CI's on the primary policy metric (while also + satisfying the policy constraint). + + This function computes the cross-product of all such expressions across all groups. + """ + # get all the physical expressions which could be the best for this group + best_phys_exprs = groups[group_id].ci_best_physical_expressions + + best_plans = [] + for phys_expr in best_phys_exprs: + # if this expression has no inputs (i.e. it is a BaseScan or CacheScan), + # create the physical plan and append it to the best_plans for this group + if len(phys_expr.input_group_ids) == 0: + plan = PhysicalPlan(operators=[phys_expr.operator], plan_cost=phys_expr.plan_cost) + best_plans.append(plan) + + # otherwise, get the best physical plan(s) for this group's inputs + else: + # TODO: need to handle joins + best_phys_subplans = [PhysicalPlan(operators=[])] + for input_group_id in phys_expr.input_group_ids: + input_best_phys_plans = self._get_confidence_interval_optimal_plans(groups, input_group_id) + best_phys_subplans = [ + PhysicalPlan.from_ops_and_sub_plan(subplan.operators, input_subplan, subplan.plan_cost) + for subplan in best_phys_subplans + for input_subplan in input_best_phys_plans + ] + + # add this operator to best physical plan and return + for subplan in best_phys_subplans: + plan = PhysicalPlan.from_ops_and_sub_plan([phys_expr.operator], subplan, phys_expr.plan_cost) + best_plans.append(plan) + + return best_plans + + def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy) -> list[PhysicalPlan]: + # TODO: fix this to properly handle multiple potential plans + raise Exception("NotImplementedError") + # plans = self._get_confidence_interval_optimal_plans(final_group_id) + + +class AutoOptimizationStrategy(OptimizationStrategy): + def get_optimal_plans(self, groups: dict, final_group_id: int, policy: Policy) -> list[PhysicalPlan]: + raise NotImplementedError("Auto optimization strategy not implemented") + + +class OptimizerStrategyRegistry: + """Registry to map strategy types to their implementations""" + + _strategies: dict[str, type[OptimizationStrategy]] = { + OptimizationStrategyType.GREEDY.value: GreedyStrategy, + OptimizationStrategyType.CONFIDENCE_INTERVAL.value: ConfidenceIntervalStrategy, + OptimizationStrategyType.PARETO.value: ParetoStrategy, + OptimizationStrategyType.SENTINEL.value: SentinelStrategy, + OptimizationStrategyType.NONE.value: NoOptimizationStrategy, + OptimizationStrategyType.AUTO.value: AutoOptimizationStrategy, + } + + @classmethod + def get_strategy(cls, strategy_type: str) -> OptimizationStrategy: + """Get strategy instance by type""" + strategy_class = cls._strategies.get(strategy_type) + if not strategy_class: + raise ValueError(f"Unknown optimization strategy: {strategy_type}") + return strategy_class() diff --git a/src/palimpzest/query/optimizer/tasks.py b/src/palimpzest/query/optimizer/tasks.py index 13cc01ee4..0b96b574a 100644 --- a/src/palimpzest/query/optimizer/tasks.py +++ b/src/palimpzest/query/optimizer/tasks.py @@ -2,10 +2,10 @@ from typing import Any -from palimpzest.constants import OptimizationStrategy from palimpzest.core.data.dataclasses import PlanCost from palimpzest.policy import Policy from palimpzest.query.optimizer.cost_model import BaseCostModel +from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType from palimpzest.query.optimizer.primitives import Expression, Group from palimpzest.query.optimizer.rules import ImplementationRule, Rule, TransformationRule @@ -464,13 +464,13 @@ def perform( # return if we've already computed the cost of this physical expression if ( # noqa: SIM114 - context['optimization_strategy'] in [OptimizationStrategy.GREEDY, OptimizationStrategy.SENTINEL, OptimizationStrategy.NONE] + context['optimization_strategy_type'] in [OptimizationStrategyType.GREEDY, OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE] and self.physical_expression.plan_cost is not None ): return [] elif ( - context['optimization_strategy'] == OptimizationStrategy.PARETO + context['optimization_strategy_type'] == OptimizationStrategyType.PARETO and self.physical_expression.pareto_optimal_plan_costs is not None ): return [] @@ -486,21 +486,21 @@ def perform( # compute the input plan cost or list of input plan costs new_tasks = [] if ( - context['optimization_strategy'] in [OptimizationStrategy.GREEDY, OptimizationStrategy.SENTINEL, OptimizationStrategy.NONE] + context['optimization_strategy_type'] in [OptimizationStrategyType.GREEDY, OptimizationStrategyType.SENTINEL, OptimizationStrategyType.NONE] and input_group.best_physical_expression is not None ): # TODO: apply policy constraint here best_input_plan_cost = input_group.best_physical_expression.plan_cost elif ( - context['optimization_strategy'] == OptimizationStrategy.CONFIDENCE_INTERVAL + context['optimization_strategy_type'] == OptimizationStrategyType.CONFIDENCE_INTERVAL and input_group.ci_best_physical_expressions is not None ): # TODO: fix this to properly compute set of potential input plan costs raise Exception("NotImplementedError") elif ( - context['optimization_strategy'] == OptimizationStrategy.PARETO + context['optimization_strategy_type'] == OptimizationStrategyType.PARETO and input_group.pareto_optimal_physical_expressions is not None ): # TODO: apply policy constraint here @@ -524,12 +524,12 @@ def perform( return [self] + new_tasks group = groups[self.physical_expression.group_id] - if context['optimization_strategy'] == OptimizationStrategy.CONFIDENCE_INTERVAL: + if context['optimization_strategy_type'] == OptimizationStrategyType.CONFIDENCE_INTERVAL: # TODO: fix this to properly compute and update set of possible plan costs raise Exception("NotImplementedError") group = self.update_ci_best_physical_expressions(group, policy) - elif context['optimization_strategy'] == OptimizationStrategy.PARETO: + elif context['optimization_strategy_type'] == OptimizationStrategyType.PARETO: # compute all possible plan costs for this physical expression given the pareto optimal input plan costs all_possible_plan_costs = [] for input_plan_cost in input_plan_costs: diff --git a/src/palimpzest/query/processor/config.py b/src/palimpzest/query/processor/config.py new file mode 100644 index 000000000..4118b73e2 --- /dev/null +++ b/src/palimpzest/query/processor/config.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass, field + +from palimpzest.constants import Model +from palimpzest.policy import MaxQuality, Policy +import json + + +# TODO: Separate out the config for the Optimizer, ExecutionStrategy, and QueryProcessor +@dataclass +class QueryProcessorConfig: + """Shared context for query processors""" + policy: Policy = field(default_factory=MaxQuality) + scan_start_idx: int = field(default=0) + num_samples: int = field(default=float("inf")) + nocache: bool = field(default=True) # NOTE: until we properly implement caching, let's set the default to True + include_baselines: bool = field(default=False) + min_plans: int | None = field(default=None) + verbose: bool = field(default=False) + available_models: list[Model] | None = field(default=None) + + max_workers: int | None = field(default=None) + num_workers_per_plan: int = field(default=1) + + allow_bonded_query: bool = field(default=True) + allow_conventional_query: bool = field(default=False) + allow_model_selection: bool = field(default=True) + allow_code_synth: bool = field(default=False) + allow_token_reduction: bool = field(default=False) + allow_rag_reduction: bool = field(default=True) + allow_mixtures: bool = field(default=True) + use_final_op_quality: bool = field(default=False) + + def to_json_str(self): + return json.dumps({ + "policy": self.policy.to_json_str(), + "scan_start_idx": self.scan_start_idx, + "num_samples": self.num_samples, + "nocache": self.nocache, + "include_baselines": self.include_baselines, + "min_plans": self.min_plans, + "verbose": self.verbose, + "available_models": self.available_models, + "max_workers": self.max_workers, + "num_workers_per_plan": self.num_workers_per_plan, + "allow_bonded_query": self.allow_bonded_query, + "allow_conventional_query": self.allow_conventional_query, + "allow_model_selection": self.allow_model_selection, + "allow_code_synth": self.allow_code_synth, + "allow_token_reduction": self.allow_token_reduction, + "allow_rag_reduction": self.allow_rag_reduction, + "allow_mixtures": self.allow_mixtures, + "use_final_op_quality": self.use_final_op_quality, + }, indent=4) diff --git a/src/palimpzest/query/execution/mab_sentinel_execution.py b/src/palimpzest/query/processor/mab_sentinel_processor.py similarity index 92% rename from src/palimpzest/query/execution/mab_sentinel_execution.py rename to src/palimpzest/query/processor/mab_sentinel_processor.py index 3711fa1fe..c7cc0c7b1 100644 --- a/src/palimpzest/query/execution/mab_sentinel_execution.py +++ b/src/palimpzest/query/processor/mab_sentinel_processor.py @@ -4,31 +4,28 @@ from typing import Callable import numpy as np - -from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS, OptimizationStrategy +from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS from palimpzest.core.data.dataclasses import ExecutionStats, OperatorStats, PlanStats, RecordOpStats from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import SourceRecord from palimpzest.policy import Policy -from palimpzest.query.execution.execution_engine import ExecutionEngine -from palimpzest.query.execution.plan_executors.parallel_plan_execution import PipelinedParallelPlanExecutor -from palimpzest.query.execution.plan_executors.single_threaded_plan_execution import SequentialSingleThreadPlanExecutor +from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy +from palimpzest.query.execution.single_threaded_execution_strategy import SequentialSingleThreadExecutionStrategy from palimpzest.query.operators.convert import ConvertOp, LLMConvert from palimpzest.query.operators.datasource import CacheScanDataOp, MarshalAndScanDataOp from palimpzest.query.operators.filter import FilterOp, LLMFilter from palimpzest.query.operators.physical import PhysicalOperator from palimpzest.query.operators.retrieve import RetrieveOp from palimpzest.query.optimizer.cost_model import CostModel, SampleBasedCostModel -from palimpzest.query.optimizer.optimizer import Optimizer from palimpzest.query.optimizer.plan import SentinelPlan +from palimpzest.query.processor.query_processor import QueryProcessor from palimpzest.sets import Set -class MABSentinelExecutionEngine(ExecutionEngine): +class MABSentinelQueryProcessor(QueryProcessor): """ - This class implements the abstract execute() method from the ExecutionEngine. - This class still needs to be sub-classed by another Execution class which implements - the higher-level execute_plan() method. + Specialized query processor that implements MAB sentinel strategy + for coordinating optimization and execution. """ def __init__( self, @@ -53,6 +50,7 @@ def __init__( self.pick_output_fn = self.pick_ensemble_output self.rng = np.random.default_rng(seed=seed) + def update_frontier_ops( self, frontier_ops, @@ -489,8 +487,7 @@ def pick_highest_quality_output(self, op_set_record_sets: list[tuple[DataRecordS return DataRecordSet(out_records, []) - @staticmethod - def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]: + def execute_op_wrapper(self, operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]: """ Wrapper function around operator execution which also and returns the operator. This is useful in the parallel setting(s) where operators are executed by a worker pool, @@ -510,7 +507,7 @@ def execute_op_set(self, op_candidate_pairs): # create futures futures = [] for operator, candidate in op_candidate_pairs: - future = executor.submit(MABSentinelExecutionEngine.execute_op_wrapper, operator, candidate) + future = executor.submit(self.execute_op_wrapper, operator, candidate) futures.append(future) # compute output record_set for each (operator, candidate) pair @@ -784,29 +781,16 @@ def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan: """ # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up # initialize the optimizer - optimizer = Optimizer( - policy=policy, - cost_model=CostModel(), - no_cache=True, - verbose=self.verbose, - available_models=self.available_models, - allow_bonded_query=self.allow_bonded_query, - allow_conventional_query=self.allow_conventional_query, - allow_code_synth=self.allow_code_synth, - allow_token_reduction=self.allow_token_reduction, - allow_rag_reduction=self.allow_rag_reduction, - allow_mixtures=self.allow_mixtures, - optimization_strategy=OptimizationStrategy.SENTINEL, - ) - # use optimizer to generate sentinel plans - sentinel_plans = optimizer.optimize(dataset, policy) + # TODO: Do we need to re-initialize the optimizer here? + self.optimizer.update_cost_model(CostModel()) + sentinel_plans = self.optimizer.optimize(dataset, policy) sentinel_plan = sentinel_plans[0] return sentinel_plan - def execute(self, dataset: Set, policy: Policy): + def execute(self, dry_run: bool = False): execution_start_time = time.time() # for now, enforce that we are using validation data; we can relax this after paper submission @@ -818,10 +802,10 @@ def execute(self, dataset: Set, policy: Policy): self.clear_cached_examples() # create sentinel plan - sentinel_plan = self.create_sentinel_plan(dataset, policy) + sentinel_plan = self.create_sentinel_plan(self.dataset, self.policy) # generate sample execution data - all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, policy) + all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, self.policy) # put sentinel plan execution stats into list and prepare list of output records all_plan_stats = [plan_stats] @@ -829,35 +813,14 @@ def execute(self, dataset: Set, policy: Policy): # construct the CostModel with any sample execution data we've gathered cost_model = SampleBasedCostModel(sentinel_plan, all_execution_data, self.verbose) - # (re-)initialize the optimizer - optimizer = Optimizer( - policy=policy, - cost_model=cost_model, - no_cache=self.nocache, - verbose=self.verbose, - available_models=self.available_models, - allow_bonded_query=self.allow_bonded_query, - allow_conventional_query=self.allow_conventional_query, - allow_code_synth=self.allow_code_synth, - allow_token_reduction=self.allow_token_reduction, - allow_rag_reduction=self.allow_rag_reduction, - allow_mixtures=self.allow_mixtures, - optimization_strategy=self.optimization_strategy, - use_final_op_quality=self.use_final_op_quality, - ) + optimizer = self.deepcopy_clean_optimizer().update_cost_model(cost_model) total_optimization_time = time.time() - execution_start_time # execute plan(s) according to the optimization strategy - if self.optimization_strategy == OptimizationStrategy.CONFIDENCE_INTERVAL: - records, plan_stats = self.execute_confidence_interval_strategy(dataset, policy, optimizer) - all_records.extend(records) - all_plan_stats.extend(plan_stats) - - else: - records, plan_stats = self.execute_strategy(dataset, policy, optimizer) - all_records.extend(records) - all_plan_stats.extend(plan_stats) + records, plan_stats = self._execute_with_optimizer(self.dataset, self.policy, optimizer) + all_records.extend(records) + all_plan_stats.extend(plan_stats) # aggregate plan stats aggregate_plan_stats = self.aggregate_plan_stats(all_plan_stats) @@ -873,21 +836,36 @@ def execute(self, dataset: Set, policy: Policy): ) return all_records, execution_stats + -class MABSequentialSingleThreadSentinelExecution(MABSentinelExecutionEngine, SequentialSingleThreadPlanExecutor): +class MABSentinelSequentialSingleThreadProcessor(MABSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy): """ This class performs sentinel execution while executing plans in a sequential, single-threaded fashion. """ def __init__(self, *args, **kwargs): - MABSentinelExecutionEngine.__init__(self, *args, **kwargs) - SequentialSingleThreadPlanExecutor.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.strategy = SequentialSingleThreadExecutionStrategy( + scan_start_idx=self.can_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + nocache=self.nocache, + verbose=self.verbose + ) + self.progress_manager = None -class MABSequentialParallelSentinelExecution(MABSentinelExecutionEngine, SequentialSingleThreadPlanExecutor): +class MABSentinelPipelinedParallelProcessor(MABSentinelQueryProcessor, PipelinedParallelExecutionStrategy): """ This class performs sentinel execution while executing plans in a pipelined, parallel fashion. """ def __init__(self, *args, **kwargs): - MABSentinelExecutionEngine.__init__(self, *args, **kwargs) - PipelinedParallelPlanExecutor.__init__(self, *args, **kwargs) + MABSentinelQueryProcessor.__init__(self, *args, **kwargs) + self.strategy = PipelinedParallelExecutionStrategy( + scan_start_idx=self.can_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + nocache=self.nocache, + verbose=self.verbose + ) + self.progress_manager = None \ No newline at end of file diff --git a/src/palimpzest/query/execution/nosentinel_execution.py b/src/palimpzest/query/processor/nosentinel_processor.py similarity index 87% rename from src/palimpzest/query/execution/nosentinel_execution.py rename to src/palimpzest/query/processor/nosentinel_processor.py index ce757eab2..5932d5711 100644 --- a/src/palimpzest/query/execution/nosentinel_execution.py +++ b/src/palimpzest/query/processor/nosentinel_processor.py @@ -1,70 +1,37 @@ import time -from concurrent.futures import ThreadPoolExecutor, wait -from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS, OptimizationStrategy from palimpzest.core.data.dataclasses import ExecutionStats, OperatorStats, PlanStats from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.schemas import SourceRecord -from palimpzest.policy import Policy -from palimpzest.query.execution.execution_engine import ExecutionEngine -from palimpzest.query.execution.plan_executors.parallel_plan_execution import ( - PipelinedParallelPlanExecutor, -) -from palimpzest.query.execution.plan_executors.single_threaded_plan_execution import ( - PipelinedSingleThreadPlanExecutor, - SequentialSingleThreadPlanExecutor, +from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy +from palimpzest.query.execution.single_threaded_execution_strategy import ( + PipelinedSingleThreadExecutionStrategy, + SequentialSingleThreadExecutionStrategy, ) from palimpzest.query.operators.aggregate import AggregateOp from palimpzest.query.operators.datasource import DataSourcePhysicalOp, MarshalAndScanDataOp from palimpzest.query.operators.filter import FilterOp from palimpzest.query.operators.limit import LimitScanOp -from palimpzest.query.optimizer.cost_model import CostModel -from palimpzest.query.optimizer.optimizer import Optimizer from palimpzest.query.optimizer.plan import PhysicalPlan -from palimpzest.sets import Set +from palimpzest.query.processor.query_processor import QueryProcessor from palimpzest.utils.progress import create_progress_manager + - -class NoSentinelExecutionEngine(ExecutionEngine): +class NoSentinelQueryProcessor(QueryProcessor): """ - This class implements the abstract execute() method from the ExecutionEngine. - This class still needs to be sub-classed by another Execution class which implements - the execute_plan() method. + Specialized query processor that implements no sentinel strategy + for coordinating optimization and execution. """ - def execute(self, dataset: Set, policy: Policy): + def execute(self, dry_run: bool = False): execution_start_time = time.time() # if nocache is True, make sure we do not re-use codegen examples if self.nocache: self.clear_cached_examples() - # construct the CostModel - cost_model = CostModel() - - # initialize the optimizer - optimizer = Optimizer( - policy=policy, - cost_model=cost_model, - no_cache=self.nocache, - verbose=self.verbose, - available_models=self.available_models, - allow_bonded_query=self.allow_bonded_query, - allow_conventional_query=self.allow_conventional_query, - allow_code_synth=self.allow_code_synth, - allow_token_reduction=self.allow_token_reduction, - allow_rag_reduction=self.allow_rag_reduction, - allow_mixtures=self.allow_mixtures, - optimization_strategy=self.optimization_strategy, - ) - # execute plan(s) according to the optimization strategy - records, plan_stats = [], [] - if self.optimization_strategy == OptimizationStrategy.CONFIDENCE_INTERVAL: - records, plan_stats = self.execute_confidence_interval_strategy(dataset, policy, optimizer) - - else: - records, plan_stats = self.execute_strategy(dataset, policy, optimizer) + records, plan_stats = self._execute_with_optimizer(self.dataset, self.policy, self.optimizer) # aggregate plan stats aggregate_plan_stats = self.aggregate_plan_stats(plan_stats) @@ -83,13 +50,19 @@ def execute(self, dataset: Set, policy: Policy): return records, execution_stats -class NoSentinelSequentialSingleThreadExecution(NoSentinelExecutionEngine, SequentialSingleThreadPlanExecutor): +class NoSentinelSequentialSingleThreadProcessor(NoSentinelQueryProcessor): """ This class performs non-sample based execution while executing plans in a sequential, single-threaded fashion. """ def __init__(self, *args, **kwargs): - NoSentinelExecutionEngine.__init__(self, *args, **kwargs) - SequentialSingleThreadPlanExecutor.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.strategy = SequentialSingleThreadExecutionStrategy( + scan_start_idx=self.scan_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + nocache=self.nocache, + verbose=self.verbose + ) self.progress_manager = None def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1): @@ -119,11 +92,8 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf # get handle to DataSource and pre-compute its size source_operator = plan.operators[0] - datasource = ( - self.datadir.get_registered_dataset(source_operator.dataset_id) - if isinstance(source_operator, MarshalAndScanDataOp) - else self.datadir.get_cached_result(source_operator.dataset_id) - ) + assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp" + datasource = source_operator.get_datasource() datasource_len = len(datasource) # Calculate total work units - each record needs to go through each operator @@ -256,13 +226,19 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf return output_records, plan_stats -class NoSentinelPipelinedSingleThreadExecution(NoSentinelExecutionEngine, PipelinedSingleThreadPlanExecutor): +class NoSentinelPipelinedSinglelProcessor(NoSentinelQueryProcessor, PipelinedSingleThreadExecutionStrategy): """ - This class performs non-sample based execution while executing plans in a pipelined, single-threaded fashion. + This class performs non-sample based execution while executing plans in a pipelined, parallel fashion. """ def __init__(self, *args, **kwargs): - NoSentinelExecutionEngine.__init__(self, *args, **kwargs) - PipelinedSingleThreadPlanExecutor.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.strategy = PipelinedParallelExecutionStrategy( + scan_start_idx=self.scan_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + nocache=self.nocache, + verbose=self.verbose + ) self.progress_manager = None def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1): @@ -293,11 +269,8 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf # get handle to DataSource and pre-compute its size source_operator = plan.operators[0] - datasource = ( - self.datadir.get_registered_dataset(source_operator.dataset_id) - if isinstance(source_operator, MarshalAndScanDataOp) - else self.datadir.get_cached_result(source_operator.dataset_id) - ) + assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp" + datasource = source_operator.get_datasource() datasource_len = len(datasource) # Calculate total work units - each record needs to go through each operator @@ -446,13 +419,19 @@ def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf return output_records, plan_stats -class NoSentinelPipelinedParallelExecution(NoSentinelExecutionEngine, PipelinedParallelPlanExecutor): +class NoSentinelPipelinedParallelProcessor(NoSentinelQueryProcessor, PipelinedParallelExecutionStrategy): """ This class performs non-sample based execution while executing plans in a pipelined, parallel fashion. """ def __init__(self, *args, **kwargs): - NoSentinelExecutionEngine.__init__(self, *args, **kwargs) - PipelinedParallelPlanExecutor.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.strategy = PipelinedParallelExecutionStrategy( + scan_start_idx=self.scan_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + nocache=self.nocache, + verbose=self.verbose + ) self.progress_manager = None # def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1): @@ -483,11 +462,8 @@ def __init__(self, *args, **kwargs): # # get handle to DataSource and pre-compute its size # source_operator = plan.operators[0] - # datasource = ( - # self.datadir.get_registered_dataset(source_operator.dataset_id) - # if isinstance(source_operator, MarshalAndScanDataOp) - # else self.datadir.get_cached_result(source_operator.dataset_id) - # ) + # assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp" + # datasource = source_operator.get_datasource() # datasource_len = len(datasource) # # Calculate total work units - each record needs to go through each operator @@ -574,5 +550,3 @@ def __init__(self, *args, **kwargs): # self.progress_manager.finish() # return output_records, plan_stats - - diff --git a/src/palimpzest/query/execution/execution_engine.py b/src/palimpzest/query/processor/query_processor.py similarity index 71% rename from src/palimpzest/query/execution/execution_engine.py rename to src/palimpzest/query/processor/query_processor.py index a57f8e2c4..4d5db77b2 100644 --- a/src/palimpzest/query/execution/execution_engine.py +++ b/src/palimpzest/query/processor/query_processor.py @@ -1,73 +1,78 @@ -import multiprocessing +from abc import abstractmethod from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass -from palimpzest.constants import Model, OptimizationStrategy -from palimpzest.core.data.dataclasses import PlanStats, RecordOpStats +from palimpzest.core.data.dataclasses import ExecutionStats, PlanStats, RecordOpStats from palimpzest.core.data.datasources import DataSource, ValidationDataSource from palimpzest.core.elements.records import DataRecord from palimpzest.datamanager.datamanager import DataDirectory from palimpzest.policy import Policy from palimpzest.query.optimizer.cost_model import CostModel from palimpzest.query.optimizer.optimizer import Optimizer +from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType from palimpzest.query.optimizer.plan import PhysicalPlan +from palimpzest.query.processor.config import QueryProcessorConfig from palimpzest.sets import Dataset, Set from palimpzest.utils.hash_helpers import hash_for_id from palimpzest.utils.model_helpers import get_models -class ExecutionEngine: +@dataclass +class QueryResult: + """Container for query processing results""" + records: list[DataRecord] + execution_stats: ExecutionStats + optimization_stats: dict + physical_plans: list[PhysicalPlan] + + +class QueryProcessor: + """ + Processes queries through the complete pipeline: + 1. Optimization phase: Plan generation and selection + 2. Execution phase: Plan execution and result collection + 3. Result phase: Statistics gathering and result formatting + """ def __init__( - self, + self, datasource: DataSource, - num_samples: int = float("inf"), - scan_start_idx: int = 0, - nocache: bool = True, # NOTE: until we properly implement caching, let's set the default to True - include_baselines: bool = False, - min_plans: int | None = None, - verbose: bool = False, - available_models: list[Model] | None = None, - allow_bonded_query: bool = True, - allow_conventional_query: bool = False, - allow_model_selection: bool = True, - allow_code_synth: bool = True, - allow_token_reduction: bool = False, - allow_rag_reduction: bool = True, - allow_mixtures: bool = True, - optimization_strategy: OptimizationStrategy = OptimizationStrategy.PARETO, - max_workers: int | None = None, - num_workers_per_plan: int = 1, - *args, - **kwargs, - ) -> None: - self.num_samples = num_samples - self.scan_start_idx = scan_start_idx - self.nocache = nocache - if not self.nocache: - raise NotImplementedError("Caching is not yet implemented! Please set nocache=True.") - self.include_baselines = include_baselines - self.min_plans = min_plans - self.verbose = verbose - self.available_models = available_models + optimizer: Optimizer = None, + config: QueryProcessorConfig = None, + ): + """ + Initialize QueryProcessor with optional custom components. + + Args: + datasource: Data source to process + optimizer: Custom optimizer (optional) + execution_engine: Custom execution engine (optional) + config: Configuration dictionary for default components + """ + assert config is not None, "QueryProcessorConfig is required for QueryProcessor" + + self.config = config or QueryProcessorConfig() + self.datasource = datasource + self.using_validation_data = isinstance(self.datasource, ValidationDataSource) + self.scan_start_idx = self.config.scan_start_idx + self.nocache = self.config.nocache + self.verbose = self.config.verbose + self.max_workers = self.config.max_workers + self.num_workers_per_plan = self.config.num_workers_per_plan + self.datadir = DataDirectory() + + self.policy = self.config.policy + self.dataset = self.datasource + + self.available_models = self.config.available_models if self.available_models is None or len(self.available_models) == 0: self.available_models = get_models(include_vision=True) + if self.verbose: print("Available models: ", self.available_models) - self.allow_bonded_query = allow_bonded_query - self.allow_conventional_query = allow_conventional_query - self.allow_model_selection = allow_model_selection - self.allow_code_synth = allow_code_synth - self.allow_token_reduction = allow_token_reduction - self.allow_rag_reduction = allow_rag_reduction - self.allow_mixtures = allow_mixtures - self.optimization_strategy = optimization_strategy - self.max_workers = max_workers - self.num_workers_per_plan = num_workers_per_plan - self.datadir = DataDirectory() - - # datasource; should be set by execute() with call to get_datasource() - self.datasource = datasource - self.using_validation_data = isinstance(self.datasource, ValidationDataSource) + # Initialize optimizer and execution engine + assert optimizer is not None, "Optimizer is required. Please use QueryProcessorFactory.create_processor() to initialize a QueryProcessor." + self.optimizer = optimizer def execution_id(self) -> str: @@ -88,16 +93,6 @@ def clear_cached_examples(self): cache = self.datadir.get_cache_service() cache.rm_cache() - def get_parallel_max_workers(self): - # for now, return the number of system CPUs; - # in the future, we may want to consider the models the user has access to - # and whether or not they will encounter rate-limits. If they will, we should - # set the max workers in a manner that is designed to avoid hitting them. - # Doing this "right" may require considering their logical, physical plan, - # and tier status with LLM providers. It may also be worth dynamically - # changing the max_workers in response to 429 errors. - return max(int(0.8 * multiprocessing.cpu_count()), 1) - def get_max_quality_plan_id(self, plans: list[PhysicalPlan]) -> str: """ Return the plan_id for the plan with the highest quality in the list of plans. @@ -182,34 +177,26 @@ def execute_plans( return_records = records return all_sample_execution_data, return_records, all_plan_stats - - def execute_strategy( + + + def _execute_with_optimizer( self, - dataset: Set, + dataset: Dataset, policy: Policy, optimizer: Optimizer, execution_data: list[RecordOpStats] | None = None, ) -> tuple[list[DataRecord], list[PlanStats]]: - if execution_data is None: - execution_data = [] - - # get the optimal plan according to the optimizer - plans = optimizer.optimize(dataset, policy) - final_plan = plans[0] - - # execute the plan - # TODO: for some reason this is not picking up change to self.max_workers from PipelinedParallelPlanExecutor.__init__() - records, plan_stats = self.execute_plan( - plan=final_plan, - plan_workers=self.max_workers, - ) + records, plan_stats = [], [] + if optimizer.optimization_strategy_type == OptimizationStrategyType.CONFIDENCE_INTERVAL: + records, plan_stats = self._execute_confidence_interval_strategy(dataset, policy, optimizer) + else: + records, plan_stats = self._execute_strategy(dataset, policy, optimizer) + return records, plan_stats - # return the output records and plan stats - return records, [plan_stats] - def execute_confidence_interval_strategy( + def _execute_confidence_interval_strategy( self, - dataset: Set, + dataset: Dataset, policy: Policy, optimizer: Optimizer, execution_data: list[RecordOpStats] | None = None, @@ -258,13 +245,33 @@ def execute_confidence_interval_strategy( return records, plan_stats - def execute_plan(self, plan: PhysicalPlan, num_samples: int | float = float("inf"), plan_workers: int = 1): - """Execute the given plan and return the output records and plan stats.""" - raise NotImplementedError("Abstract method to be overwritten by sub-classes") + def _execute_strategy( + self, + dataset: Set, + policy: Policy, + optimizer: Optimizer, + execution_data: list[RecordOpStats] | None = None, + ) -> tuple[list[DataRecord], list[PlanStats]]: + if execution_data is None: + execution_data = [] + + # get the optimal plan according to the optimizer + plans = optimizer.optimize(dataset, policy) + final_plan = plans[0] + # execute the plan + # TODO: for some reason this is not picking up change to self.max_workers from PipelinedParallelPlanExecutor.__init__() + records, plan_stats = self.execute_plan( + plan=final_plan, + plan_workers=self.max_workers, + ) + # return the output records and plan stats + return records, [plan_stats] - def execute(self, dataset: Dataset, policy: Policy): - """ - Execute the workload specified by the given dataset according to the policy provided by the user. - """ - raise NotImplementedError("Abstract method to be overwritten by sub-classes") + # let's keep the same name as the old one + @abstractmethod + def execute( + self, + dry_run: bool = False + ) -> QueryResult: + raise NotImplementedError("Abstract method to be overwritten by sub-classes") \ No newline at end of file diff --git a/src/palimpzest/query/processor/query_processor_factory.py b/src/palimpzest/query/processor/query_processor_factory.py new file mode 100644 index 000000000..517dfcbd7 --- /dev/null +++ b/src/palimpzest/query/processor/query_processor_factory.py @@ -0,0 +1,151 @@ +from enum import Enum +from typing import Type + +from palimpzest.query.execution.execution_strategy import ExecutionStrategyType +from palimpzest.query.optimizer.cost_model import CostModel +from palimpzest.query.optimizer.optimizer import Optimizer +from palimpzest.query.optimizer.optimizer_strategy import OptimizationStrategyType +from palimpzest.query.processor.config import QueryProcessorConfig +from palimpzest.query.processor.mab_sentinel_processor import ( + MABSentinelPipelinedParallelProcessor, + MABSentinelSequentialSingleThreadProcessor, +) +from palimpzest.query.processor.nosentinel_processor import ( + NoSentinelPipelinedParallelProcessor, + NoSentinelPipelinedSinglelProcessor, + NoSentinelSequentialSingleThreadProcessor, +) +from palimpzest.query.processor.query_processor import QueryProcessor +from palimpzest.query.processor.random_sampling_sentinel_processor import ( + RandomSamplingSentinelPipelinedProcessor, + RandomSamplingSentinelSequentialSingleThreadProcessor, +) +from palimpzest.query.processor.streaming_processor import StreamingQueryProcessor +from palimpzest.sets import Dataset +from palimpzest.utils.model_helpers import get_models + + +class ProcessingStrategyType(Enum): + """How to generate and optimize query plans""" + MAB_SENTINEL = "mab_sentinel" + NO_SENTINEL = "nosentinel" + RANDOM_SAMPLING = "random_sampling" + STREAMING = "streaming" + AUTO = "auto" + +def convert_to_enum(enum_type: Type[Enum], value: str) -> Enum: + if value == "pipelined": + value = "pipelined_single_thread" + value = value.upper().replace('-', '_') + try: + return enum_type[value] + except KeyError as e: + raise ValueError(f"Unsupported {enum_type.__name__}: {value}") from e + +class QueryProcessorFactory: + PROCESSOR_MAPPING = { + (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.SEQUENTIAL): + lambda ds, opt, cfg: NoSentinelSequentialSingleThreadProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.PIPELINED_SINGLE_THREAD): + lambda ds, opt, cfg: NoSentinelPipelinedSinglelProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.NO_SENTINEL, ExecutionStrategyType.PIPELINED_PARALLEL): + lambda ds, opt, cfg: NoSentinelPipelinedParallelProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.MAB_SENTINEL, ExecutionStrategyType.SEQUENTIAL): + lambda ds, opt, cfg: MABSentinelSequentialSingleThreadProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.MAB_SENTINEL, ExecutionStrategyType.PIPELINED_PARALLEL): + lambda ds, opt, cfg: MABSentinelPipelinedParallelProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.STREAMING, ExecutionStrategyType.SEQUENTIAL): + lambda ds, opt, cfg: StreamingQueryProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.STREAMING, ExecutionStrategyType.PIPELINED_PARALLEL): + lambda ds, opt, cfg: StreamingQueryProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.RANDOM_SAMPLING, ExecutionStrategyType.SEQUENTIAL): + lambda ds, opt, cfg: RandomSamplingSentinelSequentialSingleThreadProcessor(datasource=ds, optimizer=opt, config=cfg), + (ProcessingStrategyType.RANDOM_SAMPLING, ExecutionStrategyType.PIPELINED_PARALLEL): + lambda ds, opt, cfg: RandomSamplingSentinelPipelinedProcessor(datasource=ds, optimizer=opt, config=cfg), + } + + + @staticmethod + def create_processor( + datasource: Dataset, + processing_strategy: str | ProcessingStrategyType = ProcessingStrategyType.NO_SENTINEL, + execution_strategy: str | ExecutionStrategyType = ExecutionStrategyType.SEQUENTIAL, + optimizer_strategy: str | OptimizationStrategyType = OptimizationStrategyType.PARETO, + config: QueryProcessorConfig | None = None, + ) -> QueryProcessor: + """ + Creates a QueryProcessor with specified processing and execution strategies. + + Args: + datasource: The data source to process + processing_strategy: How to generate/optimize query plans and execute them + execution_strategy: How to execute the plans + optimizer_strategy: How to find the optimal plan + config: Additional configuration parameters: + """ + if config is None: + config = QueryProcessorConfig() + + # Normalize enum values + if isinstance(processing_strategy, str): + try: + processing_strategy = convert_to_enum(ProcessingStrategyType, processing_strategy) + except ValueError as e: + raise ValueError(f"""Unsupported processing strategy: {processing_strategy}. + The supported strategies are: {ProcessingStrategyType.__members__.keys()}""") from e + if isinstance(execution_strategy, str): + try: + execution_strategy = convert_to_enum(ExecutionStrategyType, execution_strategy) + except ValueError as e: + raise ValueError(f"""Unsupported execution strategy: {execution_strategy}. + The supported strategies are: {ExecutionStrategyType.__members__.keys()}""") from e + if isinstance(optimizer_strategy, str): + try: + optimizer_strategy = convert_to_enum(OptimizationStrategyType, optimizer_strategy) + except ValueError as e: + raise ValueError(f"""Unsupported optimizer strategy: {optimizer_strategy}. + The supported strategies are: {OptimizationStrategyType.__members__.keys()}""") from e + + # intialize an optimizer with the strategy + available_models = getattr(config, 'available_models', []) + if available_models is None or len(available_models) == 0: + available_models = get_models(include_vision=True) + + if config.policy is None: + raise ValueError("Policy is required for optimizer") + config.available_models = available_models + + optimizer = QueryProcessorFactory._create_optimizer(optimizer_strategy, config) + + # Get the appropriate processor based on strategy combination + processor_key = (processing_strategy, execution_strategy) + processor_factory = QueryProcessorFactory.PROCESSOR_MAPPING.get(processor_key) + + if processor_factory is None: + raise ValueError(f"Unsupported combination of processing strategy {processing_strategy} " + f"and execution strategy {execution_strategy}") + + return processor_factory(datasource, optimizer, config) + + + #TODO(Jun): The all avaliable plans could be generated earlier and outside Optimizer. + @staticmethod + def _create_optimizer(optimizer_strategy: OptimizationStrategyType, config: QueryProcessorConfig) -> Optimizer: + available_models = getattr(config, 'available_models', []) or get_models(include_vision=True) + + if config.policy is None: + raise ValueError("Policy is required for optimizer") + + return Optimizer( + policy=config.policy, + cost_model=CostModel(), + no_cache=config.nocache, + verbose=config.verbose, + available_models=available_models, + allow_bonded_query=config.allow_bonded_query, + allow_conventional_query=config.allow_conventional_query, + allow_code_synth=config.allow_code_synth, + allow_token_reduction=config.allow_token_reduction, + optimization_strategy_type=optimizer_strategy, + use_final_op_quality=config.use_final_op_quality + ) diff --git a/src/palimpzest/query/execution/random_sampling_sentinel_execution.py b/src/palimpzest/query/processor/random_sampling_sentinel_processor.py similarity index 88% rename from src/palimpzest/query/execution/random_sampling_sentinel_execution.py rename to src/palimpzest/query/processor/random_sampling_sentinel_processor.py index 014aff38b..28c82bc87 100644 --- a/src/palimpzest/query/execution/random_sampling_sentinel_execution.py +++ b/src/palimpzest/query/processor/random_sampling_sentinel_processor.py @@ -4,31 +4,28 @@ from typing import Callable import numpy as np - -from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS, OptimizationStrategy +from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS from palimpzest.core.data.dataclasses import ExecutionStats, OperatorStats, PlanStats, RecordOpStats from palimpzest.core.data.datasources import ValidationDataSource from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import SourceRecord from palimpzest.policy import Policy -from palimpzest.query.execution.execution_engine import ExecutionEngine -from palimpzest.query.execution.plan_executors.single_threaded_plan_execution import SequentialSingleThreadPlanExecutor +from palimpzest.query.execution.parallel_execution_strategy import PipelinedParallelExecutionStrategy +from palimpzest.query.execution.single_threaded_execution_strategy import SequentialSingleThreadExecutionStrategy from palimpzest.query.operators.convert import ConvertOp, LLMConvert from palimpzest.query.operators.datasource import CacheScanDataOp, MarshalAndScanDataOp from palimpzest.query.operators.filter import FilterOp, LLMFilter from palimpzest.query.operators.physical import PhysicalOperator from palimpzest.query.operators.retrieve import RetrieveOp -from palimpzest.query.optimizer.cost_model import CostModel, SampleBasedCostModel -from palimpzest.query.optimizer.optimizer import Optimizer +from palimpzest.query.optimizer.cost_model import SampleBasedCostModel from palimpzest.query.optimizer.plan import SentinelPlan +from palimpzest.query.processor.query_processor import QueryProcessor from palimpzest.sets import Set -class RandomSamplingSentinelExecutionEngine(ExecutionEngine): +class RandomSamplingSentinelQueryProcessor(QueryProcessor): """ - This class implements the abstract execute() method from the ExecutionEngine. - This class still needs to be sub-classed by another Execution class which implements - the higher-level execute_plan() method. + """ def __init__( self, @@ -308,8 +305,7 @@ def pick_ensemble_output(self, op_set_record_sets: list[tuple[DataRecordSet, Phy return DataRecordSet(out_records, []) - @staticmethod - def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]: + def execute_op_wrapper(self, operator: PhysicalOperator, op_input: DataRecord | list[DataRecord]) -> tuple[DataRecordSet, PhysicalOperator]: """ Wrapper function around operator execution which also and returns the operator. This is useful in the parallel setting(s) where operators are executed by a worker pool, @@ -317,7 +313,7 @@ def execute_op_wrapper(operator: PhysicalOperator, op_input: DataRecord | list[D """ record_set = operator(op_input) - return record_set, operator, op_input + return record_set, operator def execute_op_set(self, candidates, op_set): @@ -330,7 +326,7 @@ def execute_op_set(self, candidates, op_set): futures = [] for candidate in candidates: for operator in op_set: - future = executor.submit(RandomSamplingSentinelExecutionEngine.execute_op_wrapper, operator, candidate) + future = executor.submit(self.execute_op_wrapper, operator, candidate) futures.append(future) # compute output record_set for each (operator, candidate) pair @@ -526,21 +522,8 @@ def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan: """ # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up # initialize the optimizer - optimizer = Optimizer( - policy=policy, - cost_model=CostModel(), - no_cache=True, - verbose=self.verbose, - available_models=self.available_models, - allow_bonded_query=self.allow_bonded_query, - allow_conventional_query=self.allow_conventional_query, - allow_code_synth=self.allow_code_synth, - allow_token_reduction=self.allow_token_reduction, - allow_rag_reduction=self.allow_rag_reduction, - allow_mixtures=self.allow_mixtures, - optimization_strategy=OptimizationStrategy.SENTINEL, - ) - + # TODO: Do we need to re-initialize the optimizer here? + optimizer = self.optimizer.deepcopy_clean_optimizer() # use optimizer to generate sentinel plans sentinel_plans = optimizer.optimize(dataset, policy) sentinel_plan = sentinel_plans[0] @@ -548,7 +531,7 @@ def create_sentinel_plan(self, dataset: Set, policy: Policy) -> SentinelPlan: return sentinel_plan - def execute(self, dataset: Set, policy: Policy): + def execute(self, dry_run: bool = False): execution_start_time = time.time() # for now, enforce that we are using validation data; we can relax this after paper submission @@ -560,10 +543,10 @@ def execute(self, dataset: Set, policy: Policy): self.clear_cached_examples() # create sentinel plan - sentinel_plan = self.create_sentinel_plan(dataset, policy) + sentinel_plan = self.create_sentinel_plan(self.dataset, self.policy) # generate sample execution data - all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, policy) + all_execution_data, plan_stats = self.generate_sample_observations(sentinel_plan, self.policy) # put sentinel plan execution stats into list and prepare list of output records all_plan_stats = [plan_stats] @@ -571,35 +554,13 @@ def execute(self, dataset: Set, policy: Policy): # construct the CostModel with any sample execution data we've gathered cost_model = SampleBasedCostModel(sentinel_plan, all_execution_data, self.verbose, self.exp_name) - - # (re-)initialize the optimizer - optimizer = Optimizer( - policy=policy, - cost_model=cost_model, - no_cache=self.nocache, - verbose=self.verbose, - available_models=self.available_models, - allow_bonded_query=self.allow_bonded_query, - allow_conventional_query=self.allow_conventional_query, - allow_code_synth=self.allow_code_synth, - allow_token_reduction=self.allow_token_reduction, - allow_rag_reduction=self.allow_rag_reduction, - allow_mixtures=self.allow_mixtures, - optimization_strategy=self.optimization_strategy, - use_final_op_quality=self.use_final_op_quality, - ) + optimizer = self.optimizer.deepcopy_clean_optimizer().update_cost_model(cost_model) total_optimization_time = time.time() - execution_start_time # execute plan(s) according to the optimization strategy - if self.optimization_strategy == OptimizationStrategy.CONFIDENCE_INTERVAL: - records, plan_stats = self.execute_confidence_interval_strategy(dataset, policy, optimizer) - all_records.extend(records) - all_plan_stats.extend(plan_stats) - - else: - records, plan_stats = self.execute_strategy(dataset, policy, optimizer) - all_records.extend(records) - all_plan_stats.extend(plan_stats) + records, plan_stats = self._execute_with_optimizer(self.dataset, self.policy, optimizer) + all_records.extend(records) + all_plan_stats.extend(plan_stats) # aggregate plan stats aggregate_plan_stats = self.aggregate_plan_stats(all_plan_stats) @@ -617,20 +578,29 @@ def execute(self, dataset: Set, policy: Policy): return all_records, execution_stats -class RandomSamplingSequentialSingleThreadSentinelExecution(RandomSamplingSentinelExecutionEngine, SequentialSingleThreadPlanExecutor): +class RandomSamplingSentinelSequentialSingleThreadProcessor(RandomSamplingSentinelQueryProcessor, SequentialSingleThreadExecutionStrategy): """ This class performs sentinel execution while executing plans in a sequential, single-threaded fashion. """ def __init__(self, *args, **kwargs): - RandomSamplingSentinelExecutionEngine.__init__(self, *args, **kwargs) - SequentialSingleThreadPlanExecutor.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.strategy = SequentialSingleThreadExecutionStrategy( + scan_start_idx=self.can_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + verbose=self.verbose + ) -class RandomSamplingSequentialParallelSentinelExecution(RandomSamplingSentinelExecutionEngine, SequentialSingleThreadPlanExecutor): +class RandomSamplingSentinelPipelinedProcessor(RandomSamplingSentinelQueryProcessor, PipelinedParallelExecutionStrategy): """ This class performs sentinel execution while executing plans in a pipelined, parallel fashion. """ def __init__(self, *args, **kwargs): - RandomSamplingSentinelExecutionEngine.__init__(self, *args, **kwargs) - # TODO: post-submission, change to parallel plan executor - SequentialSingleThreadPlanExecutor.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.strategy = PipelinedParallelExecutionStrategy( + scan_start_idx=self.can_start_idx, + datadir=self.datadir, + max_workers=self.max_workers, + verbose=self.verbose + ) diff --git a/src/palimpzest/query/execution/streaming_execution.py b/src/palimpzest/query/processor/streaming_processor.py similarity index 80% rename from src/palimpzest/query/execution/streaming_execution.py rename to src/palimpzest/query/processor/streaming_processor.py index c2a78001e..f8974f05c 100644 --- a/src/palimpzest/query/execution/streaming_execution.py +++ b/src/palimpzest/query/processor/streaming_processor.py @@ -4,18 +4,16 @@ from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.schemas import SourceRecord from palimpzest.policy import Policy -from palimpzest.query.execution.execution_engine import ExecutionEngine from palimpzest.query.operators.aggregate import AggregateOp from palimpzest.query.operators.datasource import DataSourcePhysicalOp, MarshalAndScanDataOp from palimpzest.query.operators.filter import FilterOp from palimpzest.query.operators.limit import LimitScanOp -from palimpzest.query.optimizer.cost_model import CostModel -from palimpzest.query.optimizer.optimizer import Optimizer from palimpzest.query.optimizer.plan import PhysicalPlan +from palimpzest.query.processor.query_processor import QueryProcessor from palimpzest.sets import Dataset -class StreamingSequentialExecution(ExecutionEngine): +class StreamingQueryProcessor(QueryProcessor): """This class can be used for a streaming, record-based execution. Results are returned as an iterable that can be consumed by the caller.""" @@ -52,23 +50,9 @@ def generate_plan(self, dataset: Dataset, policy: Policy): self.clear_cached_examples() start_time = time.time() - cost_model = CostModel() - optimizer = Optimizer( - policy=policy, - cost_model=cost_model, - no_cache=self.nocache, - verbose=self.verbose, - available_models=self.available_models, - allow_bonded_query=self.allow_bonded_query, - allow_conventional_query=self.allow_conventional_query, - allow_code_synth=self.allow_code_synth, - allow_token_reduction=self.allow_token_reduction, - allow_rag_reduction=self.allow_rag_reduction, - allow_mixtures=self.allow_mixtures, - optimization_strategy=self.optimization_strategy, - ) - - # Effectively always use the optimal strategy + # TODO: Do we need to re-initialize the optimizer here? + # Effectively always use the optimal strategy + optimizer = self.optimizer.deepcopy_clean_optimizer() plans = optimizer.optimize(dataset, policy) self.plan = plans[0] self.plan_stats = PlanStats(plan_id=self.plan.plan_id) @@ -81,17 +65,18 @@ def generate_plan(self, dataset: Dataset, policy: Policy): self.plan_stats.operator_stats[op_id] = OperatorStats(op_id=op_id, op_name=op_name, op_details=op_details) print("Time for planning: ", time.time() - start_time) self.plan_generated = True + print("Generated plan:\n", self.plan) return self.plan - def execute( - self, - dataset: Dataset, - policy: Policy, - ): + def execute(self, dry_run: bool = False): start_time = time.time() # Always delete cache if not self.plan_generated: - self.generate_plan(dataset, policy) + self.generate_plan(self.dataset, self.policy) + + if dry_run: + yield [], self.plan, self.plan_stats + return input_records = self.get_input_records() for idx, record in enumerate(input_records): @@ -106,7 +91,7 @@ def execute( def get_input_records(self): scan_operator = self.plan.operators[0] datasource = ( - self.datadir.get_registered_dataset(scan_operator.dataset_id) + scan_operator.get_datasource() if isinstance(scan_operator, MarshalAndScanDataOp) else self.datadir.get_cached_result(scan_operator.dataset_id) ) @@ -122,9 +107,9 @@ def get_input_records(self): candidate = DataRecord(schema=SourceRecord, source_id=idx) candidate.idx = idx candidate.get_item_fn = datasource.get_item - records, record_op_stats_lst = scan_operator(candidate) - input_records += records - record_op_stats += record_op_stats_lst + record_set = scan_operator(candidate) + input_records += record_set.data_records + record_op_stats += record_set.record_op_stats op_id = scan_operator.get_op_id() self.plan_stats.operator_stats[op_id].add_record_op_stats( @@ -159,9 +144,9 @@ def execute_opstream(self, plan, record): break else: for r in input_records: - record_out, stats = operator(r) - output_records += record_out - record_op_stats_lst += stats + record_set = operator(r) + output_records += record_set.data_records + record_op_stats_lst += record_set.record_op_stats if isinstance(operator, FilterOp): # delete all records that did not pass the filter diff --git a/src/palimpzest/query/execution/plan_executors/__init__.py b/src/palimpzest/schemabuilder/__init__.py similarity index 100% rename from src/palimpzest/query/execution/plan_executors/__init__.py rename to src/palimpzest/schemabuilder/__init__.py diff --git a/src/palimpzest/schemabuilder/schema_builder.py b/src/palimpzest/schemabuilder/schema_builder.py index b89c149ac..363e45aef 100644 --- a/src/palimpzest/schemabuilder/schema_builder.py +++ b/src/palimpzest/schemabuilder/schema_builder.py @@ -8,13 +8,14 @@ import json import os -import palimpzest.core.lib.fields as pz_fields -import palimpzest.core.lib.schemas as pz_schemas import pandas as pd import yaml +from pyld import jsonld + +import palimpzest.core.lib.fields as pz_fields +import palimpzest.core.lib.schemas as pz_schemas from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import Schema -from pyld import jsonld class SchemaBuilder: diff --git a/src/palimpzest/sets.py b/src/palimpzest/sets.py index c0dccfc9b..9c2ea9d08 100644 --- a/src/palimpzest/sets.py +++ b/src/palimpzest/sets.py @@ -4,6 +4,8 @@ import pandas as pd from typing import Callable +import pandas as pd + from palimpzest.constants import AggFunc, Cardinality from palimpzest.core.data.datasources import DataSource, TextFile from palimpzest.core.elements.filters import Filter @@ -12,6 +14,7 @@ from palimpzest.datamanager.datamanager import DataDirectory from palimpzest.utils.hash_helpers import hash_for_id from palimpzest.utils.index_helpers import get_index_str +from palimpzest.query.processor.config import QueryProcessorConfig ##################################################### @@ -270,3 +273,29 @@ def project(self, project_cols: list[str] | str) -> Dataset: project_cols=project_cols if isinstance(project_cols, list) else [project_cols], nocache=self._nocache, ) + + def _processor_hashid(self, config: QueryProcessorConfig, + optimizer_strategy: str = "pareto", + execution_strategy: str = "sequential", + processing_strategy: str = "no_sentinel"): + return hash_for_id(config.to_json_str() + optimizer_strategy + execution_strategy + processing_strategy) + + def run(self, config: QueryProcessorConfig, + optimizer_strategy: str = "pareto", + execution_strategy: str = "sequential", + processing_strategy: str = "no_sentinel"): + + processor_hashid = self._processor_hashid(config, optimizer_strategy, execution_strategy, processing_strategy) + if processor_hashid in self._processor_cache: + processor = self._processor_cache[processor_hashid] + else: + from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory + processor = QueryProcessorFactory.create_processor( + datasource=self, + processing_strategy=processing_strategy, + execution_strategy=execution_strategy, + optimizer_strategy=optimizer_strategy, + config=config + ) + self._processor_cache[processor_hashid] = processor + return processor.execute() diff --git a/src/palimpzest/tools/pdfparser.py b/src/palimpzest/tools/pdfparser.py index ea89b79cf..74235b77f 100644 --- a/src/palimpzest/tools/pdfparser.py +++ b/src/palimpzest/tools/pdfparser.py @@ -215,6 +215,8 @@ def cosmos_client(name: str, data: BinaryIO, output_dir: str, delay=10): # pieces which are related to setting / reading external configurations (like "pdfprocessor"). # However, given that I can fix this in two minutes by adding this is a kwarg, I'm going to # do that for now and revisit the issue if/when this matters. + +# TODO(Jun): 1. cosmos returns 202 for me. 2. why only accept "pypdf" and "cosmos" as pdfprocessor? def get_text_from_pdf(filename, pdf_bytes, pdfprocessor="cosmos", enable_file_cache=True, file_cache_dir="/tmp"): pdf_filename = filename file_name = os.path.basename(pdf_filename) diff --git a/src/palimpzest/utils/model_helpers.py b/src/palimpzest/utils/model_helpers.py index 926d65537..adee173f5 100644 --- a/src/palimpzest/utils/model_helpers.py +++ b/src/palimpzest/utils/model_helpers.py @@ -35,36 +35,36 @@ def get_models(include_vision: Optional[bool] = False) -> List[Model]: return models - -def get_champion_model(available_models, vision=False): - champion_model = None - - # non-vision - if not vision and Model.GPT_4o in available_models: - champion_model = Model.GPT_4o - elif not vision and Model.GPT_4o_MINI in available_models: - champion_model = Model.GPT_4o_MINI - elif not vision and Model.LLAMA3 in available_models: - champion_model = Model.LLAMA3 - elif not vision and Model.MIXTRAL in available_models: - champion_model = Model.MIXTRAL - - # vision - elif vision and Model.GPT_4o_V in available_models: - champion_model = Model.GPT_4o_V - elif vision and Model.GPT_4o_MINI_V in available_models: - champion_model = Model.GPT_4o_MINI_V - elif vision and Model.LLAMA3_V in available_models: - champion_model = Model.LLAMA3_V - - else: - raise Exception( - "No models available to create physical plans! You must set at least one of the following environment" - " variables: [OPENAI_API_KEY, TOGETHER_API_KEY, GOOGLE_API_KEY]\n" - f"available_models: {available_models}" - ) - - return champion_model +# The order is the priority of the model +TEXT_MODEL_PRIORITY = [ + Model.GPT_4o, + Model.GPT_4o_MINI, + Model.LLAMA3, + Model.MIXTRAL +] + +VISION_MODEL_PRIORITY = [ + Model.GPT_4o_V, + Model.GPT_4o_MINI_V, + Model.LLAMA3_V +] +def get_champion_model(available_models, vision=False): + # Select appropriate priority list based on task + model_priority = VISION_MODEL_PRIORITY if vision else TEXT_MODEL_PRIORITY + + # Return first available model from priority list + for model in model_priority: + if model in available_models: + return model + + # If no suitable model found, raise informative error + task_type = "vision" if vision else "text" + raise Exception( + f"No {task_type} models available to create physical plans!\n" + "You must set at least one of the following environment variables:\n" + "[OPENAI_API_KEY, TOGETHER_API_KEY, GOOGLE_API_KEY]\n" + f"Available models: {available_models}" + ) def get_conventional_fallback_model(available_models, vision=False): diff --git a/tests/pytest/conftest.py b/tests/pytest/conftest.py index 817b7038d..5218017ac 100644 --- a/tests/pytest/conftest.py +++ b/tests/pytest/conftest.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Model from palimpzest.policy import MaxQuality, MaxQualityAtFixedCost, MinCost, MinCostAtFixedQuality @@ -46,7 +47,6 @@ def workload( request, enron_workload, real_estate_workload, - biofabric_workload, three_converts_workload, one_filter_one_convert_workload, two_converts_two_filters_workload, @@ -55,7 +55,6 @@ def workload( workload_id_to_workload = { "enron-workload": enron_workload, "real-estate-workload": real_estate_workload, - "biofabric-workload": biofabric_workload, "three-converts": three_converts_workload, "one-filter-one-convert": one_filter_one_convert_workload, "two-converts-two-filters": two_converts_two_filters_workload, diff --git a/tests/pytest/fixtures/champion_outputs.py b/tests/pytest/fixtures/champion_outputs.py index c2737ffef..52e37d8a1 100644 --- a/tests/pytest/fixtures/champion_outputs.py +++ b/tests/pytest/fixtures/champion_outputs.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Model from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import TextFile diff --git a/tests/pytest/fixtures/cost_est_data.py b/tests/pytest/fixtures/cost_est_data.py index 964f2e30b..9d68c61a2 100644 --- a/tests/pytest/fixtures/cost_est_data.py +++ b/tests/pytest/fixtures/cost_est_data.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Model from palimpzest.core.data.dataclasses import RecordOpStats diff --git a/tests/pytest/fixtures/datasets.py b/tests/pytest/fixtures/datasets.py index 879ed016a..87caf0931 100644 --- a/tests/pytest/fixtures/datasets.py +++ b/tests/pytest/fixtures/datasets.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest + from palimpzest.core.data.datasources import UserSource from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.fields import ListField, NumericField, StringField diff --git a/tests/pytest/fixtures/execution_data.py b/tests/pytest/fixtures/execution_data.py index 4b2ec7853..5773f4b01 100644 --- a/tests/pytest/fixtures/execution_data.py +++ b/tests/pytest/fixtures/execution_data.py @@ -1,6 +1,7 @@ import re import pytest + from palimpzest.constants import Model from palimpzest.core.data.dataclasses import RecordOpStats from palimpzest.core.elements.records import DataRecord, DataRecordSet diff --git a/tests/pytest/fixtures/expected_cost_est_results.py b/tests/pytest/fixtures/expected_cost_est_results.py index 2c515d1bc..64b256a53 100644 --- a/tests/pytest/fixtures/expected_cost_est_results.py +++ b/tests/pytest/fixtures/expected_cost_est_results.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Model from palimpzest.query.operators.convert import ConvertOp from palimpzest.query.operators.datasource import MarshalAndScanDataOp diff --git a/tests/pytest/fixtures/expected_physical_plans.py b/tests/pytest/fixtures/expected_physical_plans.py index 102517b1a..04b8bc373 100644 --- a/tests/pytest/fixtures/expected_physical_plans.py +++ b/tests/pytest/fixtures/expected_physical_plans.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Model from palimpzest.core.data.dataclasses import PlanCost from palimpzest.core.elements.filters import Filter diff --git a/tests/pytest/fixtures/expected_qualities.py b/tests/pytest/fixtures/expected_qualities.py index 8bc1ddde2..5d1f30b71 100644 --- a/tests/pytest/fixtures/expected_qualities.py +++ b/tests/pytest/fixtures/expected_qualities.py @@ -1,6 +1,7 @@ import re import pytest + from palimpzest.constants import Model diff --git a/tests/pytest/fixtures/expected_records.py b/tests/pytest/fixtures/expected_records.py index 871167b9a..ee6e98718 100644 --- a/tests/pytest/fixtures/expected_records.py +++ b/tests/pytest/fixtures/expected_records.py @@ -1,6 +1,7 @@ import os import pytest + from palimpzest.constants import Model from palimpzest.core.elements.records import DataRecord, DataRecordSet from palimpzest.core.lib.schemas import File diff --git a/tests/pytest/fixtures/operator_to_stats.py b/tests/pytest/fixtures/operator_to_stats.py index 2212c927d..3b1915af7 100644 --- a/tests/pytest/fixtures/operator_to_stats.py +++ b/tests/pytest/fixtures/operator_to_stats.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Model from palimpzest.core.elements.filters import Filter from palimpzest.core.lib.schemas import TextFile diff --git a/tests/pytest/fixtures/physical_plans.py b/tests/pytest/fixtures/physical_plans.py index 853ac4b61..97732ab58 100644 --- a/tests/pytest/fixtures/physical_plans.py +++ b/tests/pytest/fixtures/physical_plans.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.constants import Cardinality, Model from palimpzest.core.elements.filters import Filter from palimpzest.core.lib.schemas import File, Schema, StringField, TextFile diff --git a/tests/pytest/fixtures/schemas.py b/tests/pytest/fixtures/schemas.py index 8cec695d5..6b1c0d526 100644 --- a/tests/pytest/fixtures/schemas.py +++ b/tests/pytest/fixtures/schemas.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.core.lib.fields import BooleanField, Field, ImageFilepathField, ListField, NumericField, StringField from palimpzest.core.lib.schemas import Schema, TextFile diff --git a/tests/pytest/fixtures/side_effects.py b/tests/pytest/fixtures/side_effects.py index 6f5038236..f501eb37c 100644 --- a/tests/pytest/fixtures/side_effects.py +++ b/tests/pytest/fixtures/side_effects.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.core.data.dataclasses import GenerationStats diff --git a/tests/pytest/fixtures/workloads.py b/tests/pytest/fixtures/workloads.py index ac93c1e9d..b84297ac0 100644 --- a/tests/pytest/fixtures/workloads.py +++ b/tests/pytest/fixtures/workloads.py @@ -1,8 +1,7 @@ import pytest -from palimpzest.constants import Cardinality -from palimpzest.core.lib.schemas import Table, TextFile, XLSFile + +from palimpzest.core.lib.schemas import TextFile from palimpzest.sets import Dataset -from palimpzest.utils import udfs ### UDFs ### @@ -70,18 +69,6 @@ def real_estate_workload( return listings -@pytest.fixture -def biofabric_workload(biofabric_tiny, case_data_schema): - xls = Dataset(biofabric_tiny, schema=XLSFile) - # patient_tables = xls.convert( - # pz.Table, desc="All tables in the file", cardinality=pz.Cardinality.ONE_TO_MANY) - patient_tables = xls.convert(Table, udf=udfs.xls_to_tables, cardinality=Cardinality.ONE_TO_MANY) - patient_tables = patient_tables.filter("The rows of the table contain the patient age") - case_data = patient_tables.convert( - case_data_schema, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY - ) - return case_data - @pytest.fixture def three_converts_workload(enron_eval_tiny, email_schema, foobar_schema, baz_schema): # construct plan with three converts diff --git a/tests/pytest/test_convert.py b/tests/pytest/test_convert.py index 0e0981495..5280ef62f 100644 --- a/tests/pytest/test_convert.py +++ b/tests/pytest/test_convert.py @@ -23,8 +23,15 @@ load_env() -@pytest.mark.parametrize("convert_op", [LLMConvertBonded, LLMConvertConventional]) -def test_convert(convert_op, email_schema, enron_eval_tiny): +@pytest.mark.parametrize( + argnames=("convert_op", "side_effect"), + argvalues=[ + pytest.param(LLMConvertBonded, "enron-convert", id="bonded-llm-convert"), + pytest.param(LLMConvertConventional, "enron-convert", id="conventional-llm-convert"), + ], + indirect=["side_effect"], +) +def test_convert(mocker, convert_op, side_effect, email_schema, enron_eval_tiny): """Test whether convert operators""" model = Model.GPT_4o scan_op = MarshalAndScanDataOp(output_schema=TextFile, dataset_id=enron_eval_tiny) @@ -34,18 +41,23 @@ def test_convert(convert_op, email_schema, enron_eval_tiny): model=model, prompt_strategy=PromptStrategy.COT_QA, ) + + # mock out calls to generators used by the plans which parameterize this test + mocker.patch.object(LLMConvertBonded, "convert", side_effect=side_effect) + mocker.patch.object(LLMConvertConventional, "convert", side_effect=side_effect) datasource = DataDirectory().get_registered_dataset(enron_eval_tiny) candidate = DataRecord(schema=File, source_id=0) candidate.idx = 0 candidate.get_item_fn = datasource.get_item - # run DataSourcePhysicalOp on record - outputs = [] - record_set = scan_op(candidate) - for record in record_set: - output = convert_op(record) - outputs.extend(output.data_records) + # run scan and convert operators + record_op_stats_lst, outputs = [], [] + for record in scan_op(candidate): + record_set = convert_op(record) + record_op_stats_lst.extend(record_set.record_op_stats) + outputs.extend(record_set.data_records) - for record in outputs: - print(record.sender, record.subject) + assert len(outputs) == 1 + assert outputs[0].schema == email_schema.union(TextFile) + assert sorted(outputs[0].get_field_names()) == ["contents", "filename", "sender", "subject"] diff --git a/tests/pytest/test_cost_model.py b/tests/pytest/test_cost_model.py index d6b3a1c21..927b560ad 100644 --- a/tests/pytest/test_cost_model.py +++ b/tests/pytest/test_cost_model.py @@ -1,4 +1,5 @@ import pytest + from palimpzest.datamanager.datamanager import DataDirectory from palimpzest.query.optimizer.cost_model import CostModel from palimpzest.utils.model_helpers import get_models diff --git a/tests/pytest/test_datasource.py b/tests/pytest/test_datasource.py new file mode 100644 index 000000000..ce7ad00d9 --- /dev/null +++ b/tests/pytest/test_datasource.py @@ -0,0 +1,73 @@ +import pytest +from palimpzest.core.data.datasources import MemorySource +from palimpzest.core.elements.records import DataRecord +from palimpzest.core.lib.schemas import Schema, List +from palimpzest.query.operators.datasource import MarshalAndScanDataOp +from palimpzest.core.lib.schemas import SourceRecord + +def test_marshal_and_scan_memory_source(): + # Create test data + test_data = ["test1", "test2", "test3"] + + # Create MemorySource with test data + memory_source = MemorySource(test_data, dataset_id="test_dataset") + + # Create MarshalAndScanDataOp + op = MarshalAndScanDataOp(output_schema=List, dataset_id="test_dataset") + + current_scan_idx = 0 + candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx) + candidate.idx = current_scan_idx + candidate.get_item_fn = memory_source.get_item + + # Execute the operator + result = op(candidate) + + assert len(result.data_records) == 1 + assert result.data_records[0].value == "test1" + + # Test stats + assert len(result.record_op_stats) == 1 + stats = result.record_op_stats[0] + assert stats.op_name == "MarshalAndScanDataOp" + assert stats.op_details["dataset_id"] == "test_dataset" + assert stats.time_per_record > 0 + assert stats.cost_per_record == 0.0 + +# def test_marshal_and_scan_memory_source_multiple_records(): +# # Test with numeric data +# test_data = [1, 2, 3, 4, 5] +# memory_source = MemorySource(test_data, schema=List, dataset_id="test_numbers") + +# op = MarshalAndScanDataOp(dataset_id="test_numbers") + +# # Test each index +# for idx in range(len(test_data)): +# mock_record = DataRecord(Schema()) +# mock_record.idx = idx +# mock_record.get_item_fn = memory_source.get_item + +# result = op(mock_record) + +# # Verify results +# assert len(result.records) == 1 +# assert result.records[0].value == test_data[idx] +# assert len(result.record_op_stats) == 1 + +# def test_marshal_and_scan_empty_source(): +# # Test with empty data +# memory_source = MemorySource([], schema=List, dataset_id="empty_dataset") + +# op = MarshalAndScanDataOp(dataset_id="empty_dataset") + +# mock_record = DataRecord(Schema()) +# mock_record.idx = 0 +# mock_record.get_item_fn = memory_source.get_item + +# # Should raise IndexError when trying to access empty source +# with pytest.raises(IndexError): +# op(mock_record) + + + + diff --git a/tests/pytest/test_datasources.py b/tests/pytest/test_datasources.py new file mode 100644 index 000000000..9d5fd168a --- /dev/null +++ b/tests/pytest/test_datasources.py @@ -0,0 +1,146 @@ +# write tests for src/palimpzest/core/data/datasources.py + +import os +import pytest +import pandas as pd +from palimpzest.core.data.datasources import ( + FileSource, + TextFileDirectorySource, + ImageFileDirectorySource, + MemorySource, + HTMLFileDirectorySource +) +from palimpzest.core.lib.fields import ListField +from palimpzest.core.elements.records import DataRecord +from palimpzest.core.lib.schemas import List, Schema, Number, File, TextFile, WebPage, ImageFile + +@pytest.fixture +def temp_text_file(): + file_path = "testdata/tmp_test.txt" + with open(file_path, "w") as f: + f.write("Hello, World!") + return file_path + +@pytest.fixture +def temp_text_dir(): + dir_path = "testdata/text_dir" + os.makedirs(dir_path, exist_ok=True) + with open(dir_path + "/file1.txt", "w") as f: + f.write("Content 1") + with open(dir_path + "/file2.txt", "w") as f: + f.write("Content 2") + return dir_path + +@pytest.fixture +def list_values(): + return [1, 2, 3, 4] + +@pytest.fixture +def df_values(): + return pd.DataFrame({"a": [10, 20, 30, 40], "b": [50, 60, 70, 80]}) + + +def test_file_source_initialization(temp_text_file): + source = FileSource(temp_text_file, "test_dataset") + assert source.filepath == temp_text_file + assert source.dataset_id == "test_dataset" + assert source.schema == File + +def test_file_source(temp_text_file): + source = FileSource(temp_text_file, "test_dataset") + record = source.get_item(0) + + assert isinstance(record, DataRecord) + assert record.filename == temp_text_file + assert record.contents == b"Hello, World!" + assert len(source) == 1 + + copied = source.copy() + assert copied.filepath == source.filepath + assert copied.dataset_id == source.dataset_id + assert copied.schema == source.schema + +def test_text_directory_source(temp_text_dir): + source = TextFileDirectorySource(temp_text_dir, "test_dataset") + assert len(source) == 2 + assert source.schema == TextFile + + record = source.get_item(0) + assert isinstance(record, DataRecord) + assert record.contents == "Content 1" + + record = source.get_item(1) + assert record.contents == "Content 2" + +def test_memory_source_list(list_values): + source = MemorySource(list_values, dataset_id="test_memory") + assert len(source) == len(list_values) + assert source.dataset_id == "test_memory" + + record = source.get_item(0) + assert record.value == list_values[0] + record = source.get_item(3) + assert record.value == list_values[3] + copied = source.copy() + assert copied.vals == source.vals + assert copied.dataset_id == source.dataset_id + +def test_memory_source_df(df_values): + source = MemorySource(df_values, dataset_id="test_memory") + assert len(source) == len(df_values) + assert source.dataset_id == "test_memory" + + record = source.get_item(0) + assert record.a == df_values.iloc[0]['a'] + assert record.b == df_values.iloc[0]['b'] + + copied = source.copy() + assert copied.vals.equals(source.vals) + assert copied.dataset_id == source.dataset_id + + +def test_memory_source_copy(): + values = [1, 2, 3] + source = MemorySource(values, dataset_id="test_memory") + copied = source.copy() + + assert copied.vals == source.vals + assert copied.dataset_id == source.dataset_id + +@pytest.fixture +def temp_html_dir(tmp_path): + dir_path = tmp_path / "html_files" + dir_path.mkdir() + html_content = """ + + + Example Link +

Some text

+ + + """ + (dir_path / "page1.html").write_text(html_content) + return str(dir_path) + +def test_html_directory_source(temp_html_dir): + source = HTMLFileDirectorySource(temp_html_dir, "test_dataset") + assert len(source) == 1 + assert source.schema == WebPage + + record = source.get_item(0) + assert isinstance(record, DataRecord) + assert "Example Link (http://example.com)" in record.text + assert "" in record.html + +def test_invalid_directory(): + with pytest.raises(FileNotFoundError): + ImageFileDirectorySource("/nonexistent/path", "test_dataset") + +def test_source_serialization(temp_text_file): + source = FileSource(temp_text_file, "test_dataset") + serialized = source.serialize() + + assert "schema" in serialized + assert "path" in serialized + assert "source_type" in serialized + assert serialized["source_type"] == "file" \ No newline at end of file diff --git a/tests/pytest/test_dynamicschema.py b/tests/pytest/test_dynamicschema.py index ed9916304..6a7721941 100644 --- a/tests/pytest/test_dynamicschema.py +++ b/tests/pytest/test_dynamicschema.py @@ -1,12 +1,12 @@ """This testing class tests whether we can run a workload by defining a schema dynamically.""" - from palimpzest.constants import Model from palimpzest.core.lib.schemas import TextFile from palimpzest.policy import MinCost from palimpzest.query.execution.execute import Execute from palimpzest.query.execution.nosentinel_execution import NoSentinelSequentialSingleThreadExecution +from palimpzest.query.operators.convert import LLMConvertBonded +from palimpzest.query.operators.filter import LLMFilter from palimpzest.schemabuilder.schema_builder import SchemaBuilder -from palimpzest.sets import Dataset data_path = "tests/pytest/data/" @@ -19,22 +19,18 @@ def test_dynamicschema_csv(): clinical_schema = SchemaBuilder.from_file(data_path + "/synapse_schema.csv", schema_type=TextFile) assert clinical_schema is not None -def test_dynamicschema_json(): + +def test_dynamicschema_json(mocker, enron_workload, enron_convert, enron_filter): email_schema = SchemaBuilder.from_file(data_path + "/email_schema.json") assert email_schema is not None assert issubclass(email_schema, TextFile) - dataset_id = "enron-eval-tiny" - emails = Dataset(dataset_id, schema=email_schema) - emails = emails.filter( - 'The email refers to a fraudulent scheme (i.e., "Raptor", "Deathstar", "Chewco", and/or "Fat Boy")' - ) - emails = emails.filter( - "The email is not quoting from a news article or an article written by someone outside of Enron" - ) + # mock out calls to generators used by the plans which parameterize this test + mocker.patch.object(LLMFilter, "filter", side_effect=enron_filter) + mocker.patch.object(LLMConvertBonded, "convert", side_effect=enron_convert) - records, stats = Execute( - emails, + records, _ = Execute( + enron_workload, policy=MinCost(), available_models=[Model.GPT_4o_MINI], num_samples=3, @@ -51,22 +47,17 @@ def test_dynamicschema_json(): print(rec.to_dict()) -def test_dynamicschema_yml(): +def test_dynamicschema_yml(mocker, enron_workload, enron_convert, enron_filter): email_schema = SchemaBuilder.from_file(data_path + "/email_schema.yml") assert email_schema is not None assert issubclass(email_schema, TextFile) - dataset_id = "enron-eval-tiny" - emails = Dataset(dataset_id, schema=email_schema) - emails = emails.filter( - 'The email refers to a fraudulent scheme (i.e., "Raptor", "Deathstar", "Chewco", and/or "Fat Boy")' - ) - emails = emails.filter( - "The email is not quoting from a news article or an article written by someone outside of Enron" - ) + # mock out calls to generators used by the plans which parameterize this test + mocker.patch.object(LLMFilter, "filter", side_effect=enron_filter) + mocker.patch.object(LLMConvertBonded, "convert", side_effect=enron_convert) - records, stats = Execute( - emails, + records, _ = Execute( + enron_workload, policy=MinCost(), available_models=[Model.GPT_4o_MINI], num_samples=3, diff --git a/tests/pytest/test_execution_no_cache.py b/tests/pytest/test_execution_no_cache.py index a895fc4d2..20025a045 100644 --- a/tests/pytest/test_execution_no_cache.py +++ b/tests/pytest/test_execution_no_cache.py @@ -1,6 +1,7 @@ import time import pytest + from palimpzest.datamanager.datamanager import DataDirectory from palimpzest.query.execution.nosentinel_execution import ( NoSentinelPipelinedParallelExecution, diff --git a/tests/pytest/test_optimizer.py b/tests/pytest/test_optimizer.py index 4fcfecbfd..166070137 100644 --- a/tests/pytest/test_optimizer.py +++ b/tests/pytest/test_optimizer.py @@ -1,5 +1,5 @@ import pytest -from palimpzest.constants import Cardinality, Model, OptimizationStrategy +from palimpzest.constants import Cardinality, Model from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost from palimpzest.core.elements.filters import Filter from palimpzest.core.lib.schemas import TextFile @@ -96,8 +96,8 @@ def test_group_id_equality(self, email_schema): @pytest.mark.parametrize( argnames=("opt_strategy",), argvalues=[ - pytest.param(OptimizationStrategy.GREEDY, id="greedy"), - pytest.param(OptimizationStrategy.PARETO, id="pareto"), + pytest.param(OptimizationStrategy.GREEDY, id="greedy"), # TODO: fix + pytest.param(OptimizationStrategy.PARETO, id="pareto"), # TODO: fix ] ) class TestOptimizer: @@ -333,7 +333,7 @@ def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCost # create source_op_estimates for datasources if they are not provided if isinstance(operator, DataSourcePhysicalOp): # get handle to DataSource and pre-compute its size (number of records) - datasource = self.datadir.get_registered_dataset(operator.dataset_id) + datasource = operator.get_datasource() datasource_len = len(datasource) source_op_estimates = OperatorCostEstimates( @@ -388,7 +388,7 @@ def test_pareto_optimization_strategy(self, workload, policy, operator_to_stats, no_cache=True, verbose=True, available_models=[Model.GPT_4o, Model.GPT_4o_MINI, Model.LLAMA3], - optimization_strategy=OptimizationStrategy.PARETO, + optimization_strategy=OptimizationStrategy.PARETO, # TODO: fix # TODO: remove allow_code_synth=False, allow_conventional_query=False, diff --git a/tests/pytest/test_physical.py b/tests/pytest/test_physical.py index 76554f845..35e1dcea6 100644 --- a/tests/pytest/test_physical.py +++ b/tests/pytest/test_physical.py @@ -2,6 +2,10 @@ import os import sys +import pytest +from palimpzest.core.lib.schemas import Schema +from palimpzest.query.operators.physical import PhysicalOperator +from palimpzest.core.lib.fields import StringField, NumericField sys.path.append("./tests/") sys.path.append("./tests/refactor-tests/") @@ -12,6 +16,92 @@ load_env() + +class SimpleSchema(Schema): + name = StringField(desc="The name of the person") + age = NumericField(desc="The age of the person") + +def test_physical_operator_init(): + """Test basic initialization of PhysicalOperator""" + + op = PhysicalOperator( + output_schema=SimpleSchema, + input_schema=SimpleSchema, + depends_on=["op1", "op2"], + logical_op_id="logical1", + verbose=True + ) + + assert op.output_schema == SimpleSchema + assert op.input_schema == SimpleSchema + assert op.depends_on == ["op1", "op2"] + assert op.logical_op_id == "logical1" + assert op.verbose is True + +def test_physical_operator_equality(): + """Test equality comparison between PhysicalOperators""" + schema1 = SimpleSchema() + schema2 = SimpleSchema() + + op1 = PhysicalOperator(output_schema=schema1) + op2 = PhysicalOperator(output_schema=schema1) + op3 = PhysicalOperator(output_schema=schema2, verbose=True) + + assert op1 == op2 # Same output schema + assert op1 == op1 # Same instance + assert op1 == op1.copy() # Copy should be equal + assert op1 != op3 # Different parameters + +def test_physical_operator_str(): + """Test string representation of PhysicalOperator""" + + op = PhysicalOperator( + output_schema=SimpleSchema, + input_schema=SimpleSchema + ) + + str_rep = str(op) + assert "SimpleSchema -> PhysicalOperator -> SimpleSchema" in str_rep + assert "age, name" in str_rep + +def test_physical_operator_id_generation(): + """Test operator ID generation and hashing""" + op = PhysicalOperator(output_schema=SimpleSchema) + + # Test that op_id is initially None + assert op.op_id is None + + # Get op_id and verify it's generated + op_id = op.get_op_id() + assert op_id is not None + assert isinstance(op_id, str) + + # Test that subsequent calls return the same id + assert op.get_op_id() == op_id + + # Test that hash is based on op_id + assert hash(op) == int(op_id, 16) + +def test_physical_operator_copy(): + """Test copying of PhysicalOperator""" + original = PhysicalOperator( + output_schema=SimpleSchema, + input_schema=SimpleSchema, + depends_on=["op1"], + logical_op_id="logical1", + verbose=True + ) + + copied = original.copy() + + assert copied is not original # Different instances + assert copied == original # But equal in content + assert copied.get_op_id() == original.get_op_id() # Same op_id + assert copied.depends_on == original.depends_on + assert copied.logical_op_id == original.logical_op_id + assert copied.verbose == original.verbose + + # TODO: uncomment once I understand what is supposed to be happening with # ParallelConvertFromCandidateOp and ParallelFilterCandidateOp (I don't # have these on my branch; possibly came from another branch) diff --git a/tests/pytest/test_records.py b/tests/pytest/test_records.py index 2aaa20231..c4113027f 100644 --- a/tests/pytest/test_records.py +++ b/tests/pytest/test_records.py @@ -1,5 +1,6 @@ import pandas as pd import pytest + from palimpzest.core.elements.records import DataRecord from palimpzest.core.lib.fields import Field from palimpzest.core.lib.schemas import Schema diff --git a/tests/pytest/test_rules.py b/tests/pytest/test_rules.py new file mode 100644 index 000000000..28e9aefad --- /dev/null +++ b/tests/pytest/test_rules.py @@ -0,0 +1,53 @@ +import pytest +from palimpzest.query.optimizer.rules import ( + PushDownFilter, NonLLMConvertRule, LLMConvertBondedRule, + BasicSubstitutionRule, NonLLMFilterRule +) +from palimpzest.query.optimizer.primitives import LogicalExpression, Group +from palimpzest.query.operators.logical import ( + ConvertScan, FilteredScan, BaseScan +) +from palimpzest.query.operators.filter import Filter +from palimpzest.core.lib.schemas import Schema, StringField + + +@pytest.fixture +def schema(): + class SimpleSchema(Schema): + filename = StringField(desc="The filename of the file") + text = StringField(desc="The text of the file") + return SimpleSchema + +@pytest.fixture +def base_scan_op(schema): + return BaseScan( + dataset_id="test_dataset", + output_schema=schema + ) + +def test_substitute_methods(base_scan_op): + # Create a logical expression with the BaseScan operator + logical_expr = LogicalExpression( + operator=base_scan_op, + input_group_ids=[], + input_fields=set(), + generated_fields=set(["id", "text"]), + group_id=1 + ) + + # Apply the BasicSubstitutionRule + physical_exprs = BasicSubstitutionRule.substitute(logical_expr, verbose=False) + + # Verify the substitution + assert len(physical_exprs) == 1 + physical_expr = list(physical_exprs)[0] + + # Check that the operator was correctly converted to MarshalAndScanDataOp + assert physical_expr.operator.__class__.__name__ == "MarshalAndScanDataOp" + + # Verify that the important properties were preserved + assert physical_expr.operator.dataset_id == base_scan_op.dataset_id + assert physical_expr.input_group_ids == logical_expr.input_group_ids + assert physical_expr.input_fields == logical_expr.input_fields + assert physical_expr.generated_fields == logical_expr.generated_fields + assert physical_expr.group_id == logical_expr.group_id \ No newline at end of file diff --git a/tests/pytest/test_workloads.py b/tests/pytest/test_workloads.py deleted file mode 100644 index 068c24716..000000000 --- a/tests/pytest/test_workloads.py +++ /dev/null @@ -1,202 +0,0 @@ -import os - -import pandas as pd -import pytest -from palimpzest.policy import MinCost -from palimpzest.query.execution.execute import Execute -from palimpzest.query.execution.nosentinel_execution import ( - NoSentinelPipelinedParallelExecution, - NoSentinelPipelinedSingleThreadExecution, - NoSentinelSequentialSingleThreadExecution, -) -from palimpzest.utils.model_helpers import get_models -from sklearn.metrics import precision_recall_fscore_support - - -def score_biofabric_plans(dataset, records, policy_str=None, reopt=False) -> float: - """ - Computes the results of all biofabric plans - """ - # parse records - matching_columns = [ - "age_at_diagnosis", - "ajcc_pathologic_n", - "ajcc_pathologic_stage", - "ajcc_pathologic_t", - "case_submitter_id", - "ethnicity", - "gender", - "morphology", - "primary_diagnosis", - "race", - "tissue_or_organ_of_origin", - "tumor_focality", - "tumor_grade", - "tumor_largest_dimension_diameter", - "vital_status", - ] - output_rows = [] - for rec in records: - dct = {k: v for k, v in rec.as_dict().items() if k in matching_columns} - filename = os.path.basename(rec.as_dict()["filename"]) - dct["study"] = os.path.basename(filename).split("_")[0] - output_rows.append(dct) - - records_df = pd.DataFrame(output_rows) - - # if not reopt: - # records_df.to_csv( - # f"final-eval-results/{workload}/preds-{plan_idx}.csv", index=False - # ) - # else: - # records_df.to_csv( - # f"final-eval-results/reoptimization/{workload}/{policy_str}.csv", - # index=False, - # ) - - if records_df.empty: - return 0.0 - - output = records_df - index = [x for x in output.columns if x != "study"] - # target_matching = pd.read_csv(os.path.join(f'final-eval-results/{opt}/{workload}/', "target_matching.csv"), index_col=0).reindex(index) - target_matching = pd.read_csv(os.path.join("testdata/", "target_matching.csv"), index_col=0).reindex(index) - - studies = output["study"].unique() - # Group by output by the "study" column and split it into many dataframes indexed by the "study" column - df = pd.DataFrame(columns=target_matching.columns, index=index) - predicted = [] - targets = [] - - for study in studies: - output_study = output[output["study"] == study] - try: - input_df = pd.read_excel(os.path.join("testdata/biofabric-matching/", f"{study}.xlsx")) - except Exception: - print("Cannot find the study", study) - targets += [study] * 5 - predicted += ["missing"] * 5 - continue - # for every column in output_study, check which column in input_df is the closest, i.e. the one with the highest number of matching values - for col in matching_columns: - max_matches = 0 - max_col = "missing" - for input_col in input_df.columns: - matches = sum([1 for idx, x in enumerate(output_study[col]) if x == input_df[input_col][idx]]) - if matches > max_matches: - max_matches = matches - max_col = input_col - df.loc[col, study] = max_col - - # build a matrix that has the study on the columns and the predicted column names on the rows - df.fillna("missing", inplace=True) - - targets += list(target_matching[study].values) - predicted += list(df[study].values) - - # print(df) - p, r, f1, sup = precision_recall_fscore_support(targets, predicted, average="micro", zero_division=0) - - return f1 - - -def score_plan(dataset, records, policy_str=None, reopt=False) -> float: - """ - Computes the F1 score of the plan - """ - # special handling for biofabric dataset - if "biofabric" in dataset: - return score_biofabric_plans(dataset, records, policy_str, reopt) - - records_df = pd.DataFrame([rec.as_dict() for rec in records]) - - # save predictions for this plan - # if not reopt: - # records_df.to_csv( - # f"final-eval-results/{dataset}/preds-{plan_idx}.csv", index=False - # ) - # else: - # records_df.to_csv( - # f"final-eval-results/reoptimization/{dataset}/{policy_str}.csv", - # index=False, - # ) - - if records_df.empty: - return 0.0 - - # get lists of predictions and groundtruth answers - preds, targets = None, None - if "enron" in dataset: - preds = records_df.filename.apply(lambda fn: os.path.basename(fn)).tolist() - gt_df = pd.read_csv("testdata/groundtruth/enron-eval-tiny.csv") - targets = list(gt_df[gt_df.label == 1].filename) - elif "real-estate" in dataset: - preds = list(records_df.listing) - gt_df = pd.read_csv("testdata/groundtruth/real-estate-eval-tiny.csv") - targets = list(gt_df[gt_df.label == 1].listing) - - # compute true and false positives - tp, fp = 0, 0 - for pred in preds: - if pred in targets: - tp += 1 - else: - fp += 1 - - # compute false negatives - fn = 0 - for target in targets: - if target not in preds: - fn += 1 - - # compute precision, recall, f1 score - precision = tp / (tp + fp) if tp + fp > 0 else 0.0 - recall = tp / (tp + fn) if tp + fn > 0 else 0.0 - f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0 - - return f1_score - - -@pytest.mark.parametrize( - argnames=("execution_engine"), - argvalues=[ - pytest.param(NoSentinelSequentialSingleThreadExecution, id="seq-single-thread"), - pytest.param(NoSentinelPipelinedSingleThreadExecution, id="pipe-single-thread"), - pytest.param(NoSentinelPipelinedParallelExecution, id="pipe-parallel"), - ], -) -@pytest.mark.parametrize( - argnames=("dataset", "workload"), - argvalues=[ - ("real-estate-eval-tiny", "real-estate-workload"), - ("biofabric-tiny", "biofabric-workload"), - ("enron-eval-tiny", "enron-workload"), - ], - indirect=True, -) -def test_workload(dataset, workload, execution_engine): - # workload_to_dataset_size = {"enron": 1000, "real-estate": 100, "biofabric": 11} - dataset_to_size = {"enron-eval-tiny": 10, "real-estate-eval-tiny": 5, "biofabric-tiny": 3} - dataset_size = dataset_to_size[dataset] - num_samples = int(0.05 * dataset_size) if dataset != "biofabric-tiny" else 1 - - available_models = get_models(include_vision=True) - records, stats = Execute( - workload, - policy=MinCost(), - available_models=available_models, - num_samples=num_samples, - nocache=True, - allow_bonded_query=True, - allow_code_synth=False, - allow_token_reduction=False, - execution_engine=execution_engine, - ) - - # NOTE: f1 score calculation will be low for biofabric b/c the - # evaluation function still checks against the full dataset's labels - # print(f"Plan: {result_dict['plan_info']['plan_label']}") - f1_score = score_plan(dataset=dataset, records=records) - print(f" F1: {f1_score}") - print(f" rt: {stats.total_execution_time}") - print(f" $$: {stats.total_execution_cost}")