merging changes from PR #58 in

mitdbg · Jan 23, 2025 · b73c0ba · b73c0ba
2 parents ae9da64 + 5d1f3fa
commit b73c0ba
Show file tree

Hide file tree

Showing 71 changed files with 1,610 additions and 1,007 deletions.
diff --git a/.github/workflows/workflow.yaml b/.github/workflows/workflow.yaml
@@ -0,0 +1,59 @@
+name: PZ Merge Checks
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install .
+
+    - name: Download and register testdata
+      run: |
+        pushd testdata
+        wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/enron-eval-tiny.tar.gz
+        wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/real-estate-eval-tiny.tar.gz
+        tar -xzf enron-eval-tiny.tar.gz
+        tar -xzf real-estate-eval-tiny.tar.gz
+        rm *.tar.gz
+        popd
+        pz reg --path testdata/enron-eval-tiny --name enron-eval-tiny
+        pz reg --path testdata/real-estate-eval-tiny --name real-estate-eval-tiny
+
+    - name: Test with pytest
+      env: # Or as an environment variable
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        pip install pytest
+        pytest -v tests/pytest
+  
+  lint-and-format:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+    - name: Install the code linting and formatting tool Ruff
+      run: pip install "ruff>=0.9.0"
+    - name: check version
+      run: ruff --version
+    - name: Lint code with Ruff
+      run: ruff check --output-format=github --target-version=py38
+    - name: Check code formatting with Ruff
+      run: ruff check --no-fix . --target-version=py38
+      continue-on-error: true
diff --git a/demos/askem-var.py b/demos/askem-var.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 import streamlit as st
+
 from palimpzest.constants import Cardinality
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.core.lib.fields import Field
@@ -33,18 +34,26 @@ class Variable(Schema):
     value = Field(desc="The value of the variable, optional, set 'null' if not found")
 
 
+dict_of_excerpts = [
+    {"id": 0, "text": "ne of the few states producing detailed daily reports of COVID-19 confirmed cases, COVID-19 related cumulative hospitalizations, intensive care unit (ICU) admissions, and deaths per county. Likewise, Ohio is a state with marked variation of demographic and geographic attributes among counties along with substantial differences in the capacity of healthcare within the state. Our aim is to predict the spatiotemporal dynamics of the COVID-19 pandemic in relation with the distribution of the capacity of healthcare in Ohio. 2. Methods 2.1. Mathematical model We developed a spatial mathematical model to simulate the transmission dynamics of COVID-19 disease infection and spread. The spatially-explicit model incorporates geographic connectivity information at county level. The Susceptible-Infected-Hospitalized-Recovered- Dead (SIHRD) COVID-19 model classified the population into susceptibles (S), confirmed infections (I), hospitalized and ICU admitted (H), recovered (R) and dead (D). Based on a previous study that identified local air hubs and main roads as important geospatial attributes lio residing in the county. In the second scenario, we used the model to generate projections of the impact of potential easing on the non-pharmaceutical interventions in the critical care capacity of each county in Ohio. We assessed the impact of 50% reduction on the estimated impact of non-pharmaceutical interventions in reducing the hazard rate of infection. Under this scenario we calculated the proportion of ICU \n'"},
+    {"id": 1, "text": "t model incorporates geographic connectivity information at county level. The Susceptible-Infected-Hospitalized-Recovered- Dead (SIHRD) COVID-19 model classified the population into susceptibles (S), confirmed infections (I), hospitalized and ICU admitted (H), recovered (R) and dead (D). Based on a previous study that identified local air hubs and main roads as important geospatial attributes linked to differential COVID-19 related hospitalizations and mortality (Correa-Agudelo et a"}
+]
+
+list_of_strings = ["I have a variable a, the value is 1", "I have a variable b, the value is 2"]
+list_of_numbers = [1, 2, 3, 4, 5]
+
 if __name__ == "__main__":
     run_pz = True
     dataset = "askem"
+    file_path = "testdata/askem-tiny/"
 
     if run_pz:
         # reference, plan, stats = run_workload()
-        excerpts = Dataset(dataset, schema=TextFile)
+        df_input = pd.DataFrame(dict_of_excerpts)
+        excerpts = Dataset(df_input)
         output = excerpts.convert(
-            Variable, desc="A variable used or introduced in the paper snippet", cardinality=Cardinality.ONE_TO_MANY
-        )
-
-        engine = StreamingSequentialExecution
+            Variable, desc="A variable used or introduced in the context", cardinality=Cardinality.ONE_TO_MANY
+        ).filter("The value name is 'a'", depends_on="name")
         policy = MaxQuality()
         engine = StreamingSequentialExecution(
             policy=policy,
@@ -55,8 +64,7 @@ class Variable(Schema):
             allow_bonded_query=True,
         )
         engine.generate_plan(output, policy)
-
-        print(engine.plan)
+        print("Generated plan:\n", engine.plan)
         with st.container():
             st.write("### Executed plan: \n")
             # st.write(" " + str(plan).replace("\n", "  \n "))
@@ -66,9 +74,9 @@ class Variable(Schema):
                 st.write(strop)
 
         input_records = engine.get_input_records()
-        input_df = DataRecord.to_df(input_records)
+        input_df = DataRecord.to_df(input_records, fields_in_schema=True)
         print(input_df)
-        
+
         variables = []
         statistics = []
         start_time = time.time()
@@ -81,9 +89,9 @@ class Variable(Schema):
                 total_plan_time = time.time() - start_time
                 engine.plan_stats.finalize(total_plan_time)
 
-            record_time = time.time()
             statistics.append(engine.plan_stats)
-
+            intermediate_vars = DataRecord.to_df(vars, fields_in_schema=True)
+            print(intermediate_vars)
             for var in vars:
                 # ref.key = ref.first_author.split()[0] + ref.title.split()[0] + str(ref.year)
                 try:
@@ -119,8 +127,8 @@ class Variable(Schema):
                     st.write(" **value:** ", var.value, "\n")
 
         # write variables to a json file with readable format
-        with open(f"askem-variables-{dataset}.json", "w") as f:
-            json.dump(variables, f, indent=4)
+        # with open(f"askem-variables-{dataset}.json", "w") as f:
+        #     json.dump(variables, f, indent=4)
         vars_df = pd.DataFrame(variables)
 
     # G = nx.DiGraph()
@@ -150,7 +158,7 @@ class Variable(Schema):
     #
     # nx.write_gexf(G, "demos/bdf-usecase3.gexf")
 
-    print("References:", vars_df)
+    # print("References:", vars_df)
     # st.write(table.title, table.author, table.abstract)
     # endTime = time.time()
     # print("Elapsed time:", endTime - startTime)
diff --git a/demos/bdf-suite.py b/demos/bdf-suite.py
@@ -10,6 +10,7 @@
 import networkx as nx
 import pandas as pd
 import streamlit as st
+
 from palimpzest.constants import Cardinality
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import URL, File, PDFFile, Schema, Table, XLSFile

diff --git a/demos/bdf-usecase3.py b/demos/bdf-usecase3.py
@@ -11,6 +11,7 @@
 import networkx as nx
 import pandas as pd
 import streamlit as st  # type: ignore
+
 from palimpzest.constants import Cardinality
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import PDFFile, Schema

diff --git a/demos/biofabric-demo-matching.ipynb b/demos/biofabric-demo-matching.ipynb
@@ -10,6 +10,7 @@
     "import os\n",
     "\n",
     "import pandas as pd  # type: ignore\n",
+    "\n",
     "from palimpzest.constants import Cardinality\n",
     "from palimpzest.core.lib.fields import Field\n",
     "from palimpzest.core.lib.schemas import Schema, Table, XLSFile\n",

diff --git a/demos/demo_core.py b/demos/demo_core.py
@@ -3,13 +3,14 @@
 import os
 
 import pandas as pd
+from tabulate import tabulate
+
 from palimpzest.core.elements.groupbysig import GroupBySig
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import ImageFile, Number, PDFFile, TextFile
 from palimpzest.query import Execute
 from palimpzest.sets import Dataset
-from tabulate import tabulate
 
 
 class ScientificPaper(PDFFile):

diff --git a/demos/fever-demo.py b/demos/fever-demo.py
@@ -4,6 +4,8 @@
 import random
 from pathlib import Path
 
+from ragatouille import RAGPretrainedModel
+
 from palimpzest.constants import Model, OptimizationStrategy
 from palimpzest.core.data.datasources import ValidationDataSource
 from palimpzest.core.elements.records import DataRecord
@@ -21,7 +23,6 @@
 )
 from palimpzest.sets import Dataset
 from palimpzest.utils.model_helpers import get_models
-from ragatouille import RAGPretrainedModel
 
 
 class FeverClaimsSchema(Schema):

diff --git a/demos/image-demo.py b/demos/image-demo.py
@@ -7,13 +7,14 @@
 
 import gradio as gr
 import numpy as np
+from PIL import Image
+
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import ImageFile
 from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.policy import MaxQuality
 from palimpzest.query import Execute, NoSentinelSequentialSingleThreadExecution
 from palimpzest.sets import Dataset
-from PIL import Image
 
 if not os.environ.get("OPENAI_API_KEY"):
     from palimpzest.utils.env_helpers import load_env

diff --git a/demos/optimizer-demo.py b/demos/optimizer-demo.py
@@ -6,6 +6,8 @@
 from pathlib import Path
 
 import datasets
+from ragatouille import RAGPretrainedModel
+
 from palimpzest.constants import Model, OptimizationStrategy
 from palimpzest.core.data.datasources import ValidationDataSource
 from palimpzest.core.elements.records import DataRecord
@@ -25,7 +27,6 @@
 )
 from palimpzest.sets import Dataset
 from palimpzest.utils.model_helpers import get_models
-from ragatouille import RAGPretrainedModel
 
 # Addresses far from MIT; we use a simple lookup like this to make the
 # experiments re-producible w/out needed a Google API key for geocoding lookups

diff --git a/demos/paper-demo.py b/demos/paper-demo.py
@@ -5,22 +5,18 @@
 
 import gradio as gr
 import numpy as np
-from palimpzest.constants import Cardinality, OptimizationStrategy
+from PIL import Image
+
+from palimpzest.constants import Cardinality
 from palimpzest.core.data.datasources import UserSource
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.core.lib.fields import BooleanField, Field, ImageFilepathField, ListField, NumericField, StringField
 from palimpzest.core.lib.schemas import Schema, Table, TextFile, XLSFile
 from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.policy import MaxQuality, MinCost, MinTime
-from palimpzest.query import (
-    Execute,
-    NoSentinelPipelinedParallelExecution,
-    NoSentinelPipelinedSingleThreadExecution,
-    NoSentinelSequentialSingleThreadExecution,
-)
+from palimpzest.query.processor.config import QueryProcessorConfig
 from palimpzest.sets import Dataset
 from palimpzest.utils.udfs import xls_to_tables
-from PIL import Image
 
 # Addresses far from MIT; we use a simple lookup like this to make the
 # experiments re-producible w/out needed a Google API key for geocoding lookups
@@ -210,18 +206,6 @@ def get_item(self, idx: int):
         print("Policy not supported for this demo")
         exit(1)
 
-    execution_engine = None
-    executor = args.executor
-    if executor == "sequential":
-        execution_engine = NoSentinelSequentialSingleThreadExecution
-    elif executor == "pipelined":
-        execution_engine = NoSentinelPipelinedSingleThreadExecution
-    elif executor == "parallel":
-        execution_engine = NoSentinelPipelinedParallelExecution
-    else:
-        print("Executor not supported for this demo")
-        exit(1)
-
     if os.getenv("OPENAI_API_KEY") is None and os.getenv("TOGETHER_API_KEY") is None:
         print("WARNING: Both OPENAI_API_KEY and TOGETHER_API_KEY are unset")
 
@@ -261,15 +245,25 @@ def get_item(self, idx: int):
         plan = plan.filter("The rows of the table contain the patient age")
         plan = plan.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY)
 
-    # execute pz plan
-    records, execution_stats = Execute(
-        plan,
-        policy,
-        nocache=True,
-        optimization_strategy=OptimizationStrategy.PARETO,
-        execution_engine=execution_engine,
-        verbose=verbose,
-    )
+
+    config = QueryProcessorConfig(nocache=True, policy=policy, max_workers=10)
+    # # Option1: Create a basic processor
+    # # We could pass this process around to different service if needed.
+    # from palimpzest.query.processor.query_processor_factory import QueryProcessorFactory
+    # processor = QueryProcessorFactory.create_processor(
+    #     datasource=plan,
+    #     processing_strategy="no_sentinel",
+    #     execution_strategy="sequential",
+    #     optimizer_strategy="pareto", 
+    #     config=config
+    # )
+    # records, execution_stats = processor.execute()
+
+    # Option2: Use the new interface
+    records, execution_stats = plan.run(config, 
+                                        optimizer_strategy="pareto", 
+                                        execution_strategy="sequential", 
+                                        processing_strategy="no_sentinel")
 
     # save statistics
     if profile:

diff --git a/demos/simple-demo.ipynb b/demos/simple-demo.ipynb
@@ -10,6 +10,7 @@
     "# [Cell 1] - Imports\n",
     "from demo_core import execute_task, format_results_table\n",
     "from IPython.display import HTML, display\n",
+    "\n",
     "from palimpzest.policy import MinCost\n",
     "from palimpzest.query import (\n",
     "    NoSentinelPipelinedParallelExecution,\n",

diff --git a/demos/simple-demo.py b/demos/simple-demo.py
@@ -4,6 +4,7 @@
 import time
 
 from demo_core import execute_task, format_results_table
+
 from palimpzest.policy import MaxQuality, MinCost, MinTime
 from palimpzest.query import (
     NoSentinelPipelinedParallelExecution,

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "click>=8.1.7",
     "click-aliases>=1.0.4",
     "colorama>=0.4.6",
-    "fastapi~=0.100.0",
+    "fastapi~=0.115.0",
     "fuzzywuzzy>=0.18.0",
     "google-generativeai>=0.8.0",
     "gradio>=4.20.1",
@@ -46,6 +46,7 @@ dependencies = [
     "pyyaml>=6.0.1",
     "requests>=2.25",
     "requests-html>=0.10.0",
+    "ruff>=0.9.0",
     "scikit-learn>=1.5.2",
     "scipy>=1.9.0",
     "setuptools>=70.1.1",