resolving merge conflicts

mitdbg · Jan 22, 2025 · d1939bb · d1939bb
2 parents 1fe61a2 + e9f4f05
commit d1939bb
Show file tree

Hide file tree

Showing 90 changed files with 949 additions and 1,158 deletions.
diff --git a/.github/workflows/workflow.yaml b/.github/workflows/workflow.yaml
@@ -0,0 +1,59 @@
+name: PZ Merge Checks
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install .
+
+    - name: Download and register testdata
+      run: |
+        pushd testdata
+        wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/enron-eval-tiny.tar.gz
+        wget -nc https://people.csail.mit.edu/gerarvit/PalimpzestData/real-estate-eval-tiny.tar.gz
+        tar -xzf enron-eval-tiny.tar.gz
+        tar -xzf real-estate-eval-tiny.tar.gz
+        rm *.tar.gz
+        popd
+        pz reg --path testdata/enron-eval-tiny --name enron-eval-tiny
+        pz reg --path testdata/real-estate-eval-tiny --name real-estate-eval-tiny
+
+    - name: Test with pytest
+      env: # Or as an environment variable
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      run: |
+        pip install pytest
+        pytest -v tests/pytest
+  
+  lint-and-format:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+    - name: Install the code linting and formatting tool Ruff
+      run: pip install "ruff>=0.9.0"
+    - name: check version
+      run: ruff --version
+    - name: Lint code with Ruff
+      run: ruff check --output-format=github --target-version=py38
+    - name: Check code formatting with Ruff
+      run: ruff check --no-fix . --target-version=py38
+      continue-on-error: true
diff --git a/README.md b/README.md
@@ -1,21 +1,81 @@
-![pz-banner](logos/palimpzest-cropped.png)
+![pz-banner](src/static/palimpzest-cropped.png)
 
 # Palimpzest (PZ)
-- **Read our (pre-print) paper:** [**read the paper**](https://arxiv.org/pdf/2405.14696)
-- Join our Discord: [discord](https://discord.gg/znFN2baN)
-- Read our short blog post: [read the blog post](https://dsg.csail.mit.edu/projects/palimpzest/)
-- Check out our Colab Demo: [colab demo](https://colab.research.google.com/drive/1zqOxnh_G6eZ8_xax6PvDr-EjMt7hp4R5?usp=sharing)
-- Check out the video: [MIT 2024](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu)
-
-# Getting started
-You can install the Palimpzest package and CLI on your machine by cloning this repository and running:
+[![Paper](https://img.shields.io/badge/Paper-arXiv-red)](https://arxiv.org/pdf/2405.14696)
+[![Blog Post](https://img.shields.io/badge/Website-PZ-green)](https://dsg.csail.mit.edu/projects/palimpzest/)
+[![Colab Demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zqOxnh_G6eZ8_xax6PvDr-EjMt7hp4R5?usp=sharing)
+[![Video](https://img.shields.io/badge/Website-Talk-purple)](https://youtu.be/T8VQfyBiki0?si=eiph57DSEkDNbEIu)
+[![PyPI](https://img.shields.io/pypi/v/palimpzest)](https://pypi.org/project/palimpzest/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/palimpzest)](https://pypi.org/project/palimpzest/)
+
+## Getting started
+You can find a stable version of the Palimpzest package on PyPI [here](https://pypi.org/project/palimpzest/). To install the package, run:
+```bash
+$ pip install palimpzest
+```
+
+Alternatively, to install the latest version of the package from this repository, you can clone this repository and run the following commands:
 ```bash
 $ git clone [email protected]:mitdbg/palimpzest.git
 $ cd palimpzest
 $ pip install .
 ```
 
-## Downloading test data
+## Quick Start
+The easiest way to get started with Palimpzest, is to run the `quickstart.ipynb` jupyter notebook. We provide a simple use case to showcase the workflow of working with Palimpzest, including registering a dataset, running a workload, and accessing the results.
+To run the notebook, you can use the following command:
+```bash
+$ jupyter notebook
+```
+And then access the notebook from the jupyter interface in your browser at `localhost:8888`.
+
+### Even Quicker Start
+For eager readers, the code in the notebook can be found in the following condensed snippet. However, we do suggest reading the notebook as it contains more insight into each element of the program.
+```python
+import pandas as pd
+import palimpzest.datamanager.datamanager as pzdm
+from palimpzest.sets import Dataset
+from palimpzest.core.lib.fields import Field
+from palimpzest.core.lib.schemas import Schema, TextFile
+from palimpzest.policy import MinCost, MaxQuality
+from palimpzest.query import Execute
+
+# Dataset registration
+dataset_path = "testdata/enron-tiny"
+dataset_name = "enron-tiny"
+pzdm.DataDirectory().register_local_directory(dataset_path, dataset_name)
+
+# Dataset loading
+dataset = Dataset(dataset_name, schema=TextFile)
+
+# Schema definition for the fields we wish to compute
+class Email(Schema):
+    """Represents an email, which in practice is usually from a text file"""
+    sender = Field(desc="The email address of the sender")
+    subject = Field(desc="The subject of the email")
+    date = Field(desc="The date the email was sent")
+
+# Lazy construction of computation to filter for emails about holidays sent in July
+dataset = dataset.convert(Email, desc="An email from the Enron dataset")
+dataset = dataset.filter("The email was sent in July")
+dataset = dataset.filter("The email is about holidays")
+
+# Executing the compuation
+policy = MinCost()
+results, execution_stats = Execute(dataset, policy)
+
+# Writing output to disk
+output_df = pd.DataFrame([r.as_dict() for r in results])[["date","sender","subject"]]
+output_df.to_csv("july_holiday_emails.csv")
+```
+
+## Palimpzest CLI
+Installing Palimpzest also installs its CLI tool `pz` which provides users with basic utilities at the command line for creating and managing their own Palimpzest system. Please read the readme in [src/cli/README.md](./src/cli/README.md) for instructions on how to use it.
+
+## Python Demos
+Below are simple instructions to run PZ on a test data set of enron emails that is included with the system.
+
+### Downloading test data
 To run the provided demos, you will need to download the test data. Due to the size of the data, we are unable to include it in the repository. You can download the test data by running the following command from a unix terminal (requires `wget` and `tar`):
 ```
 chmod +x testdata/download-testdata.sh
@@ -27,11 +87,7 @@ chmod +x testdata/register-sources.sh
 ./testdata/register-sources.sh
 ```
 
-
-## Python Demos
-#### NOTE: we are in the process of refactoring our demos; please reach out to us or create an issue if you'd like support getting started
-Below are simple instructions to run pz on a test data set of enron emails that is included with the system:
-
+### Running the Demos
 - Initialize the configuration by running `pz init`.
 
 - Palimpzest defaults to using OpenAI. You’ll need to export an environment variable `OPENAI_API_KEY`
@@ -41,177 +97,3 @@ Below are simple instructions to run pz on a test data set of enron emails that
 
 - Finally, run the simple test program with:
       `python demos/simpleDemo.py --task enron --datasetid enron-eval-tiny --verbose`
-
-- If you would like to try running our in our execution mode which first optimizes on a subset of the data, you can run:
-      `python demos/optimizerDemo.py --verbose --workload enron --datasetid enron-eval-tiny --executor sequential-mab --sample-budget 15 --exp-name demo` 
-
-
-
-## Palimpzest CLI
-Installing Palimpzest also installs its CLI tool `pz` which provides users with basic utilities for creating and managing their own Palimpzest system. Running `pz --help` diplays an overview of the CLI's commands:
-```bash
-$ pz --help
-Usage: pz [OPTIONS] COMMAND [ARGS]...
-
-  The CLI tool for Palimpzest.
-
-Options:
-  --help  Show this message and exit.
-
-Commands:
-  help (h)                        Print the help message for PZ.
-  init (i)                        Initialize data directory for PZ.
-  ls-data (ls,lsdata)             Print a table listing the datasets
-                                  registered with PZ.
-  register-data (r,reg,register)  Register a data file or data directory with
-                                  PZ.
-  rm-data (rm,rmdata)             Remove a dataset that was registered with
-                                  PZ.
-```
-
-Users can initialize their own system by running `pz init`. This will create Palimpzest's working directory in `~/.palimpzest`:
-```bash
-$ pz init
-Palimpzest system initialized in: /Users/matthewrusso/.palimpzest
-```
-
-If we list the set of datasets registered with Palimpzest, we'll see there currently are none:
-```bash
-$ pz ls
-+------+------+------+
-| Name | Type | Path |
-+------+------+------+
-+------+------+------+
-
-Total datasets: 0
-```
-
-### Registering Datasets
-To add (or "register") a dataset with Palimpzest, we can use the `pz register-data` command (also aliased as `pz reg`) to specify that a file or directory at a given `--path` should be registered as a dataset with the specified `--name`:
-```bash
-$ pz reg --path README.md --name rdme
-Registered rdme
-```
-
-If we list Palimpzest's datasets again we will see that `README.md` has been registered under the dataset named `rdme`:
-```bash
-$ pz ls
-+------+------+------------------------------------------+
-| Name | Type |                   Path                   |
-+------+------+------------------------------------------+
-| rdme | file | /Users/matthewrusso/palimpzest/README.md |
-+------+------+------------------------------------------+
-
-Total datasets: 1
-```
-
-To remove a dataset from Palimpzest, simply use the `pz rm-data` command (also aliased as `pz rm`) and specify the `--name` of the dataset you would like to remove:
-```bash
-$ pz rm --name rdme
-Deleted rdme
-```
-
-Finally, listing our datasets once more will show that the dataset has been deleted:
-```bash
-$ pz ls
-+------+------+------+
-| Name | Type | Path |
-+------+------+------+
-+------+------+------+
-
-Total datasets: 0
-```
-
-### Cache Management
-Palimpzest will cache intermediate results by default. It can be useful to remove them from the cache when trying to evaluate the performance improvement(s) of code changes. We provide a utility command `pz clear-cache` (also aliased as `pz clr`) to clear the cache:
-```bash
-$ pz clr
-Cache cleared
-```
-
-### Config Management
-You may wish to work with multiple configurations of Palimpzest in order to, e.g., evaluate the difference in performance between various LLM services for your data extraction task. To see the config Palimpzest is currently using, you can run the `pz print-config` command (also aliased as `pz config`):
-```bash
-$ pz config
---- default ---
-filecachedir: /some/local/filepath
-llmservice: openai
-name: default
-parallel: false
-```
-By default, Palimpzest uses the configuration named `default`. As shown above, if you run a script using Palimpzest out-of-the-box, it will use OpenAI endpoints for all of its API calls.
-
-Now, let's say you wanted to try using [together.ai's](https://www.together.ai/) for your API calls, you could do this by creating a new config with the `pz create-config` command (also aliased as `pz cc`):
-```bash
-$ pz cc --name together-conf --llmservice together --parallel True --set
-Created and set config: together-conf
-```
-The `--name` parameter is required and specifies the unique name for your config. The `--llmservice` and `--parallel` options specify the service to use and whether or not to process files in parallel. Finally, if the `--set` flag is present, Palimpzest will update its current config to point to the newly created config.
-
-We can confirm that Palimpzest checked out our new config by running `pz config`:
-```bash
-$ pz config
---- together-conf ---
-filecachedir: /some/local/filepath
-llmservice: together
-name: together-conf
-parallel: true
-```
-
-You can switch which config you are using at any time by using the `pz set-config` command (also aliased as `pz set`):
-```bash
-$ pz set --name default
-Set config: default
-
-$ pz config
---- default ---
-filecachedir: /some/local/filepath
-llmservice: openai
-name: default
-parallel: false
-
-$ pz set --name together-conf
-Set config: together-conf
-
-$ pz config
---- together-conf ---
-filecachedir: /some/local/filepath
-llmservice: together
-name: together-conf
-parallel: true
-```
-
-You can update an existing config using the `pz update` command (also aliased as `pz uc`):
-```bash
-$ pz update --name default --settings parallel=true,pdfprocessor=pdfplumber
-Updated config: default
-
-$ pz config
---- default ---
-filecachedir: /some/local/filepath
-llmservice: anthropic
-name: default
-parallel: true
-pdfprocessor: pdfplumber
-```
-
-The `--name` parameter specifies which config to update. `--settings` specifies all the parameter name and value pairs in the format `param_name=param_value`, separated by commas.
-
-
-Finally, you can delete a config with the `pz rm-config` command (also aliased as `pz rmc`):
-```bash
-$ pz rmc --name together-conf
-Deleted config: together-conf
-```
-Note that you cannot delete the `default` config, and if you delete the config that you currently have set, Palimpzest will set the current config to be `default`.
-
-## Configuring for Parallel Execution
-
-There are a few things you need to do in order to use remote parallel services.
-
-If you want to use parallel LLM execution on together.ai, you have to modify the config.yaml (by default, Palimpzest uses `~/.palimpzest/config_default.yaml`) so that `llmservice: together` and `parallel: True` are set.
-
-If you want to use parallel PDF processing at modal.com, you have to:
-1. Set `pdfprocessor: modal` in the config.yaml file.
-2. Run `modal deploy src/palimpzest/tools/allenpdf.py`.  This will remotely install the modal function so you can run it. (Actually, it's probably already installed there, but do this just in case.  Also do it if there's been a change to the server-side function inside that file.)
-
diff --git a/demos/askem-var.py b/demos/askem-var.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 import streamlit as st
+
 from palimpzest.constants import Cardinality
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.core.lib.fields import Field

diff --git a/demos/bdf-suite.py b/demos/bdf-suite.py
@@ -10,6 +10,7 @@
 import networkx as nx
 import pandas as pd
 import streamlit as st
+
 from palimpzest.constants import Cardinality
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import URL, File, PDFFile, Schema, Table, XLSFile

diff --git a/demos/bdf-usecase3.py b/demos/bdf-usecase3.py
@@ -11,6 +11,7 @@
 import networkx as nx
 import pandas as pd
 import streamlit as st  # type: ignore
+
 from palimpzest.constants import Cardinality
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import PDFFile, Schema

diff --git a/demos/biofabric-demo-matching.ipynb b/demos/biofabric-demo-matching.ipynb
@@ -10,6 +10,7 @@
     "import os\n",
     "\n",
     "import pandas as pd  # type: ignore\n",
+    "\n",
     "from palimpzest.constants import Cardinality\n",
     "from palimpzest.core.lib.fields import Field\n",
     "from palimpzest.core.lib.schemas import Schema, Table, XLSFile\n",

diff --git a/demos/demo_core.py b/demos/demo_core.py
@@ -3,13 +3,14 @@
 import os
 
 import pandas as pd
+from tabulate import tabulate
+
 from palimpzest.core.elements.groupbysig import GroupBySig
 from palimpzest.core.elements.records import DataRecord
 from palimpzest.core.lib.fields import Field
 from palimpzest.core.lib.schemas import ImageFile, Number, PDFFile, TextFile
 from palimpzest.query import Execute
 from palimpzest.sets import Dataset
-from tabulate import tabulate
 
 
 class ScientificPaper(PDFFile):