unit tests passing

mdr223 · mdr223 · commit 80e53a2316bb · 2025-01-23T23:19:36.000-05:00
diff --git a/src/palimpzest/query/processor/__init__.py b/src/palimpzest/query/processor/__init__.py
diff --git a/src/palimpzest/utils/model_helpers.py b/src/palimpzest/utils/model_helpers.py
@@ -1,10 +1,9 @@
 import os
-from typing import List, Optional
 
 from palimpzest.constants import Model
 
 
-def get_vision_models() -> List[Model]:
+def get_vision_models() -> list[Model]:
     """
     Return the set of vision models which the system has access to based on the set of environment variables.
     """
@@ -18,7 +17,7 @@ def get_vision_models() -> List[Model]:
     return models
 
 
-def get_models(include_vision: Optional[bool] = False) -> List[Model]:
+def get_models(include_vision: bool = False) -> list[Model]:
     """
     Return the set of models which the system has access to based on the set environment variables.
     """
diff --git a/tests/pytest/test_cost_model.py b/tests/pytest/test_cost_model.py
@@ -52,12 +52,8 @@ def test_compute_operator_estimates(
     )
     def test_estimate_plan_cost(self, simple_plan_sample_execution_data, physical_plan, expected_cost_est_results):
         # register a fake dataset
-        dataset_id = "foobar"
         vals = [1, 2, 3, 4, 5, 6]
-        DataDirectory().register_dataset(
-            vals=vals,
-            dataset_id=dataset_id,
-        )
+        DataDirectory().get_or_register_memory_source(vals=vals)
         input_cardinality = len(vals)
 
         # TODO: if we test with a plan other than the simple test plan; this will break
diff --git a/tests/pytest/test_datasource.py b/tests/pytest/test_datasource.py
@@ -1,9 +1,13 @@
-import pytest
 from palimpzest.core.data.datasources import MemorySource
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.schemas import Schema, List
+from palimpzest.core.lib.fields import Field
+from palimpzest.core.lib.schemas import Schema, SourceRecord
 from palimpzest.query.operators.datasource import MarshalAndScanDataOp
-from palimpzest.core.lib.schemas import SourceRecord
+
+
+class List(Schema):
+    value = Field(desc="List item")
+
 
 def test_marshal_and_scan_memory_source():
     # Create test data
diff --git a/tests/pytest/test_datasources.py b/tests/pytest/test_datasources.py
@@ -1,18 +1,20 @@
 # write tests for src/palimpzest/core/data/datasources.py
 
 import os
-import pytest
+
 import pandas as pd
+import pytest
+
 from palimpzest.core.data.datasources import (
-    FileSource, 
-    TextFileDirectorySource,
+    FileSource,
+    HTMLFileDirectorySource,
     ImageFileDirectorySource,
     MemorySource,
-    HTMLFileDirectorySource
+    TextFileDirectorySource,
 )
-from palimpzest.core.lib.fields import ListField
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.schemas import List, Schema, Number, File, TextFile, WebPage, ImageFile
+from palimpzest.core.lib.schemas import File, TextFile, WebPage
+
 
 @pytest.fixture
 def temp_text_file():
diff --git a/tests/pytest/test_execution_no_cache.py b/tests/pytest/test_execution_no_cache.py
@@ -17,6 +17,15 @@
 )
 
 
+@pytest.fixture
+def optimizer():
+    return Optimizer(policy=MaxQuality(), cost_model=CostModel())
+
+@pytest.fixture
+def config():
+    return QueryProcessorConfig(nocache=True)
+
+
 @pytest.mark.parametrize(
     argnames=("query_processor",),
     argvalues=[
@@ -97,16 +106,7 @@ class TestParallelExecutionNoCache:
         ],
         indirect=True,
     )
-
-    @pytest.fixture
-    def optimizer(self):
-        return Optimizer(policy=MaxQuality(), cost_model=CostModel())
-
-    @pytest.fixture
-    def config(self):
-        return QueryProcessorConfig(nocache=True)
-
-    def test_execute_full_plan(self, mocker, query_processor, dataset, optimizer, config, physical_plan, expected_records, side_effect):
+    def test_execute_full_plan(self, mocker, query_processor, optimizer, config, dataset, physical_plan, expected_records, side_effect):
         """
         This test executes the given
         """
diff --git a/tests/pytest/test_physical.py b/tests/pytest/test_physical.py
@@ -2,10 +2,10 @@
 
 import os
 import sys
-import pytest
+
+from palimpzest.core.lib.fields import NumericField, StringField
 from palimpzest.core.lib.schemas import Schema
 from palimpzest.query.operators.physical import PhysicalOperator
-from palimpzest.core.lib.fields import StringField, NumericField
 
 sys.path.append("./tests/")
 sys.path.append("./tests/refactor-tests/")
@@ -16,11 +16,15 @@
     load_env()
 
 
-
 class SimpleSchema(Schema):
     name = StringField(desc="The name of the person")
     age = NumericField(desc="The age of the person")
 
+class SimpleSchemaTwo(Schema):
+    name = StringField(desc="The name of the person")
+    age = NumericField(desc="The age of the person")
+    height = NumericField(desc="The height of the person")
+
 def test_physical_operator_init():
     """Test basic initialization of PhysicalOperator"""
     
@@ -31,7 +35,7 @@ def test_physical_operator_init():
         logical_op_id="logical1",
         verbose=True
     )
-    
+
     assert op.output_schema == SimpleSchema
     assert op.input_schema == SimpleSchema
     assert op.depends_on == ["op1", "op2"]
@@ -41,44 +45,44 @@ def test_physical_operator_init():
 def test_physical_operator_equality():
     """Test equality comparison between PhysicalOperators"""
     schema1 = SimpleSchema()
-    schema2 = SimpleSchema()
-    
+    schema2 = SimpleSchemaTwo()
+
     op1 = PhysicalOperator(output_schema=schema1)
     op2 = PhysicalOperator(output_schema=schema1)
     op3 = PhysicalOperator(output_schema=schema2, verbose=True)
-    
+
     assert op1 == op2  # Same output schema
     assert op1 == op1  # Same instance
     assert op1 == op1.copy()  # Copy should be equal
     assert op1 != op3  # Different parameters
 
 def test_physical_operator_str():
     """Test string representation of PhysicalOperator"""
-    
+
     op = PhysicalOperator(
         output_schema=SimpleSchema,
         input_schema=SimpleSchema
     )
-    
+
     str_rep = str(op)
     assert "SimpleSchema -> PhysicalOperator -> SimpleSchema" in str_rep
     assert "age, name" in str_rep
 
 def test_physical_operator_id_generation():
     """Test operator ID generation and hashing"""
     op = PhysicalOperator(output_schema=SimpleSchema)
-    
+
     # Test that op_id is initially None
     assert op.op_id is None
-    
+
     # Get op_id and verify it's generated
     op_id = op.get_op_id()
     assert op_id is not None
     assert isinstance(op_id, str)
-    
+
     # Test that subsequent calls return the same id
     assert op.get_op_id() == op_id
-    
+
     # Test that hash is based on op_id
     assert hash(op) == int(op_id, 16)
 
@@ -91,72 +95,12 @@ def test_physical_operator_copy():
         logical_op_id="logical1",
         verbose=True
     )
-    
+
     copied = original.copy()
-    
+
     assert copied is not original  # Different instances
     assert copied == original  # But equal in content
     assert copied.get_op_id() == original.get_op_id()  # Same op_id
     assert copied.depends_on == original.depends_on
     assert copied.logical_op_id == original.logical_op_id
     assert copied.verbose == original.verbose
-
-
-# TODO: uncomment once I understand what is supposed to be happening with
-#       ParallelConvertFromCandidateOp and ParallelFilterCandidateOp (I don't
-#       have these on my branch; possibly came from another branch)
-
-# def test_convert(email_schema):
-#     """Test the physical operators equality sign"""
-#     remove_cache()
-
-#     params = {
-#         "output_schema": email_schema,
-#         "input_schema": File,
-#         "model": pz.Model.GPT_4o_MINI,
-#         "cardinality": "oneToOne",
-#     }
-
-#     # simpleConvert = pz.Convert(**params)
-#     parallelConvert = pz.ParallelConvertFromCandidateOp(**params, streaming="")
-#     monolityhConvert = pz.ConvertOp(**params)
-
-#     assert parallelConvert == parallelConvert
-#     assert monolityhConvert == monolityhConvert
-#     assert parallelConvert != monolityhConvert
-
-#     print(str(parallelConvert))
-#     print(str(monolityhConvert))
-
-#     a = parallelConvert.copy()
-#     b = monolityhConvert.copy()
-#     assert a == parallelConvert
-#     assert b == monolityhConvert
-#     assert a != b
-
-# def test_filter(email_schema):
-#     """Test the physical operators filter"""
-#     remove_cache()
-
-#     params = {
-#         "output_schema": email_schema,
-#         "input_schema": email_schema,
-#         "filter": pz.Filter("This is a sample filter"),
-#     }
-
-#     # simpleConvert = pz.Convert(**params)
-#     parallelFilter = pz.ParallelFilterCandidateOp(**params, streaming="")
-#     monoFilter = pz.NonLLMFilter(**params)
-
-#     assert parallelFilter == parallelFilter
-#     assert monoFilter == monoFilter
-#     assert parallelFilter != monoFilter
-
-#     print(str(parallelFilter))
-#     print(str(monoFilter))
-
-#     a = parallelFilter.copy()
-#     b = monoFilter.copy()
-#     assert a == parallelFilter
-#     assert b == monoFilter
-#     assert a != b
diff --git a/tests/pytest/test_rules.py b/tests/pytest/test_rules.py
@@ -1,14 +1,9 @@
 import pytest
-from palimpzest.query.optimizer.rules import (
-    PushDownFilter, NonLLMConvertRule, LLMConvertBondedRule,
-    BasicSubstitutionRule, NonLLMFilterRule
-)
-from palimpzest.query.optimizer.primitives import LogicalExpression, Group
-from palimpzest.query.operators.logical import (
-    ConvertScan, FilteredScan, BaseScan
-)
-from palimpzest.query.operators.filter import Filter
+
 from palimpzest.core.lib.schemas import Schema, StringField
+from palimpzest.query.operators.logical import BaseScan
+from palimpzest.query.optimizer.primitives import LogicalExpression
+from palimpzest.query.optimizer.rules import BasicSubstitutionRule
 
 
 @pytest.fixture
@@ -30,24 +25,26 @@ def test_substitute_methods(base_scan_op):
     logical_expr = LogicalExpression(
         operator=base_scan_op,
         input_group_ids=[],
-        input_fields=set(),
-        generated_fields=set(["id", "text"]),
+        input_fields={},
+        generated_fields={"some_id": StringField(desc="id"),  "text": StringField(desc="text")},
+        depends_on_field_names=set(),
         group_id=1
     )
-    
+
     # Apply the BasicSubstitutionRule
     physical_exprs = BasicSubstitutionRule.substitute(logical_expr, verbose=False)
-    
+
     # Verify the substitution
     assert len(physical_exprs) == 1
     physical_expr = list(physical_exprs)[0]
-    
+
     # Check that the operator was correctly converted to MarshalAndScanDataOp
     assert physical_expr.operator.__class__.__name__ == "MarshalAndScanDataOp"
-    
+
     # Verify that the important properties were preserved
     assert physical_expr.operator.dataset_id == base_scan_op.dataset_id
     assert physical_expr.input_group_ids == logical_expr.input_group_ids
     assert physical_expr.input_fields == logical_expr.input_fields
     assert physical_expr.generated_fields == logical_expr.generated_fields
-    assert physical_expr.group_id == logical_expr.group_id
+    assert physical_expr.depends_on_field_names == logical_expr.depends_on_field_names
+    assert physical_expr.group_id == logical_expr.group_id