Skip to content

Commit 96d1feb

Browse files
committed
linting and fix sneaky data registration error in unit test
1 parent 80e53a2 commit 96d1feb

File tree

4 files changed

+15
-47
lines changed

4 files changed

+15
-47
lines changed

src/palimpzest/core/data/datasources.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,16 @@
1515

1616
from palimpzest import constants
1717
from palimpzest.core.elements.records import DataRecord
18-
from palimpzest.core.lib.schemas import File, ImageFile, Number, PDFFile, Schema, TextFile, WebPage, XLSFile, DefaultSchema
18+
from palimpzest.core.lib.schemas import (
19+
DefaultSchema,
20+
File,
21+
ImageFile,
22+
PDFFile,
23+
Schema,
24+
TextFile,
25+
WebPage,
26+
XLSFile,
27+
)
1928
from palimpzest.tools.pdfparser import get_text_from_pdf
2029

2130

src/palimpzest/core/lib/schemas.py

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
from __future__ import annotations
22

33
import json
4-
import pandas as pd
5-
import numpy as np
64
from typing import Any as TypingAny
75

86
import pandas as pd
@@ -21,7 +19,7 @@
2119
StringField,
2220
)
2321
from palimpzest.utils.hash_helpers import hash_for_temp_schema
24-
from palimpzest.constants import DERIVED_SCHEMA_PREFIX, FROM_DF_PREFIX
22+
2523

2624
class SchemaMetaclass(type):
2725
"""
@@ -240,7 +238,7 @@ def project(cls, project_cols: list[str]) -> Schema:
240238
return type(new_schema_name, (Schema,), attributes)
241239

242240
@staticmethod
243-
def from_df(df: pd.DataFrame) -> "Schema":
241+
def from_df(df: pd.DataFrame) -> Schema:
244242
# Create a unique schema name based on columns
245243
schema_name = f"{DERIVED_SCHEMA_PREFIX}{hash_for_temp_schema(str(tuple(sorted(df.columns))))}"
246244

@@ -278,39 +276,6 @@ def class_name(cls) -> str:
278276
"""Return the name of this class"""
279277
return cls.__name__
280278

281-
@staticmethod
282-
def from_df(df: pd.DataFrame) -> Schema:
283-
# Create a unique schema name based on columns
284-
schema_name = f"{DERIVED_SCHEMA_PREFIX}{hash_for_temp_schema(str(tuple(sorted(df.columns))))}"
285-
286-
# consider to save to temp file and load from there
287-
if schema_name in globals():
288-
return globals()[schema_name]
289-
290-
# NOTE: we will not be able to infer more complicated types like ImageFilepathField
291-
# without some input from the user
292-
# construct attributes for schema (i.e. its fields and metadata)
293-
desc = "Schema derived from DataFrame"
294-
attributes = {"_desc": desc, "__doc__": desc, "__module__": Schema.__module__}
295-
for col, dtype in zip(df.columns, df.dtypes):
296-
if dtype == "object":
297-
attributes[col] = StringField(desc=col)
298-
elif dtype == "bool":
299-
attributes[col] = BooleanField(desc=col)
300-
elif dtype == "int64":
301-
attributes[col] = IntField(desc=col)
302-
elif dtype == "float64":
303-
attributes[col] = FloatField(desc=col)
304-
else:
305-
attributes[col] = Field(desc=col)
306-
307-
# Create new schema only if it doesn't exist
308-
new_schema = type(schema_name, (Schema,), attributes)
309-
310-
# Store the schema class globally
311-
globals()[schema_name] = new_schema
312-
return new_schema
313-
314279
###################################################################################
315280
# "Core" useful Schemas. These are Schemas that almost everyone will need.
316281
# File, TextFile, Image, PDF, etc.

src/palimpzest/policy.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22

33
from palimpzest.core.data.dataclasses import PlanCost
4-
import json
4+
55

66
class Policy:
77
"""
@@ -45,13 +45,6 @@ def choose(self, plan: PlanCost, other_plan: PlanCost) -> float:
4545
Return True if plan is better than other_plan and return False otherwise.
4646
"""
4747
raise NotImplementedError("Calling this method from an abstract base class.")
48-
49-
def to_json_str(self) -> str:
50-
"""Convert policy configuration to a JSON-serializable dictionary."""
51-
return json.dumps({
52-
"type": self.__class__.__name__,
53-
"config": self.get_dict()
54-
}, indent=4)
5548

5649
def to_json_str(self) -> str:
5750
"""Convert policy configuration to a JSON-serializable dictionary."""

tests/pytest/test_cost_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,9 @@ def test_compute_operator_estimates(
5252
)
5353
def test_estimate_plan_cost(self, simple_plan_sample_execution_data, physical_plan, expected_cost_est_results):
5454
# register a fake dataset
55+
dataset_id = "foobar"
5556
vals = [1, 2, 3, 4, 5, 6]
56-
DataDirectory().get_or_register_memory_source(vals=vals)
57+
DataDirectory().register_memory_source(vals=vals, dataset_id=dataset_id)
5758
input_cardinality = len(vals)
5859

5960
# TODO: if we test with a plan other than the simple test plan; this will break

0 commit comments

Comments
 (0)