diff --git a/packages/ordeq-pandas/src/ordeq_pandas/__init__.py b/packages/ordeq-pandas/src/ordeq_pandas/__init__.py index 8847ca14..7a0583ea 100644 --- a/packages/ordeq-pandas/src/ordeq_pandas/__init__.py +++ b/packages/ordeq-pandas/src/ordeq_pandas/__init__.py @@ -1,5 +1,6 @@ from ordeq_pandas.csv import PandasCSV +from ordeq_pandas.dataframe import PandasDataFrame from ordeq_pandas.excel import PandasExcel from ordeq_pandas.parquet import PandasParquet -__all__ = ("PandasCSV", "PandasExcel", "PandasParquet") +__all__ = ("PandasCSV", "PandasDataFrame", "PandasExcel", "PandasParquet") diff --git a/packages/ordeq-pandas/src/ordeq_pandas/dataframe.py b/packages/ordeq-pandas/src/ordeq_pandas/dataframe.py new file mode 100644 index 00000000..c8ec4789 --- /dev/null +++ b/packages/ordeq-pandas/src/ordeq_pandas/dataframe.py @@ -0,0 +1,56 @@ +from collections.abc import Iterable +from dataclasses import dataclass, field +from typing import Any +from uuid import uuid4 + +import pandas as pd +from ordeq import Input +from pandas import Index + + +@dataclass(frozen=True, kw_only=True) +class PandasDataFrame(Input[pd.DataFrame]): + """Allows a Pandas DataFrame to be hard-coded as IO. This is suitable + for small tables such as simple dimension tables that are unlikely to + change. It is also useful for unit testing. + + Example usage: + + ```pycon + >>> from ordeq_pandas import PandasDataFrame + >>> df = PandasDataFrame( + ... data=( + ... (2022, "file_2022.xlsx"), + ... (2023, "file_2023.xlsx"), + ... (2024, "file_2024.xlsx"), + ... ), + ... columns=("year", "datafile"), + ... ).load() + >>> print(df.shape) + (3, 2) + + ``` + + """ + + _idx: str = field( + init=False, default_factory=lambda: str(uuid4()), repr=False + ) + data: Iterable = field(hash=False) + columns: Index | Iterable[str] | None = field(hash=False, default=None) + + def load(self, **load_options: Any) -> pd.DataFrame: + """Loads the DataFrame from the provided data and columns. + + Args: + **load_options: Additional options passed to `pd.DataFrame`. + + Returns: + Loaded Pandas DataFrame. + """ + + return pd.DataFrame( + data=self.data, + columns=self.columns, # type: ignore[arg-type] + **load_options, + ) diff --git a/packages/ordeq-pandas/tests/test_dataframe.py b/packages/ordeq-pandas/tests/test_dataframe.py new file mode 100644 index 00000000..d3b723fc --- /dev/null +++ b/packages/ordeq-pandas/tests/test_dataframe.py @@ -0,0 +1,37 @@ +import pandas as pd +from ordeq_pandas import PandasDataFrame + + +def test_it_loads(): + data = [ + (2022, "file_2022.xlsx"), + (2023, "file_2023.xlsx"), + (2024, "file_2024.xlsx"), + ] + columns = ["year", "datafile"] + expected = pd.DataFrame(data, columns=columns) + actual = PandasDataFrame(data=data, columns=columns).load() + pd.testing.assert_frame_equal(actual, expected) + + +def test_its_unique(): + data = [ + (2022, "file_2022.xlsx"), + (2023, "file_2023.xlsx"), + (2024, "file_2024.xlsx"), + ] + columns = ["year", "datafile"] + df1 = PandasDataFrame(data=data, columns=columns) + df2 = PandasDataFrame(data=data, columns=columns) + assert df1 != df2 + + +def test_its_hashable(): + data = [ + (2022, "file_2022.xlsx"), + (2023, "file_2023.xlsx"), + (2024, "file_2024.xlsx"), + ] + columns = ["year", "datafile"] + df = PandasDataFrame(data=data, columns=columns) + assert isinstance(hash(df), int)