Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/ordeq-pandas/src/ordeq_pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ordeq_pandas.csv import PandasCSV
from ordeq_pandas.dataframe import PandasDataFrame
from ordeq_pandas.excel import PandasExcel
from ordeq_pandas.parquet import PandasParquet

__all__ = ("PandasCSV", "PandasExcel", "PandasParquet")
__all__ = ("PandasCSV", "PandasDataFrame", "PandasExcel", "PandasParquet")
56 changes: 56 additions & 0 deletions packages/ordeq-pandas/src/ordeq_pandas/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from collections.abc import Iterable
from dataclasses import dataclass, field
from typing import Any
from uuid import uuid4

import pandas as pd
from ordeq import Input
from pandas import Index


@dataclass(frozen=True, kw_only=True)
class PandasDataFrame(Input[pd.DataFrame]):
"""Allows a Pandas DataFrame to be hard-coded as IO. This is suitable
for small tables such as simple dimension tables that are unlikely to
change. It is also useful for unit testing.

Example usage:

```pycon
>>> from ordeq_pandas import PandasDataFrame
>>> df = PandasDataFrame(
... data=(
... (2022, "file_2022.xlsx"),
... (2023, "file_2023.xlsx"),
... (2024, "file_2024.xlsx"),
... ),
... columns=("year", "datafile"),
... ).load()
>>> print(df.shape)
(3, 2)

```

"""

_idx: str = field(
init=False, default_factory=lambda: str(uuid4()), repr=False
)
data: Iterable = field(hash=False)
columns: Index | Iterable[str] | None = field(hash=False, default=None)

def load(self, **load_options: Any) -> pd.DataFrame:
"""Loads the DataFrame from the provided data and columns.

Args:
**load_options: Additional options passed to `pd.DataFrame`.

Returns:
Loaded Pandas DataFrame.
"""

return pd.DataFrame(
data=self.data,
columns=self.columns, # type: ignore[arg-type]
**load_options,
)
37 changes: 37 additions & 0 deletions packages/ordeq-pandas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
from ordeq_pandas import PandasDataFrame


def test_it_loads():
data = [
(2022, "file_2022.xlsx"),
(2023, "file_2023.xlsx"),
(2024, "file_2024.xlsx"),
]
columns = ["year", "datafile"]
expected = pd.DataFrame(data, columns=columns)
actual = PandasDataFrame(data=data, columns=columns).load()
pd.testing.assert_frame_equal(actual, expected)


def test_its_unique():
data = [
(2022, "file_2022.xlsx"),
(2023, "file_2023.xlsx"),
(2024, "file_2024.xlsx"),
]
columns = ["year", "datafile"]
df1 = PandasDataFrame(data=data, columns=columns)
df2 = PandasDataFrame(data=data, columns=columns)
assert df1 != df2


def test_its_hashable():
data = [
(2022, "file_2022.xlsx"),
(2023, "file_2023.xlsx"),
(2024, "file_2024.xlsx"),
]
columns = ["year", "datafile"]
df = PandasDataFrame(data=data, columns=columns)
assert isinstance(hash(df), int)