Skip to content

Refactor _validate_data_input to simplify the codes #3818

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 31 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
df88e02
Figure.plot/plot3d/text: Pass a dict of vectors rather than x/y/extra…
seisman Mar 3, 2025
ce57c59
Check if a dict of vectors contain None
seisman Mar 3, 2025
2cb9295
clib.virtualfile_in: Remove the 'extra_arrays' parameter
seisman Mar 3, 2025
362e5bd
Figure.plot3d: Add one more test to increase code coverage
seisman Mar 3, 2025
8f72e4c
Clarify that dictionary will be recognized as vectors
seisman Mar 3, 2025
5d3d308
Rename 'required_z' to 'required_ncols' in _validate_data_input
seisman Mar 5, 2025
406cb39
Rename 'required_z' to 'required_ncols' in Session.virtualfile_in
seisman Mar 5, 2025
af189e6
Rename 'required_z' to 'required_ncols' in Session.virtualfile_in tests
seisman Mar 5, 2025
2d55e5a
Rename 'required_z' to 'required_ncols' in module wrappers
seisman Mar 5, 2025
f017adb
Merge branch 'main' into refactor/virtualfile_in_extra_arrays
seisman Mar 6, 2025
c199dbf
Clarify that the data parameter can accept dicts
seisman Mar 6, 2025
07924ca
Merge branch 'main' into refactor/virtualfile_in_extra_arrays
seisman Mar 10, 2025
1d0d0dc
Fix a typo
seisman Mar 10, 2025
84e8d8a
Merge branch 'main' into refactor/virtualfile_in
seisman Mar 10, 2025
5eeb37b
Rename required_ncols to the shorter ncols
seisman Mar 10, 2025
a7ee253
Merge branch 'main' into refactor/virtualfile_in_extra_arrays
seisman Mar 24, 2025
5cd8275
Merge branch 'main' into refactor/virtualfile_in_extra_arrays
seisman Mar 26, 2025
e44e712
Improve docstrings for the new test
seisman Mar 26, 2025
5330d0a
Raise warnings when extra_arrays is used
seisman Mar 26, 2025
0a63ef0
Add TODO comments
seisman Mar 26, 2025
2851af8
Merge branch 'main' into refactor/virtualfile_in
seisman Mar 26, 2025
0e0c1d8
Deprecate required_z in a backward-compatible way
seisman Mar 26, 2025
f8293cd
Add one test with the deprecated 'required_z' parameter to increase c…
seisman Mar 26, 2025
60f1758
Refactor _validate_data_input
seisman Feb 21, 2025
3c61fea
Merge branch 'refactor/virtualfile_in' into refactor/validate_data_input
seisman Mar 26, 2025
312aa39
Merge branch 'main' into refactor/virtualfile_in
seisman Mar 28, 2025
b7ca239
Move TODO comment to the top
seisman Mar 28, 2025
a01afbb
Merge branch 'main' into refactor/virtualfile_in
seisman Apr 15, 2025
da874d1
Put ncols before required_data
seisman Apr 15, 2025
ea367c8
Merge branch 'refactor/virtualfile_in' into refactor/validate_data_input
seisman Apr 15, 2025
72c1730
Merge branch 'main' into refactor/validate_data_input
seisman Apr 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 19 additions & 13 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1838,23 +1838,25 @@
)
mincols = 3

# Specify either data or x/y/z.
if data is not None and any(v is not None for v in (x, y, z)):
msg = "Too much data. Use either data or x/y/z."
raise GMTInvalidInput(msg)

# Determine the kind of data.
kind = data_kind(data, required=required_data)
_validate_data_input(
data=data,
x=x,
y=y,
z=z,
mincols=mincols,
required_data=required_data,
kind=kind,
)

# Check if the kind of data is valid.
if check_kind:
valid_kinds = ("file", "arg") if required_data is False else ("file",)
if check_kind == "raster":
valid_kinds += ("grid", "image")
elif check_kind == "vector":
valid_kinds += ("empty", "matrix", "vectors", "geojson")
match check_kind:
case "raster":
valid_kinds += ("grid", "image")
case "vector":
valid_kinds += ("empty", "matrix", "vectors", "geojson")
case _:
msg = f"Invalid value for check_kind: '{check_kind}'."
raise GMTInvalidInput(msg)

Check warning on line 1859 in pygmt/clib/session.py

View check run for this annotation

Codecov / codecov/patch

pygmt/clib/session.py#L1857-L1859

Added lines #L1857 - L1859 were not covered by tests
if kind not in valid_kinds:
msg = f"Unrecognized data type for {check_kind}: {type(data)}."
raise GMTInvalidInput(msg)
Expand Down Expand Up @@ -1888,6 +1890,7 @@
_data = [x, y]
if z is not None:
_data.append(z)
# TODO(PyGMT>=0.20.0): Remove the deprecated parameter 'extra_arrays'.
if extra_arrays:
msg = (
"The parameter 'extra_arrays' will be removed in v0.20.0. "
Expand All @@ -1913,6 +1916,9 @@
_virtualfile_from = self.virtualfile_from_vectors
_data = data.T

# Check if _data to be passed to the virtualfile_from_ function is valid.
_validate_data_input(data=_data, kind=kind, mincols=mincols)

# Finally create the virtualfile from the data, to be passed into GMT
file_context = _virtualfile_from(_data)
return file_context
Expand Down
118 changes: 54 additions & 64 deletions pygmt/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,102 +40,96 @@
"ISO-8859-15",
"ISO-8859-16",
]
# Type hints for the list of possible data kinds.
Kind = Literal[
"arg", "empty", "file", "geojson", "grid", "image", "matrix", "stringio", "vectors"
]


def _validate_data_input( # noqa: PLR0912
data=None, x=None, y=None, z=None, mincols=2, required_data=True, kind=None
) -> None:
def _validate_data_input(data: Any, kind: Kind, mincols=2) -> None: # noqa: PLR0912
"""
Check if the combination of data/x/y/z is valid.
Check if the data to be passed to the virtualfile_from_ functions is valid.

Examples
--------
>>> _validate_data_input(data="infile")
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6])
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], z=[7, 8, 9])
>>> _validate_data_input(data=None, required_data=False)
>>> _validate_data_input()
The "empty" kind means the data is given via a series of vectors like x/y/z.

>>> _validate_data_input(data=[[1, 2, 3], [4, 5, 6]], kind="empty")
>>> _validate_data_input(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], kind="empty")
>>> _validate_data_input(data=[None, [4, 5, 6]], kind="empty")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: No input data provided.
>>> _validate_data_input(x=[1, 2, 3])
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(data=[[1, 2, 3], None], kind="empty")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(y=[4, 5, 6])
>>> _validate_data_input(data=[None, None], kind="empty")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], mincols=3)
>>> _validate_data_input(data=[[1, 2, 3], [4, 5, 6]], kind="empty", mincols=3)
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide x, y, and z.

The "matrix" kind means the data is given via a 2-D numpy.ndarray.

>>> import numpy as np
>>> import pandas as pd
>>> import xarray as xr
>>> data = np.arange(8).reshape((4, 2))
>>> _validate_data_input(data=data, mincols=3, kind="matrix")
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
pygmt.exceptions.GMTInvalidInput: Need at least 3 columns but 2 column(s) are given.

The "vectors" kind means the original data is either dictionary, list, tuple,
pandas.DataFrame, pandas.Series, xarray.Dataset, or xarray.DataArray.

>>> _validate_data_input(
... data=pd.DataFrame(data, columns=["x", "y"]),
... mincols=3,
... kind="vectors",
... )
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
pygmt.exceptions.GMTInvalidInput: Need at least 3 columns but 2 column(s) are given.
>>> _validate_data_input(
... data=xr.Dataset(pd.DataFrame(data, columns=["x", "y"])),
... mincols=3,
... kind="vectors",
... )
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: data must provide x, y, and z columns.
>>> _validate_data_input(data="infile", x=[1, 2, 3])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", y=[4, 5, 6])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", x=[1, 2, 3], y=[4, 5, 6])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", z=[7, 8, 9])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
pygmt.exceptions.GMTInvalidInput: Need at least 3 columns but 2 column(s) are given.

Raises
------
GMTInvalidInput
If the data input is not valid.
"""
required_z = mincols >= 3
if data is None: # data is None
if x is None and y is None: # both x and y are None
if required_data: # data is not optional
msg = "No input data provided."

match kind:
case "empty": # data = [x, y], [x, y, z], [x, y, z, ...]
if len(data) < 2 or any(v is None for v in data[:2]):
msg = "Must provide both x and y."
raise GMTInvalidInput(msg)
elif x is None or y is None: # either x or y is None
msg = "Must provide both x and y."
raise GMTInvalidInput(msg)
if required_z and z is None: # both x and y are not None, now check z
msg = "Must provide x, y, and z."
raise GMTInvalidInput(msg)
else: # data is not None
if x is not None or y is not None or z is not None:
msg = "Too much data. Use either data or x/y/z."
raise GMTInvalidInput(msg)
# check if data has the required z column
if required_z:
msg = "data must provide x, y, and z columns."
if kind == "matrix" and data.shape[1] < 3:
if mincols >= 3 and (len(data) < 3 or data[:3] is None):
msg = "Must provide x, y, and z."
raise GMTInvalidInput(msg)
case "matrix": # 2-D numpy.ndarray
if (actual_cols := data.shape[1]) < mincols:
msg = f"Need at least {mincols} columns but {actual_cols} column(s) are given."
raise GMTInvalidInput(msg)
case "vectors":
# "vectors" means the original data is either dictionary, list, tuple,
# pandas.DataFrame, pandas.Series, xarray.Dataset, or xarray.DataArray.
# The original data is converted to a list of vectors or a 2-D numpy.ndarray
# in the virtualfile_in function.
if (actual_cols := len(data)) < mincols:
msg = f"Need at least {mincols} columns but {actual_cols} column(s) are given."
raise GMTInvalidInput(msg)
if kind == "vectors":
if hasattr(data, "shape") and (
Expand All @@ -145,15 +139,15 @@
raise GMTInvalidInput(msg)
if hasattr(data, "data_vars") and len(data.data_vars) < 3: # xr.Dataset
raise GMTInvalidInput(msg)
if kind == "vectors" and isinstance(data, dict):
# Iterator over the up-to-3 first elements.
arrays = list(islice(data.values(), 3))
if len(arrays) < 2 or any(v is None for v in arrays[:2]): # Check x/y
msg = "Must provide x and y."
raise GMTInvalidInput(msg)
if required_z and (len(arrays) < 3 or arrays[2] is None): # Check z
msg = "Must provide x, y, and z."
raise GMTInvalidInput(msg)
if kind == "vectors" and isinstance(data, dict):
# Iterator over the up-to-3 first elements.
arrays = list(islice(data.values(), 3))
if len(arrays) < 2 or any(v is None for v in arrays[:2]): # Check x/y
msg = "Must provide x and y."
raise GMTInvalidInput(msg)
if required_z and (len(arrays) < 3 or arrays[2] is None): # Check z
msg = "Must provide x, y, and z."
raise GMTInvalidInput(msg)

Check warning on line 150 in pygmt/helpers/utils.py

View check run for this annotation

Codecov / codecov/patch

pygmt/helpers/utils.py#L144-L150

Added lines #L144 - L150 were not covered by tests


def _is_printable_ascii(argstr: str) -> bool:
Expand Down Expand Up @@ -272,11 +266,7 @@
return "ISOLatin1+"


def data_kind(
data: Any, required: bool = True
) -> Literal[
"arg", "empty", "file", "geojson", "grid", "image", "matrix", "stringio", "vectors"
]:
def data_kind(data: Any, required: bool = True) -> Kind:
r"""
Check the kind of data that is provided to a module.

Expand Down
Loading