Skip to content

Commit

Permalink
correctly encode/decode _FillValues/missing_values/dtypes for packed …
Browse files Browse the repository at this point in the history
…data (#8713)

* correctly encode/decode _FillValues

* fix mypy

* fix CFMaskCode test

* fix scale/offset

* avert zarr issue

* add whats-new.rst entry

* refactor fillvalue/missing value check to catch non-conforming values, apply review suggestions

* fix typing

* suppress warning in doc

* FIX: silence SerializationWarnings

* FIX: silence mypy by casting to string early

* Update xarray/tests/test_conventions.py

Co-authored-by: Deepak Cherian <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Shorten test, add comment checking for two warnings

* make test even shorter

---------

Co-authored-by: Deepak Cherian <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Mar 15, 2024
1 parent c9d3084 commit fbcac76
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 99 deletions.
1 change: 1 addition & 0 deletions doc/user-guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ __ https://numpy.org/doc/stable/user/basics.indexing.html#assigning-values-to-in
You can also assign values to all variables of a :py:class:`Dataset` at once:

.. ipython:: python
:okwarning:
ds_org = xr.tutorial.open_dataset("eraint_uvz").isel(
latitude=slice(56, 59), longitude=slice(255, 258), level=0
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ Bug fixes
when used in :py:meth:`DataArray.expand_dims` and
::py:meth:`Dataset.expand_dims` (:pull:`8781`). By `Spencer
Clark <https://github.com/spencerkclark>`_.
- CF conform handling of `_FillValue`/`missing_value` and `dtype` in
`CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`,
:issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
183 changes: 126 additions & 57 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,44 @@ def _is_time_like(units):
return any(tstr == units for tstr in time_strings)


def _check_fill_values(attrs, name, dtype):
""" "Check _FillValue and missing_value if available.
Return dictionary with raw fill values and set with encoded fill values.
Issue SerializationWarning if appropriate.
"""
raw_fill_dict = {}
[
pop_to(attrs, raw_fill_dict, attr, name=name)
for attr in ("missing_value", "_FillValue")
]
encoded_fill_values = set()
for k in list(raw_fill_dict):
v = raw_fill_dict[k]
kfill = {fv for fv in np.ravel(v) if not pd.isnull(fv)}
if not kfill and np.issubdtype(dtype, np.integer):
warnings.warn(
f"variable {name!r} has non-conforming {k!r} "
f"{v!r} defined, dropping {k!r} entirely.",
SerializationWarning,
stacklevel=3,
)
del raw_fill_dict[k]
else:
encoded_fill_values |= kfill

if len(encoded_fill_values) > 1:
warnings.warn(
f"variable {name!r} has multiple fill values "
f"{encoded_fill_values} defined, decoding all values to NaN.",
SerializationWarning,
stacklevel=3,
)

return raw_fill_dict, encoded_fill_values


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

Expand All @@ -283,67 +321,56 @@ def encode(self, variable: Variable, name: T_Name = None):
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

# special case DateTime to properly handle NaT
is_time_like = _is_time_like(attrs.get("units"))

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
# try to use _FillValue, if it exists to align both values
# or use missing_value and ensure it's cast to same dtype as data's
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

# apply fillna
if not pd.isnull(fill_value):
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)

def decode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_decoding(variable)

raw_fill_values = [
pop_to(attrs, encoding, attr, name=name)
for attr in ("missing_value", "_FillValue")
]
if raw_fill_values:
encoded_fill_values = {
fv
for option in raw_fill_values
for fv in np.ravel(option)
if not pd.isnull(fv)
}

if len(encoded_fill_values) > 1:
warnings.warn(
"variable {!r} has multiple fill values {}, "
"decoding all values to NaN.".format(name, encoded_fill_values),
SerializationWarning,
stacklevel=3,
)
raw_fill_dict, encoded_fill_values = _check_fill_values(
variable.attrs, name, variable.dtype
)

# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
if raw_fill_dict:
dims, data, attrs, encoding = unpack_for_decoding(variable)
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
Expand All @@ -366,20 +393,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
return data


def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
def _choose_float_dtype(
dtype: np.dtype, mapping: MutableMapping
) -> type[np.floating[Any]]:
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
# check scale/offset first to derive wanted float dtype
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
scale_factor = mapping.get("scale_factor")
add_offset = mapping.get("add_offset")
if scale_factor is not None or add_offset is not None:
# get the type from scale_factor/add_offset to determine
# the needed floating point type
if scale_factor is not None:
scale_type = np.dtype(type(scale_factor))
if add_offset is not None:
offset_type = np.dtype(type(add_offset))
# CF conforming, both scale_factor and add-offset are given and
# of same floating point type (float32/64)
if (
add_offset is not None
and scale_factor is not None
and offset_type == scale_type
and scale_type in [np.float32, np.float64]
):
# in case of int32 -> we need upcast to float64
# due to precision issues
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
return np.float64
return scale_type.type
# Not CF conforming and add_offset given:
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if add_offset is not None:
return np.float64
# return dtype depending on given scale_factor
return scale_type.type
# If no scale_factor or add_offset is given, use some general rules.
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64
Expand All @@ -396,7 +454,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
# if we have a _FillValue/masked_value we do not want to cast now
# but leave that to CFMaskCoder
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(data.dtype, encoding)
# but still we need a copy prevent changing original data
data = duck_array_ops.astype(data, dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
Expand All @@ -412,11 +475,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
# if we have a _FillValue/masked_value we already have the wanted
# floating point dtype here (via CFMaskCoder), so no check is necessary
# only check in other cases
dtype = data.dtype
if "_FillValue" not in encoding and "missing_value" not in encoding:
dtype = _choose_float_dtype(dtype, encoding)

transform = partial(
_scale_offset_decoding,
scale_factor=scale_factor,
Expand Down
Loading

0 comments on commit fbcac76

Please sign in to comment.