From df87f692ea3d68ec90bc19fb227996413ee083a0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 21 Oct 2024 09:54:57 -0600 Subject: [PATCH] Fix multiple grouping with missing groups (#9650) * Fix multiple grouping with missing groups Closes #9360 * Small repr improvement * Small optimization in mask * Add whats-new * fix doctests --- doc/whats-new.rst | 2 ++ xarray/core/dataarray.py | 6 +++--- xarray/core/dataset.py | 6 +++--- xarray/core/groupby.py | 19 +++++++------------ xarray/tests/test_groupby.py | 36 ++++++++++++++++++++++++++++++++++-- 5 files changed, 49 insertions(+), 20 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c47b184faa9..d31efcbae0e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -68,6 +68,8 @@ Bug fixes - Fix the safe_chunks validation option on the to_zarr method (:issue:`5511`, :pull:`9559`). By `Joseph Nowak `_. +- Fix binning by multiple variables where some bins have no observations. (:issue:`9630`). + By `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a23e5b35915..9b5291fc553 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6800,7 +6800,7 @@ def groupby( >>> da.groupby("letters") + 'letters': 2/2 groups present with labels 'a', 'b'> Execute a reduction @@ -6816,8 +6816,8 @@ def groupby( >>> da.groupby(["letters", "x"]) + 'letters': 2/2 groups present with labels 'a', 'b' + 'x': 4/4 groups present with labels 10, 20, 30, 40> Use Grouper objects to express more complicated GroupBy operations diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d433cbcec18..f8cf23d188c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -10403,7 +10403,7 @@ def groupby( >>> ds.groupby("letters") + 'letters': 2/2 groups present with labels 'a', 'b'> Execute a reduction @@ -10420,8 +10420,8 @@ def groupby( >>> ds.groupby(["letters", "x"]) + 'letters': 2/2 groups present with labels 'a', 'b' + 'x': 4/4 groups present with labels 10, 20, 30, 40> Use Grouper objects to express more complicated GroupBy operations diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 5536c5d2e26..5a5a241f6c1 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -454,6 +454,7 @@ def factorize(self) -> EncodedGroups: # At this point all arrays have been factorized. codes = tuple(grouper.codes for grouper in groupers) shape = tuple(grouper.size for grouper in groupers) + masks = tuple((code == -1) for code in codes) # We broadcast the codes against each other broadcasted_codes = broadcast(*codes) # This fully broadcasted DataArray is used as a template later @@ -464,24 +465,18 @@ def factorize(self) -> EncodedGroups: ) # NaNs; as well as values outside the bins are coded by -1 # Restore these after the raveling - mask = functools.reduce( - np.logical_or, # type: ignore[arg-type] - [(code == -1) for code in broadcasted_codes], - ) + broadcasted_masks = broadcast(*masks) + mask = functools.reduce(np.logical_or, broadcasted_masks) # type: ignore[arg-type] _flatcodes[mask] = -1 - midx = pd.MultiIndex.from_product( - (grouper.unique_coord.data for grouper in groupers), + full_index = pd.MultiIndex.from_product( + (grouper.full_index.values for grouper in groupers), names=tuple(grouper.name for grouper in groupers), ) # Constructing an index from the product is wrong when there are missing groups # (e.g. binning, resampling). Account for that now. - midx = midx[np.sort(pd.unique(_flatcodes[~mask]))] + midx = full_index[np.sort(pd.unique(_flatcodes[~mask]))] - full_index = pd.MultiIndex.from_product( - (grouper.full_index.values for grouper in groupers), - names=tuple(grouper.name for grouper in groupers), - ) dim_name = "stacked_" + "_".join(str(grouper.name) for grouper in groupers) coords = Coordinates.from_pandas_multiindex(midx, dim=dim_name) @@ -684,7 +679,7 @@ def __repr__(self) -> str: for grouper in self.groupers: coord = grouper.unique_coord labels = ", ".join(format_array_flat(coord, 30).split()) - text += f"\n {grouper.name!r}: {coord.size} groups with labels {labels}" + text += f"\n {grouper.name!r}: {coord.size}/{grouper.full_index.size} groups present with labels {labels}" return text + ">" def _iter_grouped(self) -> Iterator[T_Xarray]: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 3c321166619..3d948e7840e 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -606,7 +606,7 @@ def test_groupby_repr(obj, dim) -> None: N = len(np.unique(obj[dim])) expected = f"<{obj.__class__.__name__}GroupBy" expected += f", grouped over 1 grouper(s), {N} groups in total:" - expected += f"\n {dim!r}: {N} groups with labels " + expected += f"\n {dim!r}: {N}/{N} groups present with labels " if dim == "x": expected += "1, 2, 3, 4, 5>" elif dim == "y": @@ -623,7 +623,7 @@ def test_groupby_repr_datetime(obj) -> None: actual = repr(obj.groupby("t.month")) expected = f"<{obj.__class__.__name__}GroupBy" expected += ", grouped over 1 grouper(s), 12 groups in total:\n" - expected += " 'month': 12 groups with labels " + expected += " 'month': 12/12 groups present with labels " expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>" assert actual == expected @@ -2953,3 +2953,35 @@ def test_groupby_transpose(): second = data.groupby("x").sum() assert_identical(first, second.transpose(*first.dims)) + + +def test_groupby_multiple_bin_grouper_missing_groups(): + from numpy import nan + + ds = xr.Dataset( + {"foo": (("z"), np.arange(12))}, + coords={"x": ("z", np.arange(12)), "y": ("z", np.arange(12))}, + ) + + actual = ds.groupby( + x=BinGrouper(np.arange(0, 13, 4)), y=BinGrouper(bins=np.arange(0, 16, 2)) + ).count() + expected = Dataset( + { + "foo": ( + ("x_bins", "y_bins"), + np.array( + [ + [2.0, 2.0, nan, nan, nan, nan, nan], + [nan, nan, 2.0, 2.0, nan, nan, nan], + [nan, nan, nan, nan, 2.0, 1.0, nan], + ] + ), + ) + }, + coords={ + "x_bins": ("x_bins", pd.IntervalIndex.from_breaks(np.arange(0, 13, 4))), + "y_bins": ("y_bins", pd.IntervalIndex.from_breaks(np.arange(0, 16, 2))), + }, + ) + assert_identical(actual, expected)