Begin testing

dcherian · dcherian · commit c2b83fd9d479 · 2023-06-02T22:18:04.000-06:00
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
@@ -14,7 +14,9 @@ def _prepare_for_flox(group_idx, array):
     if issorted:
         ordered_array = array
     else:
-        perm = group_idx.argsort(kind="stable")
+        kind = "stable" if isinstance(group_idx, np.ndarray) else None
+
+        perm = np.argsort(group_idx, kind=kind)
         group_idx = group_idx[..., perm]
         ordered_array = array[..., perm]
     return group_idx, ordered_array
diff --git a/flox/core.py b/flox/core.py
@@ -570,6 +570,8 @@ def factorize_(
                 else:
                     assert sort
                     groups, idx = np.unique(flat, return_inverse=True)
+                    idx[np.isnan(flat)] = -1
+                    groups = groups[~np.isnan(groups)]
 
             found_groups.append(groups)
         factorized.append(idx.reshape(groupvar.shape))
@@ -1261,7 +1263,7 @@ def subset_to_blocks(
     layer = {(name,) + key: tuple(new_keys[key].tolist()) for key in keys}
     graph = HighLevelGraph.from_collections(name, layer, dependencies=[array])
 
-    return dask.array.Array(graph, name, chunks, meta=array)
+    return dask.array.Array(graph, name, chunks, meta=array._meta)
 
 
 def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]:
@@ -1494,6 +1496,7 @@ def dask_groupby_agg(
         reduced,
         inds,
         adjust_chunks=dict(zip(out_inds, output_chunks)),
+        meta=array._meta,
         dtype=agg.dtype["final"],
         key=agg.name,
         name=f"{name}-{token}",
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -24,6 +24,13 @@
 except ImportError:
     xr_types = ()  # type: ignore
 
+try:
+    import cupy as cp
+
+    cp_types = (cp.ndarray,)
+except ImportError:
+    cp_types = ()  # type: ignore
+
 
 def _importorskip(modname, minversion=None):
     try:
@@ -80,6 +87,15 @@ def raise_if_dask_computes(max_computes=0):
     return dask.config.set(scheduler=scheduler)
 
 
+def to_numpy(a):
+    a_np = a
+    if isinstance(a_np, dask_array_type):
+        a_np = a_np.compute()
+    if isinstance(a_np, cp_types):
+        a_np = a_np.get()
+    return a_np
+
+
 def assert_equal(a, b, tolerance=None):
     __tracebackhide__ = True
 
@@ -102,16 +118,20 @@ def assert_equal(a, b, tolerance=None):
     else:
         tolerance = {}
 
-    if has_dask and isinstance(a, dask_array_type) or isinstance(b, dask_array_type):
+    if has_dask and (isinstance(a, dask_array_type) or isinstance(b, dask_array_type)):
         # sometimes it's nice to see values and shapes
         # rather than being dropped into some file in dask
-        np.testing.assert_allclose(a, b, **tolerance)
+        np.testing.assert_allclose(to_numpy(a), to_numpy(b), **tolerance)
         # does some validation of the dask graph
         da.utils.assert_eq(a, b, equal_nan=True)
     else:
         if a.dtype != b.dtype:
             raise AssertionError(f"a and b have different dtypes: (a: {a.dtype}, b: {b.dtype})")
 
+        if isinstance(a, cp_types):
+            a = a.get()
+        if isinstance(b, cp_types):
+            b = b.get()
         np.testing.assert_allclose(a, b, equal_nan=True, **tolerance)
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,3 +9,18 @@ def engine(request):
         except ImportError:
             pytest.xfail()
     return request.param
+
+
+@pytest.fixture(scope="module", params=["numpy", "cupy"])
+def array_module(request):
+    if request.param == "cupy":
+        try:
+            import cupy  # noqa
+
+            return cupy
+        except ImportError:
+            pytest.xfail()
+    elif request.param == "numpy":
+        import numpy
+
+        return numpy
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -178,31 +178,53 @@ def test_groupby_reduce(
     assert_equal(expected_result, result)
 
 
-def gen_array_by(size, func):
-    by = np.ones(size[-1])
-    rng = np.random.default_rng(12345)
+def maybe_skip_cupy(array_module, func, engine):
+    if array_module is np:
+        return
+
+    import cupy
+
+    assert array_module is cupy
+
+    if engine == "numba":
+        pytest.skip()
+
+    if engine == "numpy" and ("prod" in func or "first" in func or "last" in func):
+        pytest.xfail()
+    elif engine == "flox" and not (
+        "sum" in func or "mean" in func or "std" in func or "var" in func
+    ):
+        pytest.xfail()
+
+
+def gen_array_by(size, func, array_module):
+    xp = array_module
+    by = xp.ones(size[-1])
+    rng = xp.random.default_rng(12345)
     array = rng.random(size)
     if "nan" in func and "nanarg" not in func:
-        array[[1, 4, 5], ...] = np.nan
+        array[[1, 4, 5], ...] = xp.nan
     elif "nanarg" in func and len(size) > 1:
-        array[[1, 4, 5], 1] = np.nan
+        array[[1, 4, 5], 1] = xp.nan
     if func in ["any", "all"]:
         array = array > 0.5
     return array, by
 
 
-@pytest.mark.parametrize("chunks", [None, -1, 3, 4])
 @pytest.mark.parametrize("nby", [1, 2, 3])
 @pytest.mark.parametrize("size", ((12,), (12, 9)))
-@pytest.mark.parametrize("add_nan_by", [True, False])
+@pytest.mark.parametrize("chunks", [None, -1, 3, 4])
 @pytest.mark.parametrize("func", ALL_FUNCS)
-def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
+@pytest.mark.parametrize("add_nan_by", [True, False])
+def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine, array_module):
     if chunks is not None and not has_dask:
         pytest.skip()
     if "arg" in func and engine == "flox":
         pytest.skip()
 
-    array, by = gen_array_by(size, func)
+    maybe_skip_cupy(array_module, func, engine)
+
+    array, by = gen_array_by(size, func, array_module)
     if chunks:
         array = dask.array.from_array(array, chunks=chunks)
     by = (by,) * nby
@@ -254,10 +276,12 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
         assert expected.ndim == (array.ndim + nby - 1)
         expected_groups = tuple(np.array([idx + 1.0]) for idx in range(nby))
         for actual_group, expect in zip(groups, expected_groups):
-            assert_equal(actual_group, expect)
+            assert_equal(actual_group, array_module.asarray(expect))
         if "arg" in func:
             assert actual.dtype.kind == "i"
-        assert_equal(actual, expected, tolerance)
+        if chunks is not None:
+            assert isinstance(actual._meta, type(array._meta))
+        assert_equal(actual, array_module.asarray(expected), tolerance)
 
         if not has_dask or chunks is None:
             continue
@@ -287,6 +311,8 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine):
                 assert_equal(actual_group, expect, tolerance)
             if "arg" in func:
                 assert actual.dtype.kind == "i"
+            if chunks is not None:
+                assert isinstance(actual._meta, type(array._meta))
             assert_equal(actual, expected, tolerance)
 
 
@@ -313,18 +339,18 @@ def test_arg_reduction_dtype_is_int(size, func):
     assert actual.dtype.kind == "i"
 
 
-def test_groupby_reduce_count():
-    array = np.array([0, 0, np.nan, np.nan, np.nan, 1, 1])
-    labels = np.array(["a", "b", "b", "b", "c", "c", "c"])
+def test_groupby_reduce_count(array_module):
+    array = array_module.array([0, 0, np.nan, np.nan, np.nan, 1, 1])
+    labels = array_module.array(["a", "b", "b", "b", "c", "c", "c"])
     result, _ = groupby_reduce(array, labels, func="count")
     assert_equal(result, np.array([1, 1, 2], dtype=np.intp))
 
 
-def test_func_is_aggregation():
+def test_func_is_aggregation(array_module):
     from flox.aggregations import mean
 
-    array = np.array([0, 0, np.nan, np.nan, np.nan, 1, 1])
-    labels = np.array(["a", "b", "b", "b", "c", "c", "c"])
+    array = array_module.array([0, 0, np.nan, np.nan, np.nan, 1, 1])
+    labels = array_module.array(["a", "b", "b", "b", "c", "c", "c"])
     expected, _ = groupby_reduce(array, labels, func="mean")
     actual, _ = groupby_reduce(array, labels, func=mean)
     assert_equal(actual, expected)