From aa4b273a4b2e4abaa242a3226714ec6d9de19628 Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Fri, 25 Jun 2021 16:54:29 +1000 Subject: [PATCH 1/8] handle in-memory xrarry --- libs/algo/odc/algo/_percentile.py | 25 +++++++++++++++---------- libs/algo/tests/test_percentile.py | 10 ++++++---- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py index 67e2b581c..69d9b28a5 100644 --- a/libs/algo/odc/algo/_percentile.py +++ b/libs/algo/odc/algo/_percentile.py @@ -4,6 +4,7 @@ import numpy as np from ._masking import keep_good_np from dask.base import tokenize +import dask from functools import partial @@ -58,20 +59,24 @@ def xr_percentile( for band, xx in src.data_vars.items(): xx_data = xx.data - if len(xx.chunks[0]) > 1: - xx_data = xx_data.rechunk({0: -1}) + + if dask.is_dask_collection(xx_data): + if len(xx.chunks[0]) > 1: + xx_data = xx_data.rechunk({0: -1}) tk = tokenize(xx_data, percentiles, nodata) for percentile in percentiles: name = f"{band}_pc_{int(100 * percentile)}" - yy = da.map_blocks( - partial(np_percentile, percentile=percentile, nodata=nodata), - xx_data, - drop_axis=0, - meta=np.array([], dtype=xx.dtype), - name=f"{name}-{tk}", - ) - + if dask.is_dask_collection(xx_data): + yy = da.map_blocks( + partial(np_percentile, percentile=percentile, nodata=nodata), + xx_data, + drop_axis=0, + meta=np.array([], dtype=xx.dtype), + name=f"{name}-{tk}", + ) + else: + yy = np_percentile(xx_data, percentile=percentile, nodata=nodata) data_vars[name] = (xx.dims[1:], yy) coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:]) diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py index cfec7a616..67bfe47b0 100644 --- a/libs/algo/tests/test_percentile.py +++ b/libs/algo/tests/test_percentile.py @@ -55,8 +55,9 @@ def test_np_percentile_bad_data(nodata): np.testing.assert_equal(np_percentile(arr, 0.0, nodata), np.array([nodata, 3])) -@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) #should do -1 -def test_xr_percentile(nodata): +@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) +@pytest.mark.parametrize("use_dask", [False, True]) +def test_xr_percentile(nodata, use_dask): band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) @@ -69,8 +70,9 @@ def test_xr_percentile(nodata): true_results["band_1_pc_60"] = np_percentile(band_1, 0.6, nodata) true_results["band_2_pc_60"] = np_percentile(band_2, 0.6, nodata) - band_1 = da.from_array(band_1, chunks=(2, 20, 20)) - band_2 = da.from_array(band_2, chunks=(2, 20, 20)) + if use_dask: + band_1 = da.from_array(band_1, chunks=(2, 20, 20)) + band_2 = da.from_array(band_2, chunks=(2, 20, 20)) attrs = {"test": "attrs"} coords = { From ffab635486ffdb6553a366a580b90d58e5a300a1 Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Fri, 9 Jul 2021 14:20:49 +1000 Subject: [PATCH 2/8] dimension implementation --- libs/algo/odc/algo/_percentile.py | 72 +++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py index 69d9b28a5..50e1ca5d1 100644 --- a/libs/algo/odc/algo/_percentile.py +++ b/libs/algo/odc/algo/_percentile.py @@ -35,9 +35,9 @@ def np_percentile(xx, percentile, nodata): return keep_good_np(xx, (valid_counts >= 3), nodata) -def xr_percentile( +def xr_quantile_bands( src: xr.Dataset, - percentiles: Sequence, + quantiles: Sequence, nodata, ) -> xr.Dataset: @@ -50,7 +50,7 @@ def xr_percentile( float or integer with `nodata` values to indicate gaps in data. `nodata` must be the largest or smallest values in the dataset or NaN. - :param percentiles: A sequence of percentiles in the [0.0, 1.0] range + :param percentiles: A sequence of quantiles in the [0.0, 1.0] range :param nodata: The `nodata` value """ @@ -64,20 +64,76 @@ def xr_percentile( if len(xx.chunks[0]) > 1: xx_data = xx_data.rechunk({0: -1}) - tk = tokenize(xx_data, percentiles, nodata) - for percentile in percentiles: - name = f"{band}_pc_{int(100 * percentile)}" + tk = tokenize(xx_data, quantiles, nodata) + for quantile in quantiles: + name = f"{band}_pc_{int(100 * quantile)}" if dask.is_dask_collection(xx_data): yy = da.map_blocks( - partial(np_percentile, percentile=percentile, nodata=nodata), + partial(np_percentile, percentile=quantile, nodata=nodata), xx_data, drop_axis=0, meta=np.array([], dtype=xx.dtype), name=f"{name}-{tk}", ) else: - yy = np_percentile(xx_data, percentile=percentile, nodata=nodata) + yy = np_percentile(xx_data, percentile=quantile, nodata=nodata) data_vars[name] = (xx.dims[1:], yy) coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:]) + return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs) + + +def xr_quantile( + src: xr.Dataset, + quantiles: Sequence, + nodata, +) -> xr.Dataset: + + """ + Calculates the percentiles of the input data along the time dimension. + + This approach is approximately 700x faster than the `numpy` and `xarray` nanpercentile functions. + + :param src: xr.Dataset, bands can be either + float or integer with `nodata` values to indicate gaps in data. + `nodata` must be the largest or smallest values in the dataset or NaN. + + :param percentiles: A sequence of quantiles in the [0.0, 1.0] range + + :param nodata: The `nodata` value + """ + + data_vars = {} + for band, xx in src.data_vars.items(): + + xx_data = xx.data + out_dims = ('quantile',) + xx.dims[1:] + + if dask.is_dask_collection(xx_data): + if len(xx.chunks[0]) > 1: + xx_data = xx_data.rechunk({0: -1}) + + tk = tokenize(xx_data, quantiles, nodata) + data = [] + for quantile in quantiles: + name = f"{band}_pc_{int(100 * quantile)}" + if dask.is_dask_collection(xx_data): + yy = da.map_blocks( + partial(np_percentile, percentile=quantile, nodata=nodata), + xx_data, + drop_axis=0, + meta=np.array([], dtype=xx.dtype), + name=f"{name}-{tk}", + ) + else: + yy = np_percentile(xx_data, percentile=quantile, nodata=nodata) + data.append(yy) + + if dask.is_dask_collection(yy): + data_vars[name] = (out_dims, da.stack(data, axis=0)) + else: + data_vars[name] = (out_dims, np.stack(data, axis=0)) + + coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:]) + coords['quantile'] = np.array(quantiles) return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs) \ No newline at end of file From fdea86b7c74a43dac88be7bd7a44a54ee8891c0d Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Fri, 9 Jul 2021 04:38:09 +0000 Subject: [PATCH 3/8] add tests --- libs/algo/odc/algo/_percentile.py | 4 +-- libs/algo/tests/test_percentile.py | 39 +++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py index 50e1ca5d1..9d64c2794 100644 --- a/libs/algo/odc/algo/_percentile.py +++ b/libs/algo/odc/algo/_percentile.py @@ -130,9 +130,9 @@ def xr_quantile( data.append(yy) if dask.is_dask_collection(yy): - data_vars[name] = (out_dims, da.stack(data, axis=0)) + data_vars[band] = (out_dims, da.stack(data, axis=0)) else: - data_vars[name] = (out_dims, np.stack(data, axis=0)) + data_vars[band] = (out_dims, np.stack(data, axis=0)) coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:]) coords['quantile'] = np.array(quantiles) diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py index 67bfe47b0..c266cfb38 100644 --- a/libs/algo/tests/test_percentile.py +++ b/libs/algo/tests/test_percentile.py @@ -1,4 +1,4 @@ -from odc.algo._percentile import np_percentile, xr_percentile +from odc.algo._percentile import np_percentile, xr_quantile_bands, xr_quantile import numpy as np import pytest import dask.array as da @@ -57,7 +57,7 @@ def test_np_percentile_bad_data(nodata): @pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) @pytest.mark.parametrize("use_dask", [False, True]) -def test_xr_percentile(nodata, use_dask): +def test_xr_quantile_bands(nodata, use_dask): band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) @@ -84,7 +84,40 @@ def test_xr_percentile(nodata, use_dask): data_vars = {"band_1": (("t", "y", "x"), band_1), "band_2": (("t", "y", "x"), band_2)} dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) - output = xr_percentile(dataset, [0.2, 0.6], nodata).compute() + output = xr_quantile_bands(dataset, [0.2, 0.6], nodata).compute() + + for key in output.keys(): + np.testing.assert_equal(output[key], true_results[key]) + + +@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) +@pytest.mark.parametrize("use_dask", [False, True]) +def test_xr_quantile(nodata, use_dask): + band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) + band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata)) + + band_1[np.random.random(size=band_1.shape) > 0.5] = nodata + band_2[np.random.random(size=band_1.shape) > 0.5] = nodata + + true_results = dict() + true_results["band_1"] = np.stack([np_percentile(band_1, 0.2, nodata), np_percentile(band_1, 0.6, nodata)], axis=0) + true_results["band_2"] = np.stack([np_percentile(band_2, 0.2, nodata), np_percentile(band_2, 0.6, nodata)], axis=0) + + if use_dask: + band_1 = da.from_array(band_1, chunks=(2, 20, 20)) + band_2 = da.from_array(band_2, chunks=(2, 20, 20)) + + attrs = {"test": "attrs"} + coords = { + "x": np.linspace(10, 20, band_1.shape[2]), + "y": np.linspace(0, 5, band_1.shape[1]), + "t": np.linspace(0, 5, band_1.shape[0]) + } + + data_vars = {"band_1": (("t", "y", "x"), band_1), "band_2": (("t", "y", "x"), band_2)} + + dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) + output = xr_quantile(dataset, [0.2, 0.6], nodata).compute() for key in output.keys(): np.testing.assert_equal(output[key], true_results[key]) \ No newline at end of file From af0e403a1f7e3f0944fb72c30f9d4bba66052c72 Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Fri, 9 Jul 2021 04:39:52 +0000 Subject: [PATCH 4/8] swap functions in stats --- libs/stats/odc/stats/_fc_percentiles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/stats/odc/stats/_fc_percentiles.py b/libs/stats/odc/stats/_fc_percentiles.py index fd6e6b9c1..14ffdb5eb 100644 --- a/libs/stats/odc/stats/_fc_percentiles.py +++ b/libs/stats/odc/stats/_fc_percentiles.py @@ -10,7 +10,7 @@ from odc.stats.model import Task from odc.algo.io import load_with_native_transform from odc.algo import keep_good_only -from odc.algo._percentile import xr_percentile +from odc.algo._percentile import xr_quantile_bands from odc.algo._masking import _xr_fuse, _or_fuser, _first_valid_np, _fuse_or_np, _fuse_and_np from .model import StatsPluginInterface from . import _plugins @@ -97,7 +97,7 @@ def reduce(xx: xr.Dataset) -> xr.Dataset: xx = xx.drop_vars(["dry", "wet"]) xx = keep_good_only(xx, mask, nodata=NODATA) - yy = xr_percentile(xx, [0.1, 0.5, 0.9], nodata=NODATA) + yy = xr_quantile_bands(xx, [0.1, 0.5, 0.9], nodata=NODATA) is_ever_wet = _or_fuser(wet).squeeze(wet.dims[0], drop=True) band, *bands = yy.data_vars.keys() From d0491b79cd170bf00bd95021df0324c4edd19300 Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Fri, 9 Jul 2021 04:41:07 +0000 Subject: [PATCH 5/8] minor tweaks --- libs/algo/tests/test_percentile.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py index c266cfb38..a151fe270 100644 --- a/libs/algo/tests/test_percentile.py +++ b/libs/algo/tests/test_percentile.py @@ -86,8 +86,8 @@ def test_xr_quantile_bands(nodata, use_dask): dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) output = xr_quantile_bands(dataset, [0.2, 0.6], nodata).compute() - for key in output.keys(): - np.testing.assert_equal(output[key], true_results[key]) + for band in output.keys(): + np.testing.assert_equal(output[band], true_results[band]) @pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) @@ -119,5 +119,5 @@ def test_xr_quantile(nodata, use_dask): dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs) output = xr_quantile(dataset, [0.2, 0.6], nodata).compute() - for key in output.keys(): - np.testing.assert_equal(output[key], true_results[key]) \ No newline at end of file + for band in output.keys(): + np.testing.assert_equal(output[band], true_results[band]) \ No newline at end of file From 81de5d6b7425a86f6feee36f79a9f63c6405d44d Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Fri, 6 Aug 2021 11:16:29 +1000 Subject: [PATCH 6/8] Enable better import of xr_quantile --- libs/algo/odc/algo/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libs/algo/odc/algo/__init__.py b/libs/algo/odc/algo/__init__.py index 083aed8d3..0d8bb4d8d 100644 --- a/libs/algo/odc/algo/__init__.py +++ b/libs/algo/odc/algo/__init__.py @@ -62,6 +62,8 @@ from ._tiff import save_cog +from ._percentile import xr_quantile + __all__ = ( "apply_numexpr", "safe_div", @@ -106,4 +108,5 @@ "colorize", "xr_reproject", "save_cog", + "xr_quantile", ) From 58c283068de6a4e95c31bf619f8a8c486057d8ca Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Mon, 20 Sep 2021 13:58:10 +1000 Subject: [PATCH 7/8] fix attibutes --- libs/algo/odc/algo/_percentile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py index 09771b93d..45906a62c 100644 --- a/libs/algo/odc/algo/_percentile.py +++ b/libs/algo/odc/algo/_percentile.py @@ -77,7 +77,7 @@ def xr_quantile_bands( ) else: yy = np_percentile(xx_data, percentile=quantile, nodata=nodata) - data_vars[name] = (xx.dims[1:], yy) + data_vars[name] = xr.DataArray(yy, dims=xx.dims[1:], attrs=xx.attrs) coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:]) return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs) From 3fe5c61450eba8c2d47d4e2d9aef231c3c9c119c Mon Sep 17 00:00:00 2001 From: Kieran Ricardo Date: Tue, 21 Sep 2021 12:05:09 +1000 Subject: [PATCH 8/8] fix tests --- libs/algo/tests/test_percentile.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py index f8e6f3ce6..f01a736f7 100644 --- a/libs/algo/tests/test_percentile.py +++ b/libs/algo/tests/test_percentile.py @@ -124,8 +124,4 @@ def test_xr_quantile(nodata, use_dask): for band in output.keys(): np.testing.assert_equal(output[band], true_results[band]) - - assert output["band_1_pc_20"].attrs["test_attr"] == 1 - assert output["band_1_pc_60"].attrs["test_attr"] == 1 - assert output["band_2_pc_20"].attrs["test_attr"] == 2 - assert output["band_2_pc_20"].attrs["test_attr"] == 2 + \ No newline at end of file