From aa4b273a4b2e4abaa242a3226714ec6d9de19628 Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Fri, 25 Jun 2021 16:54:29 +1000
Subject: [PATCH 1/8] handle in-memory xrarry

---
 libs/algo/odc/algo/_percentile.py  | 25 +++++++++++++++----------
 libs/algo/tests/test_percentile.py | 10 ++++++----
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py
index 67e2b581c..69d9b28a5 100644
--- a/libs/algo/odc/algo/_percentile.py
+++ b/libs/algo/odc/algo/_percentile.py
@@ -4,6 +4,7 @@
 import numpy as np
 from ._masking import keep_good_np
 from dask.base import tokenize
+import dask
 from functools import partial
 
 
@@ -58,20 +59,24 @@ def xr_percentile(
     for band, xx in src.data_vars.items():
        
         xx_data = xx.data
-        if len(xx.chunks[0]) > 1:
-            xx_data = xx_data.rechunk({0: -1})
+
+        if dask.is_dask_collection(xx_data):
+            if len(xx.chunks[0]) > 1:
+                xx_data = xx_data.rechunk({0: -1})
         
         tk = tokenize(xx_data, percentiles, nodata)
         for percentile in percentiles:
             name = f"{band}_pc_{int(100 * percentile)}"
-            yy = da.map_blocks(
-                partial(np_percentile, percentile=percentile, nodata=nodata), 
-                xx_data, 
-                drop_axis=0, 
-                meta=np.array([], dtype=xx.dtype),
-                name=f"{name}-{tk}",
-            )
-            
+            if dask.is_dask_collection(xx_data):
+                yy = da.map_blocks(
+                    partial(np_percentile, percentile=percentile, nodata=nodata), 
+                    xx_data, 
+                    drop_axis=0, 
+                    meta=np.array([], dtype=xx.dtype),
+                    name=f"{name}-{tk}",
+                )
+            else:
+                yy = np_percentile(xx_data, percentile=percentile, nodata=nodata)
             data_vars[name] = (xx.dims[1:], yy)
 
     coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:])
diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py
index cfec7a616..67bfe47b0 100644
--- a/libs/algo/tests/test_percentile.py
+++ b/libs/algo/tests/test_percentile.py
@@ -55,8 +55,9 @@ def test_np_percentile_bad_data(nodata):
     np.testing.assert_equal(np_percentile(arr, 0.0, nodata), np.array([nodata, 3]))
 
 
-@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1]) #should do -1
-def test_xr_percentile(nodata):
+@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1])
+@pytest.mark.parametrize("use_dask", [False, True])
+def test_xr_percentile(nodata, use_dask):
     band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata))
     band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata))
 
@@ -69,8 +70,9 @@ def test_xr_percentile(nodata):
     true_results["band_1_pc_60"] = np_percentile(band_1, 0.6, nodata)
     true_results["band_2_pc_60"] = np_percentile(band_2, 0.6, nodata)
 
-    band_1 = da.from_array(band_1, chunks=(2, 20, 20))
-    band_2 = da.from_array(band_2, chunks=(2, 20, 20))
+    if use_dask:
+        band_1 = da.from_array(band_1, chunks=(2, 20, 20))
+        band_2 = da.from_array(band_2, chunks=(2, 20, 20))
 
     attrs = {"test": "attrs"}
     coords = {

From ffab635486ffdb6553a366a580b90d58e5a300a1 Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieran@Kierans-MBP-2.modem>
Date: Fri, 9 Jul 2021 14:20:49 +1000
Subject: [PATCH 2/8] dimension implementation

---
 libs/algo/odc/algo/_percentile.py | 72 +++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py
index 69d9b28a5..50e1ca5d1 100644
--- a/libs/algo/odc/algo/_percentile.py
+++ b/libs/algo/odc/algo/_percentile.py
@@ -35,9 +35,9 @@ def np_percentile(xx, percentile, nodata):
     return keep_good_np(xx, (valid_counts >= 3), nodata)
 
 
-def xr_percentile(
+def xr_quantile_bands(
     src: xr.Dataset,
-    percentiles: Sequence,
+    quantiles: Sequence,
     nodata,
 ) -> xr.Dataset:
 
@@ -50,7 +50,7 @@ def xr_percentile(
         float or integer with `nodata` values to indicate gaps in data.
         `nodata` must be the largest or smallest values in the dataset or NaN.
 
-    :param percentiles: A sequence of percentiles in the [0.0, 1.0] range
+    :param percentiles: A sequence of quantiles in the [0.0, 1.0] range
 
     :param nodata: The `nodata` value
     """
@@ -64,20 +64,76 @@ def xr_percentile(
             if len(xx.chunks[0]) > 1:
                 xx_data = xx_data.rechunk({0: -1})
         
-        tk = tokenize(xx_data, percentiles, nodata)
-        for percentile in percentiles:
-            name = f"{band}_pc_{int(100 * percentile)}"
+        tk = tokenize(xx_data, quantiles, nodata)
+        for quantile in quantiles:
+            name = f"{band}_pc_{int(100 * quantile)}"
             if dask.is_dask_collection(xx_data):
                 yy = da.map_blocks(
-                    partial(np_percentile, percentile=percentile, nodata=nodata), 
+                    partial(np_percentile, percentile=quantile, nodata=nodata), 
                     xx_data, 
                     drop_axis=0, 
                     meta=np.array([], dtype=xx.dtype),
                     name=f"{name}-{tk}",
                 )
             else:
-                yy = np_percentile(xx_data, percentile=percentile, nodata=nodata)
+                yy = np_percentile(xx_data, percentile=quantile, nodata=nodata)
             data_vars[name] = (xx.dims[1:], yy)
 
     coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:])
+    return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs)
+
+
+def xr_quantile(
+    src: xr.Dataset,
+    quantiles: Sequence,
+    nodata,
+) -> xr.Dataset:
+
+    """
+    Calculates the percentiles of the input data along the time dimension.
+
+    This approach is approximately 700x faster than the `numpy` and `xarray` nanpercentile functions.
+
+    :param src: xr.Dataset, bands can be either
+        float or integer with `nodata` values to indicate gaps in data.
+        `nodata` must be the largest or smallest values in the dataset or NaN.
+
+    :param percentiles: A sequence of quantiles in the [0.0, 1.0] range
+
+    :param nodata: The `nodata` value
+    """
+
+    data_vars = {}
+    for band, xx in src.data_vars.items():
+       
+        xx_data = xx.data
+        out_dims = ('quantile',) + xx.dims[1:]
+
+        if dask.is_dask_collection(xx_data):
+            if len(xx.chunks[0]) > 1:
+                xx_data = xx_data.rechunk({0: -1})
+        
+        tk = tokenize(xx_data, quantiles, nodata)
+        data = []
+        for quantile in quantiles:
+            name = f"{band}_pc_{int(100 * quantile)}"
+            if dask.is_dask_collection(xx_data):
+                yy = da.map_blocks(
+                    partial(np_percentile, percentile=quantile, nodata=nodata), 
+                    xx_data, 
+                    drop_axis=0, 
+                    meta=np.array([], dtype=xx.dtype),
+                    name=f"{name}-{tk}",
+                )
+            else:
+                yy = np_percentile(xx_data, percentile=quantile, nodata=nodata)
+            data.append(yy)
+
+        if dask.is_dask_collection(yy):
+            data_vars[name] = (out_dims, da.stack(data, axis=0))
+        else:
+            data_vars[name] = (out_dims, np.stack(data, axis=0))
+
+    coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:])
+    coords['quantile'] = np.array(quantiles)
     return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs)
\ No newline at end of file

From fdea86b7c74a43dac88be7bd7a44a54ee8891c0d Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Fri, 9 Jul 2021 04:38:09 +0000
Subject: [PATCH 3/8] add tests

---
 libs/algo/odc/algo/_percentile.py  |  4 +--
 libs/algo/tests/test_percentile.py | 39 +++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py
index 50e1ca5d1..9d64c2794 100644
--- a/libs/algo/odc/algo/_percentile.py
+++ b/libs/algo/odc/algo/_percentile.py
@@ -130,9 +130,9 @@ def xr_quantile(
             data.append(yy)
 
         if dask.is_dask_collection(yy):
-            data_vars[name] = (out_dims, da.stack(data, axis=0))
+            data_vars[band] = (out_dims, da.stack(data, axis=0))
         else:
-            data_vars[name] = (out_dims, np.stack(data, axis=0))
+            data_vars[band] = (out_dims, np.stack(data, axis=0))
 
     coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:])
     coords['quantile'] = np.array(quantiles)
diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py
index 67bfe47b0..c266cfb38 100644
--- a/libs/algo/tests/test_percentile.py
+++ b/libs/algo/tests/test_percentile.py
@@ -1,4 +1,4 @@
-from odc.algo._percentile import np_percentile, xr_percentile
+from odc.algo._percentile import np_percentile, xr_quantile_bands, xr_quantile
 import numpy as np
 import pytest
 import dask.array as da
@@ -57,7 +57,7 @@ def test_np_percentile_bad_data(nodata):
 
 @pytest.mark.parametrize("nodata", [255, 200, np.nan, -1])
 @pytest.mark.parametrize("use_dask", [False, True])
-def test_xr_percentile(nodata, use_dask):
+def test_xr_quantile_bands(nodata, use_dask):
     band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata))
     band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata))
 
@@ -84,7 +84,40 @@ def test_xr_percentile(nodata, use_dask):
     data_vars = {"band_1": (("t", "y", "x"), band_1), "band_2": (("t", "y", "x"), band_2)}
 
     dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs)
-    output = xr_percentile(dataset, [0.2, 0.6], nodata).compute()
+    output = xr_quantile_bands(dataset, [0.2, 0.6], nodata).compute()
+
+    for key in output.keys():
+        np.testing.assert_equal(output[key], true_results[key])
+
+
+@pytest.mark.parametrize("nodata", [255, 200, np.nan, -1])
+@pytest.mark.parametrize("use_dask", [False, True])
+def test_xr_quantile(nodata, use_dask):
+    band_1 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata))
+    band_2 = np.random.randint(0, 100, size=(10, 100, 200)).astype(type(nodata))
+
+    band_1[np.random.random(size=band_1.shape) > 0.5] = nodata
+    band_2[np.random.random(size=band_1.shape) > 0.5] = nodata
+
+    true_results = dict()
+    true_results["band_1"] = np.stack([np_percentile(band_1, 0.2, nodata), np_percentile(band_1, 0.6, nodata)], axis=0)
+    true_results["band_2"] = np.stack([np_percentile(band_2, 0.2, nodata), np_percentile(band_2, 0.6, nodata)], axis=0)
+
+    if use_dask:
+        band_1 = da.from_array(band_1, chunks=(2, 20, 20))
+        band_2 = da.from_array(band_2, chunks=(2, 20, 20))
+
+    attrs = {"test": "attrs"}
+    coords = {
+        "x": np.linspace(10, 20, band_1.shape[2]), 
+        "y": np.linspace(0, 5, band_1.shape[1]), 
+        "t": np.linspace(0, 5, band_1.shape[0])
+    }
+
+    data_vars = {"band_1": (("t", "y", "x"), band_1), "band_2": (("t", "y", "x"), band_2)}
+
+    dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs)
+    output = xr_quantile(dataset, [0.2, 0.6], nodata).compute()
 
     for key in output.keys():
         np.testing.assert_equal(output[key], true_results[key])
\ No newline at end of file

From af0e403a1f7e3f0944fb72c30f9d4bba66052c72 Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Fri, 9 Jul 2021 04:39:52 +0000
Subject: [PATCH 4/8] swap functions in stats

---
 libs/stats/odc/stats/_fc_percentiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/stats/odc/stats/_fc_percentiles.py b/libs/stats/odc/stats/_fc_percentiles.py
index fd6e6b9c1..14ffdb5eb 100644
--- a/libs/stats/odc/stats/_fc_percentiles.py
+++ b/libs/stats/odc/stats/_fc_percentiles.py
@@ -10,7 +10,7 @@
 from odc.stats.model import Task
 from odc.algo.io import load_with_native_transform
 from odc.algo import keep_good_only
-from odc.algo._percentile import xr_percentile
+from odc.algo._percentile import xr_quantile_bands
 from odc.algo._masking import _xr_fuse, _or_fuser, _first_valid_np, _fuse_or_np, _fuse_and_np
 from .model import StatsPluginInterface
 from . import _plugins
@@ -97,7 +97,7 @@ def reduce(xx: xr.Dataset) -> xr.Dataset:
         xx = xx.drop_vars(["dry", "wet"])
         xx = keep_good_only(xx, mask, nodata=NODATA)
 
-        yy = xr_percentile(xx, [0.1, 0.5, 0.9], nodata=NODATA)
+        yy = xr_quantile_bands(xx, [0.1, 0.5, 0.9], nodata=NODATA)
         is_ever_wet = _or_fuser(wet).squeeze(wet.dims[0], drop=True)
 
         band, *bands = yy.data_vars.keys()

From d0491b79cd170bf00bd95021df0324c4edd19300 Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Fri, 9 Jul 2021 04:41:07 +0000
Subject: [PATCH 5/8] minor tweaks

---
 libs/algo/tests/test_percentile.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py
index c266cfb38..a151fe270 100644
--- a/libs/algo/tests/test_percentile.py
+++ b/libs/algo/tests/test_percentile.py
@@ -86,8 +86,8 @@ def test_xr_quantile_bands(nodata, use_dask):
     dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs)
     output = xr_quantile_bands(dataset, [0.2, 0.6], nodata).compute()
 
-    for key in output.keys():
-        np.testing.assert_equal(output[key], true_results[key])
+    for band in output.keys():
+        np.testing.assert_equal(output[band], true_results[band])
 
 
 @pytest.mark.parametrize("nodata", [255, 200, np.nan, -1])
@@ -119,5 +119,5 @@ def test_xr_quantile(nodata, use_dask):
     dataset = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attrs)
     output = xr_quantile(dataset, [0.2, 0.6], nodata).compute()
 
-    for key in output.keys():
-        np.testing.assert_equal(output[key], true_results[key])
\ No newline at end of file
+    for band in output.keys():
+        np.testing.assert_equal(output[band], true_results[band])
\ No newline at end of file

From 81de5d6b7425a86f6feee36f79a9f63c6405d44d Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Fri, 6 Aug 2021 11:16:29 +1000
Subject: [PATCH 6/8] Enable better import of xr_quantile

---
 libs/algo/odc/algo/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libs/algo/odc/algo/__init__.py b/libs/algo/odc/algo/__init__.py
index 083aed8d3..0d8bb4d8d 100644
--- a/libs/algo/odc/algo/__init__.py
+++ b/libs/algo/odc/algo/__init__.py
@@ -62,6 +62,8 @@
 
 from ._tiff import save_cog
 
+from ._percentile import xr_quantile
+
 __all__ = (
     "apply_numexpr",
     "safe_div",
@@ -106,4 +108,5 @@
     "colorize",
     "xr_reproject",
     "save_cog",
+    "xr_quantile",
 )

From 58c283068de6a4e95c31bf619f8a8c486057d8ca Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Mon, 20 Sep 2021 13:58:10 +1000
Subject: [PATCH 7/8] fix attibutes

---
 libs/algo/odc/algo/_percentile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/algo/odc/algo/_percentile.py b/libs/algo/odc/algo/_percentile.py
index 09771b93d..45906a62c 100644
--- a/libs/algo/odc/algo/_percentile.py
+++ b/libs/algo/odc/algo/_percentile.py
@@ -77,7 +77,7 @@ def xr_quantile_bands(
                 )
             else:
                 yy = np_percentile(xx_data, percentile=quantile, nodata=nodata)
-            data_vars[name] = (xx.dims[1:], yy)
+            data_vars[name] = xr.DataArray(yy, dims=xx.dims[1:], attrs=xx.attrs)
             
     coords = dict((dim, src.coords[dim]) for dim in xx.dims[1:])
     return xr.Dataset(data_vars=data_vars, coords=coords, attrs=src.attrs)

From 3fe5c61450eba8c2d47d4e2d9aef231c3c9c119c Mon Sep 17 00:00:00 2001
From: Kieran Ricardo <kieranricardo@hotmail.com>
Date: Tue, 21 Sep 2021 12:05:09 +1000
Subject: [PATCH 8/8] fix tests

---
 libs/algo/tests/test_percentile.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libs/algo/tests/test_percentile.py b/libs/algo/tests/test_percentile.py
index f8e6f3ce6..f01a736f7 100644
--- a/libs/algo/tests/test_percentile.py
+++ b/libs/algo/tests/test_percentile.py
@@ -124,8 +124,4 @@ def test_xr_quantile(nodata, use_dask):
 
     for band in output.keys():
         np.testing.assert_equal(output[band], true_results[band])
-    
-    assert output["band_1_pc_20"].attrs["test_attr"] == 1
-    assert output["band_1_pc_60"].attrs["test_attr"] == 1
-    assert output["band_2_pc_20"].attrs["test_attr"] == 2
-    assert output["band_2_pc_20"].attrs["test_attr"] == 2
+    
\ No newline at end of file