From 2a3aaab2d2dd8c70df2d668160cb0f09e93a6afe Mon Sep 17 00:00:00 2001
From: Adeel Hassan <ahassan@element84.com>
Date: Fri, 14 Jul 2023 12:23:53 -0400
Subject: [PATCH 1/5] refactor ensure_json_serializable()

- relocate to core.data.utils.misc
- allow serializing of Box objects
- add unit tests
- remove unused functions from core.utils.misc
---
 rastervision_core/rastervision/core/box.py    |  1 +
 .../rastervision/core/data/utils/misc.py      | 21 ++++++++++
 .../evaluation/classification_evaluation.py   | 22 +---------
 .../rastervision/core/utils/misc.py           | 37 -----------------
 tests/core/data/utils/test_misc.py            | 40 ++++++++++++++++++-
 5 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/rastervision_core/rastervision/core/box.py b/rastervision_core/rastervision/core/box.py
index 785842dfc..90c65e37a 100644
--- a/rastervision_core/rastervision/core/box.py
+++ b/rastervision_core/rastervision/core/box.py
@@ -444,6 +444,7 @@ def get_windows(self,
         return windows
 
     def to_dict(self) -> Dict[str, int]:
+        """Convert to a dict with keys: xmin, ymin, xmax, ymax."""
         return {
             'xmin': self.xmin,
             'ymin': self.ymin,
diff --git a/rastervision_core/rastervision/core/data/utils/misc.py b/rastervision_core/rastervision/core/data/utils/misc.py
index e9305922b..c085071fa 100644
--- a/rastervision_core/rastervision/core/data/utils/misc.py
+++ b/rastervision_core/rastervision/core/data/utils/misc.py
@@ -219,3 +219,24 @@ def parse_array_slices_Nd(key: Union[tuple, slice],
     dim_slices[w_dim] = w_slice
 
     return window, dim_slices
+
+
+def ensure_json_serializable(obj: Any) -> dict:
+    """Convert numpy types to JSON serializable equivalents."""
+    if obj is None or isinstance(obj, (str, int, bool)):
+        return obj
+    if isinstance(obj, dict):
+        return {k: ensure_json_serializable(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [ensure_json_serializable(o) for o in obj]
+    if isinstance(obj, np.ndarray):
+        return ensure_json_serializable(obj.tolist())
+    if isinstance(obj, np.integer):
+        return int(obj)
+    if isinstance(obj, (float, np.floating)):
+        if np.isnan(obj):
+            return None
+        return float(obj)
+    if isinstance(obj, Box):
+        return obj.to_dict()
+    return obj
diff --git a/rastervision_core/rastervision/core/evaluation/classification_evaluation.py b/rastervision_core/rastervision/core/evaluation/classification_evaluation.py
index e97e7ad08..07fff5aaf 100644
--- a/rastervision_core/rastervision/core/evaluation/classification_evaluation.py
+++ b/rastervision_core/rastervision/core/evaluation/classification_evaluation.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from rastervision.pipeline.file_system import str_to_file
+from rastervision.core.data.utils import ensure_json_serializable
 
 if TYPE_CHECKING:
     from rastervision.core.evaluation import ClassEvaluationItem
@@ -151,24 +152,3 @@ def compute(self, ground_truth_labels, prediction_labels):
             prediction_labels: The predicted labels to evaluate.
         """
         pass
-
-
-def ensure_json_serializable(obj: Any) -> dict:
-    """Convert numpy types to JSON serializable equivalents."""
-    if obj is None or isinstance(obj, (str, int, bool)):
-        return obj
-    if isinstance(obj, dict):
-        return {k: ensure_json_serializable(v) for k, v in obj.items()}
-    if isinstance(obj, (list, tuple)):
-        return [ensure_json_serializable(o) for o in obj]
-    if isinstance(obj, np.ndarray):
-        return ensure_json_serializable(obj.tolist())
-    if isinstance(obj, np.integer):
-        return int(obj)
-    if isinstance(obj, float):
-        if np.isnan(obj):
-            return None
-        return float(obj)
-    if isinstance(obj, np.floating):
-        return float(obj)
-    return obj
diff --git a/rastervision_core/rastervision/core/utils/misc.py b/rastervision_core/rastervision/core/utils/misc.py
index 1d108ce95..7e9dafe6a 100644
--- a/rastervision_core/rastervision/core/utils/misc.py
+++ b/rastervision_core/rastervision/core/utils/misc.py
@@ -1,8 +1,5 @@
-import io
 from pydantic import confloat
 
-from PIL import Image
-import numpy as np
 import imageio
 import logging
 
@@ -13,37 +10,3 @@
 
 def save_img(im_array, output_path):
     imageio.imwrite(output_path, im_array)
-
-
-def numpy_to_png(array: np.ndarray) -> str:
-    """Get a PNG string from a Numpy array.
-
-    Args:
-         array: A Numpy array of shape (w, h, 3) or (w, h), where the
-               former is meant to become a three-channel image and the
-               latter a one-channel image.  The dtype of the array
-               should be uint8.
-
-    Returns:
-         str
-
-    """
-    im = Image.fromarray(array)
-    output = io.BytesIO()
-    im.save(output, 'png')
-    return output.getvalue()
-
-
-def png_to_numpy(png: str, dtype=np.uint8) -> np.ndarray:
-    """Get a Numpy array from a PNG string.
-
-    Args:
-         png: A str containing a PNG-formatted image.
-
-    Returns:
-         numpy.ndarray
-
-    """
-    incoming = io.BytesIO(png)
-    im = Image.open(incoming)
-    return np.array(im)
diff --git a/tests/core/data/utils/test_misc.py b/tests/core/data/utils/test_misc.py
index 198a8e79d..e2d0fb1c9 100644
--- a/tests/core/data/utils/test_misc.py
+++ b/tests/core/data/utils/test_misc.py
@@ -1,6 +1,9 @@
-from typing import Any, Tuple
+from typing import Any, Callable, Tuple
 import unittest
 from os.path import join
+import json
+
+import numpy as np
 
 from rastervision.pipeline.file_system.utils import get_tmp_dir, json_to_file
 from rastervision.core.box import Box
@@ -10,6 +13,7 @@
     ChipClassificationGeoJSONStore, ObjectDetectionLabelSource,
     ObjectDetectionGeoJSONStore, SemanticSegmentationLabelSource,
     SemanticSegmentationLabelStore)
+from rastervision.core.data.utils.misc import ensure_json_serializable
 from rastervision.core.data.utils.geojson import geoms_to_geojson
 from rastervision.core.data.utils.misc import (
     match_bboxes, parse_array_slices_2d, parse_array_slices_Nd)
@@ -222,5 +226,39 @@ def test_step(self):
         self.assertListEqual(dim_slices, [slice(0, 60, 2), slice(0, 40, 3)])
 
 
+class TestEnsureJsonSerializable(unittest.TestCase):
+    def assertNoError(self, fn: Callable, msg: str = ''):
+        try:
+            fn()
+        except Exception:
+            self.fail(msg)
+
+    def test_serializable(self):
+        objs = [None, 'str', 1, True, False]
+        for obj in objs:
+            self.assertEqual(ensure_json_serializable(obj), obj, msg=str(obj))
+
+    def test_numpy(self):
+        arr = np.ones(5, dtype=int)
+        self.assertListEqual(ensure_json_serializable(arr), [1] * 5)
+        self.assertIsInstance(ensure_json_serializable(arr[0]), int)
+        arr = np.ones(5, dtype=np.float32)
+        self.assertIsInstance(ensure_json_serializable(arr[0]), float)
+
+    def test_dict(self):
+        arr = {'a': np.ones(5, dtype=int)}
+        self.assertDictEqual(ensure_json_serializable(arr), dict(a=([1] * 5)))
+
+    def test_float_edge_cases(self):
+        d = dict(a=np.nan, b=np.inf, c=-np.inf)
+        d_serializable = ensure_json_serializable(d)
+        self.assertNoError(lambda: json.dumps(d_serializable))
+
+    def test_box(self):
+        box = Box(0, 1, 2, 3)
+        box_dict = dict(ymin=0, xmin=1, ymax=2, xmax=3)
+        self.assertDictEqual(ensure_json_serializable(box), box_dict)
+
+
 if __name__ == '__main__':
     unittest.main()

From 08e06e6f1fdbdd0b6ee389d68c64ebb88b09cebd Mon Sep 17 00:00:00 2001
From: Adeel Hassan <ahassan@element84.com>
Date: Fri, 14 Jul 2023 12:23:45 -0400
Subject: [PATCH 2/5] refactor RasterStats and StatsTransformer

- also add/update unit tests
---
 .../raster_transformer/stats_transformer.py   |  24 +-
 .../rastervision/core/raster_stats.py         | 309 ++++++++++++------
 .../test_stats_transformer.py                 |  56 +++-
 tests/core/data/test_scene.py                 |   4 +
 tests/core/test_raster_stats.py               | 133 ++++++++
 tests/core/test_stats_analyzer.py             |  34 +-
 6 files changed, 417 insertions(+), 143 deletions(-)
 create mode 100644 tests/core/test_raster_stats.py

diff --git a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py
index e686f27c6..5e6194eda 100644
--- a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py
+++ b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from rastervision.core.data.raster_transformer import RasterTransformer
+from rastervision.core.raster_stats import RasterStats
 
 if TYPE_CHECKING:
     from rastervision.core.data import RasterSource
@@ -94,7 +95,7 @@ def transform(self,
     @classmethod
     def from_raster_sources(cls,
                             raster_sources: List['RasterSource'],
-                            sample_prob: float = 0.1,
+                            sample_prob: Optional[float] = 0.1,
                             max_stds: float = 3.) -> 'StatsTransformer':
         """Create a StatsTransformer with stats from the given raster sources.
 
@@ -110,9 +111,24 @@ def from_raster_sources(cls,
         Returns:
             StatsTransformer: A StatsTransformer.
         """
-        from rastervision.core.raster_stats import RasterStats
         stats = RasterStats()
         stats.compute(raster_sources=raster_sources, sample_prob=sample_prob)
-        stats_transformer = StatsTransformer(
-            means=stats.means, stds=stats.stds, max_stds=max_stds)
+        stats_transformer = StatsTransformer.from_raster_stats(
+            stats, max_stds=max_stds)
         return stats_transformer
+
+    @classmethod
+    def from_stats_json(cls, uri: str, **kwargs) -> 'StatsTransformer':
+        stats = RasterStats.load(uri)
+        stats_transformer = StatsTransformer.from_raster_stats(stats, **kwargs)
+        return stats_transformer
+
+    @classmethod
+    def from_raster_stats(cls, stats: RasterStats,
+                          **kwargs) -> 'StatsTransformer':
+        stats_transformer = StatsTransformer(stats.means, stats.stds, **kwargs)
+        return stats_transformer
+
+    @property
+    def stats(self):
+        return RasterStats(self.means, self.stds)
diff --git a/rastervision_core/rastervision/core/raster_stats.py b/rastervision_core/rastervision/core/raster_stats.py
index 1c5aa9bcf..1fea35cbc 100644
--- a/rastervision_core/rastervision/core/raster_stats.py
+++ b/rastervision_core/rastervision/core/raster_stats.py
@@ -1,16 +1,157 @@
-from typing import TYPE_CHECKING, Iterator, Optional, Sequence
-import json
+from typing import (TYPE_CHECKING, Iterable, Iterator, Optional, Sequence,
+                    Tuple, Union)
 
 import numpy as np
 from tqdm.auto import tqdm
 
-from rastervision.pipeline.file_system import str_to_file, file_to_str
+from rastervision.pipeline.file_system import file_to_json, json_to_file
+from rastervision.core.data.utils import ensure_json_serializable
 
 if TYPE_CHECKING:
     from rastervision.core.box import Box
     from rastervision.core.data import RasterSource
 
 
+class RasterStats:
+    """Band-wise means and standard deviations."""
+
+    def __init__(self,
+                 means: Optional[np.ndarray] = None,
+                 stds: Optional[np.ndarray] = None,
+                 count: Optional[np.ndarray] = None):
+        self.means = means
+        self.stds = stds
+        self.count = count
+
+    @classmethod
+    def load(cls, stats_uri: str) -> 'RasterStats':
+        """Load stats from file."""
+        stats_json = file_to_json(stats_uri)
+        assert 'means' in stats_json and 'stds' in stats_json
+        stats = RasterStats(
+            means=stats_json['means'],
+            stds=stats_json['stds'],
+            count=stats_json.get('count'))
+        return stats
+
+    def compute(self,
+                raster_sources: Sequence['RasterSource'],
+                sample_prob: Optional[float] = None,
+                chip_sz: int = 300,
+                stride: Optional[int] = None,
+                nodata_value: Optional[float] = 0) -> None:
+        """Compute the mean and stds over all the raster_sources.
+
+        This ignores NODATA values if nodata_value is not None.
+
+        If sample_prob is set, then a subset of each scene is used to compute
+        stats which speeds up the computation. Roughly speaking, if
+        sample_prob=0.5, then half the pixels in the scene will be used. More
+        precisely, the number of chips is equal to
+        sample_prob * (width * height / 300^2), or 1, whichever is greater.
+        Each chip is uniformly sampled from the scene with replacement.
+        Otherwise, it uses a sliding window over the entire scene to compute
+        stats.
+
+        Args:
+            raster_sources Sequence['RasterSource']: List of RasterSources.
+            sample_prob (Optional[float]): Pixel sampling probability. See
+                notes above. Defaults to None.
+            nodata_value (Optional[float]): NODATA value. If set, these pixels
+                will be ignored when computing stats.
+        """
+        if sample_prob is None:
+            if stride is None:
+                stride = chip_sz
+            chip_stream = sliding_chip_stream(
+                raster_sources, chip_sz, stride, nodata_value=nodata_value)
+        else:
+            chip_stream = random_chip_stream(
+                raster_sources,
+                chip_sz,
+                sample_prob,
+                nodata_value=nodata_value)
+
+        means, vars, count = self.compute_from_chips(
+            chip_stream,
+            running_mean=self.means,
+            running_var=self.vars,
+            running_count=self.count)
+        if means is None or vars is None:
+            raise ValueError('No valid chips found in raster sources to '
+                             'compute stats from. This may be because all '
+                             'sampled chips were entirely composed of NODATA '
+                             'pixels.')
+        self.means = means
+        self.stds = np.sqrt(vars)
+        self.count = count
+
+    def compute_from_chips(
+            self,
+            chips: Iterable[np.ndarray],
+            running_mean: Optional[np.ndarray] = None,
+            running_var: Optional[np.ndarray] = None,
+            running_count: Optional[np.ndarray] = None) -> Union[Tuple[
+                None, None, None], Tuple[np.ndarray, np.ndarray, np.ndarray]]:
+        """Compute running mean and var from chips in stream."""
+        with tqdm(chips, desc='Analyzing chips') as bar:
+            for chip in bar:
+                num_channels = chip.shape[-1]
+                # (..., H, W, C) --> (... * H * W, C)
+                pixels = chip.reshape(-1, num_channels)
+                stats = self.compute_from_pixels(pixels, running_mean,
+                                                 running_var, running_count)
+                running_mean, running_var, running_count = stats
+
+        return running_mean, running_var, running_count
+
+    def compute_from_pixels(self,
+                            pixels: np.ndarray,
+                            running_mean: Optional[np.ndarray] = None,
+                            running_var: Optional[np.ndarray] = None,
+                            running_count: Optional[np.ndarray] = None
+                            ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """Update running mean and var from pixel values."""
+        running_stats = [running_mean, running_var, running_count]
+        has_running_stats = any(s is not None for s in running_stats)
+        has_all_running_stats = all(s is not None for s in running_stats)
+        if has_running_stats and not has_all_running_stats:
+            raise ValueError('Provide either none or all running stats.')
+
+        channel_means = np.nanmean(pixels, axis=0)
+        channel_vars = np.nanvar(pixels, axis=0)
+        channel_counts = np.sum(~np.isnan(pixels), axis=0)
+
+        if not has_running_stats:
+            return channel_means, channel_vars, channel_counts
+
+        running_var = parallel_variance(channel_means, channel_counts,
+                                        channel_vars, running_mean,
+                                        running_count, running_var)
+        running_mean = parallel_mean(channel_means, channel_counts,
+                                     running_mean, running_count)
+        running_count += channel_counts
+
+        return running_mean, running_var, running_count
+
+    def to_dict(self) -> dict:
+        stats_dict = dict(means=self.means, stds=self.stds, count=self.count)
+        return stats_dict
+
+    def save(self, stats_uri: str) -> None:
+        """Save stats to file."""
+        assert self.means is not None and self.stds is not None
+        stats_dict = self.to_dict()
+        stats_dict = ensure_json_serializable(stats_dict)
+        json_to_file(stats_dict, stats_uri)
+
+    @property
+    def vars(self) -> Optional[np.ndarray]:
+        if self.stds is None:
+            return None
+        return self.stds**2
+
+
 def parallel_variance(mean_a, count_a, var_a, mean_b, count_b, var_b):
     """Compute the variance based on stats from two partitions of the data.
 
@@ -55,117 +196,67 @@ def parallel_mean(mean_a, count_a, mean_b, count_b):
     return mean
 
 
-class RasterStats():
-    def __init__(self):
-        self.means = None
-        self.stds = None
+def sliding_chip_stream(
+        raster_sources: Iterable['RasterSource'],
+        chip_sz: int,
+        stride: int,
+        nodata_value: Optional[float] = 0) -> Iterator[np.ndarray]:
+    """Get stream of chips using a sliding window."""
+    for raster_source in raster_sources:
+        windows = raster_source.extent.get_windows(chip_sz, stride)
+        for window in windows:
+            chip = get_chip(raster_source, window, nodata_value=nodata_value)
+            if chip is None:
+                continue
+            yield chip
 
-    def compute(self,
-                raster_sources: Sequence['RasterSource'],
-                sample_prob: Optional[float] = None,
-                chip_sz: int = 300,
-                nodata_value: Optional[float] = 0) -> None:
-        """Compute the mean and stds over all the raster_sources.
 
-        This ignores NODATA values if nodata_value is not None.
+def random_chip_stream(
+        raster_sources: Iterable['RasterSource'],
+        chip_sz: int,
+        sample_prob: float,
+        nodata_value: Optional[float] = 0) -> Iterator[np.ndarray]:
+    """Get random stream of chips."""
+    for raster_source in raster_sources:
+        extent = raster_source.extent
+        num_chips_to_sample = get_num_chips_to_sample(extent, chip_sz,
+                                                      sample_prob)
+        if num_chips_to_sample == 0:
+            windows = [extent]
+        else:
+            windows = [
+                extent.make_random_square(chip_sz)
+                for _ in range(num_chips_to_sample)
+            ]
+        for window in windows:
+            chip = get_chip(raster_source, window, nodata_value=nodata_value)
+            if chip is None:
+                continue
+            yield chip
 
-        If sample_prob is set, then a subset of each scene is used to compute
-        stats which speeds up the computation. Roughly speaking, if
-        sample_prob=0.5, then half the pixels in the scene will be used. More
-        precisely, the number of chips is equal to
-        sample_prob * (width * height / 300^2), or 1, whichever is greater.
-        Each chip is uniformly sampled from the scene with replacement.
-        Otherwise, it uses a sliding window over the entire scene to compute
-        stats.
 
-        Args:
-            raster_sources Sequence['RasterSource']: List of RasterSources.
-            sample_prob (Optional[float]): Pixel sampling probability. See
-                notes above. Defaults to None.
-            nodata_value (Optional[float]): NODATA value. If set, these pixels
-                will be ignored when computing stats.
-        """
-        stride = chip_sz
-
-        def get_chip(raster_source: 'RasterSource',
-                     window: 'Box') -> Optional[np.ndarray]:
-            """Return chip or None if all values are NODATA."""
-            chip = raster_source.get_raw_chip(window).astype(float)
-            # Convert shape from [h,w,c] to [c,h*w]
-            chip = chip.reshape(-1, chip.shape[-1])
-
-            if nodata_value is None:
-                return chip
-            else:
-                # Ignore NODATA values.
-                chip[chip == nodata_value] = np.nan
-                has_non_nan_pixels = np.any(~np.isnan(chip))
-                if has_non_nan_pixels:
-                    return chip
-                return None
-
-        def sliding_chip_stream() -> Iterator[np.ndarray]:
-            """Get stream of chips using a sliding window of size 300."""
-            for raster_source in raster_sources:
-                windows = raster_source.extent.get_windows(chip_sz, stride)
-                for window in windows:
-                    chip = get_chip(raster_source, window)
-                    if chip is not None:
-                        yield chip
-
-        def random_chip_stream() -> Iterator[np.ndarray]:
-            """Get random stream of chips."""
-            for raster_source in raster_sources:
-                extent = raster_source.extent
-                num_pixels = extent.area
-                num_chips = round(sample_prob * (num_pixels / (chip_sz**2)))
-                num_chips = max(1, num_chips)
-                for _ in range(num_chips):
-                    window = raster_source.extent.make_random_square(chip_sz)
-                    chip = get_chip(raster_source, window)
-                    if chip is not None:
-                        yield chip
-
-        # For each chip, compute the mean and var of that chip and then update the
-        # running mean and var.
-        count = 0
-        mean = None
-        var = None
-        chip_stream = (sliding_chip_stream()
-                       if sample_prob is None else random_chip_stream())
-        with tqdm(chip_stream, desc='Analyzing chips') as bar:
-            for chip in bar:
-                chip_means = np.nanmean(chip, axis=0)
-                chip_vars = np.nanvar(chip, axis=0)
-                chip_count = np.sum(chip.sum(axis=-1) != np.nan)
-
-                if mean is None or var is None:
-                    mean = np.zeros_like(chip_means)
-                    var = np.zeros_like(chip_vars)
+def get_chip(raster_source: 'RasterSource',
+             window: 'Box',
+             nodata_value: Optional[float] = 0) -> Optional[np.ndarray]:
+    """Return chip or None if all values are NODATA."""
+    chip = raster_source.get_raw_chip(window).astype(float)
 
-                var = parallel_variance(chip_means, chip_count, chip_vars,
-                                        mean, count, var)
-                mean = parallel_mean(chip_means, chip_count, mean, count)
-                count += chip_count
+    if nodata_value is None:
+        return chip
 
-        if mean is None or var is None:
-            raise ValueError(
-                'No chips found in raster sources to compute stats from.')
+    chip[chip == nodata_value] = np.nan
+    all_nan_pixels = np.all(np.isnan(chip))
+    if all_nan_pixels:
+        return None
+    return chip
 
-        self.means = mean
-        self.stds = np.sqrt(var)
 
-    def save(self, stats_uri: str) -> None:
-        # Ensure lists
-        means = list(self.means)
-        stds = list(self.stds)
-        stats = {'means': means, 'stds': stds}
-        str_to_file(json.dumps(stats), stats_uri)
-
-    @staticmethod
-    def load(stats_uri: str) -> None:
-        stats_json = json.loads(file_to_str(stats_uri))
-        stats = RasterStats()
-        stats.means = stats_json['means']
-        stats.stds = stats_json['stds']
-        return stats
+def get_num_chips_to_sample(extent: 'Box', chip_sz: int,
+                            sample_prob: float) -> int:
+    num_pixels_total = extent.area
+    num_pixels_per_chip = chip_sz**2
+    if num_pixels_per_chip > num_pixels_total:
+        return 0
+    num_chips_total = (num_pixels_total / num_pixels_per_chip)
+    num_chips_to_sample = round(sample_prob * num_chips_total)
+    return max(1, num_chips_to_sample)
diff --git a/tests/core/data/raster_transformer/test_stats_transformer.py b/tests/core/data/raster_transformer/test_stats_transformer.py
index ad2b2fd52..865b50f6a 100644
--- a/tests/core/data/raster_transformer/test_stats_transformer.py
+++ b/tests/core/data/raster_transformer/test_stats_transformer.py
@@ -1,11 +1,11 @@
 import unittest
-import os
+from os.path import join
 
 import numpy as np
 
 from rastervision.pipeline.file_system import get_tmp_dir
 from rastervision.core.raster_stats import RasterStats
-from rastervision.core.data import StatsTransformerConfig
+from rastervision.core.data import StatsTransformer, StatsTransformerConfig
 
 
 class MockRVPipelineConfig:
@@ -33,24 +33,48 @@ def test_update_root(self):
         self.assertEqual(cfg.stats_uri,
                          '/path/to/bundle/analyze/stats/group1/stats.json')
 
+    def test_build(self):
+        stats = RasterStats(np.array([1, 2]), np.array([3, 4]))
+
+        with get_tmp_dir() as tmp_dir:
+            stats_uri = join(tmp_dir, 'stats.json')
+            stats.save(stats_uri)
+            tf = StatsTransformerConfig(stats_uri=stats_uri).build()
+            np.testing.assert_array_equal(tf.means, np.array([1, 2]))
+            np.testing.assert_array_equal(tf.stds, np.array([3, 4]))
+
 
 class TestStatsTransformer(unittest.TestCase):
-    def test_stats_transformer(self):
-        raster_stats = RasterStats()
-        raster_stats.means = list(np.ones((4, )))
-        raster_stats.stds = list(np.ones((4, )) * 2)
+    def test_transform(self):
+        # All values have z-score of 1, which translates to
+        # uint8 value of 170.
+        tf = StatsTransformer(np.ones((4, )), np.ones((4, )) * 2)
+        chip = np.ones((2, 2, 4)) * 3
+        out_chip = tf.transform(chip)
+        expected_out_chip = np.ones((2, 2, 4)) * 170
+        np.testing.assert_equal(out_chip, expected_out_chip)
+
+    def test_stats(self):
+        tf = StatsTransformer([1, 2], [3, 4])
+        stats = tf.stats
+        self.assertIsInstance(stats, RasterStats)
+        np.testing.assert_array_equal(stats.means, np.array([1, 2]))
+        np.testing.assert_array_equal(stats.stds, np.array([3, 4]))
+
+    def test_from_raster_stats(self):
+        stats = RasterStats(np.array([1, 2]), np.array([3, 4]))
+        tf = StatsTransformer.from_raster_stats(stats)
+        np.testing.assert_array_equal(tf.means, np.array([1, 2]))
+        np.testing.assert_array_equal(tf.stds, np.array([3, 4]))
 
+    def test_from_stats_json(self):
+        stats = RasterStats(np.array([1, 2]), np.array([3, 4]))
         with get_tmp_dir() as tmp_dir:
-            stats_uri = os.path.join(tmp_dir, 'stats.json')
-            raster_stats.save(stats_uri)
-
-            # All values have z-score of 1, which translates to
-            # uint8 value of 170.
-            transformer = StatsTransformerConfig(stats_uri=stats_uri).build()
-            chip = np.ones((2, 2, 4)) * 3
-            out_chip = transformer.transform(chip)
-            expected_out_chip = np.ones((2, 2, 4)) * 170
-            np.testing.assert_equal(out_chip, expected_out_chip)
+            stats_uri = join(tmp_dir, 'stats.json')
+            stats.save(stats_uri)
+            tf = StatsTransformer.from_stats_json(stats_uri)
+            np.testing.assert_array_equal(tf.means, np.array([1, 2]))
+            np.testing.assert_array_equal(tf.stds, np.array([3, 4]))
 
 
 if __name__ == '__main__':
diff --git a/tests/core/data/test_scene.py b/tests/core/data/test_scene.py
index 2f0d851da..a1373d0d3 100644
--- a/tests/core/data/test_scene.py
+++ b/tests/core/data/test_scene.py
@@ -42,3 +42,7 @@ def test_aoi_polygons(self):
         self.assertListEqual(scene.aoi_polygons, aoi_polygons[:2])
         self.assertListEqual(scene.aoi_polygons_bbox_coords,
                              aoi_polygons_bbox_coords)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/core/test_raster_stats.py b/tests/core/test_raster_stats.py
new file mode 100644
index 000000000..bd175a813
--- /dev/null
+++ b/tests/core/test_raster_stats.py
@@ -0,0 +1,133 @@
+import unittest
+from os.path import join
+
+import numpy as np
+from xarray import DataArray
+
+from rastervision.pipeline.file_system import get_tmp_dir
+from rastervision.core.box import Box
+from rastervision.core.data import IdentityCRSTransformer, XarraySource
+from rastervision.core.raster_stats import (
+    RasterStats, get_num_chips_to_sample, random_chip_stream,
+    sliding_chip_stream, get_chip, parallel_mean, parallel_variance)
+
+
+class TestRasterStats(unittest.TestCase):
+    def test_save_and_load(self):
+        stats = RasterStats(np.array([1, 2]), np.array([3, 4]))
+        with get_tmp_dir() as tmp_dir:
+            stats_uri = join(tmp_dir, 'stats.json')
+            stats.save(stats_uri)
+
+            stats2 = RasterStats.load(stats_uri)
+            np.testing.assert_array_equal(stats2.means, np.array([1, 2]))
+            np.testing.assert_array_equal(stats2.stds, np.array([3, 4]))
+
+    def test_no_valid_chips(self):
+        arr = np.zeros((20, 20, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        stats = RasterStats()
+        args = dict(raster_sources=[rs], chip_sz=10, stride=10)
+        self.assertRaises(ValueError, lambda: stats.compute(**args))
+        args = dict(raster_sources=[rs], chip_sz=10, sample_prob=1)
+        self.assertRaises(ValueError, lambda: stats.compute(**args))
+
+    def test_compute_from_pixels_validation(self):
+        stats = RasterStats()
+        pixels = np.zeros((5, 3), dtype=np.uint8)
+        running_mean = np.zeros((3, ), dtype=np.uint8)
+        args = dict(pixels=pixels, running_mean=running_mean)
+        self.assertRaises(ValueError,
+                          lambda: stats.compute_from_pixels(**args))
+
+
+class TestUtils(unittest.TestCase):
+    def test_parallel_mean(self):
+        a = np.random.randint(0, 10, size=5)
+        b = np.random.randint(0, 10, size=10)
+        mean = parallel_mean(a.mean(), len(a), b.mean(), len(b))
+        expected_mean = np.concatenate((a, b)).mean()
+        self.assertEqual(mean, expected_mean)
+
+    def test_parallel_variance(self):
+        a = np.random.randint(0, 10, size=5)
+        b = np.random.randint(0, 10, size=10)
+        var = parallel_variance(
+            a.mean(), len(a), a.var(ddof=1), b.mean(), len(b), b.var(ddof=1))
+        expected_var = np.concatenate((a, b)).var(ddof=1)
+        self.assertAlmostEqual(var, expected_var)
+
+    def test_get_num_chips_to_sample(self):
+        n = get_num_chips_to_sample(
+            extent=Box(0, 0, 1, 1), chip_sz=10, sample_prob=0.1)
+        self.assertEqual(n, 0)
+        n = get_num_chips_to_sample(
+            extent=Box(0, 0, 100, 100), chip_sz=10, sample_prob=0.)
+        self.assertEqual(n, 1)
+        n = get_num_chips_to_sample(
+            extent=Box(0, 0, 100, 100), chip_sz=10, sample_prob=0.1)
+        self.assertEqual(n, 10)
+
+    def test_get_chip(self):
+        arr = np.zeros((20, 20, 4), dtype=np.uint8)
+        arr[:10, :10] = 1
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        chip = get_chip(rs, Box(0, 0, 10, 10), nodata_value=0)
+        self.assertIsNotNone(chip)
+        chip = get_chip(rs, Box(9, 9, 20, 20), nodata_value=0)
+        self.assertIsNotNone(chip)
+        chip = get_chip(rs, Box(10, 10, 20, 20), nodata_value=0)
+        self.assertIsNone(chip)
+        chip = get_chip(rs, Box(10, 10, 20, 20), nodata_value=None)
+        self.assertIsNotNone(chip)
+
+    def test_sliding_chip_stream_normal(self):
+        arr = np.ones((20, 20, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        chips = list(sliding_chip_stream([rs], chip_sz=10, stride=10))
+        self.assertEqual(len(chips), 4)
+
+    def test_sliding_chip_stream_all_nodata(self):
+        arr = np.zeros((20, 20, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        chips = list(sliding_chip_stream([rs], chip_sz=10, stride=10))
+        self.assertEqual(len(chips), 0)
+
+    def test_random_chip_stream_normal(self):
+        arr = np.ones((20, 20, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        chips = list(random_chip_stream([rs], chip_sz=10, sample_prob=0.5))
+        self.assertEqual(len(chips), 2)
+        chips = list(random_chip_stream([rs], chip_sz=10, sample_prob=0.))
+        self.assertEqual(len(chips), 1)
+
+    def test_random_chip_stream_all_nodata(self):
+        arr = np.zeros((20, 20, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        chips = list(random_chip_stream([rs], chip_sz=10, sample_prob=0.5))
+        self.assertEqual(len(chips), 0)
+
+    def test_random_chip_stream_extent_smaller_than_window(self):
+        arr = np.ones((20, 20, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        chips = list(random_chip_stream([rs], chip_sz=100, sample_prob=0.5))
+        self.assertEqual(len(chips), 1)
+        self.assertEqual(chips[0].shape, (20, 20, 4))
+
+    def test_random_chip_stream_window_overflows_extent(self):
+        arr = np.ones((20, 100, 4), dtype=np.uint8)
+        da = DataArray(arr, dims=['x', 'y', 'band'])
+        rs = XarraySource(da, IdentityCRSTransformer())
+        args = dict(raster_sources=[rs], chip_sz=40, sample_prob=0.5)
+        self.assertRaises(ValueError, lambda: list(random_chip_stream(**args)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/core/test_stats_analyzer.py b/tests/core/test_stats_analyzer.py
index 396558974..921889d0c 100644
--- a/tests/core/test_stats_analyzer.py
+++ b/tests/core/test_stats_analyzer.py
@@ -1,3 +1,4 @@
+from typing import Tuple
 import unittest
 from os.path import join
 
@@ -12,7 +13,8 @@
 chip_sz = 300
 
 
-def make_scene(i: int, is_random: bool = False) -> Scene:
+def make_scene(i: int, is_random: bool = False
+               ) -> Tuple[Scene, MockRasterSource, np.ndarray]:
     rs = MockRasterSource([0, 1, 2], 3)
     img = np.zeros((600, 600, 3))
     img[:, :, 0] = 1 + i
@@ -34,20 +36,25 @@ def tearDown(self):
 
     def _test(self, is_random=False):
         sample_prob = 0.5
-        scenes, raster_sources, imgs = zip(*[make_scene(i) for i in range(3)])
 
-        channel_vals = list(map(lambda x: np.expand_dims(x, axis=0), imgs))
-        channel_vals = np.concatenate(channel_vals, axis=0)
-        channel_vals = np.transpose(channel_vals, [3, 0, 1, 2])
-        channel_vals = np.reshape(channel_vals, (3, -1))
-        exp_means = np.nanmean(channel_vals, axis=1)
-        exp_stds = np.nanstd(channel_vals, axis=1)
+        scenes, raster_sources, imgs = zip(
+            *[make_scene(i, is_random=is_random) for i in range(3)])
+
+        imgs: np.ndarray = np.stack(imgs)
+        pixels = imgs.reshape(-1, 3)
+        exp_means = np.nanmean(pixels, axis=0)
+        exp_stds = np.nanstd(pixels, axis=0)
 
-        analyzer_cfg = StatsAnalyzerConfig(
-            output_uri=self.tmp_dir.name, sample_prob=None)
         if is_random:
             analyzer_cfg = StatsAnalyzerConfig(
-                output_uri=self.tmp_dir.name, sample_prob=sample_prob)
+                output_uri=self.tmp_dir.name,
+                chip_sz=chip_sz,
+                sample_prob=sample_prob)
+        else:
+            analyzer_cfg = StatsAnalyzerConfig(
+                output_uri=self.tmp_dir.name,
+                chip_sz=chip_sz,
+                sample_prob=None)
         analyzer = analyzer_cfg.build()
         analyzer.process(scenes, self.tmp_dir.name)
 
@@ -56,9 +63,8 @@ def _test(self, is_random=False):
         np.testing.assert_array_almost_equal(stats.stds, exp_stds, decimal=3)
         if is_random:
             for rs in raster_sources:
-                height, width = rs.extent.size
-                exp_num_chips = round(
-                    ((width * height) / (chip_sz**2)) * sample_prob)
+                area = rs.extent.area
+                exp_num_chips = round((area / (chip_sz**2)) * sample_prob)
                 self.assertEqual(rs.mock._get_chip.call_count, exp_num_chips)
 
     def test_random(self):

From 54c9a34748dc91d470a0e06bdfa81d749085b332 Mon Sep 17 00:00:00 2001
From: Adeel Hassan <ahassan@element84.com>
Date: Fri, 14 Jul 2023 11:27:02 -0400
Subject: [PATCH 3/5] add a helper func for __repr__'s

---
 rastervision_core/rastervision/core/box.py         | 14 ++++++--------
 .../data/raster_transformer/cast_transformer.py    |  3 ++-
 .../data/raster_transformer/stats_transformer.py   |  5 +++++
 .../rastervision/core/raster_stats.py              |  4 ++++
 .../rastervision/pipeline/utils.py                 |  9 +++++++++
 .../raster_transformer/test_cast_transformer.py    |  4 ++--
 tests/pipeline/test_utils.py                       |  9 ++++++++-
 7 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/rastervision_core/rastervision/core/box.py b/rastervision_core/rastervision/core/box.py
index 90c65e37a..c42bf9137 100644
--- a/rastervision_core/rastervision/core/box.py
+++ b/rastervision_core/rastervision/core/box.py
@@ -8,6 +8,8 @@
 from shapely.geometry import Polygon
 from rasterio.windows import Window as RioWindow
 
+from rastervision.pipeline.utils import repr_with_args
+
 NonNegInt = conint(ge=0)
 
 if TYPE_CHECKING:
@@ -118,11 +120,7 @@ def __getitem__(self, i):
         return self.tuple_format()[i]
 
     def __repr__(self) -> str:
-        arg_keys = ['ymin', 'xmin', 'ymax', 'xmax']
-        arg_vals = [getattr(self, k) for k in arg_keys]
-        arg_strs = [f'{k}={v}' for k, v in zip(arg_keys, arg_vals)]
-        arg_str = ', '.join(arg_strs)
-        return f'{type(self).__name__}({arg_str})'
+        return repr_with_args(self, **self.to_dict())
 
     def __hash__(self) -> int:
         return hash(self.tuple_format())
@@ -444,12 +442,12 @@ def get_windows(self,
         return windows
 
     def to_dict(self) -> Dict[str, int]:
-        """Convert to a dict with keys: xmin, ymin, xmax, ymax."""
+        """Convert to a dict with keys: ymin, xmin, ymax, xmax."""
         return {
-            'xmin': self.xmin,
             'ymin': self.ymin,
+            'xmin': self.xmin,
+            'ymax': self.ymax,
             'xmax': self.xmax,
-            'ymax': self.ymax
         }
 
     @classmethod
diff --git a/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py b/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py
index e3f9dcc9e..309bbcce8 100644
--- a/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py
+++ b/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py
@@ -2,6 +2,7 @@
 
 from rastervision.core.data.raster_transformer.raster_transformer \
     import RasterTransformer
+from rastervision.pipeline.utils import repr_with_args
 
 import numpy as np
 
@@ -18,7 +19,7 @@ def __init__(self, to_dtype: str):
         self.to_dtype = np.dtype(to_dtype)
 
     def __repr__(self):
-        return f'CastTransformer(to_dtype="{self.to_dtype}")'
+        return repr_with_args(self, to_dtype=str(self.to_dtype))
 
     def transform(self, chip: np.ndarray,
                   channel_order: Optional[list] = None) -> np.ndarray:
diff --git a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py
index 5e6194eda..de47b63c5 100644
--- a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py
+++ b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py
@@ -4,6 +4,7 @@
 
 from rastervision.core.data.raster_transformer import RasterTransformer
 from rastervision.core.raster_stats import RasterStats
+from rastervision.pipeline.utils import repr_with_args
 
 if TYPE_CHECKING:
     from rastervision.core.data import RasterSource
@@ -132,3 +133,7 @@ def from_raster_stats(cls, stats: RasterStats,
     @property
     def stats(self):
         return RasterStats(self.means, self.stds)
+
+    def __repr__(self) -> str:
+        return repr_with_args(
+            self, means=self.means, std=self.stds, max_stds=self.max_stds)
diff --git a/rastervision_core/rastervision/core/raster_stats.py b/rastervision_core/rastervision/core/raster_stats.py
index 1fea35cbc..05e531927 100644
--- a/rastervision_core/rastervision/core/raster_stats.py
+++ b/rastervision_core/rastervision/core/raster_stats.py
@@ -4,6 +4,7 @@
 import numpy as np
 from tqdm.auto import tqdm
 
+from rastervision.pipeline.utils import repr_with_args
 from rastervision.pipeline.file_system import file_to_json, json_to_file
 from rastervision.core.data.utils import ensure_json_serializable
 
@@ -151,6 +152,9 @@ def vars(self) -> Optional[np.ndarray]:
             return None
         return self.stds**2
 
+    def __repr__(self) -> str:
+        return repr_with_args(self, **self.to_dict())
+
 
 def parallel_variance(mean_a, count_a, var_a, mean_b, count_b, var_b):
     """Compute the variance based on stats from two partitions of the data.
diff --git a/rastervision_pipeline/rastervision/pipeline/utils.py b/rastervision_pipeline/rastervision/pipeline/utils.py
index 68ede8335..1ad6c64eb 100644
--- a/rastervision_pipeline/rastervision/pipeline/utils.py
+++ b/rastervision_pipeline/rastervision/pipeline/utils.py
@@ -1,3 +1,4 @@
+from typing import Any
 import atexit
 import logging
 from math import ceil
@@ -34,3 +35,11 @@ def split_into_groups(lst, num_groups):
     group_sz = max(int(ceil((len(lst)) / num_groups)), 1)
 
     return grouped(lst, group_sz)
+
+
+def repr_with_args(obj: Any, **kwargs) -> str:
+    """Builds a string of the form: <obj's class name>(k1=v1, k2=v2, ...)."""
+    cls = type(obj).__name__
+    arg_strs = [f'{k}={v!r}' for k, v in kwargs.items()]
+    arg_str = ', '.join(arg_strs)
+    return f'{cls}({arg_str})'
diff --git a/tests/core/data/raster_transformer/test_cast_transformer.py b/tests/core/data/raster_transformer/test_cast_transformer.py
index 853adf7b9..e2c49607d 100644
--- a/tests/core/data/raster_transformer/test_cast_transformer.py
+++ b/tests/core/data/raster_transformer/test_cast_transformer.py
@@ -11,13 +11,13 @@ def test_cast_transformer(self):
         tf = CastTransformerConfig(to_dtype='uint8').build()
         out_chip = tf.transform(in_chip)
         self.assertEqual(out_chip.dtype, np.uint8)
-        self.assertEqual(str(tf), 'CastTransformer(to_dtype="uint8")')
+        self.assertEqual(str(tf), "CastTransformer(to_dtype='uint8')")
 
         in_chip = np.empty((10, 10, 3), dtype=np.uint16)
         tf = CastTransformerConfig(to_dtype='float32').build()
         out_chip = tf.transform(in_chip)
         self.assertEqual(out_chip.dtype, np.float32)
-        self.assertEqual(str(tf), 'CastTransformer(to_dtype="float32")')
+        self.assertEqual(str(tf), "CastTransformer(to_dtype='float32')")
 
 
 if __name__ == '__main__':
diff --git a/tests/pipeline/test_utils.py b/tests/pipeline/test_utils.py
index 43790f6f1..a6364691a 100644
--- a/tests/pipeline/test_utils.py
+++ b/tests/pipeline/test_utils.py
@@ -1,6 +1,6 @@
 import unittest
 
-from rastervision.pipeline.utils import split_into_groups
+from rastervision.pipeline.utils import split_into_groups, repr_with_args
 
 
 class TestUtils(unittest.TestCase):
@@ -19,6 +19,13 @@ def test_split_into_groups(self):
         g4 = split_into_groups(lst, 3)
         self.assertEqual(g4, [[1, 2], [3, 4], [5, 6]])
 
+    def test_repr_with_args(self):
+        obj = 1
+        self.assertEqual(repr_with_args(obj), 'int()')
+
+        obj = dict(a=1, b='2')
+        self.assertEqual(repr_with_args(obj, **obj), "dict(a=1, b='2')")
+
 
 if __name__ == '__main__':
     unittest.main()

From 0266a60ceaa9c64e0b991a8845e06bfa97707ebe Mon Sep 17 00:00:00 2001
From: Adeel Hassan <ahassan@element84.com>
Date: Thu, 13 Jul 2023 13:35:52 -0400
Subject: [PATCH 4/5] remove pygeos dependency

No longer needed after shapely and geopandas version upgrades.
---
 rastervision_core/requirements.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rastervision_core/requirements.txt b/rastervision_core/requirements.txt
index 2df3d8c15..5e85a531e 100644
--- a/rastervision_core/requirements.txt
+++ b/rastervision_core/requirements.txt
@@ -1,8 +1,5 @@
 rastervision_pipeline==0.20.3-dev
 
-# These 3 should be updated together to ensure compatibility. Incompatibility
-# results in a warning about pygeos versions.
-pygeos==0.14
 shapely==2.0.1
 geopandas==0.13.2
 

From f4c795296a1e94294052d4334a63d53a4de6e93d Mon Sep 17 00:00:00 2001
From: Adeel Hassan <ahassan@element84.com>
Date: Thu, 13 Jul 2023 15:14:48 -0400
Subject: [PATCH 5/5] make scripts/unit_test also generate report when using
 coverage

---
 scripts/unit_tests | 2 +-
 tests/README.md    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/unit_tests b/scripts/unit_tests
index 6b2a002ab..59124676f 100755
--- a/scripts/unit_tests
+++ b/scripts/unit_tests
@@ -28,6 +28,6 @@ else
     if ! [ -x "$(command -v coverage)" ]; then
 	    python -m unittest discover -t "$SRC_DIR" tests -vf
     else
-	    coverage run -m unittest discover -t "$SRC_DIR" tests -vf
+	    coverage run -m unittest discover -t "$SRC_DIR" tests -vf && coverage html
     fi
 fi
diff --git a/tests/README.md b/tests/README.md
index 783ccd021..871469254 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -15,9 +15,11 @@ Every directory in the path of the test file _must_ have an `__init__.py` file f
 ## Running tests
 
 ### Run all unit tests
+Run all unit tests and generate a coverage report (if the `coverage` package is installed):
 ```sh
 scripts/unit_tests
 ```
+
 Or (from the repo root):
 ```sh
 python -m unittest discover -t . tests -vf