From 2a3aaab2d2dd8c70df2d668160cb0f09e93a6afe Mon Sep 17 00:00:00 2001 From: Adeel Hassan Date: Fri, 14 Jul 2023 12:23:53 -0400 Subject: [PATCH 1/5] refactor ensure_json_serializable() - relocate to core.data.utils.misc - allow serializing of Box objects - add unit tests - remove unused functions from core.utils.misc --- rastervision_core/rastervision/core/box.py | 1 + .../rastervision/core/data/utils/misc.py | 21 ++++++++++ .../evaluation/classification_evaluation.py | 22 +--------- .../rastervision/core/utils/misc.py | 37 ----------------- tests/core/data/utils/test_misc.py | 40 ++++++++++++++++++- 5 files changed, 62 insertions(+), 59 deletions(-) diff --git a/rastervision_core/rastervision/core/box.py b/rastervision_core/rastervision/core/box.py index 785842dfc..90c65e37a 100644 --- a/rastervision_core/rastervision/core/box.py +++ b/rastervision_core/rastervision/core/box.py @@ -444,6 +444,7 @@ def get_windows(self, return windows def to_dict(self) -> Dict[str, int]: + """Convert to a dict with keys: xmin, ymin, xmax, ymax.""" return { 'xmin': self.xmin, 'ymin': self.ymin, diff --git a/rastervision_core/rastervision/core/data/utils/misc.py b/rastervision_core/rastervision/core/data/utils/misc.py index e9305922b..c085071fa 100644 --- a/rastervision_core/rastervision/core/data/utils/misc.py +++ b/rastervision_core/rastervision/core/data/utils/misc.py @@ -219,3 +219,24 @@ def parse_array_slices_Nd(key: Union[tuple, slice], dim_slices[w_dim] = w_slice return window, dim_slices + + +def ensure_json_serializable(obj: Any) -> dict: + """Convert numpy types to JSON serializable equivalents.""" + if obj is None or isinstance(obj, (str, int, bool)): + return obj + if isinstance(obj, dict): + return {k: ensure_json_serializable(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [ensure_json_serializable(o) for o in obj] + if isinstance(obj, np.ndarray): + return ensure_json_serializable(obj.tolist()) + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, (float, np.floating)): + if np.isnan(obj): + return None + return float(obj) + if isinstance(obj, Box): + return obj.to_dict() + return obj diff --git a/rastervision_core/rastervision/core/evaluation/classification_evaluation.py b/rastervision_core/rastervision/core/evaluation/classification_evaluation.py index e97e7ad08..07fff5aaf 100644 --- a/rastervision_core/rastervision/core/evaluation/classification_evaluation.py +++ b/rastervision_core/rastervision/core/evaluation/classification_evaluation.py @@ -8,6 +8,7 @@ import numpy as np from rastervision.pipeline.file_system import str_to_file +from rastervision.core.data.utils import ensure_json_serializable if TYPE_CHECKING: from rastervision.core.evaluation import ClassEvaluationItem @@ -151,24 +152,3 @@ def compute(self, ground_truth_labels, prediction_labels): prediction_labels: The predicted labels to evaluate. """ pass - - -def ensure_json_serializable(obj: Any) -> dict: - """Convert numpy types to JSON serializable equivalents.""" - if obj is None or isinstance(obj, (str, int, bool)): - return obj - if isinstance(obj, dict): - return {k: ensure_json_serializable(v) for k, v in obj.items()} - if isinstance(obj, (list, tuple)): - return [ensure_json_serializable(o) for o in obj] - if isinstance(obj, np.ndarray): - return ensure_json_serializable(obj.tolist()) - if isinstance(obj, np.integer): - return int(obj) - if isinstance(obj, float): - if np.isnan(obj): - return None - return float(obj) - if isinstance(obj, np.floating): - return float(obj) - return obj diff --git a/rastervision_core/rastervision/core/utils/misc.py b/rastervision_core/rastervision/core/utils/misc.py index 1d108ce95..7e9dafe6a 100644 --- a/rastervision_core/rastervision/core/utils/misc.py +++ b/rastervision_core/rastervision/core/utils/misc.py @@ -1,8 +1,5 @@ -import io from pydantic import confloat -from PIL import Image -import numpy as np import imageio import logging @@ -13,37 +10,3 @@ def save_img(im_array, output_path): imageio.imwrite(output_path, im_array) - - -def numpy_to_png(array: np.ndarray) -> str: - """Get a PNG string from a Numpy array. - - Args: - array: A Numpy array of shape (w, h, 3) or (w, h), where the - former is meant to become a three-channel image and the - latter a one-channel image. The dtype of the array - should be uint8. - - Returns: - str - - """ - im = Image.fromarray(array) - output = io.BytesIO() - im.save(output, 'png') - return output.getvalue() - - -def png_to_numpy(png: str, dtype=np.uint8) -> np.ndarray: - """Get a Numpy array from a PNG string. - - Args: - png: A str containing a PNG-formatted image. - - Returns: - numpy.ndarray - - """ - incoming = io.BytesIO(png) - im = Image.open(incoming) - return np.array(im) diff --git a/tests/core/data/utils/test_misc.py b/tests/core/data/utils/test_misc.py index 198a8e79d..e2d0fb1c9 100644 --- a/tests/core/data/utils/test_misc.py +++ b/tests/core/data/utils/test_misc.py @@ -1,6 +1,9 @@ -from typing import Any, Tuple +from typing import Any, Callable, Tuple import unittest from os.path import join +import json + +import numpy as np from rastervision.pipeline.file_system.utils import get_tmp_dir, json_to_file from rastervision.core.box import Box @@ -10,6 +13,7 @@ ChipClassificationGeoJSONStore, ObjectDetectionLabelSource, ObjectDetectionGeoJSONStore, SemanticSegmentationLabelSource, SemanticSegmentationLabelStore) +from rastervision.core.data.utils.misc import ensure_json_serializable from rastervision.core.data.utils.geojson import geoms_to_geojson from rastervision.core.data.utils.misc import ( match_bboxes, parse_array_slices_2d, parse_array_slices_Nd) @@ -222,5 +226,39 @@ def test_step(self): self.assertListEqual(dim_slices, [slice(0, 60, 2), slice(0, 40, 3)]) +class TestEnsureJsonSerializable(unittest.TestCase): + def assertNoError(self, fn: Callable, msg: str = ''): + try: + fn() + except Exception: + self.fail(msg) + + def test_serializable(self): + objs = [None, 'str', 1, True, False] + for obj in objs: + self.assertEqual(ensure_json_serializable(obj), obj, msg=str(obj)) + + def test_numpy(self): + arr = np.ones(5, dtype=int) + self.assertListEqual(ensure_json_serializable(arr), [1] * 5) + self.assertIsInstance(ensure_json_serializable(arr[0]), int) + arr = np.ones(5, dtype=np.float32) + self.assertIsInstance(ensure_json_serializable(arr[0]), float) + + def test_dict(self): + arr = {'a': np.ones(5, dtype=int)} + self.assertDictEqual(ensure_json_serializable(arr), dict(a=([1] * 5))) + + def test_float_edge_cases(self): + d = dict(a=np.nan, b=np.inf, c=-np.inf) + d_serializable = ensure_json_serializable(d) + self.assertNoError(lambda: json.dumps(d_serializable)) + + def test_box(self): + box = Box(0, 1, 2, 3) + box_dict = dict(ymin=0, xmin=1, ymax=2, xmax=3) + self.assertDictEqual(ensure_json_serializable(box), box_dict) + + if __name__ == '__main__': unittest.main() From 08e06e6f1fdbdd0b6ee389d68c64ebb88b09cebd Mon Sep 17 00:00:00 2001 From: Adeel Hassan Date: Fri, 14 Jul 2023 12:23:45 -0400 Subject: [PATCH 2/5] refactor RasterStats and StatsTransformer - also add/update unit tests --- .../raster_transformer/stats_transformer.py | 24 +- .../rastervision/core/raster_stats.py | 309 ++++++++++++------ .../test_stats_transformer.py | 56 +++- tests/core/data/test_scene.py | 4 + tests/core/test_raster_stats.py | 133 ++++++++ tests/core/test_stats_analyzer.py | 34 +- 6 files changed, 417 insertions(+), 143 deletions(-) create mode 100644 tests/core/test_raster_stats.py diff --git a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py index e686f27c6..5e6194eda 100644 --- a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py +++ b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py @@ -3,6 +3,7 @@ import numpy as np from rastervision.core.data.raster_transformer import RasterTransformer +from rastervision.core.raster_stats import RasterStats if TYPE_CHECKING: from rastervision.core.data import RasterSource @@ -94,7 +95,7 @@ def transform(self, @classmethod def from_raster_sources(cls, raster_sources: List['RasterSource'], - sample_prob: float = 0.1, + sample_prob: Optional[float] = 0.1, max_stds: float = 3.) -> 'StatsTransformer': """Create a StatsTransformer with stats from the given raster sources. @@ -110,9 +111,24 @@ def from_raster_sources(cls, Returns: StatsTransformer: A StatsTransformer. """ - from rastervision.core.raster_stats import RasterStats stats = RasterStats() stats.compute(raster_sources=raster_sources, sample_prob=sample_prob) - stats_transformer = StatsTransformer( - means=stats.means, stds=stats.stds, max_stds=max_stds) + stats_transformer = StatsTransformer.from_raster_stats( + stats, max_stds=max_stds) return stats_transformer + + @classmethod + def from_stats_json(cls, uri: str, **kwargs) -> 'StatsTransformer': + stats = RasterStats.load(uri) + stats_transformer = StatsTransformer.from_raster_stats(stats, **kwargs) + return stats_transformer + + @classmethod + def from_raster_stats(cls, stats: RasterStats, + **kwargs) -> 'StatsTransformer': + stats_transformer = StatsTransformer(stats.means, stats.stds, **kwargs) + return stats_transformer + + @property + def stats(self): + return RasterStats(self.means, self.stds) diff --git a/rastervision_core/rastervision/core/raster_stats.py b/rastervision_core/rastervision/core/raster_stats.py index 1c5aa9bcf..1fea35cbc 100644 --- a/rastervision_core/rastervision/core/raster_stats.py +++ b/rastervision_core/rastervision/core/raster_stats.py @@ -1,16 +1,157 @@ -from typing import TYPE_CHECKING, Iterator, Optional, Sequence -import json +from typing import (TYPE_CHECKING, Iterable, Iterator, Optional, Sequence, + Tuple, Union) import numpy as np from tqdm.auto import tqdm -from rastervision.pipeline.file_system import str_to_file, file_to_str +from rastervision.pipeline.file_system import file_to_json, json_to_file +from rastervision.core.data.utils import ensure_json_serializable if TYPE_CHECKING: from rastervision.core.box import Box from rastervision.core.data import RasterSource +class RasterStats: + """Band-wise means and standard deviations.""" + + def __init__(self, + means: Optional[np.ndarray] = None, + stds: Optional[np.ndarray] = None, + count: Optional[np.ndarray] = None): + self.means = means + self.stds = stds + self.count = count + + @classmethod + def load(cls, stats_uri: str) -> 'RasterStats': + """Load stats from file.""" + stats_json = file_to_json(stats_uri) + assert 'means' in stats_json and 'stds' in stats_json + stats = RasterStats( + means=stats_json['means'], + stds=stats_json['stds'], + count=stats_json.get('count')) + return stats + + def compute(self, + raster_sources: Sequence['RasterSource'], + sample_prob: Optional[float] = None, + chip_sz: int = 300, + stride: Optional[int] = None, + nodata_value: Optional[float] = 0) -> None: + """Compute the mean and stds over all the raster_sources. + + This ignores NODATA values if nodata_value is not None. + + If sample_prob is set, then a subset of each scene is used to compute + stats which speeds up the computation. Roughly speaking, if + sample_prob=0.5, then half the pixels in the scene will be used. More + precisely, the number of chips is equal to + sample_prob * (width * height / 300^2), or 1, whichever is greater. + Each chip is uniformly sampled from the scene with replacement. + Otherwise, it uses a sliding window over the entire scene to compute + stats. + + Args: + raster_sources Sequence['RasterSource']: List of RasterSources. + sample_prob (Optional[float]): Pixel sampling probability. See + notes above. Defaults to None. + nodata_value (Optional[float]): NODATA value. If set, these pixels + will be ignored when computing stats. + """ + if sample_prob is None: + if stride is None: + stride = chip_sz + chip_stream = sliding_chip_stream( + raster_sources, chip_sz, stride, nodata_value=nodata_value) + else: + chip_stream = random_chip_stream( + raster_sources, + chip_sz, + sample_prob, + nodata_value=nodata_value) + + means, vars, count = self.compute_from_chips( + chip_stream, + running_mean=self.means, + running_var=self.vars, + running_count=self.count) + if means is None or vars is None: + raise ValueError('No valid chips found in raster sources to ' + 'compute stats from. This may be because all ' + 'sampled chips were entirely composed of NODATA ' + 'pixels.') + self.means = means + self.stds = np.sqrt(vars) + self.count = count + + def compute_from_chips( + self, + chips: Iterable[np.ndarray], + running_mean: Optional[np.ndarray] = None, + running_var: Optional[np.ndarray] = None, + running_count: Optional[np.ndarray] = None) -> Union[Tuple[ + None, None, None], Tuple[np.ndarray, np.ndarray, np.ndarray]]: + """Compute running mean and var from chips in stream.""" + with tqdm(chips, desc='Analyzing chips') as bar: + for chip in bar: + num_channels = chip.shape[-1] + # (..., H, W, C) --> (... * H * W, C) + pixels = chip.reshape(-1, num_channels) + stats = self.compute_from_pixels(pixels, running_mean, + running_var, running_count) + running_mean, running_var, running_count = stats + + return running_mean, running_var, running_count + + def compute_from_pixels(self, + pixels: np.ndarray, + running_mean: Optional[np.ndarray] = None, + running_var: Optional[np.ndarray] = None, + running_count: Optional[np.ndarray] = None + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Update running mean and var from pixel values.""" + running_stats = [running_mean, running_var, running_count] + has_running_stats = any(s is not None for s in running_stats) + has_all_running_stats = all(s is not None for s in running_stats) + if has_running_stats and not has_all_running_stats: + raise ValueError('Provide either none or all running stats.') + + channel_means = np.nanmean(pixels, axis=0) + channel_vars = np.nanvar(pixels, axis=0) + channel_counts = np.sum(~np.isnan(pixels), axis=0) + + if not has_running_stats: + return channel_means, channel_vars, channel_counts + + running_var = parallel_variance(channel_means, channel_counts, + channel_vars, running_mean, + running_count, running_var) + running_mean = parallel_mean(channel_means, channel_counts, + running_mean, running_count) + running_count += channel_counts + + return running_mean, running_var, running_count + + def to_dict(self) -> dict: + stats_dict = dict(means=self.means, stds=self.stds, count=self.count) + return stats_dict + + def save(self, stats_uri: str) -> None: + """Save stats to file.""" + assert self.means is not None and self.stds is not None + stats_dict = self.to_dict() + stats_dict = ensure_json_serializable(stats_dict) + json_to_file(stats_dict, stats_uri) + + @property + def vars(self) -> Optional[np.ndarray]: + if self.stds is None: + return None + return self.stds**2 + + def parallel_variance(mean_a, count_a, var_a, mean_b, count_b, var_b): """Compute the variance based on stats from two partitions of the data. @@ -55,117 +196,67 @@ def parallel_mean(mean_a, count_a, mean_b, count_b): return mean -class RasterStats(): - def __init__(self): - self.means = None - self.stds = None +def sliding_chip_stream( + raster_sources: Iterable['RasterSource'], + chip_sz: int, + stride: int, + nodata_value: Optional[float] = 0) -> Iterator[np.ndarray]: + """Get stream of chips using a sliding window.""" + for raster_source in raster_sources: + windows = raster_source.extent.get_windows(chip_sz, stride) + for window in windows: + chip = get_chip(raster_source, window, nodata_value=nodata_value) + if chip is None: + continue + yield chip - def compute(self, - raster_sources: Sequence['RasterSource'], - sample_prob: Optional[float] = None, - chip_sz: int = 300, - nodata_value: Optional[float] = 0) -> None: - """Compute the mean and stds over all the raster_sources. - This ignores NODATA values if nodata_value is not None. +def random_chip_stream( + raster_sources: Iterable['RasterSource'], + chip_sz: int, + sample_prob: float, + nodata_value: Optional[float] = 0) -> Iterator[np.ndarray]: + """Get random stream of chips.""" + for raster_source in raster_sources: + extent = raster_source.extent + num_chips_to_sample = get_num_chips_to_sample(extent, chip_sz, + sample_prob) + if num_chips_to_sample == 0: + windows = [extent] + else: + windows = [ + extent.make_random_square(chip_sz) + for _ in range(num_chips_to_sample) + ] + for window in windows: + chip = get_chip(raster_source, window, nodata_value=nodata_value) + if chip is None: + continue + yield chip - If sample_prob is set, then a subset of each scene is used to compute - stats which speeds up the computation. Roughly speaking, if - sample_prob=0.5, then half the pixels in the scene will be used. More - precisely, the number of chips is equal to - sample_prob * (width * height / 300^2), or 1, whichever is greater. - Each chip is uniformly sampled from the scene with replacement. - Otherwise, it uses a sliding window over the entire scene to compute - stats. - Args: - raster_sources Sequence['RasterSource']: List of RasterSources. - sample_prob (Optional[float]): Pixel sampling probability. See - notes above. Defaults to None. - nodata_value (Optional[float]): NODATA value. If set, these pixels - will be ignored when computing stats. - """ - stride = chip_sz - - def get_chip(raster_source: 'RasterSource', - window: 'Box') -> Optional[np.ndarray]: - """Return chip or None if all values are NODATA.""" - chip = raster_source.get_raw_chip(window).astype(float) - # Convert shape from [h,w,c] to [c,h*w] - chip = chip.reshape(-1, chip.shape[-1]) - - if nodata_value is None: - return chip - else: - # Ignore NODATA values. - chip[chip == nodata_value] = np.nan - has_non_nan_pixels = np.any(~np.isnan(chip)) - if has_non_nan_pixels: - return chip - return None - - def sliding_chip_stream() -> Iterator[np.ndarray]: - """Get stream of chips using a sliding window of size 300.""" - for raster_source in raster_sources: - windows = raster_source.extent.get_windows(chip_sz, stride) - for window in windows: - chip = get_chip(raster_source, window) - if chip is not None: - yield chip - - def random_chip_stream() -> Iterator[np.ndarray]: - """Get random stream of chips.""" - for raster_source in raster_sources: - extent = raster_source.extent - num_pixels = extent.area - num_chips = round(sample_prob * (num_pixels / (chip_sz**2))) - num_chips = max(1, num_chips) - for _ in range(num_chips): - window = raster_source.extent.make_random_square(chip_sz) - chip = get_chip(raster_source, window) - if chip is not None: - yield chip - - # For each chip, compute the mean and var of that chip and then update the - # running mean and var. - count = 0 - mean = None - var = None - chip_stream = (sliding_chip_stream() - if sample_prob is None else random_chip_stream()) - with tqdm(chip_stream, desc='Analyzing chips') as bar: - for chip in bar: - chip_means = np.nanmean(chip, axis=0) - chip_vars = np.nanvar(chip, axis=0) - chip_count = np.sum(chip.sum(axis=-1) != np.nan) - - if mean is None or var is None: - mean = np.zeros_like(chip_means) - var = np.zeros_like(chip_vars) +def get_chip(raster_source: 'RasterSource', + window: 'Box', + nodata_value: Optional[float] = 0) -> Optional[np.ndarray]: + """Return chip or None if all values are NODATA.""" + chip = raster_source.get_raw_chip(window).astype(float) - var = parallel_variance(chip_means, chip_count, chip_vars, - mean, count, var) - mean = parallel_mean(chip_means, chip_count, mean, count) - count += chip_count + if nodata_value is None: + return chip - if mean is None or var is None: - raise ValueError( - 'No chips found in raster sources to compute stats from.') + chip[chip == nodata_value] = np.nan + all_nan_pixels = np.all(np.isnan(chip)) + if all_nan_pixels: + return None + return chip - self.means = mean - self.stds = np.sqrt(var) - def save(self, stats_uri: str) -> None: - # Ensure lists - means = list(self.means) - stds = list(self.stds) - stats = {'means': means, 'stds': stds} - str_to_file(json.dumps(stats), stats_uri) - - @staticmethod - def load(stats_uri: str) -> None: - stats_json = json.loads(file_to_str(stats_uri)) - stats = RasterStats() - stats.means = stats_json['means'] - stats.stds = stats_json['stds'] - return stats +def get_num_chips_to_sample(extent: 'Box', chip_sz: int, + sample_prob: float) -> int: + num_pixels_total = extent.area + num_pixels_per_chip = chip_sz**2 + if num_pixels_per_chip > num_pixels_total: + return 0 + num_chips_total = (num_pixels_total / num_pixels_per_chip) + num_chips_to_sample = round(sample_prob * num_chips_total) + return max(1, num_chips_to_sample) diff --git a/tests/core/data/raster_transformer/test_stats_transformer.py b/tests/core/data/raster_transformer/test_stats_transformer.py index ad2b2fd52..865b50f6a 100644 --- a/tests/core/data/raster_transformer/test_stats_transformer.py +++ b/tests/core/data/raster_transformer/test_stats_transformer.py @@ -1,11 +1,11 @@ import unittest -import os +from os.path import join import numpy as np from rastervision.pipeline.file_system import get_tmp_dir from rastervision.core.raster_stats import RasterStats -from rastervision.core.data import StatsTransformerConfig +from rastervision.core.data import StatsTransformer, StatsTransformerConfig class MockRVPipelineConfig: @@ -33,24 +33,48 @@ def test_update_root(self): self.assertEqual(cfg.stats_uri, '/path/to/bundle/analyze/stats/group1/stats.json') + def test_build(self): + stats = RasterStats(np.array([1, 2]), np.array([3, 4])) + + with get_tmp_dir() as tmp_dir: + stats_uri = join(tmp_dir, 'stats.json') + stats.save(stats_uri) + tf = StatsTransformerConfig(stats_uri=stats_uri).build() + np.testing.assert_array_equal(tf.means, np.array([1, 2])) + np.testing.assert_array_equal(tf.stds, np.array([3, 4])) + class TestStatsTransformer(unittest.TestCase): - def test_stats_transformer(self): - raster_stats = RasterStats() - raster_stats.means = list(np.ones((4, ))) - raster_stats.stds = list(np.ones((4, )) * 2) + def test_transform(self): + # All values have z-score of 1, which translates to + # uint8 value of 170. + tf = StatsTransformer(np.ones((4, )), np.ones((4, )) * 2) + chip = np.ones((2, 2, 4)) * 3 + out_chip = tf.transform(chip) + expected_out_chip = np.ones((2, 2, 4)) * 170 + np.testing.assert_equal(out_chip, expected_out_chip) + + def test_stats(self): + tf = StatsTransformer([1, 2], [3, 4]) + stats = tf.stats + self.assertIsInstance(stats, RasterStats) + np.testing.assert_array_equal(stats.means, np.array([1, 2])) + np.testing.assert_array_equal(stats.stds, np.array([3, 4])) + + def test_from_raster_stats(self): + stats = RasterStats(np.array([1, 2]), np.array([3, 4])) + tf = StatsTransformer.from_raster_stats(stats) + np.testing.assert_array_equal(tf.means, np.array([1, 2])) + np.testing.assert_array_equal(tf.stds, np.array([3, 4])) + def test_from_stats_json(self): + stats = RasterStats(np.array([1, 2]), np.array([3, 4])) with get_tmp_dir() as tmp_dir: - stats_uri = os.path.join(tmp_dir, 'stats.json') - raster_stats.save(stats_uri) - - # All values have z-score of 1, which translates to - # uint8 value of 170. - transformer = StatsTransformerConfig(stats_uri=stats_uri).build() - chip = np.ones((2, 2, 4)) * 3 - out_chip = transformer.transform(chip) - expected_out_chip = np.ones((2, 2, 4)) * 170 - np.testing.assert_equal(out_chip, expected_out_chip) + stats_uri = join(tmp_dir, 'stats.json') + stats.save(stats_uri) + tf = StatsTransformer.from_stats_json(stats_uri) + np.testing.assert_array_equal(tf.means, np.array([1, 2])) + np.testing.assert_array_equal(tf.stds, np.array([3, 4])) if __name__ == '__main__': diff --git a/tests/core/data/test_scene.py b/tests/core/data/test_scene.py index 2f0d851da..a1373d0d3 100644 --- a/tests/core/data/test_scene.py +++ b/tests/core/data/test_scene.py @@ -42,3 +42,7 @@ def test_aoi_polygons(self): self.assertListEqual(scene.aoi_polygons, aoi_polygons[:2]) self.assertListEqual(scene.aoi_polygons_bbox_coords, aoi_polygons_bbox_coords) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/core/test_raster_stats.py b/tests/core/test_raster_stats.py new file mode 100644 index 000000000..bd175a813 --- /dev/null +++ b/tests/core/test_raster_stats.py @@ -0,0 +1,133 @@ +import unittest +from os.path import join + +import numpy as np +from xarray import DataArray + +from rastervision.pipeline.file_system import get_tmp_dir +from rastervision.core.box import Box +from rastervision.core.data import IdentityCRSTransformer, XarraySource +from rastervision.core.raster_stats import ( + RasterStats, get_num_chips_to_sample, random_chip_stream, + sliding_chip_stream, get_chip, parallel_mean, parallel_variance) + + +class TestRasterStats(unittest.TestCase): + def test_save_and_load(self): + stats = RasterStats(np.array([1, 2]), np.array([3, 4])) + with get_tmp_dir() as tmp_dir: + stats_uri = join(tmp_dir, 'stats.json') + stats.save(stats_uri) + + stats2 = RasterStats.load(stats_uri) + np.testing.assert_array_equal(stats2.means, np.array([1, 2])) + np.testing.assert_array_equal(stats2.stds, np.array([3, 4])) + + def test_no_valid_chips(self): + arr = np.zeros((20, 20, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + stats = RasterStats() + args = dict(raster_sources=[rs], chip_sz=10, stride=10) + self.assertRaises(ValueError, lambda: stats.compute(**args)) + args = dict(raster_sources=[rs], chip_sz=10, sample_prob=1) + self.assertRaises(ValueError, lambda: stats.compute(**args)) + + def test_compute_from_pixels_validation(self): + stats = RasterStats() + pixels = np.zeros((5, 3), dtype=np.uint8) + running_mean = np.zeros((3, ), dtype=np.uint8) + args = dict(pixels=pixels, running_mean=running_mean) + self.assertRaises(ValueError, + lambda: stats.compute_from_pixels(**args)) + + +class TestUtils(unittest.TestCase): + def test_parallel_mean(self): + a = np.random.randint(0, 10, size=5) + b = np.random.randint(0, 10, size=10) + mean = parallel_mean(a.mean(), len(a), b.mean(), len(b)) + expected_mean = np.concatenate((a, b)).mean() + self.assertEqual(mean, expected_mean) + + def test_parallel_variance(self): + a = np.random.randint(0, 10, size=5) + b = np.random.randint(0, 10, size=10) + var = parallel_variance( + a.mean(), len(a), a.var(ddof=1), b.mean(), len(b), b.var(ddof=1)) + expected_var = np.concatenate((a, b)).var(ddof=1) + self.assertAlmostEqual(var, expected_var) + + def test_get_num_chips_to_sample(self): + n = get_num_chips_to_sample( + extent=Box(0, 0, 1, 1), chip_sz=10, sample_prob=0.1) + self.assertEqual(n, 0) + n = get_num_chips_to_sample( + extent=Box(0, 0, 100, 100), chip_sz=10, sample_prob=0.) + self.assertEqual(n, 1) + n = get_num_chips_to_sample( + extent=Box(0, 0, 100, 100), chip_sz=10, sample_prob=0.1) + self.assertEqual(n, 10) + + def test_get_chip(self): + arr = np.zeros((20, 20, 4), dtype=np.uint8) + arr[:10, :10] = 1 + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + chip = get_chip(rs, Box(0, 0, 10, 10), nodata_value=0) + self.assertIsNotNone(chip) + chip = get_chip(rs, Box(9, 9, 20, 20), nodata_value=0) + self.assertIsNotNone(chip) + chip = get_chip(rs, Box(10, 10, 20, 20), nodata_value=0) + self.assertIsNone(chip) + chip = get_chip(rs, Box(10, 10, 20, 20), nodata_value=None) + self.assertIsNotNone(chip) + + def test_sliding_chip_stream_normal(self): + arr = np.ones((20, 20, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + chips = list(sliding_chip_stream([rs], chip_sz=10, stride=10)) + self.assertEqual(len(chips), 4) + + def test_sliding_chip_stream_all_nodata(self): + arr = np.zeros((20, 20, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + chips = list(sliding_chip_stream([rs], chip_sz=10, stride=10)) + self.assertEqual(len(chips), 0) + + def test_random_chip_stream_normal(self): + arr = np.ones((20, 20, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + chips = list(random_chip_stream([rs], chip_sz=10, sample_prob=0.5)) + self.assertEqual(len(chips), 2) + chips = list(random_chip_stream([rs], chip_sz=10, sample_prob=0.)) + self.assertEqual(len(chips), 1) + + def test_random_chip_stream_all_nodata(self): + arr = np.zeros((20, 20, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + chips = list(random_chip_stream([rs], chip_sz=10, sample_prob=0.5)) + self.assertEqual(len(chips), 0) + + def test_random_chip_stream_extent_smaller_than_window(self): + arr = np.ones((20, 20, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + chips = list(random_chip_stream([rs], chip_sz=100, sample_prob=0.5)) + self.assertEqual(len(chips), 1) + self.assertEqual(chips[0].shape, (20, 20, 4)) + + def test_random_chip_stream_window_overflows_extent(self): + arr = np.ones((20, 100, 4), dtype=np.uint8) + da = DataArray(arr, dims=['x', 'y', 'band']) + rs = XarraySource(da, IdentityCRSTransformer()) + args = dict(raster_sources=[rs], chip_sz=40, sample_prob=0.5) + self.assertRaises(ValueError, lambda: list(random_chip_stream(**args))) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/core/test_stats_analyzer.py b/tests/core/test_stats_analyzer.py index 396558974..921889d0c 100644 --- a/tests/core/test_stats_analyzer.py +++ b/tests/core/test_stats_analyzer.py @@ -1,3 +1,4 @@ +from typing import Tuple import unittest from os.path import join @@ -12,7 +13,8 @@ chip_sz = 300 -def make_scene(i: int, is_random: bool = False) -> Scene: +def make_scene(i: int, is_random: bool = False + ) -> Tuple[Scene, MockRasterSource, np.ndarray]: rs = MockRasterSource([0, 1, 2], 3) img = np.zeros((600, 600, 3)) img[:, :, 0] = 1 + i @@ -34,20 +36,25 @@ def tearDown(self): def _test(self, is_random=False): sample_prob = 0.5 - scenes, raster_sources, imgs = zip(*[make_scene(i) for i in range(3)]) - channel_vals = list(map(lambda x: np.expand_dims(x, axis=0), imgs)) - channel_vals = np.concatenate(channel_vals, axis=0) - channel_vals = np.transpose(channel_vals, [3, 0, 1, 2]) - channel_vals = np.reshape(channel_vals, (3, -1)) - exp_means = np.nanmean(channel_vals, axis=1) - exp_stds = np.nanstd(channel_vals, axis=1) + scenes, raster_sources, imgs = zip( + *[make_scene(i, is_random=is_random) for i in range(3)]) + + imgs: np.ndarray = np.stack(imgs) + pixels = imgs.reshape(-1, 3) + exp_means = np.nanmean(pixels, axis=0) + exp_stds = np.nanstd(pixels, axis=0) - analyzer_cfg = StatsAnalyzerConfig( - output_uri=self.tmp_dir.name, sample_prob=None) if is_random: analyzer_cfg = StatsAnalyzerConfig( - output_uri=self.tmp_dir.name, sample_prob=sample_prob) + output_uri=self.tmp_dir.name, + chip_sz=chip_sz, + sample_prob=sample_prob) + else: + analyzer_cfg = StatsAnalyzerConfig( + output_uri=self.tmp_dir.name, + chip_sz=chip_sz, + sample_prob=None) analyzer = analyzer_cfg.build() analyzer.process(scenes, self.tmp_dir.name) @@ -56,9 +63,8 @@ def _test(self, is_random=False): np.testing.assert_array_almost_equal(stats.stds, exp_stds, decimal=3) if is_random: for rs in raster_sources: - height, width = rs.extent.size - exp_num_chips = round( - ((width * height) / (chip_sz**2)) * sample_prob) + area = rs.extent.area + exp_num_chips = round((area / (chip_sz**2)) * sample_prob) self.assertEqual(rs.mock._get_chip.call_count, exp_num_chips) def test_random(self): From 54c9a34748dc91d470a0e06bdfa81d749085b332 Mon Sep 17 00:00:00 2001 From: Adeel Hassan Date: Fri, 14 Jul 2023 11:27:02 -0400 Subject: [PATCH 3/5] add a helper func for __repr__'s --- rastervision_core/rastervision/core/box.py | 14 ++++++-------- .../data/raster_transformer/cast_transformer.py | 3 ++- .../data/raster_transformer/stats_transformer.py | 5 +++++ .../rastervision/core/raster_stats.py | 4 ++++ .../rastervision/pipeline/utils.py | 9 +++++++++ .../raster_transformer/test_cast_transformer.py | 4 ++-- tests/pipeline/test_utils.py | 9 ++++++++- 7 files changed, 36 insertions(+), 12 deletions(-) diff --git a/rastervision_core/rastervision/core/box.py b/rastervision_core/rastervision/core/box.py index 90c65e37a..c42bf9137 100644 --- a/rastervision_core/rastervision/core/box.py +++ b/rastervision_core/rastervision/core/box.py @@ -8,6 +8,8 @@ from shapely.geometry import Polygon from rasterio.windows import Window as RioWindow +from rastervision.pipeline.utils import repr_with_args + NonNegInt = conint(ge=0) if TYPE_CHECKING: @@ -118,11 +120,7 @@ def __getitem__(self, i): return self.tuple_format()[i] def __repr__(self) -> str: - arg_keys = ['ymin', 'xmin', 'ymax', 'xmax'] - arg_vals = [getattr(self, k) for k in arg_keys] - arg_strs = [f'{k}={v}' for k, v in zip(arg_keys, arg_vals)] - arg_str = ', '.join(arg_strs) - return f'{type(self).__name__}({arg_str})' + return repr_with_args(self, **self.to_dict()) def __hash__(self) -> int: return hash(self.tuple_format()) @@ -444,12 +442,12 @@ def get_windows(self, return windows def to_dict(self) -> Dict[str, int]: - """Convert to a dict with keys: xmin, ymin, xmax, ymax.""" + """Convert to a dict with keys: ymin, xmin, ymax, xmax.""" return { - 'xmin': self.xmin, 'ymin': self.ymin, + 'xmin': self.xmin, + 'ymax': self.ymax, 'xmax': self.xmax, - 'ymax': self.ymax } @classmethod diff --git a/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py b/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py index e3f9dcc9e..309bbcce8 100644 --- a/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py +++ b/rastervision_core/rastervision/core/data/raster_transformer/cast_transformer.py @@ -2,6 +2,7 @@ from rastervision.core.data.raster_transformer.raster_transformer \ import RasterTransformer +from rastervision.pipeline.utils import repr_with_args import numpy as np @@ -18,7 +19,7 @@ def __init__(self, to_dtype: str): self.to_dtype = np.dtype(to_dtype) def __repr__(self): - return f'CastTransformer(to_dtype="{self.to_dtype}")' + return repr_with_args(self, to_dtype=str(self.to_dtype)) def transform(self, chip: np.ndarray, channel_order: Optional[list] = None) -> np.ndarray: diff --git a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py index 5e6194eda..de47b63c5 100644 --- a/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py +++ b/rastervision_core/rastervision/core/data/raster_transformer/stats_transformer.py @@ -4,6 +4,7 @@ from rastervision.core.data.raster_transformer import RasterTransformer from rastervision.core.raster_stats import RasterStats +from rastervision.pipeline.utils import repr_with_args if TYPE_CHECKING: from rastervision.core.data import RasterSource @@ -132,3 +133,7 @@ def from_raster_stats(cls, stats: RasterStats, @property def stats(self): return RasterStats(self.means, self.stds) + + def __repr__(self) -> str: + return repr_with_args( + self, means=self.means, std=self.stds, max_stds=self.max_stds) diff --git a/rastervision_core/rastervision/core/raster_stats.py b/rastervision_core/rastervision/core/raster_stats.py index 1fea35cbc..05e531927 100644 --- a/rastervision_core/rastervision/core/raster_stats.py +++ b/rastervision_core/rastervision/core/raster_stats.py @@ -4,6 +4,7 @@ import numpy as np from tqdm.auto import tqdm +from rastervision.pipeline.utils import repr_with_args from rastervision.pipeline.file_system import file_to_json, json_to_file from rastervision.core.data.utils import ensure_json_serializable @@ -151,6 +152,9 @@ def vars(self) -> Optional[np.ndarray]: return None return self.stds**2 + def __repr__(self) -> str: + return repr_with_args(self, **self.to_dict()) + def parallel_variance(mean_a, count_a, var_a, mean_b, count_b, var_b): """Compute the variance based on stats from two partitions of the data. diff --git a/rastervision_pipeline/rastervision/pipeline/utils.py b/rastervision_pipeline/rastervision/pipeline/utils.py index 68ede8335..1ad6c64eb 100644 --- a/rastervision_pipeline/rastervision/pipeline/utils.py +++ b/rastervision_pipeline/rastervision/pipeline/utils.py @@ -1,3 +1,4 @@ +from typing import Any import atexit import logging from math import ceil @@ -34,3 +35,11 @@ def split_into_groups(lst, num_groups): group_sz = max(int(ceil((len(lst)) / num_groups)), 1) return grouped(lst, group_sz) + + +def repr_with_args(obj: Any, **kwargs) -> str: + """Builds a string of the form: (k1=v1, k2=v2, ...).""" + cls = type(obj).__name__ + arg_strs = [f'{k}={v!r}' for k, v in kwargs.items()] + arg_str = ', '.join(arg_strs) + return f'{cls}({arg_str})' diff --git a/tests/core/data/raster_transformer/test_cast_transformer.py b/tests/core/data/raster_transformer/test_cast_transformer.py index 853adf7b9..e2c49607d 100644 --- a/tests/core/data/raster_transformer/test_cast_transformer.py +++ b/tests/core/data/raster_transformer/test_cast_transformer.py @@ -11,13 +11,13 @@ def test_cast_transformer(self): tf = CastTransformerConfig(to_dtype='uint8').build() out_chip = tf.transform(in_chip) self.assertEqual(out_chip.dtype, np.uint8) - self.assertEqual(str(tf), 'CastTransformer(to_dtype="uint8")') + self.assertEqual(str(tf), "CastTransformer(to_dtype='uint8')") in_chip = np.empty((10, 10, 3), dtype=np.uint16) tf = CastTransformerConfig(to_dtype='float32').build() out_chip = tf.transform(in_chip) self.assertEqual(out_chip.dtype, np.float32) - self.assertEqual(str(tf), 'CastTransformer(to_dtype="float32")') + self.assertEqual(str(tf), "CastTransformer(to_dtype='float32')") if __name__ == '__main__': diff --git a/tests/pipeline/test_utils.py b/tests/pipeline/test_utils.py index 43790f6f1..a6364691a 100644 --- a/tests/pipeline/test_utils.py +++ b/tests/pipeline/test_utils.py @@ -1,6 +1,6 @@ import unittest -from rastervision.pipeline.utils import split_into_groups +from rastervision.pipeline.utils import split_into_groups, repr_with_args class TestUtils(unittest.TestCase): @@ -19,6 +19,13 @@ def test_split_into_groups(self): g4 = split_into_groups(lst, 3) self.assertEqual(g4, [[1, 2], [3, 4], [5, 6]]) + def test_repr_with_args(self): + obj = 1 + self.assertEqual(repr_with_args(obj), 'int()') + + obj = dict(a=1, b='2') + self.assertEqual(repr_with_args(obj, **obj), "dict(a=1, b='2')") + if __name__ == '__main__': unittest.main() From 0266a60ceaa9c64e0b991a8845e06bfa97707ebe Mon Sep 17 00:00:00 2001 From: Adeel Hassan Date: Thu, 13 Jul 2023 13:35:52 -0400 Subject: [PATCH 4/5] remove pygeos dependency No longer needed after shapely and geopandas version upgrades. --- rastervision_core/requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/rastervision_core/requirements.txt b/rastervision_core/requirements.txt index 2df3d8c15..5e85a531e 100644 --- a/rastervision_core/requirements.txt +++ b/rastervision_core/requirements.txt @@ -1,8 +1,5 @@ rastervision_pipeline==0.20.3-dev -# These 3 should be updated together to ensure compatibility. Incompatibility -# results in a warning about pygeos versions. -pygeos==0.14 shapely==2.0.1 geopandas==0.13.2 From f4c795296a1e94294052d4334a63d53a4de6e93d Mon Sep 17 00:00:00 2001 From: Adeel Hassan Date: Thu, 13 Jul 2023 15:14:48 -0400 Subject: [PATCH 5/5] make scripts/unit_test also generate report when using coverage --- scripts/unit_tests | 2 +- tests/README.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/unit_tests b/scripts/unit_tests index 6b2a002ab..59124676f 100755 --- a/scripts/unit_tests +++ b/scripts/unit_tests @@ -28,6 +28,6 @@ else if ! [ -x "$(command -v coverage)" ]; then python -m unittest discover -t "$SRC_DIR" tests -vf else - coverage run -m unittest discover -t "$SRC_DIR" tests -vf + coverage run -m unittest discover -t "$SRC_DIR" tests -vf && coverage html fi fi diff --git a/tests/README.md b/tests/README.md index 783ccd021..871469254 100644 --- a/tests/README.md +++ b/tests/README.md @@ -15,9 +15,11 @@ Every directory in the path of the test file _must_ have an `__init__.py` file f ## Running tests ### Run all unit tests +Run all unit tests and generate a coverage report (if the `coverage` package is installed): ```sh scripts/unit_tests ``` + Or (from the repo root): ```sh python -m unittest discover -t . tests -vf