Skip to content

Commit

Permalink
DAS-1891 - Aggregated time dimensions only include input values.
Browse files Browse the repository at this point in the history
  • Loading branch information
owenlittlejohns authored Aug 1, 2023
1 parent 82d8a59 commit de47e5d
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 28 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## v1.2.0
### 2023-07-28

* DAS-1891 - Update temporal aggregation to return output temporal dimensions
only containing values that map to values in input granules, rather than
producing a regular grid.

## v1.1.1
### 2023-02-17

Expand Down
4 changes: 2 additions & 2 deletions docs/NetCDF-to-Zarr-Example-Usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@
"\n",
"**Figure 2:** Left: A gridded science variable as represented in six separate NetCDF-4 input GPM/IMERG granules. These have dimensions (1, 3600, 1800). Right: The stacked variable as saved in the output Zarr store. This has dimensions (6, 3600, 1800).\n",
"\n",
"The temporal aggregation will always produce a regular grid. If there are gaps in data coverage, then the temporal dimension will include a value for the missing data, but that slice will be populated with fill values.\n",
"The temporal aggregation will produce an output temporal dimension that only includes the temporal dimension values of the input granules. If the input time values are not all evenly spaced, potentially due to a missing granule, then the output temporal dimension will have gaps, and be irregular.\n",
"\n",
"### The temporal aggregation request:\n",
"\n",
Expand Down Expand Up @@ -403,7 +403,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.9.12"
}
},
"nbformat": 4,
Expand Down
5 changes: 3 additions & 2 deletions harmony_netcdf_to_zarr/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,8 @@ class method instantiates a dataset that uses the `ProcessSynchronizer`
shape=aggregated_shape,
chunks=tuple(chunks),
dtype=netcdf_variable.dtype,
fill_value=fill_value)
fill_value=fill_value
)

if resolved_variable_name not in aggregated_dimensions:
# For a non-aggregated dimension, insert input granule data
Expand Down Expand Up @@ -564,7 +565,7 @@ def compute_chunksize(shape: Union[tuple, list],
the regenerated new zarr chunks
"""
# convert compressed_chunksize_byte to integer if it's a str
if type(compressed_chunksize_byte) == str:
if isinstance(compressed_chunksize_byte, str):
try:
(value, unit) = findall(
r'^\s*([\d.]+)\s*(Ki|Mi|Gi)\s*$', compressed_chunksize_byte
Expand Down
29 changes: 23 additions & 6 deletions harmony_netcdf_to_zarr/mosaic_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def __init__(self, input_paths: List[str]):
self._map_input_dimensions()

if len(self.input_paths) > 1:
# Only calculate regular, aggregated dimensions for multiple inputs
# Only calculate aggregated dimensions for multiple inputs
self._aggregate_output_dimensions()

def _map_input_dimensions(self):
Expand Down Expand Up @@ -263,8 +263,22 @@ def _get_temporal_output_dimension(self,
dimension_name: str) -> DimensionInformation:
""" Find the units metadata attribute for the input granule with the
earliest epoch. Apply this epoch to the temporal data in all
granules, to place them with respect to a common epoch. Then use
generate an output dimension grid.
granules, to place them with respect to a common epoch.
This method now only returns an aggregated array of input values.
Previously, it would calculate a regular grid that had evenly
spaced pixels, and contained all input values. Several collections
have monthly granules that produce grids with hourly or finer
resolution, and so caused significant performance issues and
created very sparsely populated Zarr stores.
To reimplement uniform gridding, replace the call to
`self._get_dimension_bounds` and the return statement with:
```
return self._get_output_dimension(dimension_name, all_input_values,
output_dimension_units)
```
"""
dimension_units = [dimension_input.units
Expand All @@ -281,9 +295,12 @@ def _get_temporal_output_dimension(self,
dtype=list(dimension_inputs.values())[0].get_values().dtype
)
)

return self._get_output_dimension(dimension_name, all_input_values,
output_dimension_units)
bounds_path, bounds_values = self._get_dimension_bounds(
dimension_name, all_input_values
)
return DimensionInformation(dimension_name, all_input_values,
output_dimension_units, bounds_path,
bounds_values)

def _get_output_dimension(self, dimension_name: str,
input_dimension_values: np.ndarray,
Expand Down
93 changes: 76 additions & 17 deletions tests/unit/test_mosaic_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,9 @@ def test_dimensions_mapping_output_merra(self, mock_dataset):
* Continuous granules (e.g., all output dimension values map to an
input value).
* Discontinous granules (e.g., there is intervening space in the
output temporal dimension).
* Discontinuous granules (e.g., there is intervening space in the
output temporal dimension). This will have gaps in the output
temporal dimension.
"""
merra_time_values = np.linspace(0, 1380, 24)
Expand Down Expand Up @@ -358,8 +359,13 @@ def test_dimensions_mapping_output_merra(self, mock_dataset):
# Check the output time has correct values and units.
self.assertEqual(merra_mapping.output_dimensions['/time'].units,
'minutes since 2020-01-03T00:30:00')

# Expected time values are 24 consecutive hours, then a gap of 24
# hours, before another 24 consecutive hourly values.
expected_time_values = np.append(np.linspace(0, 23 * 60, 24),
np.linspace(48 * 60, 71 * 60, 24))
assert_array_equal(merra_mapping.output_dimensions['/time'].values,
np.linspace(0, 4260, 72)) # 72 values of consecutive hours
expected_time_values)

# Check none of the output dimensions have bounds information, as
# none of the inputs did.
Expand All @@ -385,21 +391,23 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
* Continuous granules (e.g., all output dimension values map to an
input value).
* Discontinous granules (e.g., there is intervening space in the
output temporal dimension).
output temporal dimension). This test will now assume those gaps
are persisted as the service will no longer attempt to create a
regular grid.
"""
expected_output_time_values = np.linspace(0, 432000, 6) # Daily data
continuous_time_values = np.linspace(0, 432000, 6) # Daily data
dataset_one = self.generate_netcdf_input(
'gpm_one.nc4', self.lat_data, self.lon_data,
np.array([expected_output_time_values[0]]), self.temporal_units
np.array([continuous_time_values[0]]), self.temporal_units
)
dataset_two = self.generate_netcdf_input(
'gpm_two.nc4', self.lat_data, self.lon_data,
np.array([expected_output_time_values[2]]), self.temporal_units
np.array([continuous_time_values[2]]), self.temporal_units
)
dataset_three = self.generate_netcdf_input(
'gpm_three.nc4', self.lat_data, self.lon_data,
np.array([expected_output_time_values[5]]), self.temporal_units
np.array([continuous_time_values[5]]), self.temporal_units
)

mock_dataset.side_effect = [dataset_one, dataset_two, dataset_three]
Expand Down Expand Up @@ -438,10 +446,15 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
"""

# Check the output time has correct values and units.
expected_discontinuous_time_values = np.array([
continuous_time_values[0],
continuous_time_values[2],
continuous_time_values[5]
])
self.assertEqual(gpm_mapping.output_dimensions['/time'].units,
self.temporal_units)
assert_array_equal(gpm_mapping.output_dimensions['/time'].values,
expected_output_time_values)
expected_discontinuous_time_values)

# Check none of the output dimensions have bounds information, as
# none of the inputs did.
Expand All @@ -454,6 +467,55 @@ def test_dimensions_mapping_output_gpm(self, mock_dataset):
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_values)
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_path)

@patch('harmony_netcdf_to_zarr.mosaic_utilities.Dataset')
def test_dimensions_mapping_unordered_granules(self, mock_dataset):
""" Test that the `DimensionsMapping.output_dimensions` mapping is
correctly instantiated from known input data. This specific test
targets data like GPM/IMERG, where the spatial dimensions are the
same in each granule, the temporal dimension epochs are the same,
but the temporal dimension values vary between granules.
This specific test ensures that the output temporal dimension will
be correctly ordered, even if the input granules are not. This is
achieved by the behaviour of `numpy.unique`.
"""
expected_output_time_values = np.linspace(0, 172800, 3) # Daily data
dataset_one = self.generate_netcdf_input(
'gpm_one.nc4', self.lat_data, self.lon_data,
np.array([expected_output_time_values[0]]), self.temporal_units
)
dataset_two = self.generate_netcdf_input(
'gpm_two.nc4', self.lat_data, self.lon_data,
np.array([expected_output_time_values[1]]), self.temporal_units
)
dataset_three = self.generate_netcdf_input(
'gpm_three.nc4', self.lat_data, self.lon_data,
np.array([expected_output_time_values[2]]), self.temporal_units
)

mock_dataset.side_effect = [dataset_one, dataset_two, dataset_three]
gpm_mapping = DimensionsMapping(['gpm_three.nc4', 'gpm_one.nc4',
'gpm_two.nc4'])

# Check the expected dimensions are in the output mapping.
# Note: aggregation of non-temporal dimensions has been disabled
# as the Swath Projector can have values with slight rounding
# errors in their output grid dimensions.
self.assertSetEqual(set(gpm_mapping.output_dimensions.keys()),
{'/time'})

# Check the output time has correct values and units.
self.assertEqual(gpm_mapping.output_dimensions['/time'].units,
self.temporal_units)
assert_array_equal(gpm_mapping.output_dimensions['/time'].values,
expected_output_time_values)

# Check none of the output dimensions have bounds information, as
# none of the inputs did.
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_values)
self.assertIsNone(gpm_mapping.output_dimensions['/time'].bounds_path)

@patch('harmony_netcdf_to_zarr.mosaic_utilities.Dataset')
def test_dimensions_mapping_output_spatial(self, mock_dataset):
""" Test that the `DimensionsMapping.output_dimensions` mapping is
Expand Down Expand Up @@ -538,9 +600,9 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
* All output dimension values map to an input dimension value
and therefore all output bounds values can be copied from the
input data
* There are output dimension values that do not map to input
dimension values (due to gaps in coverage) and the corresponding
bounds values for those gaps must be calculated.
* The inputs are discontinuous, and so the outputs will also be
discontinuous. (Note, previously, the gaps would be filled to
form a regularly sampled dimension)
"""
dimension_data_one = np.linspace(0, 2, 3)
Expand Down Expand Up @@ -591,7 +653,7 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
if dataset.isopen():
dataset.close()

with self.subTest('Some output dimension values are in coverage gaps'):
with self.subTest('Discontinuous input granules'):
dataset_one = self.generate_netcdf_with_bounds('bounds_three.nc4',
'dim',
dimension_data_one,
Expand All @@ -617,15 +679,12 @@ def test_dimensions_mapping_bounds(self, mock_dataset):
[2.5, 3.5],
[3.5, 4.5],
[4.5, 5.5],
[5.5, 6.5],
[6.5, 7.5],
[7.5, 8.5],
[8.5, 9.5],
[9.5, 10.5],
[10.5, 11.5]])

assert_array_equal(mapping.output_dimensions['/dim'].values,
np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))
np.array([0, 1, 2, 3, 4, 5, 9, 10, 11]))
self.assertEqual(mapping.output_dimensions['/dim'].bounds_path,
'/dim_bnds')
assert_array_equal(mapping.output_dimensions['/dim'].bounds_values,
Expand Down
2 changes: 1 addition & 1 deletion version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1.1
1.2.0

0 comments on commit de47e5d

Please sign in to comment.