Skip to content

Commit 0e6a281

Browse files
author
Stepheny Perez
authored
Merge pull request #47 from podaac/release/1.3.0
Release/1.3.0
2 parents cff60fb + 57390d9 commit 0e6a281

15 files changed

+709
-447
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1212
### Fixed
1313
### Security
1414

15+
## [1.3.0]
16+
### Added
17+
- [issues/27](https://github.com/podaac/l2ss-py/issues/27): Xarray is unable to handle variables with duplicate dimensions. Module dimension_cleanup.py added to handle variables that may have duplicate dimensions. Method remove_duplicate_dims() creates a new dimension identical dimension to the dimensions originally duplicated so the dimension does not need to be duplicated and can have the same shape and values.
18+
- [issues/24](https://github.com/podaac/l2ss-py/issues/24): Added support for time as lines
19+
### Changed
20+
- [issues/36](https://github.com/podaac/l2ss-py/issues/36): Empty datasets will now maintain attributes, variables, and dimensions where each variable contains a single data point where the value is masked.
21+
### Deprecated
22+
### Removed
23+
### Fixed
24+
- [issues/34](https://github.com/podaac/l2ss-py/issues/34): Fixed bug that did not allow variable subsetting in OCO3 files. Fix includes adding the variable list in subset_bbox method
25+
### Security
26+
1527
## [1.2.0]
1628
### Added
1729
### Changed

cmr/ops_associations.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,8 @@ C2036882072-POCLOUD
2626
C2036879048-POCLOUD
2727
C2036878029-POCLOUD
2828
C2036880717-POCLOUD
29+
C2075141559-POCLOUD
30+
C2183155461-POCLOUD
31+
C2158344213-POCLOUD
32+
C2205618215-POCLOUD
33+
C2205618339-POCLOUD

cmr/uat_associations.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
C1234208436-POCLOUD
2-
C1234208437-POCLOUD
31
C1234208438-POCLOUD
42
C1234724470-POCLOUD
53
C1234724471-POCLOUD
@@ -19,3 +17,5 @@ C1238538233-POCLOUD
1917
C1238538232-POCLOUD
2018
C1238538241-POCLOUD
2119
C1238658052-POCLOUD
20+
C1240739577-POCLOUD
21+
C1240739709-POCLOUD
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# This software may be subject to U.S. export control laws. By accepting
2+
# this software, the user agrees to comply with all applicable U.S. export
3+
# laws and regulations. User has the responsibility to obtain export
4+
# licenses, or other export authority as may be required before exporting
5+
# such information to foreign countries or providing access to foreign
6+
# persons.
7+
8+
"""
9+
======================
10+
dimension_cleanup.py
11+
======================
12+
Functions which improve upon existing netCDF4 library existing functions
13+
"""
14+
15+
import collections
16+
17+
18+
def remove_duplicate_dims(nc_dataset):
19+
"""
20+
xarray cannot read netCDF4 datasets with duplicate dimensions.
21+
Function goes through a dataset to catch any variables with duplicate dimensions.
22+
creates an exact copy of the dimension duplicated with a new name. Variable
23+
is reset with new dimensions without duplicates. Old variable deleted, new variable's name
24+
is changed to the original name.
25+
"""
26+
dup_vars = {}
27+
for var_name, var in nc_dataset.variables.items():
28+
dim_list = list(var.dimensions)
29+
if len(set(dim_list)) != len(dim_list): # get true if var.dimensions has a duplicate
30+
dup_vars[var_name] = var # populate dictionary with variables with vars with dup dims
31+
for dup_var_name, dup_var in dup_vars.items():
32+
dim_list = list(dup_var.dimensions) # list of original dimensions of variable with dup dims
33+
# get the dimensions that is duplicated
34+
dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
35+
dim_dup_new = dim_dup+'_1'
36+
37+
var_name_new = dup_var_name+'_1'
38+
39+
# create new dimension by copying from the duplicated dimension
40+
41+
data = {}
42+
fill_value = dup_var._FillValue # pylint: disable=W0212
43+
nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
44+
data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
45+
(dim_dup_new,), fill_value=fill_value)
46+
47+
for ncattr in nc_dataset.variables[dim_dup].ncattrs():
48+
if ncattr != '_FillValue':
49+
data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
50+
data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
51+
52+
new_dim_list = dim_list[:-1]
53+
new_dim_list.extend([dim_dup_new])
54+
55+
# createVariable with new dimensions
56+
57+
data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
58+
59+
for attrname in dup_var.ncattrs():
60+
if attrname != '_FillValue':
61+
data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
62+
data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
63+
del nc_dataset.variables[dup_var_name]
64+
nc_dataset.renameVariable(var_name_new, dup_var_name)
65+
66+
return nc_dataset

podaac/subsetter/subset.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from shapely.ops import transform
3636

3737
from podaac.subsetter import xarray_enhancements as xre
38+
from podaac.subsetter import dimension_cleanup as dc
3839

3940
GROUP_DELIM = '__'
4041
SERVICE_NAME = 'l2ss-py'
@@ -484,12 +485,18 @@ def get_time_variable_name(dataset, lat_var):
484485
# per lat var)
485486
return time_vars[0]
486487

487-
for var_name in list(dataset.dims.keys()):
488+
# Filter variables with 'time' in the name to avoid extra work
489+
time_vars = list(filter(lambda var_name: 'time' in var_name, dataset.dims.keys()))
490+
491+
for var_name in time_vars:
488492
if "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
489493
return var_name
490494
for var_name in list(dataset.data_vars.keys()):
491495
if "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
492496
return var_name
497+
for var_name in list(dataset.data_vars.keys()):
498+
if 'time' in var_name and dataset[var_name].squeeze().dims[0] in lat_var.squeeze().dims:
499+
return var_name
493500
raise ValueError('Unable to determine time variable')
494501

495502

@@ -566,7 +573,8 @@ def translate_timestamp(str_timestamp):
566573
'%Y-%m-%dT%H:%M:%SZ',
567574
'%Y-%m-%dT%H:%M:%S%Z',
568575
'%Y-%m-%dT%H:%M:%S.%fZ',
569-
'%Y-%m-%dT%H:%M:%S.%f%Z'
576+
'%Y-%m-%dT%H:%M:%S.%f%Z',
577+
'%Y-%m-%d %H:%M:%S',
570578
]
571579

572580
for timestamp_format in allowed_ts_formats:
@@ -672,7 +680,7 @@ def build_cond(str_timestamp, compare):
672680
return temporal_cond
673681

674682

675-
def subset_with_bbox(dataset, lat_var_names, lon_var_names, time_var_names, bbox=None, cut=True,
683+
def subset_with_bbox(dataset, lat_var_names, lon_var_names, time_var_names, variables=None, bbox=None, cut=True,
676684
min_time=None, max_time=None):
677685
"""
678686
Subset an xarray Dataset using a spatial bounding box.
@@ -708,16 +716,23 @@ def subset_with_bbox(dataset, lat_var_names, lon_var_names, time_var_names, bbox
708716
if lon_bounds[0] > lon_bounds[1]:
709717
oper = operator.or_
710718

719+
lat_var_prefix = [f'{GROUP_DELIM}{GROUP_DELIM.join(x.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1])}' for x in lat_var_names]
711720
datasets = []
712721
for lat_var_name, lon_var_name, time_var_name in zip(
713-
lat_var_names, lon_var_names, time_var_names
722+
lat_var_names, lon_var_names, time_var_names
714723
):
715724
if GROUP_DELIM in lat_var_name:
716725
var_prefix = GROUP_DELIM.join(lat_var_name.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1])
717726
group_vars = [
718727
var for var in dataset.data_vars.keys()
719728
if var.startswith(f'{GROUP_DELIM}{var_prefix}')
720729
]
730+
if variables:
731+
group_vars.extend([
732+
var for var in dataset.data_vars.keys()
733+
if var in variables and var not in group_vars and not var.startswith(tuple(lat_var_prefix))
734+
])
735+
721736
else:
722737
group_vars = list(dataset.keys())
723738

@@ -956,13 +971,16 @@ def _rename_variables(dataset, base_dataset):
956971
encoded_var = cf_dt_coder.encode(dataset.variables[var_name])
957972
variable = encoded_var
958973

974+
var_attrs = variable.attrs
975+
fill_value = var_attrs.get('_FillValue')
976+
var_attrs.pop('_FillValue', None)
977+
959978
if variable.dtype == object:
960-
var_group.createVariable(new_var_name, 'S1', var_dims)
979+
var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value)
961980
else:
962-
var_group.createVariable(new_var_name, variable.dtype, var_dims)
981+
var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value)
963982

964983
# Copy attributes
965-
var_attrs = variable.attrs
966984
var_group.variables[new_var_name].setncatts(var_attrs)
967985

968986
# Copy data
@@ -1015,6 +1033,8 @@ def subset(file_to_subset, bbox, output_file, variables=None, # pylint: disable
10151033
if has_groups:
10161034
nc_dataset = transform_grouped_dataset(nc_dataset, file_to_subset)
10171035

1036+
nc_dataset = dc.remove_duplicate_dims(nc_dataset)
1037+
10181038
if variables:
10191039
variables = [x.replace('/', GROUP_DELIM) for x in variables]
10201040

@@ -1046,10 +1066,10 @@ def subset(file_to_subset, bbox, output_file, variables=None, # pylint: disable
10461066
if variables:
10471067
# Drop variables that aren't explicitly requested, except lat_var_name and
10481068
# lon_var_name which are needed for subsetting
1049-
variables = [variable.upper() for variable in variables]
1069+
variables_upper = [variable.upper() for variable in variables]
10501070
vars_to_drop = [
10511071
var_name for var_name, var in dataset.data_vars.items()
1052-
if var_name.upper() not in variables
1072+
if var_name.upper() not in variables_upper
10531073
and var_name not in lat_var_names
10541074
and var_name not in lon_var_names
10551075
and var_name not in time_var_names
@@ -1062,6 +1082,7 @@ def subset(file_to_subset, bbox, output_file, variables=None, # pylint: disable
10621082
lat_var_names=lat_var_names,
10631083
lon_var_names=lon_var_names,
10641084
time_var_names=time_var_names,
1085+
variables=variables,
10651086
bbox=bbox,
10661087
cut=cut,
10671088
min_time=min_time,
@@ -1103,6 +1124,8 @@ def subset(file_to_subset, bbox, output_file, variables=None, # pylint: disable
11031124
for var in dataset.data_vars:
11041125
if var not in encoding:
11051126
encoding[var] = compression
1127+
if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
1128+
dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')
11061129

11071130
dataset.load().to_netcdf(output_file, 'w', encoding=encoding)
11081131

podaac/subsetter/xarray_enhancements.py

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"""
2121

2222
import logging
23+
2324
import numpy as np
2425
import xarray as xr
2526

@@ -112,13 +113,15 @@ def copy_empty_dataset(dataset):
112113
xarray.Dataset
113114
The new dataset which has no data.
114115
"""
115-
empty_dataset = xr.Dataset()
116-
for variable_name, variable in dataset.data_vars.items():
117-
empty_dataset[variable_name] = []
118-
empty_dataset[variable_name].attrs = variable.attrs
119-
# Copy global metadata
120-
empty_dataset.attrs = dataset.attrs
121-
return empty_dataset
116+
# Create a dict object where each key is a variable in the dataset and the value is an
117+
# array initialized to the fill value for that variable or NaN if there is no fill value
118+
# attribute for the variable
119+
empty_data = {k: np.full(v.shape, dataset.variables[k].attrs.get('_FillValue', np.nan)) for k, v in
120+
dataset.items()}
121+
122+
# Create a copy of the dataset filled with the empty data. Then select the first index along each
123+
# dimension and return the result
124+
return dataset.copy(data=empty_data).isel({dim: slice(0, 1, 1) for dim in dataset.dims})
122125

123126

124127
def cast_type(var, var_type):
@@ -176,6 +179,14 @@ def where(dataset, cond, cut):
176179
if not all(len(value) > 0 for value in indexers.values()):
177180
return copy_empty_dataset(dataset)
178181

182+
# This will be true if any variables in the dataset have a partial
183+
# overlap with the coordinate dims. If so, the cond should be
184+
# applied per-variable rather than to the entire dataset.
185+
partial_dim_in_in_vars = np.any(
186+
[len(set(indexers.keys()).intersection(var.dims)) > 0 and len(
187+
indexers.keys() - var.dims) > 0 for _, var in dataset.variables.items()]
188+
)
189+
179190
indexed_cond = cond.isel(**indexers)
180191
indexed_ds = dataset.isel(**indexers)
181192
new_dataset = indexed_ds.where(indexed_cond)
@@ -185,6 +196,20 @@ def where(dataset, cond, cut):
185196
original_type = indexed_ds[variable_name].dtype
186197
new_type = variable.dtype
187198

199+
indexed_var = indexed_ds[variable_name]
200+
201+
if partial_dim_in_in_vars and (indexers.keys() - dataset[variable_name].dims) and set(
202+
indexers.keys()).intersection(dataset[variable_name].dims):
203+
missing_dim = (indexers.keys() - dataset[variable_name].dims).pop() # Assume only 1
204+
var_indexers = {
205+
dim_name: dim_value for dim_name, dim_value in indexers.items()
206+
if dim_name in dataset[variable_name].dims
207+
}
208+
var_cond = cond.sel({missing_dim: 1}).isel(**var_indexers)
209+
indexed_var = dataset[variable_name].isel(**var_indexers)
210+
new_dataset[variable_name] = indexed_var.where(var_cond)
211+
variable = new_dataset[variable_name]
212+
188213
# Check if variable has no _FillValue. If so, use original data
189214
if '_FillValue' not in variable.attrs:
190215

@@ -197,24 +222,24 @@ def where(dataset, cond, cut):
197222
# variable has more than one dimension, copy the entire
198223
# variable over, otherwise use a NaN mask to copy over the
199224
# relevant values.
200-
if len(variable.shape) > 1:
201-
new_dataset[variable_name] = indexed_ds[variable_name]
202-
else:
203-
nan_mask = np.isnan(variable.data)
204-
if nan_mask.any():
205-
variable.data[nan_mask] = indexed_ds[variable_name][nan_mask]
206-
207-
new_dataset[variable_name].attrs = indexed_ds[variable_name].attrs
208-
variable.attrs = indexed_ds[variable_name].attrs
225+
new_dataset[variable_name] = indexed_var
226+
227+
new_dataset[variable_name].attrs = indexed_var.attrs
228+
variable.attrs = indexed_var.attrs
209229
new_dataset[variable_name].encoding['_FillValue'] = None
210230
variable.encoding['_FillValue'] = None
211231

212232
else:
213233
# Manually replace nans with FillValue
214-
variable.data[np.isnan(variable.data)] = variable.attrs.get("_FillValue")
234+
# If variable represents time, cast _FillValue to datetime
235+
fill_value = new_dataset[variable_name].attrs.get('_FillValue')
236+
237+
if np.issubdtype(new_dataset[variable_name].dtype, np.dtype(np.datetime64)):
238+
fill_value = np.datetime64('nat')
239+
new_dataset[variable_name] = new_dataset[variable_name].fillna(fill_value)
215240

216241
if original_type != new_type:
217-
new_dataset[variable_name] = xr.apply_ufunc(cast_type, variable,
242+
new_dataset[variable_name] = xr.apply_ufunc(cast_type, new_dataset[variable_name],
218243
str(original_type), dask='allowed',
219244
keep_attrs=True)
220245

0 commit comments

Comments
 (0)