podaac
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎cmr/ops_associations.txt‎
Lines changed: 5 additions & 0 deletions b/‎cmr/ops_associations.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cmr/uat_associations.txt‎
Lines changed: 2 additions & 2 deletions b/‎cmr/uat_associations.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎podaac/subsetter/dimension_cleanup.py‎
Lines changed: 66 additions & 0 deletions b/‎podaac/subsetter/dimension_cleanup.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎podaac/subsetter/subset.py‎
Lines changed: 32 additions & 9 deletions b/‎podaac/subsetter/subset.py‎
Lines changed: 32 additions & 9 deletions
diff --git a/‎podaac/subsetter/xarray_enhancements.py‎
Lines changed: 43 additions & 18 deletions b/‎podaac/subsetter/xarray_enhancements.py‎
Lines changed: 43 additions & 18 deletions
@@ -12,6 +12,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 ### Security
 
+## [1.3.0]
+### Added
+- [issues/27](https://github.com/podaac/l2ss-py/issues/27): Xarray is unable to handle variables with duplicate dimensions. Module dimension_cleanup.py added to handle variables that may have duplicate dimensions. Method remove_duplicate_dims() creates a new dimension identical dimension to the dimensions originally duplicated so the dimension does not need to be duplicated and can have the same shape and values.
+- [issues/24](https://github.com/podaac/l2ss-py/issues/24): Added support for time as lines
+### Changed 
+- [issues/36](https://github.com/podaac/l2ss-py/issues/36): Empty datasets will now maintain attributes, variables, and dimensions where each variable contains a single data point where the value is masked.
+### Deprecated 
+### Removed
+### Fixed
+- [issues/34](https://github.com/podaac/l2ss-py/issues/34): Fixed bug that did not allow variable subsetting in OCO3 files. Fix includes adding the variable list in subset_bbox method 
+### Security
+
 ## [1.2.0]
 ### Added
 ### Changed 
 
@@ -26,3 +26,8 @@ C2036882072-POCLOUD
 C2036879048-POCLOUD
 C2036878029-POCLOUD
 C2036880717-POCLOUD
+C2075141559-POCLOUD
+C2183155461-POCLOUD
+C2158344213-POCLOUD
+C2205618215-POCLOUD
+C2205618339-POCLOUD
@@ -1,5 +1,3 @@
-C1234208436-POCLOUD
-C1234208437-POCLOUD
 C1234208438-POCLOUD
 C1234724470-POCLOUD
 C1234724471-POCLOUD
@@ -19,3 +17,5 @@ C1238538233-POCLOUD
 C1238538232-POCLOUD
 C1238538241-POCLOUD
 C1238658052-POCLOUD
+C1240739577-POCLOUD
+C1240739709-POCLOUD
@@ -0,0 +1,66 @@
+# This software may be subject to U.S. export control laws. By accepting
+# this software, the user agrees to comply with all applicable U.S. export
+# laws and regulations. User has the responsibility to obtain export
+# licenses, or other export authority as may be required before exporting
+# such information to foreign countries or providing access to foreign
+# persons.
+
+"""
+======================
+dimension_cleanup.py
+======================
+Functions which improve upon existing netCDF4 library existing functions
+"""
+
+import collections
+
+
+def remove_duplicate_dims(nc_dataset):
+    """
+    xarray cannot read netCDF4 datasets with duplicate dimensions.
+    Function goes through a dataset to catch any variables with duplicate dimensions.
+    creates an exact copy of the dimension duplicated with a new name. Variable
+    is reset with new dimensions without duplicates. Old variable deleted, new variable's name
+    is changed to the original name.
+    """
+    dup_vars = {}
+    for var_name, var in nc_dataset.variables.items():
+        dim_list = list(var.dimensions)
+        if len(set(dim_list)) != len(dim_list):  # get true if var.dimensions has a duplicate
+            dup_vars[var_name] = var  # populate dictionary with variables with vars with dup dims
+    for dup_var_name, dup_var in dup_vars.items():
+        dim_list = list(dup_var.dimensions)  # list of original dimensions of variable with dup dims
+        # get the dimensions that is duplicated
+        dim_dup = [item for item, count in collections.Counter(dim_list).items() if count > 1][0]
+        dim_dup_new = dim_dup+'_1'
+
+        var_name_new = dup_var_name+'_1'
+
+        # create new dimension by copying from the duplicated dimension
+
+        data = {}
+        fill_value = dup_var._FillValue  # pylint: disable=W0212
+        nc_dataset.createDimension(dim_dup_new, nc_dataset.variables[dim_dup].size)
+        data[dim_dup_new] = nc_dataset.createVariable(dim_dup_new, nc_dataset.variables[dim_dup].dtype,
+                                                      (dim_dup_new,), fill_value=fill_value)
+
+        for ncattr in nc_dataset.variables[dim_dup].ncattrs():
+            if ncattr != '_FillValue':
+                data[dim_dup_new].setncattr(ncattr, nc_dataset.variables[dim_dup].getncattr(ncattr))
+        data[dim_dup_new][:] = nc_dataset.variables[dim_dup][:]
+
+        new_dim_list = dim_list[:-1]
+        new_dim_list.extend([dim_dup_new])
+
+        # createVariable with new dimensions
+
+        data[var_name_new] = nc_dataset.createVariable(var_name_new, str(dup_var[:].dtype), tuple(new_dim_list), fill_value=fill_value)
+
+        for attrname in dup_var.ncattrs():
+            if attrname != '_FillValue':
+                data[var_name_new].setncattr(attrname, nc_dataset.variables[dup_var_name].getncattr(attrname))
+                data[var_name_new][:] = nc_dataset.variables[dup_var_name][:]
+        del nc_dataset.variables[dup_var_name]
+        nc_dataset.renameVariable(var_name_new, dup_var_name)
+
+    return nc_dataset
@@ -35,6 +35,7 @@
 from shapely.ops import transform
 
 from podaac.subsetter import xarray_enhancements as xre
+from podaac.subsetter import dimension_cleanup as dc
 
 GROUP_DELIM = '__'
 SERVICE_NAME = 'l2ss-py'
@@ -484,12 +485,18 @@ def get_time_variable_name(dataset, lat_var):
         # per lat var)
         return time_vars[0]
 
-    for var_name in list(dataset.dims.keys()):
+    # Filter variables with 'time' in the name to avoid extra work
+    time_vars = list(filter(lambda var_name: 'time' in var_name, dataset.dims.keys()))
+
+    for var_name in time_vars:
         if "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
             return var_name
     for var_name in list(dataset.data_vars.keys()):
         if "time" in var_name and dataset[var_name].squeeze().dims == lat_var.squeeze().dims:
             return var_name
+    for var_name in list(dataset.data_vars.keys()):
+        if 'time' in var_name and dataset[var_name].squeeze().dims[0] in lat_var.squeeze().dims:
+            return var_name
     raise ValueError('Unable to determine time variable')
 
 
@@ -566,7 +573,8 @@ def translate_timestamp(str_timestamp):
         '%Y-%m-%dT%H:%M:%SZ',
         '%Y-%m-%dT%H:%M:%S%Z',
         '%Y-%m-%dT%H:%M:%S.%fZ',
-        '%Y-%m-%dT%H:%M:%S.%f%Z'
+        '%Y-%m-%dT%H:%M:%S.%f%Z',
+        '%Y-%m-%d %H:%M:%S',
     ]
 
     for timestamp_format in allowed_ts_formats:
@@ -672,7 +680,7 @@ def build_cond(str_timestamp, compare):
     return temporal_cond
 
 
-def subset_with_bbox(dataset, lat_var_names, lon_var_names, time_var_names, bbox=None, cut=True,
+def subset_with_bbox(dataset, lat_var_names, lon_var_names, time_var_names, variables=None, bbox=None, cut=True,
                      min_time=None, max_time=None):
     """
     Subset an xarray Dataset using a spatial bounding box.
@@ -708,16 +716,23 @@ def subset_with_bbox(dataset, lat_var_names, lon_var_names, time_var_names, bbox
     if lon_bounds[0] > lon_bounds[1]:
         oper = operator.or_
 
+    lat_var_prefix = [f'{GROUP_DELIM}{GROUP_DELIM.join(x.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1])}' for x in lat_var_names]
     datasets = []
     for lat_var_name, lon_var_name, time_var_name in zip(
-            lat_var_names, lon_var_names, time_var_names
+        lat_var_names, lon_var_names, time_var_names
     ):
         if GROUP_DELIM in lat_var_name:
             var_prefix = GROUP_DELIM.join(lat_var_name.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1])
             group_vars = [
                 var for var in dataset.data_vars.keys()
                 if var.startswith(f'{GROUP_DELIM}{var_prefix}')
             ]
+            if variables:
+                group_vars.extend([
+                    var for var in dataset.data_vars.keys()
+                    if var in variables and var not in group_vars and not var.startswith(tuple(lat_var_prefix))
+                ])
+
         else:
             group_vars = list(dataset.keys())
 
@@ -956,13 +971,16 @@ def _rename_variables(dataset, base_dataset):
             encoded_var = cf_dt_coder.encode(dataset.variables[var_name])
             variable = encoded_var
 
+        var_attrs = variable.attrs
+        fill_value = var_attrs.get('_FillValue')
+        var_attrs.pop('_FillValue', None)
+
         if variable.dtype == object:
-            var_group.createVariable(new_var_name, 'S1', var_dims)
+            var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value)
         else:
-            var_group.createVariable(new_var_name, variable.dtype, var_dims)
+            var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value)
 
         # Copy attributes
-        var_attrs = variable.attrs
         var_group.variables[new_var_name].setncatts(var_attrs)
 
         # Copy data
@@ -1015,6 +1033,8 @@ def subset(file_to_subset, bbox, output_file, variables=None,  # pylint: disable
     if has_groups:
         nc_dataset = transform_grouped_dataset(nc_dataset, file_to_subset)
 
+    nc_dataset = dc.remove_duplicate_dims(nc_dataset)
+
     if variables:
         variables = [x.replace('/', GROUP_DELIM) for x in variables]
 
@@ -1046,10 +1066,10 @@ def subset(file_to_subset, bbox, output_file, variables=None,  # pylint: disable
         if variables:
             # Drop variables that aren't explicitly requested, except lat_var_name and
             # lon_var_name which are needed for subsetting
-            variables = [variable.upper() for variable in variables]
+            variables_upper = [variable.upper() for variable in variables]
             vars_to_drop = [
                 var_name for var_name, var in dataset.data_vars.items()
-                if var_name.upper() not in variables
+                if var_name.upper() not in variables_upper
                 and var_name not in lat_var_names
                 and var_name not in lon_var_names
                 and var_name not in time_var_names
@@ -1062,6 +1082,7 @@ def subset(file_to_subset, bbox, output_file, variables=None,  # pylint: disable
                 lat_var_names=lat_var_names,
                 lon_var_names=lon_var_names,
                 time_var_names=time_var_names,
+                variables=variables,
                 bbox=bbox,
                 cut=cut,
                 min_time=min_time,
@@ -1103,6 +1124,8 @@ def subset(file_to_subset, bbox, output_file, variables=None,  # pylint: disable
                 for var in dataset.data_vars:
                     if var not in encoding:
                         encoding[var] = compression
+                    if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
+                        dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')
 
                 dataset.load().to_netcdf(output_file, 'w', encoding=encoding)
 
 
@@ -20,6 +20,7 @@
 """
 
 import logging
+
 import numpy as np
 import xarray as xr
 
@@ -112,13 +113,15 @@ def copy_empty_dataset(dataset):
     xarray.Dataset
         The new dataset which has no data.
     """
-    empty_dataset = xr.Dataset()
-    for variable_name, variable in dataset.data_vars.items():
-        empty_dataset[variable_name] = []
-        empty_dataset[variable_name].attrs = variable.attrs
-    # Copy global metadata
-    empty_dataset.attrs = dataset.attrs
-    return empty_dataset
+    # Create a dict object where each key is a variable in the dataset and the value is an
+    # array initialized to the fill value for that variable or NaN if there is no fill value
+    # attribute for the variable
+    empty_data = {k: np.full(v.shape, dataset.variables[k].attrs.get('_FillValue', np.nan)) for k, v in
+                  dataset.items()}
+
+    # Create a copy of the dataset filled with the empty data. Then select the first index along each
+    # dimension and return the result
+    return dataset.copy(data=empty_data).isel({dim: slice(0, 1, 1) for dim in dataset.dims})
 
 
 def cast_type(var, var_type):
@@ -176,6 +179,14 @@ def where(dataset, cond, cut):
     if not all(len(value) > 0 for value in indexers.values()):
         return copy_empty_dataset(dataset)
 
+    # This will be true if any variables in the dataset have a partial
+    # overlap with the coordinate dims. If so, the cond should be
+    # applied per-variable rather than to the entire dataset.
+    partial_dim_in_in_vars = np.any(
+        [len(set(indexers.keys()).intersection(var.dims)) > 0 and len(
+            indexers.keys() - var.dims) > 0 for _, var in dataset.variables.items()]
+    )
+
     indexed_cond = cond.isel(**indexers)
     indexed_ds = dataset.isel(**indexers)
     new_dataset = indexed_ds.where(indexed_cond)
@@ -185,6 +196,20 @@ def where(dataset, cond, cut):
         original_type = indexed_ds[variable_name].dtype
         new_type = variable.dtype
 
+        indexed_var = indexed_ds[variable_name]
+
+        if partial_dim_in_in_vars and (indexers.keys() - dataset[variable_name].dims) and set(
+                indexers.keys()).intersection(dataset[variable_name].dims):
+            missing_dim = (indexers.keys() - dataset[variable_name].dims).pop()  # Assume only 1
+            var_indexers = {
+                dim_name: dim_value for dim_name, dim_value in indexers.items()
+                if dim_name in dataset[variable_name].dims
+            }
+            var_cond = cond.sel({missing_dim: 1}).isel(**var_indexers)
+            indexed_var = dataset[variable_name].isel(**var_indexers)
+            new_dataset[variable_name] = indexed_var.where(var_cond)
+            variable = new_dataset[variable_name]
+
         # Check if variable has no _FillValue. If so, use original data
         if '_FillValue' not in variable.attrs:
 
@@ -197,24 +222,24 @@ def where(dataset, cond, cut):
             # variable has more than one dimension, copy the entire
             # variable over, otherwise use a NaN mask to copy over the
             # relevant values.
-            if len(variable.shape) > 1:
-                new_dataset[variable_name] = indexed_ds[variable_name]
-            else:
-                nan_mask = np.isnan(variable.data)
-                if nan_mask.any():
-                    variable.data[nan_mask] = indexed_ds[variable_name][nan_mask]
-
-            new_dataset[variable_name].attrs = indexed_ds[variable_name].attrs
-            variable.attrs = indexed_ds[variable_name].attrs
+            new_dataset[variable_name] = indexed_var
+
+            new_dataset[variable_name].attrs = indexed_var.attrs
+            variable.attrs = indexed_var.attrs
             new_dataset[variable_name].encoding['_FillValue'] = None
             variable.encoding['_FillValue'] = None
 
         else:
             # Manually replace nans with FillValue
-            variable.data[np.isnan(variable.data)] = variable.attrs.get("_FillValue")
+            # If variable represents time, cast _FillValue to datetime
+            fill_value = new_dataset[variable_name].attrs.get('_FillValue')
+
+            if np.issubdtype(new_dataset[variable_name].dtype, np.dtype(np.datetime64)):
+                fill_value = np.datetime64('nat')
+            new_dataset[variable_name] = new_dataset[variable_name].fillna(fill_value)
 
             if original_type != new_type:
-                new_dataset[variable_name] = xr.apply_ufunc(cast_type, variable,
+                new_dataset[variable_name] = xr.apply_ufunc(cast_type, new_dataset[variable_name],
                                                             str(original_type), dask='allowed',
                                                             keep_attrs=True)