diff --git a/Changelog.rst b/Changelog.rst index ca8a6cc93d..f962181f7b 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,8 +1,15 @@ -version 3.14.2 +version 3.15.0 -------------- **2023-04-??** +* Re-introduction of CFA-netCDF functionality for CFA-0.6 + (https://github.com/NCAS-CMS/cf-python/issues/451, + https://github.com/NCAS-CMS/cf-python/issues/475, + https://github.com/NCAS-CMS/cf-python/issues/637) +* New function: `cf.CFA` +* New method: `cf.Data.get_cfa_write` +* New method: `cf.Data.set_cfa_write` * Fix excessive memory use arising from `cf.Field.regrids` and `cf.Field.regridc` (https://github.com/NCAS-CMS/cf-python/issues/623) @@ -10,6 +17,7 @@ version 3.14.2 runlength encoded (https://github.com/NCAS-CMS/cf-python/issues/621) * Removed benign UserWarning from `cf.Field.percentile` (https://github.com/NCAS-CMS/cf-python/issues/619) +* Changed dependency: ``1.10.1.0<=cfdm<1.10.2.0`` ---- @@ -49,6 +57,8 @@ version 3.14.0 https://github.com/NCAS-CMS/cf-python/issues/428) * Backwards incompatible API changes to facilitate the use of Dask (https://github.com/NCAS-CMS/cf-python/issues/579) +* Removal of CFA-0.4 functionality (CFA-0.6 will introduced at a later + version). * New method: `cf.Field.get_original_filenames` (https://github.com/NCAS-CMS/cf-python/issues/448) * New method: `cf.Field.to_dask_array` diff --git a/cf/__init__.py b/cf/__init__.py index 7e76d73cf8..ef037e2d0b 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -97,6 +97,7 @@ raise ImportError(_error0 + str(error1)) __cf_version__ = cfdm.core.__cf_version__ +__cfa_version__ = "0.6.2" from packaging.version import Version import importlib.util @@ -188,8 +189,8 @@ ) # Check the version of cfdm -_minimum_vn = "1.10.0.3" -_maximum_vn = "1.10.1.0" +_minimum_vn = "1.10.1.0" +_maximum_vn = "1.10.2.0" _cfdm_version = Version(cfdm.__version__) if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): raise RuntimeError( @@ -198,7 +199,7 @@ ) # Check the version of dask -_minimum_vn = "2022.12.1" +_minimum_vn = "2022.02.1" if Version(dask.__version__) < Version(_minimum_vn): raise RuntimeError( f"Bad dask version: cf requires dask>={_minimum_vn}. " @@ -262,7 +263,7 @@ ) from .data.fragment import ( - MissingFragmentArray, + FullFragmentArray, NetCDFFragmentArray, UMFragmentArray, ) diff --git a/cf/aggregate.py b/cf/aggregate.py index 0e9dd1a1f2..9b17100214 100644 --- a/cf/aggregate.py +++ b/cf/aggregate.py @@ -2,19 +2,19 @@ from collections import namedtuple from operator import itemgetter +import numpy as np from cfdm import is_log_level_debug, is_log_level_detail, is_log_level_info -from numpy import argsort as numpy_argsort -from numpy import dtype as numpy_dtype -from numpy import sort as numpy_sort from .auxiliarycoordinate import AuxiliaryCoordinate -from .data.data import Data +from .data import Data +from .data.array import FullArray from .decorators import ( _manage_log_level_via_verbose_attr, _manage_log_level_via_verbosity, _reset_log_emergence_level, ) from .domainaxis import DomainAxis +from .fieldancillary import FieldAncillary from .fieldlist import FieldList from .functions import _DEPRECATION_ERROR_FUNCTION_KWARGS, _numpy_allclose from .functions import atol as cf_atol @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) -_dtype_float = numpy_dtype(float) +_dtype_float = np.dtype(float) # # -------------------------------------------------------------------- # # Global properties, as defined in Appendix A of the CF conventions. @@ -134,10 +134,11 @@ def __init__( equal=None, exist=None, ignore=None, - dimension=(), + dimension=None, relaxed_identities=False, ncvar_identities=False, field_identity=None, + field_ancillaries=(), copy=True, ): """**initialisation** @@ -207,6 +208,11 @@ def __init__( coordinate whose datum is the property's value and the property itself is deleted from that field. + field_ancillaries: (sequence of) `str`, optional + See `cf.aggregate` for details. + + .. versionadded:: TODOCFAVER + copy: `bool` optional If False then do not copy fields prior to aggregation. Setting this option to False may change input fields in @@ -289,41 +295,22 @@ def __init__( "no identity; consider setting " "relaxed_identities" ) return - # elif not self.has_data: - # self.message = "{} has no data".format(f.__class__.__name__) - # return # ------------------------------------------------------------ # Promote selected properties to 1-d, size 1 auxiliary - # coordinates + # coordinates with new independent domain axes # ------------------------------------------------------------ - _copy = copy - for prop in dimension: - value = f.get_property(prop, None) - if value is None: - continue - - aux_coord = AuxiliaryCoordinate( - properties={"long_name": prop}, - data=Data([value], units=""), - copy=False, - ) - aux_coord.nc_set_variable(prop) - aux_coord.id = prop - - if _copy: - # Copy the field, as we're about to change it. - f = f.copy() - self.field = f - _copy = False - - axis = f.set_construct(DomainAxis(1)) - f.set_construct(aux_coord, axes=[axis], copy=False) + if dimension: + f = self.promote_to_auxiliary_coordinate(dimension) - f.del_property(prop) + # ------------------------------------------------------------ + # Promote selected properties to field ancillaries that span + # the same domain axes as the field + # ------------------------------------------------------------ + if field_ancillaries: + f = self.promote_to_field_ancillary(field_ancillaries) - if dimension: - construct_axes = f.constructs.data_axes() + construct_axes = f.constructs.data_axes() self.units = self.canonical_units( f, self.identity, relaxed_units=relaxed_units @@ -400,7 +387,6 @@ def __init__( "coordrefs": self.find_coordrefs(axis), } ) - # 'size' : None}) # Find the 1-d auxiliary coordinates which span this axis aux_coords = { @@ -546,10 +532,10 @@ def __init__( # Field ancillaries # ------------------------------------------------------------ self.field_anc = {} - field_ancillaries = f.constructs.filter_by_type( + field_ancs = f.constructs.filter_by_type( "field_ancillary", todict=True ) - for key, field_anc in field_ancillaries.items(): + for key, field_anc in field_ancs.items(): # Find this field ancillary's identity identity = self.field_ancillary_has_identity_and_data(field_anc) if identity is None: @@ -1395,6 +1381,117 @@ def find_coordrefs(self, key): return tuple(sorted(names)) + def promote_to_auxiliary_coordinate(self, properties): + """Promote properties to auxiliary coordinate constructs. + + Each property is converted to a 1-d auxilliary coordinate + construct that spans a new independent size 1 domain axis of + the field, and the property is deleted. + + ... versionadded:: TODOCFAVER + + :Parameters: + + properties: sequence of `str` + The names of the properties to be promoted. + + :Returns: + + `Field` or `Domain` + The field or domain with the new auxiliary coordinate + constructs. + + """ + f = self.field + + copy = True + for prop in properties: + value = f.get_property(prop, None) + if value is None: + continue + + aux_coord = AuxiliaryCoordinate( + properties={"long_name": prop}, + data=Data([value]), + copy=False, + ) + aux_coord.nc_set_variable(prop) + aux_coord.id = prop + + if copy: + # Copy the field as we're about to change it + f = f.copy() + copy = False + + axis = f.set_construct(DomainAxis(1)) + f.set_construct(aux_coord, axes=[axis], copy=False) + f.del_property(prop) + + self.field = f + return f + + def promote_to_field_ancillary(self, properties): + """Promote properties to field ancillary constructs. + + For each input field, each property is converted to a field + ancillary construct that spans the entire domain, with the + constant value of the property. + + The `Data` of any new field ancillary construct is marked + as a CFA term, meaning that it will only be written to disk if + the parent field construct is written as a CFA aggregation + variable, and in that case the field ancillary is written as a + non-standard CFA aggregation instruction variable, rather than + a CF-netCDF ancillary variable. + + If a domain construct is being aggregated then it is always + returned unchanged. + + ... versionadded:: TODOCFAVER + + :Parameters: + + properties: sequnce of `str` + The names of the properties to be promoted. + + :Returns: + + `Field` or `Domain` + The field or domain with the new field ancillary + constructs. + + """ + f = self.field + if f.construct_type != "field": + return f + + copy = True + for prop in properties: + value = f.get_property(prop, None) + if value is None: + continue + + data = Data( + FullArray(value, shape=f.shape, dtype=np.array(value).dtype) + ) + data._cfa_set_term(True) + + field_anc = FieldAncillary( + data=data, properties={"long_name": prop}, copy=False + ) + field_anc.id = prop + + if copy: + # Copy the field as we're about to change it + f = f.copy() + copy = False + + f.set_construct(field_anc, axes=f.get_data_axes(), copy=False) + f.del_property(prop) + + self.field = f + return f + @_manage_log_level_via_verbosity def aggregate( @@ -1423,6 +1520,7 @@ def aggregate( no_overlap=False, shared_nc_domain=False, field_identity=None, + field_ancillaries=None, info=False, ): """Aggregate field constructs into as few field constructs as @@ -1649,6 +1747,16 @@ def aggregate( numbers. The default value is set by the `cf.rtol` function. + field_ancillaries: (sequence of) `str`, optional + Create new field ancillary constructs for each input field + which has one or more of the given properties. For each + input field, each property is converted to a field + ancillary construct that spans the entire domain, with the + constant value of the property, and the property itself is + deleted. + + .. versionadded:: TODOCFAVER + no_overlap: Use the *overlap* parameter instead. @@ -1705,6 +1813,7 @@ def aggregate( "\ninfo=2 maps to verbose=3" "\ninfo=3 maps to verbose=-1", version="3.5.0", + removed_at="4.0.0", ) # pragma: no cover # Initialise the cache for coordinate and cell measure hashes, @@ -1738,6 +1847,9 @@ def aggregate( if isinstance(dimension, str): dimension = (dimension,) + if isinstance(field_ancillaries, str): + field_ancillaries = (field_ancillaries,) + if exist_all and equal_all: raise ValueError( "Only one of 'exist_all' and 'equal_all' can be True, since " @@ -1808,6 +1920,7 @@ def aggregate( ncvar_identities=ncvar_identities, field_identity=field_identity, respect_valid=respect_valid, + field_ancillaries=field_ancillaries, copy=copy, ) @@ -2220,7 +2333,7 @@ def _create_hash_and_first_values( # ... or which doesn't have a dimension coordinate but # does have one or more 1-d auxiliary coordinates aux = m_axis_identity["keys"][0] - sort_indices = numpy_argsort(field.constructs[aux].array) + sort_indices = np.argsort(field.constructs[aux].array) m_sort_keys[axis] = aux null_sort = False @@ -2662,8 +2775,8 @@ def _get_hfl( if create_flb: # Record the bounds of the first and last (sorted) cells - first = numpy_sort(array[0, ...]) - last = numpy_sort(array[-1, ...]) + first = np.sort(array[0, ...]) + last = np.sort(array[-1, ...]) hfl_cache.flb[key] = (first, last) if first_and_last_values or first_and_last_bounds: diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 5a4a96e1ab..a5f3035971 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -83,13 +83,16 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): def initialise_CFANetCDFArray( self, filename=None, - ncvar=None, - group=None, + address=None, dtype=None, mask=True, units=False, calendar=False, instructions=None, + substitutions=None, + term=None, + x=None, + **kwargs, ): """Return a `CFANetCDFArray` instance. @@ -97,9 +100,7 @@ def initialise_CFANetCDFArray( filename: `str` - ncvar: `str` - - group: `None` or sequence of str` + address: (sequence of) `str` or `int` dytpe: `numpy.dtype` @@ -111,6 +112,15 @@ def initialise_CFANetCDFArray( instructions: `str`, optional + substitutions: `dict`, optional + + term: `str`, optional + + x: `dict`, optional + + kwargs: optional + Ignored. + :Returns: `CFANetCDFArray` @@ -119,13 +129,15 @@ def initialise_CFANetCDFArray( cls = self.get_class("CFANetCDFArray") return cls( filename=filename, - ncvar=ncvar, - group=group, + address=address, dtype=dtype, mask=mask, units=units, calendar=calendar, instructions=instructions, + substitutions=substitutions, + term=term, + x=x, ) diff --git a/cf/data/array/abstract/filearray.py b/cf/data/array/abstract/filearray.py index 2f48b709fa..750a7f8f31 100644 --- a/cf/data/array/abstract/filearray.py +++ b/cf/data/array/abstract/filearray.py @@ -69,25 +69,10 @@ def get_address(self): """ raise NotImplementedError( - f"Must implement {self.__class__.__name__}.get_address" + f"Must implement {self.__class__.__name__}.get_address " + "in subclasses" ) # pragma: no cover - def get_filename(self): - """Return the name of the file containing the array. - - :Returns: - - `str` or `None` - The filename, or `None` if there isn't one. - - **Examples** - - >>> a.get_filename() - 'file.nc' - - """ - return self._get_component("filename", None) - def open(self): """Returns an open dataset containing the data array.""" raise NotImplementedError( diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/cfanetcdfarray.py index bbf5e3f8a0..bef6fbffb3 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/cfanetcdfarray.py @@ -1,14 +1,20 @@ from copy import deepcopy +from functools import partial from itertools import accumulate, product -from ...functions import abspath -from ..fragment import ( - MissingFragmentArray, - NetCDFFragmentArray, - UMFragmentArray, -) +import numpy as np + +from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray +from ..utils import chunk_locations, chunk_positions from .netcdfarray import NetCDFArray +# Store fragment array classes. +_FragmentArray = { + "nc": NetCDFFragmentArray, + "um": UMFragmentArray, + "full": FullFragmentArray, +} + class CFANetCDFArray(NetCDFArray): """A CFA aggregated array stored in a netCDF file. @@ -17,75 +23,37 @@ class CFANetCDFArray(NetCDFArray): """ - def __new__(cls, *args, **kwargs): - """Store fragment array classes. - - .. versionadded:: 3.14.0 - - """ - instance = super().__new__(cls) - instance._FragmentArray = { - "nc": NetCDFFragmentArray, - "um": UMFragmentArray, - None: MissingFragmentArray, - } - return instance - def __init__( self, filename=None, - ncvar=None, - varid=None, - group=None, + address=None, dtype=None, mask=True, units=False, calendar=False, instructions=None, + substitutions=None, + term=None, source=None, copy=True, + x=None, ): """**Initialisation** :Parameters: - filename: `str` - The name of the netCDF file containing the array. - - ncvar: `str`, optional - The name of the netCDF variable containing the - array. Required unless *varid* is set. + filename: (sequence of) `str`, optional + The name of the CFA-netCDF file containing the + array. If a sequence then it must contain one element. - varid: `int`, optional - The UNIDATA netCDF interface ID of the variable - containing the array. Required if *ncvar* is not set, - ignored if *ncvar* is set. - - group: `None` or sequence of `str`, optional - Specify the netCDF4 group to which the netCDF variable - belongs. By default, or if *group* is `None` or an - empty sequence, it assumed to be in the root - group. The last element in the sequence is the name of - the group in which the variable lies, with other - elements naming any parent groups (excluding the root - group). - - *Parameter example:* - To specify that a variable is in the root group: - ``group=()`` or ``group=None`` - - *Parameter example:* - To specify that a variable is in the group '/forecasts': - ``group=['forecasts']`` - - *Parameter example:* - To specify that a variable is in the group - '/forecasts/model2': ``group=['forecasts', 'model2']`` + address: (sequence of) `str`, optional + The name of the CFA-netCDF aggregation variable for the + array. If a sequence then it must contain one element. dtype: `numpy.dtype` - The data type of the array in the netCDF file. May be - `None` if the numpy data-type is not known (which can be - the case for netCDF string types, for example). + The data type of the aggregated data array. May be + `None` if the numpy data-type is not known (which can + be the case for netCDF string types, for example). mask: `bool` If True (the default) then mask by convention when @@ -104,15 +72,39 @@ def __init__( The calendar of the aggregated data. Set to `None` to indicate the CF default calendar, if applicable. + instructions: `str`, optional + The ``aggregated_data`` attribute value as found on + the CFA netCDF variable. If set then this will be used + to improve the performance of `__dask_tokenize__`. + + substitutions: `dict`, optional + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + .. versionadded:: TODOCFAVER + + term: `str`, optional + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard + terms. If set then *address* must be the name of the + term's CFA-netCDF aggregation instruction variable, + which must be defined on the fragment dimensions and + no others. Each value of the aggregation instruction + variable will be broadcast across the shape of the + corresponding fragment. + + *Parameter example:* + ``address='cfa_tracking_id', term='tracking_id'`` + + .. versionadded:: TODOCFAVER + {{init source: optional}} {{init copy: `bool`, optional}} - instructions: `str`, optional - The ``aggregated_data`` attribute value found on the - CFA netCDF variable. If set then this will be used by - `__dask_tokenize__` to improve performance. - """ if source is not None: super().__init__(source=source, copy=copy) @@ -131,27 +123,97 @@ def __init__( aggregated_data = source.get_aggregated_data(copy=False) except AttributeError: aggregated_data = {} - elif filename is not None: - from CFAPython import CFAFileFormat - from CFAPython.CFADataset import CFADataset - from CFAPython.CFAExceptions import CFAException - from dask import compute, delayed - cfa = CFADataset(filename, CFAFileFormat.CFANetCDF, "r") try: - var = cfa.getVar(ncvar) - except CFAException: - raise ValueError( - f"CFA variable {ncvar} not found in file {filename}" - ) + substitutions = source.get_substitutions() + except AttributeError: + substitutions = None - shape = tuple([d.len for d in var.getDims()]) + try: + term = source.get_term() + except AttributeError: + term = None + + elif filename is not None: + aggregated_data = {} + + location = x["location"] + ndim = location.shape[0] + + chunks = [i.compressed().tolist() for i in location] + shape = [sum(c) for c in chunks] + positions = chunk_positions(chunks) + locations = chunk_locations(chunks) + + if term is not None: + # -------------------------------------------------------- + # This fragment contains a constant value, not file + # locations. + # -------------------------------------------------------- + term = x[term] + fragment_shape = term.shape + aggregated_data = { + frag_loc: { + "location": loc, + "fill_value": term[frag_loc].item(), + "format": "full", + } + for frag_loc, loc in zip(positions, locations) + } + else: + a = x["address"] + f = x["file"] + fmt = x["format"] + + extra_dimension = f.ndim > ndim + if extra_dimension: + # There is an extra non-fragment dimension + fragment_shape = f.shape[:-1] + else: + fragment_shape = f.shape + + if not a.ndim: + a = np.full(f.shape, a, dtype=a.dtype) + + if not fmt.ndim: + fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) + + if extra_dimension: + aggregated_data = { + frag_loc: { + "location": loc, + "filename": f[frag_loc].tolist(), + "address": a[frag_loc].tolist(), + "format": fmt[frag_loc].item(), + } + for frag_loc, loc in zip(positions, locations) + } + else: + aggregated_data = { + frag_loc: { + "location": loc, + "filename": (f[frag_loc].item(),), + "address": (a[frag_loc].item(),), + "format": fmt[frag_loc].item(), + } + for frag_loc, loc in zip(positions, locations) + } + + # Apply string substitutions to the fragment filenames + if substitutions: + for value in aggregated_data.values(): + filenames2 = [] + for filename in value["filename"]: + for base, sub in substitutions.items(): + filename = filename.replace(base, sub) + + filenames2.append(filename) + + value["filename"] = filenames2 super().__init__( filename=filename, - ncvar=ncvar, - varid=varid, - group=group, + address=address, shape=shape, dtype=dtype, mask=mask, @@ -159,32 +221,10 @@ def __init__( calendar=calendar, copy=copy, ) - - fragment_shape = tuple(var.getFragDef()) - - # Note: It is an as-yet-untested hypothesis that creating - # the 'aggregated_data' dictionary for massive - # aggretations (e.g. with O(10e6) fragments) will be - # slow, hence the parallelisation of the process - # with delayed + compute, and that the - # parallelisation overheads won't be noticeable for - # small aggregations (e.g. O(10) fragments). - aggregated_data = {} - set_fragment = self._set_fragment - compute( - *[ - delayed(set_fragment(var, loc, aggregated_data, filename)) - for loc in product(*[range(i) for i in fragment_shape]) - ] - ) - - del cfa else: super().__init__( filename=filename, - ncvar=ncvar, - varid=varid, - group=group, + address=address, dtype=dtype, mask=mask, units=units, @@ -195,10 +235,17 @@ def __init__( fragment_shape = None aggregated_data = None instructions = None + term = None self._set_component("fragment_shape", fragment_shape, copy=False) self._set_component("aggregated_data", aggregated_data, copy=False) self._set_component("instructions", instructions, copy=False) + self._set_component("term", term, copy=False) + + if substitutions is not None: + self._set_component( + "substitutions", substitutions.copy(), copy=False + ) def __dask_tokenize__(self): """Used by `dask.base.tokenize`. @@ -206,72 +253,131 @@ def __dask_tokenize__(self): .. versionadded:: 3.14.0 """ + out = super().__dask_tokenize__() aggregated_data = self._get_component("instructions", None) if aggregated_data is None: aggregated_data = self.get_aggregated_data(copy=False) - return ( - self.__class__.__name__, - abspath(self.get_filename()), - self.get_ncvar(), - self.get_group(), - aggregated_data, - ) + return out + (aggregated_data,) def __getitem__(self, indices): """x.__getitem__(indices) <==> x[indices]""" return NotImplemented # pragma: no cover - def _set_fragment(self, var, frag_loc, aggregated_data, cfa_filename): - """Create a new key/value pair in the *aggregated_data* - dictionary. + def get_aggregated_data(self, copy=True): + """Get the aggregation data dictionary. - The *aggregated_data* dictionary contains the definitions of - the fragments and the instructions on how to aggregate them, - and is updated in-place. + The aggregation data dictionary contains the definitions of + the fragments and the instructions on how to aggregate them. + The keys are indices of the CFA fragment dimensions, + e.g. ``(1, 0, 0 ,0)``. .. versionadded:: 3.14.0 :Parameters: - var: `CFAPython.CFAVariable.CFAVariable` - The CFA aggregation variable. + copy: `bool`, optional + Whether or not to return a copy of the aggregation + dictionary. By default a deep copy is returned. + + .. warning:: If False then changing the returned + dictionary in-place will change the + aggregation dictionary stored in the + {{class}} instance, **as well as in any + copies of it**. + + :Returns: + + `dict` + The aggregation data dictionary. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_shape() + (2, 1, 1, 1) + >>> a.get_aggregated_data() + {(0, 0, 0, 0): {'file': 'January-June.nc', + 'address': 'temp', + 'format': 'nc', + 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, + (1, 0, 0, 0): {'file': 'July-December.nc', + 'address': 'temp', + 'format': 'nc', + 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} + + """ + aggregated_data = self._get_component("aggregated_data") + if copy: + aggregated_data = deepcopy(aggregated_data) - frag_loc: `tuple` of `int` - The new key, that must be index of the CFA fragment - dimensions, e.g. ``(1, 0, 0, 0)``. + return aggregated_data - aggregated_data: `dict` - The aggregated data dictionary to be updated in-place. + def get_fragmented_dimensions(self): + """Get the positions of dimensions that have two or more fragments. + + .. versionadded:: 3.14.0 :Returns: - `None` + `list` + The dimension positions. + + **Examples** + + >>> a.get_fragment_shape() + (20, 1, 40, 1) + >>> a.get_fragmented_dimensions() + [0, 2] + + >>> a.get_fragment_shape() + (1, 1, 1) + >>> a.get_fragmented_dimensions() + [] """ - fragment = var.getFrag(frag_loc=frag_loc) + return [ + i for i, size in enumerate(self.get_fragment_shape()) if size > 1 + ] - filename = fragment.file - fmt = fragment.format - address = fragment.address + def get_fragment_shape(self): + """Get the sizes of the fragment dimensions. - if address is not None: - if filename is None: - # This fragment is in the CFA-netCDF file - filename = cfa_filename - fmt = "nc" - else: - # This fragment is in its own file - filename = abspath(fragment.file) + The fragment dimension sizes are given in the same order as + the aggregated dimension sizes given by `shape`. + + .. versionadded:: 3.14.0 + + :Returns: + + `tuple` + The shape of the fragment dimensions. + + """ + return self._get_component("fragment_shape") - aggregated_data[frag_loc] = { - "file": filename, - "address": address, - "format": fmt, - "location": fragment.location, - } + def get_term(self, default=ValueError()): + """The CFA aggregation instruction term for the data, if set. - def _subarray_shapes(self, shapes): + .. versionadded:: TODOCFAVER + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + term has not been set. If set to an `Exception` + instance then it will be raised instead. + + :Returns: + + `str` + The CFA aggregation instruction term name. + + """ + return self._get_component("term", default=default) + + def subarray_shapes(self, shapes): """Create the subarray shapes. .. versionadded:: 3.14.0 @@ -374,7 +480,7 @@ def _subarray_shapes(self, shapes): return normalize_chunks(chunks, shape=shape, dtype=self.dtype) - def _subarrays(self, subarray_shapes): + def subarrays(self, subarray_shapes): """Return descriptors for every subarray. .. versionadded:: 3.14.0 @@ -522,125 +628,6 @@ def _subarrays(self, subarray_shapes): product(*f_shapes), ) - def get_aggregated_data(self, copy=True): - """Get the aggregation data dictionary. - - The aggregation data dictionary contains the definitions of - the fragments and the instructions on how to aggregate them. - The keys are indices of the CFA fragment dimensions, - e.g. ``(1, 0, 0 ,0)``. - - .. versionadded:: 3.14.0 - - :Parameters: - - copy: `bool`, optional - Whether or not to return a copy of the aggregation - dictionary. By default a deep copy is returned. - - .. warning:: If False then changing the returned - dictionary in-place will change the - aggregation dictionary stored in the - {{class}} instance, **as well as in any - copies of it**. - - :Returns: - - `dict` - The aggregation data dictionary. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a.get_fragment_shape() - (2, 1, 1, 1) - >>> a.get_aggregated_data() - {(0, 0, 0, 0): {'file': 'January-June.nc', - 'address': 'temp', - 'format': 'nc', - 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, - (1, 0, 0, 0): {'file': 'July-December.nc', - 'address': 'temp', - 'format': 'nc', - 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} - - """ - aggregated_data = self._get_component("aggregated_data") - if copy: - aggregated_data = deepcopy(aggregated_data) - - return aggregated_data - - def get_FragmentArray(self, fragment_format): - """Return a Fragment class. - - .. versionadded:: 3.14.0 - - :Parameters: - - fragment_format: `str` - The dataset format of the fragment. Either ``'nc'``, - ``'um'``, or `None`. - - :Returns: - - `FragmentArray` - The class for representing a fragment array of the - given format. - - """ - try: - return self._FragmentArray[fragment_format] - except KeyError: - raise ValueError( - "Can't get FragmentArray class for unknown " - f"fragment dataset format: {fragment_format!r}" - ) - - def get_fragmented_dimensions(self): - """Get the positions dimension that have two or more fragments. - - .. versionadded:: 3.14.0 - - :Returns: - - `list` - The dimension positions. - - **Examples** - - >>> a.get_fragment_shape() - (20, 1, 40, 1) - >>> a.get_fragmented_dimensions() - [0, 2] - - >>> a.get_fragment_shape() - (1, 1, 1) - >>> a.get_fragmented_dimensions() - [] - - """ - return [ - i for i, size in enumerate(self.get_fragment_shape()) if size > 1 - ] - - def get_fragment_shape(self): - """Get the sizes of the fragment dimensions. - - The fragment dimension sizes are given in the same order as - the aggregated dimension sizes given by `shape` - - .. versionadded:: 3.14.0 - - :Returns: - - `tuple` - The shape of the fragment dimensions. - - """ - return self._get_component("fragment_shape") - def to_dask_array(self, chunks="auto"): """Create a dask array with `FragmentArray` chunks. @@ -675,10 +662,13 @@ def to_dask_array(self, chunks="auto"): aggregated_data = self.get_aggregated_data(copy=False) # Set the chunk sizes for the dask array - chunks = self._subarray_shapes(chunks) + chunks = self.subarray_shapes(chunks) - # Create a FragmentArray for each chunk - get_FragmentArray = self.get_FragmentArray + if self.get_mask(): + fragment_arrays = _FragmentArray + else: + fragment_arrays = _FragmentArray.copy() + fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) dsk = {} for ( @@ -688,29 +678,35 @@ def to_dask_array(self, chunks="auto"): chunk_location, fragment_location, fragment_shape, - ) in zip(*self._subarrays(chunks)): - d = aggregated_data[fragment_location] + ) in zip(*self.subarrays(chunks)): + kwargs = aggregated_data[fragment_location].copy() + kwargs.pop("location", None) - FragmentArray = get_FragmentArray(d["format"]) + fragment_format = kwargs.pop("format", None) + try: + FragmentArray = fragment_arrays[fragment_format] + except KeyError: + raise ValueError( + "Can't get FragmentArray class for unknown " + f"fragment dataset format: {fragment_format!r}" + ) - fragment_array = FragmentArray( - filename=d["file"], - address=d["address"], + fragment = FragmentArray( dtype=dtype, shape=fragment_shape, aggregated_units=units, aggregated_calendar=calendar, + **kwargs, ) - key = f"{fragment_array.__class__.__name__}-{tokenize(fragment_array)}" - dsk[key] = fragment_array - + key = f"{fragment.__class__.__name__}-{tokenize(fragment)}" + dsk[key] = fragment dsk[name + chunk_location] = ( getter, key, f_indices, False, - False, + getattr(fragment, "_lock", False), ) # Return the dask array diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index ee1a73f658..9cf58ce577 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -2,12 +2,16 @@ from .abstract import Array +_FULLARRAY_HANDLED_FUNCTIONS = {} + class FullArray(Array): """A array filled with a given value. The array may be empty or all missing values. + .. versionadded:: 3.14.0 + """ def __init__( @@ -56,7 +60,7 @@ def __init__( if source is not None: try: - fill_value = source._get_component("fill_value", None) + fill_value = source._get_component("full_value", None) except AttributeError: fill_value = None @@ -80,12 +84,50 @@ def __init__( except AttributeError: calendar = None - self._set_component("fill_value", fill_value, copy=False) + self._set_component("full_value", fill_value, copy=False) self._set_component("dtype", dtype, copy=False) self._set_component("shape", shape, copy=False) self._set_component("units", units, copy=False) self._set_component("calendar", calendar, copy=False) + def __array__(self, *dtype): + """The numpy array interface. + + .. versionadded:: TODOCFAVER + + :Parameters: + + dtype: optional + Typecode or data-type to which the array is cast. + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + """ + array = self[...] + if not dtype: + return array + else: + return array.astype(dtype[0], copy=False) + + def __array_function__(self, func, types, args, kwargs): + """The `numpy` `__array_function__` protocol. + + .. versionadded:: TODOCFAVER + + """ + if func not in _FULLARRAY_HANDLED_FUNCTIONS: + return NotImplemented + + # Note: This allows subclasses that don't override + # __array_function__ to handle FullArray objects + if not all(issubclass(t, self.__class__) for t in types): + return NotImplemented + + return _FULLARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) + def __getitem__(self, indices): """x.__getitem__(indices) <==> x[indices] @@ -102,7 +144,7 @@ def __getitem__(self, indices): array_shape = self.shape else: array_shape = [] - for i, size, i in zip(indices, self.shape): + for i, size in zip(indices, self.shape): if not isinstance(i, slice): continue @@ -117,7 +159,7 @@ def __getitem__(self, indices): apply_indices = True array_shape = self.shape - fill_value = self.get_fill_value() + fill_value = self.get_full_value() if fill_value is np.ma.masked: array = np.ma.masked_all(array_shape, dtype=self.dtype) elif fill_value is not None: @@ -148,42 +190,12 @@ def __str__(self): x.__str__() <==> str(x) """ - fill_value = self.get_fill_value() + fill_value = self.get_full_value() if fill_value is None: return "Uninitialised" return f"Filled with {fill_value!r}" - def _set_units(self): - """The units and calendar properties. - - These are the values set during initialisation, defaulting to - `None` if either was not set at that time. - - .. versionadded:: 3.14.0 - - :Returns: - - `tuple` - The units and calendar values, either of which may be - `None`. - - """ - # TODOCFA: Consider moving _set_units to cfdm.Array, or some - # other common ancestor so that this, and other, - # subclasses can access it. - units = self.get_units(False) - if units is False: - units = None - self._set_component("units", units, copy=False) - - calendar = self.get_calendar(False) - if calendar is False: - calendar = None - self._set_component("calendar", calendar, copy=False) - - return units, calendar - @property def dtype(self): """Data-type of the data elements.""" @@ -194,14 +206,86 @@ def shape(self): """Tuple of array dimension sizes.""" return self._get_component("shape") - def get_fill_value(self): + def get_full_value(self, default=AttributeError()): """Return the data array fill value. .. versionadded:: 3.14.0 + .. seealso:: `set_full_value` + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + fill value has not been set. If set to an `Exception` + instance then it will be raised instead. + :Returns: The fill value. """ - return self._get_component("fill_value", None) + return self._get_component("full_value", default=default) + + def set_full_value(self, fill_value): + """Set the data array fill value. + + .. versionadded:: 3.14.0 + + .. seealso:: `get_full_value` + + :Parameters: + + fill_value : scalar, optional + The fill value for the array. May be set to + `cf.masked` or `np.ma.masked`. + + :Returns: + + `None` + + """ + self._set_component("full_value", fill_value, copy=False) + + +def fullarray_implements(numpy_function): + """Register an __array_function__ implementation for FullArray objects. + + .. versionadded:: TODOCFAVER + + """ + + def decorator(func): + _FULLARRAY_HANDLED_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@fullarray_implements(np.unique) +def unique( + a, return_index=False, return_inverse=False, return_counts=False, axis=None +): + """Version of `np.unique` that is optimised for `FullArray` objects. + + .. versionadded:: TODOCFAVER + + """ + if return_index or return_inverse or return_counts or axis is not None: + # Fall back to the slow unique. (I'm sure we could probably do + # something more clever here, but there is no use case at + # present.) + return np.unique( + a[...], + return_index=return_index, + return_inverse=return_inverse, + return_counts=return_counts, + axis=axis, + ) + + # Fast unique based on the full value + x = a.get_full_value() + if x is np.ma.masked: + return np.ma.masked_all((1,), dtype=a.dtype) + + return np.array([x], dtype=a.dtype) diff --git a/cf/data/array/gatheredarray.py b/cf/data/array/gatheredarray.py index ab777738ab..f8e66117aa 100644 --- a/cf/data/array/gatheredarray.py +++ b/cf/data/array/gatheredarray.py @@ -111,13 +111,7 @@ def to_dask_array(self, chunks="auto"): key = f"{subarray_name}-{tokenize(subarray)}" dsk[key] = subarray - dsk[name + chunk_location] = ( - getter, - key, - Ellipsis, - False, - False, - ) + dsk[name + chunk_location] = (getter, key, Ellipsis, False, False) # Return the dask array return da.Array(dsk, name[0], chunks=chunks, dtype=dtype) diff --git a/cf/data/array/mixin/compressedarraymixin.py b/cf/data/array/mixin/compressedarraymixin.py index 48c731e166..456d2cd919 100644 --- a/cf/data/array/mixin/compressedarraymixin.py +++ b/cf/data/array/mixin/compressedarraymixin.py @@ -26,6 +26,11 @@ def _lock_file_read(self, array): couldn't be ascertained how to form the `dask` array. """ + try: + return array.to_dask_array() + except AttributeError: + pass + try: chunks = array.chunks except AttributeError: @@ -37,7 +42,7 @@ def _lock_file_read(self, array): pass try: - array.get_filename() + array.get_filenames() except AttributeError: pass else: diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 03c0afd283..5aa6f493af 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -1,5 +1,10 @@ +from os import sep +from os.path import basename, dirname, join + import numpy as np +from ....functions import _DEPRECATION_ERROR_ATTRIBUTE, abspath + class FileArrayMixin: """Mixin class for an array stored in a file. @@ -8,6 +13,19 @@ class FileArrayMixin: """ + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: TODOCFAVER + + """ + return ( + self.__class__, + self.shape, + self.get_filenames(), + self.get_addresses(), + ) + @property def _dask_meta(self): """The metadata for the containing dask array. @@ -21,3 +39,197 @@ def _dask_meta(self): """ return np.array((), dtype=self.dtype) + + @property + def filename(self): + """The name of the file containing the array. + + Deprecated at version 3.14.0. Use method `get_filename` instead. + + """ + _DEPRECATION_ERROR_ATTRIBUTE( + self, + "filename", + message="Use method 'get_filename' instead.", + version="3.14.0", + removed_at="5.0.0", + ) # pragma: no cover + + def del_file_location(self, location): + """Remove reference to files in the given location. + + .. versionadded:: TODOCFAVER + + :Parameters: + + location: `str` + The file location to remove. + + :Returns: + + `{{class}}` + A new {{class}} with reference to files in *location* + removed. + + **Examples** + + >>> a.get_filenames() + ('/data1/file1', '/data2/file2') + >>> a.get_addresses() + ('tas1', 'tas2') + >>> b = a.del_file_location('/data1') + >>> b = get_filenames() + ('/data2/file2',) + >>> b.get_addresses() + ('tas2',) + + >>> a.get_filenames() + ('/data1/file1', '/data2/file1', '/data2/file2') + >>> a.get_addresses() + ('tas1', 'tas1', 'tas2') + >>> b = a.del_file_location('/data2') + >>> b.get_filenames() + ('/data1/file1',) + >>> b.get_addresses() + ('tas1',) + + """ + location = abspath(location).rstrip(sep) + + new_filenames = [] + new_addresses = [] + for filename, address in zip( + self.get_filenames(), self.get_addresses() + ): + if dirname(filename) != location: + new_filenames.append(filename) + new_addresses.append(address) + + if not new_filenames: + raise ValueError( + "Can't delete a file location when it results in there " + "being no files" + ) + + a = self.copy() + a._set_component("filename", tuple(new_filenames), copy=False) + a._set_component("address", tuple(new_addresses), copy=False) + return a + + def file_locations(self): + """The locations of the files, any of which may contain the data. + + .. versionadded:: TODOCFAVER + + :Returns: + + `tuple` + The file locations, one for each file, as absolute + paths with no trailing separate pathname component + separator. + + **Examples** + + >>> a.get_filenames() + ('/data1/file1',) + >>> a.file_locations() + ('/data1,) + + >>> a.get_filenames() + ('/data1/file1', '/data2/file2') + >>> a.file_locations() + ('/data1', '/data2') + + >>> a.get_filenames() + ('/data1/file1', '/data2/file2', '/data1/file2') + >>> a.file_locations() + ('/data1', '/data2', '/data1') + + """ + return tuple(map(dirname, self.get_filenames())) + + def add_file_location(self, location): + """Add a new file location. + + All existing files are additionally referenced from the given + location. + + .. versionadded:: TODOCFAVER + + :Parameters: + + location: `str` + The new location. + + :Returns: + + `{{class}}` + A new {{class}} with all previous files additionally + referenced from *location*. + + **Examples** + + >>> a.get_filenames() + ('/data1/file1',) + >>> a.get_addresses() + ('tas',) + >>> b = a.add_file_location('/home') + >>> b.get_filenames() + ('/data1/file1', '/home/file1') + >>> b.get_addresses() + ('tas', 'tas') + + >>> a.get_filenames() + ('/data1/file1', '/data2/file2',) + >>> a.get_addresses() + ('tas', 'tas') + >>> b = a.add_file_location('/home/') + >>> b = get_filenames() + ('/data1/file1', '/data2/file2', '/home/file1', '/home/file2') + >>> b.get_addresses() + ('tas', 'tas', 'tas', 'tas') + + >>> a.get_filenames() + ('/data1/file1', '/data2/file1',) + >>> a.get_addresses() + ('tas1', 'tas2') + >>> b = a.add_file_location('/home/') + >>> b.get_filenames() + ('/data1/file1', '/data2/file1', '/home/file1') + >>> b.get_addresses() + ('tas1', 'tas2', 'tas1') + + >>> a.get_filenames() + ('/data1/file1', '/data2/file1',) + >>> a.get_addresses() + ('tas1', 'tas2') + >>> b = a.add_file_location('/data1') + >>> b.get_filenames() + ('/data1/file1', '/data2/file1') + >>> b.get_addresses() + ('tas1', 'tas2') + + """ + location = abspath(location).rstrip(sep) + + filenames = self.get_filenames() + addresses = self.get_addresses() + + # Note: It is assumed that each existing file name is either + # an absolute path or a fully qualified URI. + new_filenames = list(filenames) + new_addresses = list(addresses) + for filename, address in zip(filenames, addresses): + new_filename = join(location, basename(filename)) + if new_filename not in new_filenames: + new_filenames.append(new_filename) + new_addresses.append(address) + + a = self.copy() + a._set_component("filename", tuple(new_filenames), copy=False) + a._set_component( + "address", + tuple(new_addresses), + copy=False, + ) + return a diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index d6042909f9..e602cec16f 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -2,15 +2,23 @@ from dask.utils import SerializableLock from ...mixin_container import Container -from .mixin import FileArrayMixin +from .mixin import ArrayMixin, FileArrayMixin # Global lock for netCDF file access _lock = SerializableLock() -class NetCDFArray(FileArrayMixin, Container, cfdm.NetCDFArray): +class NetCDFArray(FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray): """An array stored in a netCDF file.""" + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: TODOCFAVER + + """ + return super().__dask_tokenize__() + (self.get_mask(),) + def __repr__(self): """Called by the `repr` built-in function. @@ -20,21 +28,16 @@ def __repr__(self): return super().__repr__().replace("<", ">> a.get_format() + 'um' + + """ + return "um" + def get_word_size(self): """Word size in bytes. @@ -656,26 +670,17 @@ def open(self): :Returns: - `umfile_lib.File` + `umfile_lib.File`, `int` **Examples** >>> f.open() + (, 44567) """ - try: - f = File( - path=self.get_filename(), - byte_ordering=self.get_byte_ordering(), - word_size=self.get_word_size(), - fmt=self.get_fmt(), - ) - except Exception as error: - try: - f.close_fd() - except Exception: - pass - - raise Exception(error) - else: - return f + return super().open( + File, + byte_ordering=self.get_byte_ordering(), + word_size=self.get_word_size(), + fmt=self.get_fmt(), + ) diff --git a/cf/data/creation.py b/cf/data/creation.py index 0db52403c0..0f2ebb94e7 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -33,8 +33,8 @@ def to_dask(array, chunks, **from_array_options): Keyword arguments to be passed to `dask.array.from_array`. If *from_array_options* has no ``'lock'`` key then the - `lock` keyword is set to the `_dask_lock` attribute of - *array* or, if there is no such attribute, `False`. + `lock` keyword is set to the `_lock` attribute of *array* + or, if there is no such attribute, `False`. If *from_array_options* has no ``'meta'`` key then the `meta` keyword is set to the `_dask_meta` attribute of @@ -76,7 +76,7 @@ def to_dask(array, chunks, **from_array_options): array = np.asanyarray(array) kwargs = from_array_options - kwargs.setdefault("lock", getattr(array, "_dask_lock", False)) + kwargs.setdefault("lock", getattr(array, "_lock", False)) kwargs.setdefault("meta", getattr(array, "_dask_meta", None)) try: diff --git a/cf/data/data.py b/cf/data/data.py index f5c999868a..7af57ddfee 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -5,6 +5,7 @@ from itertools import product from numbers import Integral from operator import mul +from os import sep import cfdm import cftime @@ -13,7 +14,7 @@ from dask import compute, delayed # noqa: F401 from dask.array import Array from dask.array.core import normalize_chunks -from dask.base import is_dask_collection, tokenize +from dask.base import collections_to_dsk, is_dask_collection, tokenize from dask.highlevelgraph import HighLevelGraph from dask.optimization import cull @@ -29,13 +30,14 @@ from ..functions import ( _DEPRECATION_ERROR_KWARGS, _section, + abspath, atol, default_netCDF_fillvals, free_memory, parse_indices, rtol, ) -from ..mixin_container import Container +from ..mixin2 import CFANetCDF, Container from ..units import Units from .collapse import Collapse from .creation import generate_axis_identifiers, to_dask @@ -93,10 +95,11 @@ _NONE = 0 # = 0b0000 _ARRAY = 1 # = 0b0001 _CACHE = 2 # = 0b0010 +_CFA = 4 # = 0b0100 _ALL = 15 # = 0b1111 -class Data(DataClassDeprecationsMixin, Container, cfdm.Data): +class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): """An N-dimensional data array with units and masked values. * Contains an N-dimensional, indexable and broadcastable array with @@ -416,25 +419,23 @@ def __init__( except AttributeError: compressed = "" - if compressed: - if init_options.get("from_array"): - raise ValueError( - "Can't define 'from_array' initialisation options " - "for compressed input arrays" - ) + if compressed and init_options.get("from_array"): + raise ValueError( + "Can't define 'from_array' initialisation options " + "for compressed input arrays" + ) - # Bring the compressed data into memory without - # decompressing it - if to_memory: - try: - array = array.to_memory() - except AttributeError: - pass + if to_memory: + try: + array = array.to_memory() + except AttributeError: + pass - if self._is_abstract_Array_subclass(array): - # Save the input array in case it's useful later. For - # compressed input arrays this will contain extra information, - # such as a count or index variable. + try: + array.get_filenames() + except AttributeError: + pass + else: self._set_Array(array) # Cast the input data as a dask array @@ -452,6 +453,7 @@ def __init__( dt = True first_value = None + if not dt and array.dtype.kind == "O": kwargs = init_options.get("first_non_missing_value", {}) first_value = first_non_missing_value(array, **kwargs) @@ -621,20 +623,6 @@ def _rtol(self): """Return the current value of the `cf.rtol` function.""" return rtol().value - def _is_abstract_Array_subclass(self, array): - """Whether or not an array is a type of abstract Array. - - :Parameters: - - array: - - :Returns: - - `bool` - - """ - return isinstance(array, cfdm.Array) - def __data__(self): """Returns a new reference to self.""" return self @@ -1248,6 +1236,43 @@ def __keepdims_indexing__(self): def __keepdims_indexing__(self, value): self._custom["__keepdims_indexing__"] = bool(value) + def _cfa_del_write(self): + """Set the CFA write status of the data to `False`. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_get_write`, `_cfa_set_write` + + :Returns: + + `bool` + The CFA status prior to deletion. + + """ + return self._custom.pop("cfa_write", False) + + def _cfa_set_term(self, value): + """Set the CFA aggregation instruction term status. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_get_term`, `cfa_set_term` + + :Parameters: + + status: `bool` + The new CFA aggregation instruction term status. + + :Returns: + + `None` + + """ + if not value: + self._custom.pop("cfa_term", None) + + self._custom["cfa_term"] = bool(value) + def _clear_after_dask_update(self, clear=_ALL): """Remove components invalidated by updating the `dask` array. @@ -1257,16 +1282,17 @@ def _clear_after_dask_update(self, clear=_ALL): .. versionadded:: 3.14.0 - .. seealso:: `_del_Array`, `_del_cached_elements`, `_set_dask` + .. seealso:: `_del_Array`, `_del_cached_elements`, + `_cfa_del_write`, `_set_dask` :Parameters: clear: `int`, optional Specify which components should be removed. Which components are removed is determined by sequentially - combining *clear* with the ``_ARRAY`` and ``_CACHE`` - integer-valued contants, using the bitwise AND - operator: + combining *clear* with the ``_ARRAY``, ``_CACHE`` and + ``_CFA`` integer-valued contants, using the bitwise + AND operator: * If ``clear & _ARRAY`` is non-zero then a source array is deleted. @@ -1274,6 +1300,12 @@ def _clear_after_dask_update(self, clear=_ALL): * If ``clear & _CACHE`` is non-zero then cached element values are deleted. + * If ``clear & _CFA`` is non-zero then the CFA write + status is set to `False`. + + * If ``clear`` is non-zero then the CFA term status is + set to `False`. + By default *clear* is the ``_ALL`` integer-valued constant, which results in all components being removed. @@ -1287,7 +1319,7 @@ def _clear_after_dask_update(self, clear=_ALL): element values will be kept but all other components will be removed. - .. versionadded:: 3.14.1 + .. versionadded:: TODOCFAVER :Returns: @@ -1305,6 +1337,10 @@ def _clear_after_dask_update(self, clear=_ALL): # Delete cached element values self._del_cached_elements() + if clear & _CFA: + # Set the CFA write status to False + self._cfa_del_write() + def _set_dask(self, array, copy=False, clear=_ALL): """Set the dask array. @@ -1377,9 +1413,9 @@ def _del_dask(self, default=ValueError(), clear=_ALL): Specify which components should be removed. By default *clear* is the ``_ALL`` integer-valued constant, which results in all components being removed. See - `_clear_after_dask_update` for details. - If there is no dask array then no components are - removed, regardless of the value of *clear*. + `_clear_after_dask_update` for details. If there is + no dask array then no components are removed, + regardless of the value of *clear*. :Returns: @@ -1400,6 +1436,7 @@ def _del_dask(self, default=ValueError(), clear=_ALL): Traceback (most recent call last): ... RuntimeError: No dask array + """ try: out = self._custom.pop("dask") @@ -1498,6 +1535,30 @@ def _set_cached_elements(self, elements): self._custom["cached_elements"] = cache + def _cfa_set_write(self, status): + """Set the CFA write status of the data. + + If and only if the CFA write status is True then it may be + possible to write the data as an aggregation variable to a + CFA-netCDF file. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_get_write`, `cfa_set_write`, + `_cfa_del_write`, `cf.read`, `cf.write`, + + :Parameters: + + status: `bool` + The new CFA write status. + + :Returns: + + `None` + + """ + self._custom["cfa_write"] = bool(status) + @_inplace_enabled(default=False) def diff(self, axis=-1, n=1, inplace=False): """Calculate the n-th discrete difference along the given axis. @@ -2399,6 +2460,110 @@ def ceil(self, inplace=False, i=False): d._set_dask(da.ceil(dx)) return d + def cfa_get_term(self): + """The CFA aggregation instruction term status. + + If True then the data represents that of a non-standard CFA + aggregation instruction variable. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_set_term` + + :Returns: + + `bool` + + **Examples** + + >>> d = cf.Data([1, 2]) + >>> d.cfa_get_term() + False + + """ + return bool(self._custom.get("cfa_term", False)) + + def cfa_get_write(self): + """The CFA write status of the data. + + If and only if the CFA write status is True then it may be + possible to write the data as an aggregation variable to a + CFA-netCDF file. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_set_write`, `cf.read`, `cf.write` + + :Returns: + + `bool` + + **Examples** + + >>> d = cf.Data([1, 2]) + >>> d.cfa_get_write() + False + + """ + return bool(self._custom.get("cfa_write", False)) + + def cfa_set_term(self, status): + """Set the CFA aggregation instruction term status. + + If True then the data represents that of a non-standard CFA + aggregation instruction variable. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_get_term` + + :Parameters: + + status: `bool` + The new CFA aggregation instruction term status. + + :Returns: + + `None` + + """ + if status: + raise ValueError( + "'cfa_set_term' only allows the CFA aggregation instruction " + "term write status to be set to False" + ) + + self._custom.pop("cfa_term", False) + + def cfa_set_write(self, status): + """Set the CFA write status of the data. + + If and only if the CFA write status is True then it may be + possible to write the data as an aggregation variable to a + CFA-netCDF file. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_get_write`, `cf.read`, `cf.write` + + :Parameters: + + status: `bool` + The new CFA write status. + + :Returns: + + `None` + + """ + if status: + raise ValueError( + "'cfa_set_write' only allows the CFA write status to be " + "set to False" + ) + + self._cfa_del_write() + def compute(self): # noqa: F811 """A numpy view the data. @@ -3510,8 +3675,6 @@ def _regrid( The regridded data. """ - from dask import delayed - from .dask_regrid import regrid, regrid_weights shape = self.shape @@ -3713,8 +3876,33 @@ def concatenate(cls, data, axis=0, cull_graph=True, relaxed_units=False): dxs = [d.to_dask_array() for d in processed_data] dx = da.concatenate(dxs, axis=axis) - # Set the new dask array, retaining the cached elements ... - data0._set_dask(dx, clear=_ALL) + # Set the CFA write status + # + # Assume at first that all input data instances have True + # status, but ... + cfa = _CFA + for d in processed_data: + if not d.cfa_get_write(): + # ... the CFA write status is False when any input + # data instance has False status ... + cfa = _NONE + break + + if cfa != _NONE: + non_concat_axis_chunks0 = list(processed_data[0].chunks) + non_concat_axis_chunks0.pop(axis) + for d in processed_data[1:]: + non_concat_axis_chunks = list(d.chunks) + non_concat_axis_chunks.pop(axis) + if non_concat_axis_chunks != non_concat_axis_chunks0: + # ... the CFA write status is False when any two + # input data instances have different chunk + # patterns for the non-concatenated axes. + cfa = _NONE + break + + # Set the new dask array + data0._set_dask(dx, clear=_ALL ^ cfa) # Set the appropriate cached elements cached_elements = {} @@ -3725,12 +3913,37 @@ def concatenate(cls, data, axis=0, cull_graph=True, relaxed_units=False): data0._set_cached_elements(cached_elements) - # Manage cyclicity of axes: if join axis was cyclic, it is no longer + # Set the CFA-netCDF aggregated data instructions and file + # name substitutions by combining them from all of the input + # data instances, giving precedence to those towards the left + # hand side of the input list. + if data0.cfa_get_write(): + aggregated_data = {} + substitutions = {} + for d in processed_data[::-1]: + aggregated_data.update(d.cfa_get_aggregated_data()) + substitutions.update(d.cfa_file_substitutions()) + + if aggregated_data: + data0.cfa_set_aggregated_data(aggregated_data) + + if substitutions: + data0.cfa_update_file_substitutions(substitutions) + + # Set the CFA aggregation instruction term status + if data0.cfa_get_term(): + for d in processed_data[1:]: + if not d.cfa_get_term(): + data0.cfa_set_term(False) + break + + # Manage cyclicity of axes: if join axis was cyclic, it is no + # longer. axis = data0._parse_axes(axis)[0] if axis in data0.cyclic(): logger.warning( f"Concatenating along a cyclic axis ({axis}) therefore the " - f"axis has been set as non-cyclic in the output." + "axis has been set as non-cyclic in the output." ) data0.cyclic(axes=axis, iscyclic=False) @@ -4365,7 +4578,9 @@ def Units(self, value): partial(cf_units, from_units=old_units, to_units=value), dtype=dtype, ) - self._set_dask(dx) + + # Setting equivalent units doesn't affect the CFA write status + self._set_dask(dx, clear=_ALL ^ _CFA) self._Units = value @@ -5872,32 +6087,20 @@ def convert_reference_time( return d - def get_data(self, default=ValueError(), _units=None, _fill_value=None): - """Returns the data. - - .. versionadded:: 3.0.0 - - :Returns: - - `Data` - - """ - return self - def get_filenames(self): """The names of files containing parts of the data array. - Returns the names of any files that are required to deliver - the computed data array. This list may contain fewer names - than the collection of file names that defined the data when - it was first instantiated, as could be the case after the data - has been subspaced. + Returns the names of any files that may be required to deliver + the computed data array. This set may contain fewer names than + the collection of file names that defined the data when it was + first instantiated, as could be the case after the data has + been subspaced. **Implementation** A `dask` chunk that contributes to the computed array is assumed to reference data within a file if that chunk's array - object has a callable `get_filename` method, the output of + object has a callable `get_filenames` method, the output of which is added to the returned `set`. :Returns: @@ -5943,13 +6146,10 @@ def get_filenames(self): {'file_A.nc'} """ - from dask.base import collections_to_dsk - out = set() - dsk = collections_to_dsk((self.to_dask_array(),), optimize_graph=True) - for a in dsk.values(): + for a in self.todict().values(): try: - out.add(a.get_filename()) + out.update(a.get_filenames()) except AttributeError: pass @@ -6050,6 +6250,55 @@ def set_calendar(self, calendar): """ self.Units = Units(self.get_units(default=None), calendar) + def add_file_location(self, location): + """Add a new file location in-place. + + All data definitions that reference files are additionally + referenced from the given location. + + .. versionadded:: TODOCFAVER + + .. seealso:: `del_file_location`, `file_locations` + + :Parameters: + + location: `str` + The new location. + + :Returns: + + `str` + The new location as an absolute path with no trailing + separate pathname component separator. + + **Examples** + + >>> d.add_file_location('/data/model/') + '/data/model' + + """ + location = abspath(location).rstrip(sep) + + updated = False + dsk = self.todict() + for key, a in dsk.items(): + try: + dsk[key] = a.add_file_location(location) + except AttributeError: + # This chunk doesn't contain a file array + continue + + # This chunk contains a file array and the dask graph has + # been updated + updated = True + + if updated: + dx = self.to_dask_array() + dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) + self._set_dask(dx, clear=_NONE) + + return location + def set_units(self, value): """Set the units. @@ -7627,7 +7876,7 @@ def insert_dimension(self, position=0, inplace=False): **Examples** """ - # TODODASKAPI bring back expand_dime alias (or rather alias this to + # TODODASKAPI bring back expand_dims alias (or rather alias this to # that) d = _inplace_enabled_define_and_cleanup(self) @@ -7650,8 +7899,9 @@ def insert_dimension(self, position=0, inplace=False): dx = d.to_dask_array() dx = dx.reshape(shape) - # Inserting a dimension doesn't affect the cached elements - d._set_dask(dx, clear=_ALL ^ _CACHE) + # Inserting a dimension doesn't affect the cached elements nor + # the CFA write status + d._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) # Expand _axes axis = new_axis_identifier(d._axes) @@ -8137,6 +8387,39 @@ def soften_mask(self): self._set_dask(dx, clear=_NONE) self.hardmask = False + def file_locations(self): + """The locations of files containing parts of the data. + + Returns the locations of any files that may be required to + deliver the computed data array. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `del_file_location` + + :Returns: + + `set` + The unique file locations as absolute paths with no + trailing separate pathname component separator. + + **Examples** + + >>> d.file_locations() + {'/home/data1', 'file:///data2'} + + """ + out = set() + + for key, a in self.todict().items(): + try: + out.update(a.file_locations()) + except AttributeError: + # This chunk doesn't contain a file array + pass + + return out + @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): """Replace masked elements with a fill value. @@ -8688,6 +8971,46 @@ def change_calendar(self, calendar, inplace=False, i=False): return d + def chunk_indices(self): + """Return indices that define each dask compute chunk. + + .. versionadded:: TODOCFAVER + + .. seealso:: `chunks` + + :Returns: + + `itertools.product` + An iterator over tuples of indices of the data array. + + **Examples** + + >>> d = cf.Data(np.arange(405).reshape(3, 9, 15), + ... chunks=((1, 2), (9,), (4, 5, 6))) + >>> d.npartitions + 6 + >>> for index in d.chunk_indices(): + ... print(index) + ... + (slice(0, 1, None), slice(0, 9, None), slice(0, 4, None)) + (slice(0, 1, None), slice(0, 9, None), slice(4, 9, None)) + (slice(0, 1, None), slice(0, 9, None), slice(9, 15, None)) + (slice(1, 3, None), slice(0, 9, None), slice(0, 4, None)) + (slice(1, 3, None), slice(0, 9, None), slice(4, 9, None)) + (slice(1, 3, None), slice(0, 9, None), slice(9, 15, None)) + + """ + from dask.utils import cached_cumsum + + chunks = self.chunks + + cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] + indices = [ + [slice(s, s + dim) for s, dim in zip(starts, shapes)] + for starts, shapes in zip(cumdims, chunks) + ] + return product(*indices) + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def override_units(self, units, inplace=False, i=False): @@ -9018,9 +9341,8 @@ def del_calendar(self, default=ValueError()): default: optional Return the value of the *default* parameter if the - calendar has not been set. - - {{default Exception}} + calendar has not been set. If set to an `Exception` + instance then it will be raised instead. :Returns: @@ -9058,6 +9380,55 @@ def del_calendar(self, default=ValueError()): self.override_calendar(None, inplace=True) return calendar + def del_file_location(self, location): + """Remove a file location in-place. + + All data definitions that reference files will have references + to files in the given location removed from them. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `file_locations` + + :Parameters: + + location: `str` + The file location to remove. + + :Returns: + + `str` + The removed location as an absolute path with no + trailing separate pathname component separator. + + **Examples** + + >>> d.del_file_location('/data/model/') + '/data/model' + + """ + location = abspath(location).rstrip(sep) + + updated = False + dsk = self.todict() + for key, a in dsk.items(): + try: + dsk[key] = a.del_file_location(location) + except AttributeError: + # This chunk doesn't contain a file array + continue + + # This chunk contains a file array and the dask graph has + # been updated + updated = True + + if updated: + dx = self.to_dask_array() + dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) + self._set_dask(dx, clear=_NONE) + + return location + def del_units(self, default=ValueError()): """Delete the units. @@ -9067,10 +9438,9 @@ def del_units(self, default=ValueError()): :Parameters: default: optional - Return the value of the *default* parameter if the units - has not been set. - - {{default Exception}} + Return the value of the *default* parameter if the + units has not been set. If set to an `Exception` + instance then it will be raised instead. :Returns: @@ -10624,31 +10994,32 @@ def squeeze(self, axes=None, inplace=False, i=False): shape = d.shape if axes is None: - axes = [i for i, n in enumerate(shape) if n == 1] + iaxes = tuple([i for i, n in enumerate(shape) if n == 1]) else: - axes = d._parse_axes(axes) + iaxes = d._parse_axes(axes) # Check the squeeze axes - for i in axes: + for i in iaxes: if shape[i] > 1: raise ValueError( f"Can't squeeze {d.__class__.__name__}: " f"Can't remove axis of size {shape[i]}" ) - if not axes: + if not iaxes: + # Short circuit if the squeeze is a null operation return d # Still here? Then the data array is not scalar and at least # one size 1 axis needs squeezing. dx = d.to_dask_array() - dx = dx.squeeze(axis=tuple(axes)) + dx = dx.squeeze(axis=iaxes) # Squeezing a dimension doesn't affect the cached elements d._set_dask(dx, clear=_ALL ^ _CACHE) # Remove the squeezed axes names - d._axes = [axis for i, axis in enumerate(d._axes) if i not in axes] + d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes] return d @@ -10713,6 +11084,53 @@ def tan(self, inplace=False, i=False): return d + def todict(self, optimize_graph=True): + """Return a dictionary of the dask graph key/value pairs. + + .. versionadded:: TODOCFAVER + + .. seealso:: `to_dask_array`, `tolist` + + :Parameters: + + `optimize_graph`: `bool` + If True, the default, then prior to being converted to + a dictionary, the graph is optimised to remove unused + chunks. Note that optimising the graph can add a + considerable performance overhead. + + :Returns: + + `dict` + The dictionary of the dask graph key/value pairs. + + **Examples** + + >>> d = cf.Data([1, 2, 3, 4], chunks=2) + >>> d.todict() + {('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2]), + ('array-2f41b21b4cd29f757a7bfa932bf67832', 1): array([3, 4])} + >>> e = d[0] + >>> e.todict() + {('getitem-153fd24082bc067cf438a0e213b41ce6', + 0): (, ('array-2f41b21b4cd29f757a7bfa932bf67832', + 0), (slice(0, 1, 1),)), + ('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2])} + >>> e.todict(optimize_graph=False) + {('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2]), + ('array-2f41b21b4cd29f757a7bfa932bf67832', 1): array([3, 4]), + ('getitem-153fd24082bc067cf438a0e213b41ce6', + 0): (, ('array-2f41b21b4cd29f757a7bfa932bf67832', + 0), (slice(0, 1, 1),))} + + """ + dx = self.to_dask_array() + + if optimize_graph: + return collections_to_dsk((dx,), optimize_graph=True) + + return dict(collections_to_dsk((dx,), optimize_graph=False)) + def tolist(self): """Return the data as a scalar or (nested) list. @@ -10722,6 +11140,8 @@ def tolist(self): If ``N`` is 0 then, since the depth of the nested list is 0, it will not be a list at all, but a simple Python scalar. + .. sealso:: `todict` + :Returns: `list` or scalar @@ -10801,8 +11221,6 @@ def transpose(self, axes=None, inplace=False, i=False): ndim = d.ndim if axes is None: - if ndim <= 1: - return d iaxes = tuple(range(ndim - 1, -1, -1)) else: iaxes = d._parse_axes(axes) @@ -10811,9 +11229,10 @@ def transpose(self, axes=None, inplace=False, i=False): # Short circuit if the transpose is a null operation return d - # Note: _axes attribute is still important/utilised post-Daskification - # because e.g. axes labelled as cyclic by the _cyclic attribute use it - # to determine their position (see #discussion_r694096462 on PR #247). + # Note: The _axes attribute is important because e.g. axes + # labelled as cyclic by the _cyclic attribute use it to + # determine their position (see #discussion_r694096462 + # on PR #247). data_axes = d._axes d._axes = [data_axes[i] for i in iaxes] @@ -10824,6 +11243,7 @@ def transpose(self, axes=None, inplace=False, i=False): raise ValueError( f"Can't transpose: Axes don't match array: {axes}" ) + d._set_dask(dx) return d diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index 6d23df6b46..2ce2dafa60 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,3 +1,3 @@ -from .missingfragmentarray import MissingFragmentArray +from .fullfragmentarray import FullFragmentArray from .netcdffragmentarray import NetCDFFragmentArray from .umfragmentarray import UMFragmentArray diff --git a/cf/data/fragment/abstract/__init__.py b/cf/data/fragment/abstract/__init__.py deleted file mode 100644 index e91f73b8de..0000000000 --- a/cf/data/fragment/abstract/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .fragmentarray import FragmentArray diff --git a/cf/data/fragment/abstract/fragmentarray.py b/cf/data/fragment/abstract/fragmentarray.py deleted file mode 100644 index 14f60bda90..0000000000 --- a/cf/data/fragment/abstract/fragmentarray.py +++ /dev/null @@ -1,390 +0,0 @@ -from numbers import Integral - -from ....units import Units -from ...array.abstract import FileArray - - -class FragmentArray(FileArray): - """A CFA fragment array. - - .. versionadded:: 3.14.0 - - """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - aggregated_units=False, - aggregated_calendar=None, - array=None, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: `str` - The name of the netCDF fragment file containing the - array. - - address: `str`, optional - The name of the netCDF variable containing the - fragment array. Required unless *varid* is set. - - dtype: `numpy.dtype` - The data type of the aggregated array. May be `None` - if the numpy data-type is not known (which can be the - case for netCDF string types, for example). This may - differ from the data type of the netCDF fragment - variable. - - shape: `tuple` - The shape of the fragment within the aggregated - array. This may differ from the shape of the netCDF - fragment variable in that the latter may have fewer - size 1 dimensions. - - {{aggregated_units: `str` or `None`, optional}} - - {{aggregated_calendar: `str` or `None`, optional}} - - array: `Array` - The fragment array stored in a file. - - source: optional - Initialise the array from the given object. - - {{init source}} - - {{deep copy}} - - """ - super().__init__(source=source, copy=copy) - - if source is not None: - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - aggregated_units = source._get_component( - "aggregated_units", False - ) - except AttributeError: - aggregated_units = False - - try: - aggregated_calendar = source._get_component( - "aggregated_calendar", False - ) - except AttributeError: - aggregated_calendar = False - - try: - array = source._get_component("array", None) - except AttributeError: - array = None - - self._set_component("filename", filename, copy=False) - self._set_component("address", address, copy=False) - self._set_component("dtype", dtype, copy=False) - self._set_component("shape", shape, copy=False) - self._set_component("aggregated_units", aggregated_units, copy=False) - self._set_component( - "aggregated_calendar", aggregated_calendar, copy=False - ) - - if array is not None: - self._set_component("array", array, copy=copy) - - def __getitem__(self, indices): - """Returns a subspace of the fragment as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - Indexing is similar to numpy indexing, with the following - differences: - - * A dimension's index can't be rank-reducing, i.e. it can't - be an integer, nor a scalar `numpy` or `dask` array. - - * When two or more dimension's indices are sequences of - integers then these indices work independently along each - dimension (similar to the way vector subscripts work in - Fortran). - - .. versionadded:: 3.14.0 - - """ - array = self.get_array() - indices = self._parse_indices(indices) - array = array[indices] - array = self._conform_units(array) - return array - - def _parse_indices(self, indices): - """Parse the indices that retrieve the fragment data. - - Ellipses are replaced with the approriate number `slice(None)` - instances, and rank-reducing indices (such as an integer or - scalar array) are disallowed. - - .. versionadded:: 3.14.0 - - :Parameters: - - indices: `tuple` or `Ellipsis` - The array indices to be parsed. - - :Returns: - - `tuple` - The parsed indices. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a._parse_indices(([2, 4, 5], Ellipsis, slice(45, 67)) - ([2, 4, 5], slice(None), slice(None), slice(45, 67)) - - """ - ndim = self.ndim - if indices is Ellipsis: - return (slice(None),) * ndim - - # Check indices - has_ellipsis = False - for i in indices: - if isinstance(i, slice): - continue - - if i is Ellipsis: - has_ellipsis = True - continue - - if isinstance(i, Integral) or not getattr(i, "ndim", True): - # TODOCFA: what about [] or np.array([])? - - # 'i' is an integer or a scalar numpy/dask array - raise ValueError( - f"Can't subspace {self.__class__.__name__} with a " - f"rank-reducing index: {i!r}" - ) - - if has_ellipsis: - # Replace Ellipsis with one or more slice(None) - indices2 = [] - length = len(indices) - n = ndim - for i in indices: - if i is Ellipsis: - m = n - length + 1 - indices2.extend([slice(None)] * m) - n -= m - else: - indices2.append(i) - n -= 1 - - length -= 1 - - indices = tuple(indices2) - - return indices - - def _conform_units(self, array): - """Conform the array to have the aggregated units. - - .. versionadded:: 3.14.0 - - :Parameters: - - array: `numpy.ndarray` - The array to be conformed. - - :Returns: - - `numpy.ndarray` - The conformed array. The returned array may or may not - be the input array updated in-place, depending on its - data type and the nature of its units and the - aggregated units. - - """ - units = self.Units - if units: - aggregated_units = self.aggregated_Units - if not units.equivalent(aggregated_units): - raise ValueError( - f"Can't convert fragment data with units {units!r} to " - f"have aggregated units {aggregated_units!r}" - ) - - if units != aggregated_units: - array = Units.conform( - array, units, aggregated_units, inplace=True - ) - - return array - - @property - def aggregated_Units(self): - """The units of the aggregated data. - - .. versionadded:: 3.14.0 - - :Returns: - - `Units` - The units of the aggregated data. - - """ - return Units( - self.get_aggregated_units(), self.get_aggregated_calendar(None) - ) - - def close(self): - """Close the dataset containing the data.""" - return NotImplemented # pragma: no cover - - def get_address(self): - """The address of the fragment in the file. - - .. versionadded:: 3.14.0 - - :Returns: - - The file address of the fragment, or `None` if there - isn't one. - - """ - return self._get_component("address", None) - - def get_aggregated_calendar(self, default=ValueError()): - """The calendar of the aggregated array. - - If the calendar is `None` then the CF default calendar is - assumed, if applicable. - - .. versionadded:: 3.14.0 - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - calendar has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` or `None` - The calendar value. - - """ - calendar = self._get_component("aggregated_calendar", False) - if calendar is False: - if default is None: - return - - return self._default( - default, - f"{self.__class__.__name__} 'aggregated_calendar' has not " - "been set", - ) - - return calendar - - def get_aggregated_units(self, default=ValueError()): - """The units of the aggregated array. - - If the units are `None` then the aggregated array has no - defined units. - - .. versionadded:: 3.14.0 - - .. seealso:: `get_aggregated_calendar` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - units have not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` or `None` - The units value. - - """ - units = self._get_component("aggregated_units", False) - if units is False: - if default is None: - return - - return self._default( - default, - f"{self.__class__.__name__} 'aggregated_units' have not " - "been set", - ) - - return units - - def get_array(self): - """The fragment array stored in a file. - - .. versionadded:: 3.14.0 - - :Returns: - - `Array` - The object defining the fragment array. - - """ - return self._get_component("array") - - def get_units(self, default=ValueError()): - """The units of the netCDF variable. - - .. versionadded:: (cfdm) 1.10.0.1 - - .. seealso:: `get_calendar` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - units have not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` or `None` - The units value. - - """ - return self.get_array().get_units(default) - - def open(self): - """Returns an open dataset containing the data array.""" - return NotImplemented # pragma: no cover diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py new file mode 100644 index 0000000000..f456dae2b4 --- /dev/null +++ b/cf/data/fragment/fullfragmentarray.py @@ -0,0 +1,93 @@ +from ..array.fullarray import FullArray +from .mixin import FragmentArrayMixin + + +class FullFragmentArray(FragmentArrayMixin, FullArray): + """A CFA fragment array that is filled with a value. + + .. versionadded:: TODOCFAVER + + """ + + def __init__( + self, + fill_value=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + units=False, + calendar=False, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + fill_value: scalar + The fill value. + + dtype: `numpy.dtype` + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple` + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + units: `str` or `None`, optional + The units of the fragment data. Set to `None` to + indicate that there are no units. If unset then the + units will be set to `None` during the first + `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the fragment data. Set to `None` to + indicate the CF default calendar, if applicable. If + unset then the calendar will be set to `None` during + the first `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + fill_value=fill_value, + dtype=dtype, + shape=shape, + units=units, + calendar=calendar, + source=source, + copy=False, + ) + + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/missingfragmentarray.py b/cf/data/fragment/missingfragmentarray.py deleted file mode 100644 index 3d702efb0e..0000000000 --- a/cf/data/fragment/missingfragmentarray.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np - -from ..array.fullarray import FullArray -from .abstract import FragmentArray - - -class MissingFragmentArray(FragmentArray): - """A CFA fragment array that is wholly missing data. - - .. versionadded:: 3.14.0 - - """ - - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - aggregated_units=False, - aggregated_calendar=False, - units=False, - calendar=False, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: `str` or `None` - The name of the netCDF fragment file containing the - array. - - address: `str`, optional - The name of the netCDF variable containing the - fragment array. Required unless *varid* is set. - - dtype: `numpy.dtype` - The data type of the aggregated array. May be `None` - if the numpy data-type is not known (which can be the - case for netCDF string types, for example). This may - differ from the data type of the netCDF fragment - variable. - - shape: `tuple` - The shape of the fragment within the aggregated - array. This may differ from the shape of the netCDF - fragment variable in that the latter may have fewer - size 1 dimensions. - - units: `str` or `None`, optional - The units of the fragment data. Ignored, as the data - are all missing values. - - calendar: `str` or `None`, optional - The calendar of the fragment data. Ignored, as the data - are all missing values. - - {{aggregated_units: `str` or `None`, optional}}" - - {{aggregated_calendar: `str` or `None`, optional}} - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - if source is not None: - super().__init__(source=source, copy=copy) - return - - array = FullArray( - fill_value=np.ma.masked, - dtype=dtype, - shape=shape, - units=None, - calendar=None, - copy=False, - ) - - super().__init__( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - aggregated_units=aggregated_units, - aggregated_calendar=aggregated_calendar, - array=array, - source=source, - copy=False, - ) diff --git a/cf/data/fragment/mixin/__init__.py b/cf/data/fragment/mixin/__init__.py new file mode 100644 index 0000000000..a4a35a1129 --- /dev/null +++ b/cf/data/fragment/mixin/__init__.py @@ -0,0 +1 @@ +from .fragmentarraymixin import FragmentArrayMixin diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py new file mode 100644 index 0000000000..c2c549423b --- /dev/null +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -0,0 +1,359 @@ +from numbers import Integral + +import numpy as np + +from ....units import Units + + +class FragmentArrayMixin: + """Mixin class for a CFA fragment array. + + .. versionadded:: TODOCFAVER + + """ + + def __getitem__(self, indices): + """Returns a subspace of the fragment as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + Indexing is similar to numpy indexing, with the following + differences: + + * A dimension's index can't be rank-reducing, i.e. it can't + be an integer, a scalar `numpy` array, nor a scalar `dask` + array. + + * When two or more dimension's indices are sequences of + integers then these indices work independently along each + dimension (similar to the way vector subscripts work in + Fortran). + + .. versionadded:: TODOCFAVER + + """ + # TODOACTIVE: modify this for the case when + # super().__getitem__(tuple(indices)) returns a + # dictionary + + indices = self._parse_indices(indices) + + try: + array = super().__getitem__(tuple(indices)) + except ValueError: + # A ValueError is expected to be raised when the fragment + # variable has fewer than 'self.ndim' dimensions (we know + # this because because 'indices' has 'self.ndim' + # elements). + axis = self._size_1_axis(indices) + if axis is not None: + # There is a unique size 1 index that must correspond + # to the missing dimension => Remove it from the + # indices, get the fragment array with the new + # indices; and then insert the missing size one + # dimension. + indices.pop(axis) + array = super().__getitem__(tuple(indices)) + array = np.expand_dims(array, axis) + else: + # There are multiple size 1 indices so we don't know + # how many missing dimensions the fragment has, nor + # their positions => Get the full fragment array and + # then reshape it to the shape of the dask compute + # chunk. + array = super().__getitem__(Ellipsis) + if array.size != self.size: + raise ValueError( + f"Can't get CFA fragment data from ({self}) when " + "the fragment has two or more missing size 1 " + "dimensions, whilst also spanning two or more " + "dask compute chunks." + "\n\n" + "Consider re-creating the data with exactly one " + "dask compute chunk per fragment (e.g. by setting " + "'chunks=None' as a keyword to cf.read)." + ) + + array = array.reshape(self.shape) + + array = self._conform_to_aggregated_units(array) + return array + + def _conform_to_aggregated_units(self, array): + """Conform the array to have the aggregated units. + + .. versionadded:: TODOCFAVER + + :Parameters: + + array: `numpy.ndarray` or `dict` + The array to be conformed. If *array* is a `dict` with + `numpy` array values then selected values are + conformed. + + :Returns: + + `numpy.ndarray` or `dict` + The conformed array. The returned array may or may not + be the input array updated in-place, depending on its + data type and the nature of its units and the + aggregated units. + + If *array* is a `dict` then a dictionary of conformed + arrays is returned. + + """ + units = self.Units + if units: + aggregated_units = self.aggregated_Units + if not units.equivalent(aggregated_units): + raise ValueError( + f"Can't convert fragment data with units {units!r} to " + f"have aggregated units {aggregated_units!r}" + ) + + if units != aggregated_units: + if isinstance(array, dict): + # 'array' is a dictionary. + raise ValueError( + "TODOACTIVE. This error is notification of an " + "unreplaced placeholder for dealing with active " + "storage reductions on CFA fragments." + ) + else: + # 'array' is a numpy array + array = Units.conform( + array, units, aggregated_units, inplace=True + ) + + return array + + def _parse_indices(self, indices): + """Parse the indices that retrieve the fragment data. + + Ellipses are replaced with the approriate number of `slice` + instances, and rank-reducing indices (such as an integer or + scalar array) are disallowed. + + .. versionadded:: TODOCFAVER + + :Parameters: + + indices: `tuple` or `Ellipsis` + The array indices to be parsed. + + :Returns: + + `list` + The parsed indices. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a._parse_indices([2, 4, 5], Ellipsis, slice(45, 67)) + [[2, 4, 5], slice(0, 1), slice(0, 73), slice(45, 67)] + >>> a._parse_indices([2, 4, 5], [0], slice(None), slice(45, 67)) + [[2, 4, 5], [0], slice(0, 73), slice(45, 67)] + + """ + shape = self.shape + if indices is Ellipsis: + return [slice(0, n) for n in shape] + + indices = list(indices) + + # Check indices + has_ellipsis = False + for i, (index, n) in enumerate(zip(indices, shape)): + if isinstance(index, slice): + if index == slice(None): + indices[i] = slice(0, n) + + continue + + if index is Ellipsis: + has_ellipsis = True + continue + + if isinstance(index, Integral) or not getattr(index, "ndim", True): + # TODOCFA: what about [] or np.array([])? + + # 'index' is an integer or a scalar numpy/dask array + raise ValueError( + f"Can't subspace {self.__class__.__name__} with a " + f"rank-reducing index: {index!r}" + ) + + if has_ellipsis: + # Replace Ellipsis with one or more slices + indices2 = [] + length = len(indices) + n = self.ndim + for index in indices: + if index is Ellipsis: + m = n - length + 1 + indices2.extend([slice(None)] * m) + n -= m + else: + indices2.append(index) + n -= 1 + + length -= 1 + + indices = indices2 + + for i, (index, n) in enumerate(zip(indices, shape)): + if index == slice(None): + indices[i] = slice(0, n) + + return indices + + def _size_1_axis(self, indices): + """Find the position of a unique size 1 index. + + .. versionadded:: TODOCFAVER + + .. seealso:: `_parse_indices`, `__getitem__` + + :Parameters: + + indices: sequence of index + The array indices to be parsed, as returned by + `_parse_indices`. + + :Returns: + + `int` or `None` + The position of the unique size 1 index, or `None` if + there are zero or at least two of them. + + **Examples** + + >>> a._size_1_axis(([2, 4, 5], slice(0, 1), slice(0, 73))) + 1 + >>> a._size_1_axis(([2, 4, 5], slice(3, 4), slice(0, 73))) + 1 + >>> a._size_1_axis(([2, 4, 5], [0], slice(0, 73))) + 1 + >>> a._size_1_axis(([2, 4, 5], slice(0, 144), slice(0, 73))) + None + >>> a._size_1_axis(([2, 4, 5], slice(3, 7), [0, 1])) + None + >>> a._size_1_axis(([2, 4, 5], slice(0, 1), [0])) + None + + """ + axis = None + + n_size_1 = 0 # Number of size 1 indices + for i, (index, n) in enumerate(zip(indices, self.shape)): + try: + x = index.indices(n) + if abs(x[1] - x[0]) == 1: + # Index is a size 1 slice + n_size_1 += 1 + axis = i + except AttributeError: + try: + if index.size == 1: + # Index is a size 1 numpy or dask array + n_size_1 += 1 + axis = i + except AttributeError: + if len(index) == 1: + # Index is a size 1 list + n_size_1 += 1 + axis = i + + if n_size_1 > 1: + # There are two or more size 1 indices + axis = None + + return axis + + @property + def aggregated_Units(self): + """The units of the aggregated data. + + .. versionadded:: TODOCFAVER + + :Returns: + + `Units` + The units of the aggregated data. + + """ + return Units( + self.get_aggregated_units(), self.get_aggregated_calendar(None) + ) + + def get_aggregated_calendar(self, default=ValueError()): + """The calendar of the aggregated array. + + If the calendar is `None` then the CF default calendar is + assumed, if applicable. + + .. versionadded:: TODOCFAVER + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + aggregated calendar has not been set. If set to an + `Exception` instance then it will be raised instead. + + :Returns: + + `str` or `None` + The calendar value. + + """ + calendar = self._get_component("aggregated_calendar", False) + if calendar is False: + if default is None: + return + + return self._default( + default, + f"{self.__class__.__name__} 'aggregated_calendar' has not " + "been set", + ) + + return calendar + + def get_aggregated_units(self, default=ValueError()): + """The units of the aggregated array. + + If the units are `None` then the aggregated array has no + defined units. + + .. versionadded:: TODOCFAVER + + .. seealso:: `get_aggregated_calendar` + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + aggregated units have not been set. If set to an + `Exception` instance then it will be raised instead. + + :Returns: + + `str` or `None` + The units value. + + """ + units = self._get_component("aggregated_units", False) + if units is False: + if default is None: + return + + return self._default( + default, + f"{self.__class__.__name__} 'aggregated_units' have not " + "been set", + ) + + return units diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 4bf56408ac..578412f124 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,8 +1,8 @@ from ..array.netcdfarray import NetCDFArray -from .abstract import FragmentArray +from .mixin import FragmentArrayMixin -class NetCDFFragmentArray(FragmentArray): +class NetCDFFragmentArray(FragmentArrayMixin, NetCDFArray): """A CFA fragment array stored in a netCDF file. .. versionadded:: 3.14.0 @@ -26,22 +26,22 @@ def __init__( :Parameters: - filename: `str` - The name of the netCDF fragment file containing the + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the array. - address: `str`, optional + address: (sequence of `str`), optional The name of the netCDF variable containing the fragment array. Required unless *varid* is set. - dtype: `numpy.dtype` + dtype: `numpy.dtype`, optional The data type of the aggregated array. May be `None` if the numpy data-type is not known (which can be the case for netCDF string types, for example). This may differ from the data type of the netCDF fragment variable. - shape: `tuple` + shape: `tuple`, optional The shape of the fragment within the aggregated array. This may differ from the shape of the netCDF fragment variable in that the latter may have fewer @@ -67,86 +67,34 @@ def __init__( {{init copy: `bool`, optional}} """ - if source is not None: - super().__init__(source=source, copy=copy) - return - - if isinstance(address, int): - ncvar = None - varid = address - else: - ncvar = address - varid = None - - # TODO set groups from ncvar - group = None - - array = NetCDFArray( + super().__init__( filename=filename, - ncvar=ncvar, - varid=varid, - group=group, + address=address, dtype=dtype, shape=shape, mask=True, units=units, calendar=calendar, - copy=False, - ) - - super().__init__( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - aggregated_units=aggregated_units, - aggregated_calendar=aggregated_calendar, - array=array, source=source, - copy=False, + copy=copy, ) - def __getitem__(self, indices): - """Returns a subspace of the fragment as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - Indexing is similar to numpy indexing, with the following - differences: - - * A dimension's index can't be rank-reducing, i.e. it can't - be an integer, nor a scalar `numpy` or `dask` array. - - * When two or more dimension's indices are sequences of - integers then these indices work independently along each - dimension (similar to the way vector subscripts work in - Fortran). - - **Performance** - - If the netCDF fragment variable has fewer than `ndim` - dimensions then the entire array is read into memory before - the requested subspace of it is returned. - - .. versionadded:: 3.14.0 - - """ - indices = self._parse_indices(indices) - array = self.get_array() - - try: - array = array[indices] - except ValueError: - # A value error is raised if indices has at least ndim - # elements but the netCDF fragment variable has fewer than - # ndim dimensions. In this case we get the entire fragment - # array, insert the missing size 1 dimensions, and then - # apply the requested slice. - array = array[Ellipsis] - if array.ndim < self.ndim: - array = array.reshape(self.shape) - - array = array[indices] - - array = self._conform_units(array) - return array + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index 4ecff13303..a30737f46d 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -1,8 +1,8 @@ from ..array.umarray import UMArray -from .abstract import FragmentArray +from .mixin import FragmentArrayMixin -class UMFragmentArray(FragmentArray): +class UMFragmentArray(FragmentArrayMixin, UMArray): """A CFA fragment array stored in a UM or PP file. .. versionadded:: 3.14.0 @@ -26,11 +26,11 @@ def __init__( :Parameters: - filename: `str` - The name of the UM or PP file containing the fragment. + filename: (sequence of `str`), optional + The names of the UM or PP files containing the fragment. - address: `int`, optional - The start word in the file of the header. + address: (sequence of `str`), optional + The start words in the files of the header. dtype: `numpy.dtype` The data type of the aggregated array. May be `None` @@ -65,28 +65,33 @@ def __init__( {{init copy: `bool`, optional}} """ - if source is not None: - super().__init__(source=source, copy=copy) - return - - array = UMArray( + super().__init__( filename=filename, - header_offset=address, + address=address, dtype=dtype, shape=shape, units=units, calendar=calendar, + source=source, copy=False, ) - super().__init__( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - aggregated_units=aggregated_units, - aggregated_calendar=aggregated_calendar, - array=array, - source=source, - copy=False, + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False ) diff --git a/cf/data/utils.py b/cf/data/utils.py index 3b501fb538..abb1a835a4 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -423,7 +423,7 @@ def chunk_positions(chunks): .. versionadded:: 3.14.0 - .. seealso:: `chunk_shapes` + .. seealso:: `chunk_indices`, `chunk_locations`, `chunk_shapes` :Parameters: @@ -453,7 +453,7 @@ def chunk_shapes(chunks): .. versionadded:: 3.14.0 - .. seealso:: `chunk_positions` + .. seealso:: `chunk_indices`, `chunk_locations`, `chunk_positions` :Parameters: @@ -478,6 +478,43 @@ def chunk_shapes(chunks): return product(*chunks) +def chunk_locations(chunks): + """Find the shape of each chunk. + + .. versionadded:: TODOCFAVER + + .. seealso:: `chunk_indices`, `chunk_positions`, `chunk_shapes` + + :Parameters: + + chunks: `tuple` + The chunk sizes along each dimension, as output by + `dask.array.Array.chunks`. + + **Examples** + + >>> chunks = ((1, 2), (9,), (4, 5, 6)) + >>> for location in cf.data.utils.chunk_locations(chunks): + ... print(location) + ... + ((0, 1), (0, 9), (0, 4)) + ((0, 1), (0, 9), (4, 9)) + ((0, 1), (0, 9), (9, 15)) + ((1, 3), (0, 9), (0, 4)) + ((1, 3), (0, 9), (4, 9)) + ((1, 3), (0, 9), (9, 15)) + + """ + from dask.utils import cached_cumsum + + cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] + locations = [ + [(s, s + dim) for s, dim in zip(starts, shapes)] + for starts, shapes in zip(cumdims, chunks) + ] + return product(*locations) + + def scalar_masked_array(dtype=float): """Return a scalar masked array. diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index c07262f3fc..243e8d8096 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -491,6 +491,22 @@ culled. See `dask.optimization.cull` for details. .. versionadded:: 3.14.0""", + # cfa substitutions + "{{cfa substitutions: `dict`}}": """substitutions: `dict` + The substitution definitions in a dictionary whose + key/value pairs are the file name parts to be + substituted and their corresponding substitution text. + + Each substitution definition may be specified with or + without the ``${...}`` syntax. For instance, the + following are equivalent: ``{'base': 'sub'}``, + ``{'${base}': 'sub'}``.""", + # cfa base + "{{cfa base: `str`}}": """base: `str` + The substitution definition to be removed. May be + specified with or without the ``${...}`` syntax. For + instance, the following are equivalent: ``'base'`` and + ``'${base}'``.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- @@ -523,4 +539,19 @@ checked. The coordinates check will be carried out, however, if the *check_coordinates* parameter is True.""", + # Returns cfa_file_substitutions + "{{Returns cfa_file_substitutions}}": """The CFA-netCDF file name substitutions in a dictionary + whose key/value pairs are the file name parts to be + substituted and their corresponding substitution + text.""", + # Returns cfa_clear_file_substitutions + "{{Returns cfa_clear_file_substitutions}}": """The removed CFA-netCDF file name substitutions in a + dictionary whose key/value pairs are the file name + parts to be substituted and their corresponding + substitution text.""", + # Returns cfa_clear_file_substitutions + "{{Returns cfa_del_file_substitution}}": """ + The removed CFA-netCDF file name substitution. If the + substitution was not defined then an empty dictionary + is returned.""", } diff --git a/cf/domain.py b/cf/domain.py index 7b463207b1..78eff09ab9 100644 --- a/cf/domain.py +++ b/cf/domain.py @@ -1,4 +1,5 @@ from math import prod +from os import sep import cfdm @@ -12,6 +13,7 @@ from .functions import ( _DEPRECATION_ERROR_ARG, _DEPRECATION_ERROR_METHOD, + abspath, indices_shape, parse_indices, ) @@ -130,6 +132,141 @@ def size(self): [domain_axis.get_size(0) for domain_axis in domain_axes.values()] ) + def add_file_location( + self, + location, + ): + """Add a new file location in-place. + + All data definitions that reference files are additionally + referenced from the given location. + + .. versionadded:: TODOCFAVER + + .. seealso:: `del_file_location`, `file_locations` + + :Parameters: + + location: `str` + The new location. + + :Returns: + + `str` + The new location as an absolute path with no trailing + separate pathname component separator. + + **Examples** + + >>> f.add_file_location('/data/model/') + '/data/model' + + """ + location = abspath(location).rstrip(sep) + + for c in self.constructs.filter_by_data(todict=True).values(): + c.add_file_location(location) + + return location + + def cfa_clear_file_substitutions( + self, + ): + """Remove all of the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_clear_file_substitutions}} + + **Examples** + + >>> d.cfa_clear_file_substitutions() + {} + + """ + out = {} + for c in self.constructs.filter_by_data(todict=True).values(): + out.update(c.cfa_clear_file_substitutions()) + + return out + + def cfa_file_substitutions(self): + """Return the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_file_substitutions}} + + **Examples** + + >>> d.cfa_file_substitutions() + {} + + """ + out = {} + for c in self.constructs.filter_by_data(todict=True).values(): + out.update(c.cfa_file_substitutions()) + + return out + + def cfa_del_file_substitution( + self, + base, + ): + """Remove a CFA-netCDF file name substitution. + + .. versionadded:: TODOCFAVER + + :Parameters: + + base: `str` + {{cfa base: `str`}} + + :Returns: + + `dict` + {{Returns cfa_del_file_substitution}} + + **Examples** + + >>> f.cfa_del_file_substitution('base') + + """ + for c in self.constructs.filter_by_data(todict=True).values(): + c.cfa_del_file_substitution( + base, + ) + + def cfa_update_file_substitutions( + self, + substitutions, + ): + """Set CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{cfa substitutions: `dict`}} + + :Returns: + + `None` + + **Examples** + + >>> d.cfa_update_file_substitutions({'base': '/data/model'}) + + """ + for c in self.constructs.filter_by_data(todict=True).values(): + c.cfa_update_file_substitutions(substitutions) + def close(self): """Close all files referenced by the domain construct. @@ -156,6 +293,75 @@ def close(self): removed_at="5.0.0", ) # pragma: no cover + def del_file_location( + self, + location, + ): + """Remove a file location in-place. + + All data definitions that reference files will have references + to files in the given location removed from them. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `file_locations` + + :Parameters: + + location: `str` + The file location to remove. + + :Returns: + + `str` + The removed location as an absolute path with no + trailing separate pathname component separator. + + **Examples** + + >>> d.del_file_location('/data/model/') + '/data/model' + + """ + location = abspath(location).rstrip(sep) + + for c in self.constructs.filter_by_data(todict=True).values(): + c.del_file_location(location) + + return location + + def file_locations( + self, + ): + """The locations of files containing parts of the components data. + + Returns the locations of any files that may be required to + deliver the computed data arrays of any of the component + constructs (such as dimension coordinate constructs, cell + measure constructs, etc.). + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `del_file_location` + + :Returns: + + `set` + The unique file locations as absolute paths with no + trailing separate pathname component separator. + + **Examples** + + >>> d.file_locations() + {'/home/data1', 'file:///data2'} + + """ + out = set() + for c in self.constructs.filter_by_data(todict=True).values(): + out.update(c.file_locations()) + + return out + @_inplace_enabled(default=False) def flip(self, axes=None, inplace=False): """Flip (reverse the direction of) domain axes. diff --git a/cf/field.py b/cf/field.py index 233fd61346..73afbec9ab 100644 --- a/cf/field.py +++ b/cf/field.py @@ -2,6 +2,7 @@ from collections import namedtuple from functools import reduce from operator import mul as operator_mul +from os import sep import cfdm import numpy as np @@ -46,6 +47,7 @@ _DEPRECATION_ERROR_METHOD, DeprecationError, _section, + abspath, flat, parse_indices, ) @@ -3643,6 +3645,164 @@ def cell_area( return w + def cfa_clear_file_substitutions( + self, + ): + """Remove all of the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_clear_file_substitutions}} + + **Examples** + + >>> f.cfa_clear_file_substitutions() + {} + + """ + out = super().cfa_clear_file_substitution() + + for c in self.constructs.filter_by_data(todict=True).values(): + out.update(c.cfa_clear_file_substitutions()) + + return out + + def cfa_del_file_substitution( + self, + base, + constructs=True, + ): + """Remove a CFA-netCDF file name substitution. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{cfa base: `str`}} + + constructs: `bool`, optional + If True (the default) then metadata constructs also + have the file substitutions removed from them. + + :Returns: + + `dict` + {{Returns cfa_del_file_substitution}} + + **Examples** + + >>> f.cfa_del_file_substitution('base') + + """ + super().cfa_del_file_substitution(base) + + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + c.cfa_del_file_substitution(base) + + def cfa_file_substitutions(self, constructs=True): + """Return the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_file_substitutions}} + + **Examples** + + >>> f.cfa_file_substitutions() + {} + + """ + out = super().cfa_file_substitutions() + + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + out.update(c.cfa_file_substitutions()) + + return out + + def del_file_location( + self, + location, + constructs=True, + ): + """Remove a file location in-place. + + All data definitions that reference files will have references + to files in the given location removed from them. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `file_locations` + + :Parameters: + + location: `str` + The file location to remove. + + constructs: `bool`, optional + If True (the default) then metadata constructs also + have the new file location removed from them. + + :Returns: + + `str` + The removed location as an absolute path with no + trailing separate pathname component separator. + + **Examples** + + >>> d.del_file_location('/data/model/') + '/data/model' + + """ + location = abspath(location).rstrip(sep) + super().del_file_location(location) + + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + c.del_file_location(location, inplace=True) + + return location + + def cfa_update_file_substitutions( + self, + substitutions, + constructs=True, + ): + """Set CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{cfa substitutions: `dict`}} + + constructs: `bool`, optional + If True (the default) then metadata constructs also + have the file substitutions set on them. + + :Returns: + + `None` + + **Examples** + + >>> f.cfa_update_file_substitutions({'base': '/data/model'}) + + """ + super().cfa_update_file_substitutions(substitutions) + + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + c.cfa_update_file_substitutions(substitutions) + def radius(self, default=None): """Return the radius of a latitude-longitude plane defined in spherical polar coordinates. @@ -11453,6 +11613,41 @@ def cumsum( return f + def file_locations(self, constructs=True): + """The locations of files containing parts of the data. + + Returns the locations of any files that may be required to + deliver the computed data array. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `del_file_location` + + :Parameters: + + constructs: `bool`, optional + If True (the default) then the file locations from + metadata constructs are also returned. + + :Returns: + + `set` + The unique file locations as absolute paths with no + trailing separate pathname component separator. + + **Examples** + + >>> f.file_locations() + {'/home/data1', 'file:///data2'} + + """ + out = super().file_locations() + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + out.update(c.file_locations()) + + return out + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def flip(self, axes=None, inplace=False, i=False, **kwargs): @@ -11796,7 +11991,7 @@ def transpose( provided, or if no axes are specified then the axis order is reversed. - constructs: `bool` + constructs: `bool`, optional If True then metadata constructs are also transposed so that their axes are in the same relative order as in the transposed data array of the field. By default metadata @@ -13846,6 +14041,48 @@ def subspace(self): """ return SubspaceField(self) + def add_file_location( + self, + location, + constructs=True, + ): + """Add a new file location in-place. + + All data definitions that reference files are additionally + referenced from the given location. + + .. versionadded:: TODOCFAVER + + .. seealso:: `del_file_location`, `file_locations` + + :Parameters: + + location: `str` + The new location. + + constructs: `bool`, optional + If True (the default) then metadata constructs also + have the new file location added to them. + + :Returns: + + `str` + The new location as an absolute path with no trailing + separate pathname component separator. + + **Examples** + + >>> f.add_file_location('/data/model/') + '/data/model' + + """ + location = super().add_file_location(location) + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + c.add_file_location(location) + + return location + def section(self, axes=None, stop=None, min_step=1, **kwargs): """Return a FieldList of m dimensional sections of a Field of n dimensions, where M <= N. diff --git a/cf/functions.py b/cf/functions.py index d9a987bf8f..dbfbeffe8e 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -21,6 +21,7 @@ from os.path import expandvars as _os_path_expandvars from os.path import join as _os_path_join from os.path import relpath as _os_path_relpath +from urllib.parse import urlparse import cfdm import netCDF4 @@ -30,7 +31,7 @@ from dask.utils import parse_bytes from psutil import virtual_memory -from . import __file__, __version__ +from . import __cfa_version__, __file__, __version__ from .constants import ( CONSTANTS, OperandBoundsCombination, @@ -1149,6 +1150,33 @@ def CF(): CF.__doc__ = cfdm.CF.__doc__.replace("cfdm.", "cf.") + +def CFA(): + """The version of the CFA conventions. + + This indicates which version of the CFA conventions are + represented by this release of the cf package, and therefore the + version can not be changed. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cf.CF` + + :Returns: + + `str` + The version of the CFA conventions represented by this + release of the cf package. + + **Examples** + + >>> cf.CFA() + '0.6.2' + + """ + return __cfa_version__ + + # Module-level alias to avoid name clashes with function keyword # arguments (corresponding to 'import atol as cf_atol' etc. in other # modules) @@ -2473,14 +2501,15 @@ def abspath(filename): 'http://data/archive/file.nc' """ - if filename is None: - return + u = urlparse(filename) + scheme = u.scheme + if not scheme: + return _os_path_abspath(filename) - u = urllib.parse.urlparse(filename) - if u.scheme != "": - return filename + if scheme == "file": + return u.path - return _os_path_abspath(filename) + return filename def relpath(filename, start=None): @@ -3207,6 +3236,44 @@ def _DEPRECATION_ERROR_FUNCTION_KWARGS( ) +def _DEPRECATION_ERROR_FUNCTION_KWARG_VALUE( + func, + kwarg, + value, + message="", + version=None, + removed_at=None, +): + if removed_at: + removed_at = f" and will be removed at version {removed_at}" + + raise DeprecationError( + f"Value {value!r} of keyword {kwarg!r} of function {func!r} " + f"has been deprecated at version {version} and is no longer " + f"available{removed_at}. {message}" + ) + + +def _DEPRECATION_ERROR_FUNCTION_KWARG( + func, + kwarg=None, + message="", + version=None, + removed_at=None, +): + if version is None: + raise ValueError("Must provide deprecation version, e.g. '3.14.0'") + + if removed_at: + removed_at = f" and will be removed at version {removed_at}" + + raise DeprecationError( + f"Keyword {kwarg!r} of function {func} has been deprecated " + f"at version {version} and is no longer available{removed_at}. " + f"{message}" + ) + + def _DEPRECATION_ERROR_KWARGS( instance, method, diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 00599b0fca..6ab2e0bc31 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -1,5 +1,6 @@ import logging from itertools import chain +from os import sep import numpy as np @@ -16,6 +17,7 @@ _DEPRECATION_ERROR_ATTRIBUTE, _DEPRECATION_ERROR_KWARGS, _DEPRECATION_ERROR_METHOD, + abspath, default_netCDF_fillvals, ) from ..functions import equivalent as cf_equivalent @@ -1593,6 +1595,39 @@ def units(self): self.Units = Units(None, getattr(self, "calendar", None)) + def add_file_location(self, location): + """Add a new file location in-place. + + All data definitions that reference files are additionally + referenced from the given location. + + .. versionadded:: TODOCFAVER + + .. seealso:: `del_file_location`, `file_locations` + + :Parameters: + + location: `str` + The new location. + + :Returns: + + `str` + The new location as an absolute path with no trailing + separate pathname component separator. + + **Examples** + + >>> d.add_file_location('/data/model/') + '/data/model' + + """ + data = self.get_data(None, _fill_value=False, _units=False) + if data is not None: + return data.add_file_location(location) + + return abspath(location).rstrip(sep) + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def mask_invalid(self, inplace=False, i=False): @@ -2473,6 +2508,100 @@ def ceil(self, inplace=False, i=False): delete_props=True, ) + def cfa_update_file_substitutions(self, substitutions): + """Set CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{cfa substitutions: `dict`}} + + :Returns: + + `None` + + **Examples** + + >>> f.cfa_update_file_substitutions({'base', '/data/model'}) + + """ + data = self.get_data(None, _fill_value=False, _units=False) + if data is not None: + data.cfa_update_file_substitutions(substitutions) + + @_inplace_enabled(default=False) + def cfa_clear_file_substitutions(self, inplace=False): + """Remove all of the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{inplace: `bool`, optional}} + + :Returns: + + `dict` + {{Returns cfa_clear_file_substitutions}} + + **Examples** + + >>> f.cfa_clear_file_substitutions() + {} + + """ + data = self.get_data(None) + if data is None: + return {} + + return data.cfa_clear_file_substitutions({}) + + def cfa_del_file_substitution( + self, + base, + ): + """Remove a CFA-netCDF file name substitution. + + .. versionadded:: TODOCFAVER + + :Parameters: + + `dict` + {{Returns cfa_del_file_substitution}} + + **Examples** + + >>> f.cfa_del_file_substitution('base') + + """ + data = self.get_data(None, _fill_value=False, _units=False) + if data is not None: + data.cfa_del_file_substitution(base) + + def cfa_file_substitutions( + self, + ): + """Return the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_file_substitutions}} + + **Examples** + + >>> g = f.cfa_file_substitutions() + + """ + data = self.get_data(None) + if data is None: + return {} + + return data.cfa_file_substitutions({}) + def chunk(self, chunksize=None): """Partition the data array. @@ -2899,6 +3028,39 @@ def datum(self, *index): return data.datum(*index) + def del_file_location(self, location): + """Remove a file location in-place. + + All data definitions that reference files will have references + to files in the given location removed from them. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `file_locations` + + :Parameters: + + location: `str` + The file location to remove. + + :Returns: + + `str` + The removed location as an absolute path with no + trailing separate pathname component separator. + + **Examples** + + >>> f.del_file_location('/data/model/') + '/data/model' + + """ + data = self.get_data(None, _fill_value=False, _units=False) + if data is not None: + return data.del_file_location(location) + + return abspath(location).rstrip(sep) + @_manage_log_level_via_verbosity def equals( self, @@ -3226,6 +3388,34 @@ def convert_reference_time( calendar_years=calendar_years, ) + def file_locations(self): + """The locations of files containing parts of the data. + + Returns the locations of any files that may be required to + deliver the computed data array. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `del_file_location` + + :Returns: + + `set` + The unique file locations as absolute paths with no + trailing separate pathname component separator. + + **Examples** + + >>> d.file_locations() + {'/home/data1', 'file:///data2'} + + """ + data = self.get_data(None, _fill_value=False, _units=False) + if data is not None: + return data.file_locations() + + return set() + @_inplace_enabled(default=False) def flatten(self, axes=None, inplace=False): """Flatten axes of the data. diff --git a/cf/mixin/propertiesdatabounds.py b/cf/mixin/propertiesdatabounds.py index 0fa5d716bc..bc0ad1ac2f 100644 --- a/cf/mixin/propertiesdatabounds.py +++ b/cf/mixin/propertiesdatabounds.py @@ -1135,6 +1135,45 @@ def dtype(self): if data is not None: del data.dtype + def add_file_location(self, location): + """Add a new file location in-place. + + All data definitions that reference files are additionally + referenced from the given location. + + .. versionadded:: TODOCFAVER + + .. seealso:: `del_file_location`, `file_locations` + + :Parameters: + + location: `str` + The new location. + + :Returns: + + `str` + The new location as an absolute path with no trailing + separate pathname component separator. + + **Examples** + + >>> d.add_file_location('/data/model/') + '/data/model' + + """ + location = super().add_file_location(location) + + bounds = self.get_bounds(None) + if bounds is not None: + bounds.add_file_location(location) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + interior_ring.add_file_location(location) + + return location + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def ceil(self, bounds=True, inplace=False, i=False): @@ -1182,6 +1221,121 @@ def ceil(self, bounds=True, inplace=False, i=False): i=i, ) + def cfa_clear_file_substitutions( + self, + ): + """Remove all of the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_clear_file_substitutions}} + + **Examples** + + >>> f.cfa_clear_file_substitutions() + {} + + """ + out = super().cfa_clear_file_substitutions() + + bounds = self.get_bounds(None) + if bounds is not None: + out.update(bounds.cfa_clear_file_substitutions()) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + out.update(interior_ring.cfa_clear_file_substitutions()) + + return out + + def cfa_del_file_substitution(self, base): + """Remove a CFA-netCDF file name substitution. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{cfa base: `str`}} + + :Returns: + + `dict` + {{Returns cfa_del_file_substitution}} + + **Examples** + + >>> c.cfa_del_file_substitution('base') + + """ + super().cfa_del_file_substitution(base) + + bounds = self.get_bounds(None) + if bounds is not None: + bounds.cfa_del_file_substitution(base) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + interior_ring.cfa_del_file_substitution(base) + + def cfa_file_substitutions(self): + """Return the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Returns: + + `dict` + {{Returns cfa_file_substitutions}} + + **Examples** + + >>> c.cfa_file_substitutions() + {} + + """ + out = super().cfa_file_substitutions() + + bounds = self.get_bounds(None) + if bounds is not None: + out.update(bounds.cfa_file_substitutions({})) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + out.update(interior_ring.cfa_file_substitutions({})) + + return out + + def cfa_update_file_substitutions(self, substitutions): + """Set CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + :Parameters: + + {{cfa substitutions: `dict`}} + + :Returns: + + `None` + + **Examples** + + >>> c.cfa_add_file_substitutions({'base', '/data/model'}) + + """ + super().cfa_update_file_substitutions(substitutions) + + bounds = self.get_bounds(None) + if bounds is not None: + bounds.cfa_update_file_substitutions(substitutions) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + interior_ring.cfa_update_file_substitutions(substitutions) + def chunk(self, chunksize=None): """Partition the data array. @@ -1899,6 +2053,40 @@ def get_property(self, prop, default=ValueError(), bounds=False): return super().get_property(prop, default) + def file_locations(self): + """The locations of files containing parts of the data. + + Returns the locations of any files that may be required to + deliver the computed data array. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `del_file_location` + + :Returns: + + `set` + The unique file locations as absolute paths with no + trailing separate pathname component separator. + + **Examples** + + >>> d.file_locations() + {'/home/data1', 'file:///data2'} + + """ + out = super().file_locations() + + bounds = self.get_bounds(None) + if bounds is not None: + out.update(bounds.file_locations()) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + out.update(interior_ring.file_locations()) + + return out + @_inplace_enabled(default=False) def flatten(self, axes=None, inplace=False): """Flatten axes of the data. @@ -1970,6 +2158,45 @@ def flatten(self, axes=None, inplace=False): return v + def del_file_location(self, location): + """Remove a file location in-place. + + All data definitions that reference files will have references + to files in the given location removed from them. + + .. versionadded:: TODOCFAVER + + .. seealso:: `add_file_location`, `file_locations` + + :Parameters: + + location: `str` + The file location to remove. + + :Returns: + + `str` + The removed location as an absolute path with no + trailing separate pathname component separator. + + **Examples** + + >>> c.del_file_location('/data/model/') + '/data/model' + + """ + location = super().del_file_location(location) + + bounds = self.get_bounds(None) + if bounds is not None: + bounds.del_file_location(location) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + interior_ring.del_file_location(location) + + return location + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def floor(self, bounds=True, inplace=False, i=False): diff --git a/cf/mixin2/__init__.py b/cf/mixin2/__init__.py new file mode 100644 index 0000000000..3dc304f232 --- /dev/null +++ b/cf/mixin2/__init__.py @@ -0,0 +1,2 @@ +from .cfanetcdf import CFANetCDF +from .container import Container diff --git a/cf/mixin2/cfanetcdf.py b/cf/mixin2/cfanetcdf.py new file mode 100644 index 0000000000..0eb80c9d1f --- /dev/null +++ b/cf/mixin2/cfanetcdf.py @@ -0,0 +1,506 @@ +"""This class is not in the cf.mixin package because it needs to be +imported by cf.Data, and some of the other mixin classes in cf.mixin +themsleves import cf.Data, which would lead to a circular import +situation. + +""" +from re import split + +from cfdm.mixin import NetCDFMixin + + +class CFANetCDF(NetCDFMixin): + """Mixin class for CFA-netCDF. + + .. versionadded:: TODOCFAVER + + """ + + def cfa_del_aggregated_data(self): + """Remove the CFA-netCDF aggregation instruction terms. + + The aggregation instructions are stored in the + ``aggregation_data`` attribute of a CFA-netCDF aggregation + variable. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_get_aggregated_data`, + `cfa_has_aggregated_data`, + `cfa_set_aggregated_data` + + :Returns: + + `dict` + The removed CFA-netCDF aggregation instruction terms. + + **Examples** + + >>> f.cfa_set_aggregated_data( + ... {'location': 'cfa_location', + ... 'file': 'cfa_file', + ... 'address': 'cfa_address', + ... 'format': 'cfa_format', + ... 'tracking_id': 'tracking_id'} + ... ) + >>> f.cfa_has_aggregated_data() + True + >>> f.cfa_get_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'c ', + 'tracking_id': 'tracking_id'} + >>> f.cfa_del_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_has_aggregated_data() + False + >>> f.cfa_del_aggregated_data() + {} + >>> f.cfa_get_aggregated_data() + {} + + """ + return self._nc_del("cfa_aggregated_data", {}).copy() + + def cfa_get_aggregated_data(self): + """Return the CFA-netCDF aggregation instruction terms. + + The aggregation instructions are stored in the + ``aggregation_data`` attribute of a CFA-netCDF aggregation + variable. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_del_aggregated_data`, + `cfa_has_aggregated_data`, + `cfa_set_aggregated_data` + + :Returns: + + `dict` + The aggregation instruction terms and their + corresponding netCDF variable names in a dictionary + whose key/value pairs are the aggregation instruction + terms and their corresponding variable names. + + **Examples** + + >>> f.cfa_set_aggregated_data( + ... {'location': 'cfa_location', + ... 'file': 'cfa_file', + ... 'address': 'cfa_address', + ... 'format': 'cfa_format', + ... 'tracking_id': 'tracking_id'} + ... ) + >>> f.cfa_has_aggregated_data() + True + >>> f.cfa_get_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_del_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_has_aggregated_data() + False + >>> f.cfa_del_aggregated_data() + {} + >>> f.cfa_get_aggregated_data() + {} + + """ + out = self._nc_get("cfa_aggregated_data", default=None) + if out is not None: + return out.copy() + + return {} + + def cfa_has_aggregated_data(self): + """Whether any CFA-netCDF aggregation instruction terms have been set. + + The aggregation instructions are stored in the + ``aggregation_data`` attribute of a CFA-netCDF aggregation + variable. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_del_aggregated_data`, + `cfa_get_aggregated_data`, + `cfa_set_aggregated_data` + + :Returns: + + `bool` + `True` if the CFA-netCDF aggregation instruction terms + have been set, otherwise `False`. + + **Examples** + + >>> f.cfa_set_aggregated_data( + ... {'location': 'cfa_location', + ... 'file': 'cfa_file', + ... 'address': 'cfa_address', + ... 'format': 'cfa_format', + ... 'tracking_id': 'tracking_id'} + ... ) + >>> f.cfa_has_aggregated_data() + True + >>> f.cfa_get_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_del_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_has_aggregated_data() + False + >>> f.cfa_del_aggregated_data() + {} + >>> f.cfa_get_aggregated_data() + {} + + """ + return self._nc_has("cfa_aggregated_data") + + def cfa_set_aggregated_data(self, value): + """Set the CFA-netCDF aggregation instruction terms. + + The aggregation instructions are stored in the + ``aggregation_data`` attribute of a CFA-netCDF aggregation + variable. + + If there are any ``/`` (slash) characters in the netCDF + variable names then these act as delimiters for a group + hierarchy. By default, or if the name starts with a ``/`` + character and contains no others, the name is assumed to be in + the root group. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_del_aggregated_data`, + `cfa_get_aggregated_data`, + `cfa_has_aggregated_data` + + :Parameters: + + value: `str` or `dict` + The aggregation instruction terms and their + corresponding netCDF variable names. Either a + CFA-netCDF-compliant string value of an + ``aggregated_data`` attribute, or a dictionary whose + key/value pairs are the aggregation instruction terms + and their corresponding variable names. + + :Returns: + + `None` + + **Examples** + + >>> f.cfa_set_aggregated_data( + ... {'location': 'cfa_location', + ... 'file': 'cfa_file', + ... 'address': 'cfa_address', + ... 'format': 'cfa_format', + ... 'tracking_id': 'tracking_id'} + ... ) + >>> f.cfa_has_aggregated_data() + True + >>> f.cfa_get_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_del_aggregated_data() + {'location': 'cfa_location', + 'file': 'cfa_file', + 'address': 'cfa_address', + 'format': 'cfa_format', + 'tracking_id': 'tracking_id'} + >>> f.cfa_has_aggregated_data() + False + >>> f.cfa_del_aggregated_data() + {} + >>> f.cfa_get_aggregated_data() + {} + + """ + if value: + if isinstance(value, str): + v = split("\s+", value) + value = {term[:-1]: var for term, var in zip(v[::2], v[1::2])} + else: + # 'value' is a dictionary + value = value.copy() + + self._nc_set("cfa_aggregated_data", value) + + def cfa_clear_file_substitutions(self): + """Remove all of the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_del_file_substitution`, + `cfa_file_substitutions`, + `cfa_has_file_substitutions`, + `cfa_update_file_substitutions` + + :Returns: + + `dict` + {{Returns cfa_clear_file_substitutions}} + + **Examples** + + >>> f.cfa_update_file_substitutions({'base': 'file:///data/'}) + >>> f.cfa_has_file_substitutions() + True + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/'} + >>> f.cfa_update_file_substitutions({'${base2}': '/home/data/'}) + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/', '${base2}': '/home/data/'} + >>> f.cfa_update_file_substitutions({'${base}': '/new/location/'}) + >>> f.cfa_file_substitutions() + {'${base}': '/new/location/', '${base2}': '/home/data/'} + >>> f.cfa_del_file_substitution('${base}') + {'${base}': '/new/location/'} + >>> f.cfa_clear_file_substitutions() + {'${base2}': '/home/data/'} + >>> f.cfa_has_file_substitutions() + False + >>> f.cfa_file_substitutions() + {} + >>> f.cfa_clear_file_substitutions() + {} + >>> print(f.cfa_del_file_substitution('base', None)) + None + + """ + return self._nc_del("cfa_file_substitutions", {}).copy() + + def cfa_del_file_substitution(self, base): + """Remove a CFA-netCDF file name substitution. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_clear_file_substitutions`, + `cfa_file_substitutions`, + `cfa_has_file_substitutions`, + `cfa_update_file_substitutions` + + :Parameters: + + {{cfa base: `str`}} + + :Returns: + + `dict` + {{Returns cfa_del_file_substitution}} + + **Examples** + + >>> f.cfa_update_file_substitutions({'base': 'file:///data/'}) + >>> f.cfa_has_file_substitutions() + True + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/'} + >>> f.cfa_update_file_substitutions({'${base2}': '/home/data/'}) + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/', '${base2}': '/home/data/'} + >>> f.cfa_update_file_substitutions({'${base}': '/new/location/'}) + >>> f.cfa_file_substitutions() + {'${base}': '/new/location/', '${base2}': '/home/data/'} + >>> f.cfa_del_file_substitution('${base}') + {'${base}': '/new/location/'} + >>> f.cfa_clear_file_substitutions() + {'${base2}': '/home/data/'} + >>> f.cfa_has_file_substitutions() + False + >>> f.cfa_file_substitutions() + {} + >>> f.cfa_clear_file_substitutions() + {} + >>> print(f.cfa_del_file_substitution('base')) + {} + + """ + if not (base.startswith("${") and base.endswith("}")): + base = f"${{{base}}}" + + subs = self.cfa_file_substitutions() + if base not in subs: + return {} + + out = {base: subs.pop(base)} + if subs: + self._nc_set("cfa_file_substitutions", subs) + else: + self._nc_del("cfa_file_substitutions", None) + + return out + + def cfa_file_substitutions(self): + """Return the CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_clear_file_substitutions`, + `cfa_del_file_substitution`, + `cfa_has_file_substitutions`, + `cfa_update_file_substitutions` + :Returns: + + `dict` + The CFA-netCDF file name substitutions. + + **Examples** + + >>> f.cfa_update_file_substitutions({'base': 'file:///data/'}) + >>> f.cfa_has_file_substitutions() + True + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/'} + >>> f.cfa_update_file_substitutions({'${base2}': '/home/data/'}) + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/', '${base2}': '/home/data/'} + >>> f.cfa_update_file_substitutions({'${base}': '/new/location/'}) + >>> f.cfa_file_substitutions() + {'${base}': '/new/location/', '${base2}': '/home/data/'} + >>> f.cfa_del_file_substitution('${base}') + {'${base}': '/new/location/'} + >>> f.cfa_clear_file_substitutions() + {'${base2}': '/home/data/'} + >>> f.cfa_has_file_substitutions() + False + >>> f.cfa_file_substitutions() + {} + >>> f.cfa_clear_file_substitutions() + {} + >>> print(f.cfa_del_file_substitution('base', None)) + None + + """ + out = self._nc_get("cfa_file_substitutions", default=None) + if out is not None: + return out.copy() + + return {} + + def cfa_has_file_substitutions(self): + """Whether any CFA-netCDF file name substitutions have been set. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_clear_file_substitutions`, + `cfa_del_file_substitution`, + `cfa_file_substitutions`, + `cfa_update_file_substitutions` + + :Returns: + + `bool` + `True` if any CFA-netCDF file name substitutions have + been set, otherwise `False`. + + **Examples** + + >>> f.cfa_update_file_substitutions({'base': 'file:///data/'}) + >>> f.cfa_has_file_substitutions() + True + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/'} + >>> f.cfa_update_file_substitutions({'${base2}': '/home/data/'}) + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/', '${base2}': '/home/data/'} + >>> f.cfa_update_file_substitutions({'${base}': '/new/location/'}) + >>> f.cfa_file_substitutions() + {'${base}': '/new/location/', '${base2}': '/home/data/'} + >>> f.cfa_del_file_substitution('${base}') + {'${base}': '/new/location/'} + >>> f.cfa_clear_file_substitutions() + {'${base2}': '/home/data/'} + >>> f.cfa_has_file_substitutions() + False + >>> f.cfa_file_substitutions() + {} + >>> f.cfa_clear_file_substitutions() + {} + >>> print(f.cfa_del_file_substitution('base', None)) + None + + """ + return self._nc_has("cfa_file_substitutions") + + def cfa_update_file_substitutions(self, substitutions): + """Set CFA-netCDF file name substitutions. + + .. versionadded:: TODOCFAVER + + .. seealso:: `cfa_clear_file_substitutions`, + `cfa_del_file_substitution`, + `cfa_file_substitutions`, + `cfa_has_file_substitutions` + + :Parameters: + + {{cfa substitutions: `dict`}} + + :Returns: + + `None` + + **Examples** + + >>> f.cfa_update_file_substitutions({'base': 'file:///data/'}) + >>> f.cfa_has_file_substitutions() + True + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/'} + >>> f.cfa_update_file_substitutions({'${base2}': '/home/data/'}) + >>> f.cfa_file_substitutions() + {'${base}': 'file:///data/', '${base2}': '/home/data/'} + >>> f.cfa_update_file_substitutions({'${base}': '/new/location/'}) + >>> f.cfa_file_substitutions() + {'${base}': '/new/location/', '${base2}': '/home/data/'} + >>> f.cfa_del_file_substitution('${base}') + {'${base}': '/new/location/'} + >>> f.cfa_clear_file_substitutions() + {'${base2}': '/home/data/'} + >>> f.cfa_has_file_substitutions() + False + >>> f.cfa_file_substitutions() + {} + >>> f.cfa_clear_file_substitutions() + {} + >>> print(f.cfa_del_file_substitution('base', None)) + None + + """ + if not substitutions: + return + + substitutions = substitutions.copy() + for base, sub in tuple(substitutions.items()): + if not (base.startswith("${") and base.endswith("}")): + substitutions[f"${{{base}}}"] = substitutions.pop(base) + + subs = self.cfa_file_substitutions() + subs.update(substitutions) + self._nc_set("cfa_file_substitutions", subs) diff --git a/cf/mixin2/container.py b/cf/mixin2/container.py new file mode 100644 index 0000000000..c8ca130d3e --- /dev/null +++ b/cf/mixin2/container.py @@ -0,0 +1,45 @@ +"""This class is not in the cf.mixin package because it needs to be +imported by cf.Data, and some of the other mixin classes in cf.mixin +themsleves import cf.Data, which would lead to a circular import +situation. + +""" +from ..docstring import _docstring_substitution_definitions + + +class Container: + """Mixin class for storing components. + + .. versionadded:: 3.7.0 + + """ + + def __docstring_substitutions__(self): + """Define docstring substitutions that apply to this class and + all of its subclasses. + + These are in addtion to, and take precendence over, docstring + substitutions defined by the base classes of this class. + + See `_docstring_substitutions` for details. + + .. versionadded:: 3.7.0 + + .. seealso:: `_docstring_substitutions` + + :Returns: + + `dict` + The docstring substitutions that have been applied. + + """ + return _docstring_substitution_definitions + + def __docstring_package_depth__(self): + """Return the package depth for {{package}} docstring + substitutions. + + See `_docstring_package_depth` for details. + + """ + return 0 diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index eaca6abdd3..2f3eed94d0 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -1,24 +1,7 @@ import cfdm +import netCDF4 import numpy as np - -""" -TODOCFA: remove aggregation_* properties from constructs - -TODOCFA: Create auxiliary coordinates from non-standardised terms - -TODOCFA: Reference instruction variables (and/or set as - "do_not_create_field") - -TODOCFA: Create auxiliary coordinates from non-standardised terms - -TODOCFA: Consider scanning for cfa variables to the top (e.g. where - scanning for geometry varables is). This will probably need a - change in cfdm so that a customizable hook can be overlaoded - (like `_customize_read_vars` does). - -TODOCFA: What about groups/netcdf_flattener? - -""" +from packaging.version import Version class NetCDFRead(cfdm.read_write.netcdf.NetCDFRead): @@ -144,7 +127,9 @@ def _get_domain_axes(self, ncvar, allow_external=False, parent_ncvar=None): parent_ncvar=parent_ncvar, ) - # Still here? Then we have a CFA variable. + # ------------------------------------------------------------ + # Still here? Then we have a CFA-netCDF variable. + # ------------------------------------------------------------ g = self.read_vars ncdimensions = g["variable_attributes"][ncvar][ @@ -168,6 +153,7 @@ def _create_data( uncompress_override=None, parent_ncvar=None, coord_ncvar=None, + cfa_term=None, ): """Create data for a netCDF or CFA-netCDF variable. @@ -176,7 +162,8 @@ def _create_data( :Parameters: ncvar: `str` - The name of the netCDF variable that contains the data. + The name of the netCDF variable that contains the + data. See the *cfa_term* parameter. construct: optional @@ -184,20 +171,24 @@ def _create_data( uncompress_override: `bool`, optional - parent_ncvar: `str`, optional - coord_ncvar: `str`, optional - .. versionadded:: TODO + cfa_term: `dict`, optional + The name of a non-standard aggregation instruction + term from which to create the data. If set then + *ncvar* must be the value of the term in the + ``aggregation_data`` attribute. + + .. versionadded:: TODOCFAVER :Returns: `Data` """ - if not self._is_cfa_variable(ncvar): + if not cfa_term and not self._is_cfa_variable(ncvar): # Create data for a normal netCDF variable - return super()._create_data( + data = super()._create_data( ncvar=ncvar, construct=construct, unpacked_dtype=unpacked_dtype, @@ -206,25 +197,80 @@ def _create_data( coord_ncvar=coord_ncvar, ) + # Set the CFA write status to True when there is exactly + # one dask chunk + if data.npartitions == 1: + data._cfa_set_write(True) + + self._cache_data_elements(data, ncvar) + + return data + # ------------------------------------------------------------ - # Still here? Then create data for a CFA-netCDF variable + # Still here? Create data for a CFA variable # ------------------------------------------------------------ - cfa_array, kwargs = self._create_cfanetcdfarray( - ncvar, - unpacked_dtype=unpacked_dtype, - coord_ncvar=coord_ncvar, - ) + if construct is not None: + # Remove the aggregation attributes from the construct + self.implementation.del_property( + construct, "aggregated_dimensions", None + ) + aggregated_data = self.implementation.del_property( + construct, "aggregated_data", None + ) + else: + aggregated_data = None + + if cfa_term: + term, term_ncvar = tuple(cfa_term.items())[0] + cfa_array, kwargs = self._create_cfanetcdfarray_term( + ncvar, term, term_ncvar + ) + else: + cfa_array, kwargs = self._create_cfanetcdfarray( + ncvar, + unpacked_dtype=unpacked_dtype, + coord_ncvar=coord_ncvar, + ) - # Return the data - return self._create_Data( + data = self._create_Data( cfa_array, ncvar, units=kwargs["units"], calendar=kwargs["calendar"], ) + # Note: We don't cache elements from CFA variables, because + # the data are in fragment files which have not been + # opened; and may not not even be openable, such as + # could be the case if a fragement was on tape storage. + + # Set the CFA write status to True iff each non-aggregated + # axis has exactly one dask storage chunk + if cfa_term: + data._cfa_set_term(True) + else: + cfa_write = True + for n, numblocks in zip( + cfa_array.get_fragment_shape(), data.numblocks + ): + if n == 1 and numblocks > 1: + # Note: 'n == 1' is True for non-aggregated axes + cfa_write = False + break + + data._cfa_set_write(cfa_write) + + # Store the 'aggregated_data' attribute + if aggregated_data: + data.cfa_set_aggregated_data(aggregated_data) + + # Store the file substitutions + data.cfa_update_file_substitutions(kwargs.get("substitutions")) + + return data + def _is_cfa_variable(self, ncvar): - """Return True if *ncvar* is a CFA variable. + """Return True if *ncvar* is a CFA aggregated variable. .. versionadded:: 3.14.0 @@ -240,39 +286,11 @@ def _is_cfa_variable(self, ncvar): """ g = self.read_vars - - if not g["cfa"] or ncvar in g["external_variables"]: - return False - - attributes = g["variable_attributes"][ncvar] - - # TODOCFA: test on the version of CFA given by g["cfa"]. See - # also `_customize_read_vars`. - cfa = "aggregated_dimensions" in attributes - if cfa: - # TODOCFA: Modify this message for v4.0.0 - raise ValueError( - "The reading of CFA files has been temporarily disabled, " - "but will return for CFA-0.6 files at version 4.0.0. " - "CFA-0.4 functionality is still available at version 3.13.1." - ) - - # TODOCFA: The 'return' remains when the exception is - # removed at v4.0.0. - return True - - cfa_04 = attributes.get("cf_role") == "cfa_variable" - if cfa_04: - # TODOCFA: Modify this message for v4.0.0. - raise ValueError( - "The reading of CFA-0.4 files was permanently disabled at " - "version 3.14.0. However, CFA-0.4 functionality is " - "still available at version 3.13.1. " - "The reading and writing of CFA-0.6 files will become " - "available at version 4.0.0." - ) - - return False + return ( + g["cfa"] + and ncvar in g["cfa_aggregated_data"] + and ncvar not in g["external_variables"] + ) def _create_Data( self, @@ -295,11 +313,6 @@ def _create_Data( ncvar: `str` The netCDF variable containing the array. - ncdimensions: sequence of `str`, optional - The netCDF dimensions spanned by the array. - - .. versionadded:: 3.14.0 - units: `str`, optional The units of *array*. By default, or if `None`, it is assumed that there are no units. @@ -308,6 +321,11 @@ def _create_Data( The calendar of *array*. By default, or if `None`, it is assumed that there is no calendar. + ncdimensions: sequence of `str`, optional + The netCDF dimensions spanned by the array. + + .. versionadded:: 3.14.0 + kwargs: optional Extra parameters to pass to the initialisation of the returned `Data` object. @@ -335,51 +353,68 @@ def _create_Data( chunks=chunks, **kwargs, ) - self._cache_data_elements(data, ncvar) return data - def _customize_read_vars(self): - """Customize the read parameters. + def _customise_read_vars(self): + """Customise the read parameters. + + Take the opportunity to apply CFA updates to + `read_vars['variable_dimensions']` and + `read_vars['do_not_create_field']`. .. versionadded:: 3.0.0 """ - super()._customize_read_vars() - + super()._customise_read_vars() g = self.read_vars + if not g["cfa"]: + return + + g["cfa_aggregated_data"] = {} + g["cfa_aggregation_instructions"] = {} + g["cfa_file_substitutions"] = {} + # ------------------------------------------------------------ - # Find out if this is a CFA file + # Still here? Then this is a CFA-netCDF file # ------------------------------------------------------------ - g["cfa"] = "CFA" in g["global_attributes"].get("Conventions", ()) + if g["CFA_version"] < Version("0.6.2"): + raise ValueError( + f"Can't read file {g['filename']} that uses obsolete " + f"CFA conventions version CFA-{g['CFA_version']}. " + "(Note that cf version 3.13.1 can be used to read and " + "write CFA-0.4 files.)" + ) - if g["cfa"]: - attributes = g["variable_attributes"] - dimensions = g["variable_dimensions"] + # Get the directory of the CFA-netCDF file being read + from os.path import abspath + from pathlib import PurePath - # Do not create fields from CFA private - # variables. TODOCFA: get private variables from - # CFANetCDFArray instances - for ncvar in g["variables"]: - if attributes[ncvar].get("cf_role", None) == "cfa_private": - g["do_not_create_field"].add(ncvar) + g["cfa_dir"] = PurePath(abspath(g["filename"])).parent - for ncvar, ncdims in tuple(dimensions.items()): - if ncdims != (): - continue + # Process the aggregation instruction variables, and the + # aggregated dimensions. + dimensions = g["variable_dimensions"] + attributes = g["variable_attributes"] - if not ( - ncvar not in g["external_variables"] - and "aggregated_dimensions" in attributes[ncvar] - ): - continue + for ncvar, attributes in attributes.items(): + if "aggregated_dimensions" not in attributes: + # This is not an aggregated variable + continue + + # Set the aggregated variable's dimensions as its + # aggregated dimensions + ncdimensions = attributes["aggregated_dimensions"].split() + dimensions[ncvar] = tuple(map(str, ncdimensions)) - ncdimensions = attributes[ncvar][ - "aggregated_dimensions" - ].split() - if ncdimensions: - dimensions[ncvar] = tuple(map(str, ncdimensions)) + # Do not create fields/domains from aggregation + # instruction variables + parsed_aggregated_data = self._cfa_parse_aggregated_data( + ncvar, attributes.get("aggregated_data") + ) + for term_ncvar in parsed_aggregated_data.values(): + g["do_not_create_field"].add(term_ncvar) def _cache_data_elements(self, data, ncvar): """Cache selected element values. @@ -464,19 +499,30 @@ def _create_cfanetcdfarray( ncvar, unpacked_dtype=False, coord_ncvar=None, + term=None, ): """Create a CFA-netCDF variable array. - .. versionadded:: (cfdm) 1.10.0.1 + .. versionadded:: 3.14.0 :Parameters: ncvar: `str` + The name of the CFA-netCDF aggregated variable. See + the *term* parameter. unpacked_dtype: `False` or `numpy.dtype`, optional coord_ncvar: `str`, optional + term: `str`, optional + The name of a non-standard aggregation instruction + term from which to create the array. If set then + *ncvar* must be the value of the non-standard term in + the ``aggregation_data`` attribute. + + .. versionadded:: TODOCFAVER + :Returns: (`CFANetCDFArray`, `dict`) @@ -484,6 +530,8 @@ def _create_cfanetcdfarray( kwargs used to create it. """ + g = self.read_vars + # Get the kwargs needed to instantiate a general NetCDFArray # instance kwargs = self._create_netcdfarray( @@ -493,17 +541,97 @@ def _create_cfanetcdfarray( return_kwargs_only=True, ) - # Get rid of the incorrect shape + # Get rid of the incorrect shape of (). This will end up + # getting set correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) - # Add the aggregated_data attribute (that can be used by - # dask.base.tokenize). - kwargs["instructions"] = self.read_vars["variable_attributes"][ - ncvar - ].get("aggregated_data") + aggregated_data = g["cfa_aggregated_data"][ncvar] + + standardised_terms = ("location", "file", "address", "format") + + instructions = [] + aggregation_instructions = {} + for t, term_ncvar in aggregated_data.items(): + if t not in standardised_terms: + continue + + aggregation_instructions[t] = g["cfa_aggregation_instructions"][ + term_ncvar + ] + instructions.append(f"{t}: {term_ncvar}") + + if t == "file": + kwargs["substitutions"] = g["cfa_file_substitutions"].get( + term_ncvar + ) + + kwargs["x"] = aggregation_instructions + kwargs["instructions"] = " ".join(sorted(instructions)) + + # Use the kwargs to create a CFANetCDFArray instance + array = self.implementation.initialise_CFANetCDFArray(**kwargs) + + return array, kwargs + + def _create_cfanetcdfarray_term( + self, + parent_ncvar, + term, + ncvar, + ): + """Create a CFA-netCDF variable array. + + .. versionadded:: 3.14.0 + + :Parameters: + + parent_ncvar: `str` + The name of the CFA-netCDF aggregated variable. See + the *term* parameter. + + term: `str`, optional + The name of a non-standard aggregation instruction + term from which to create the array. If set then + *ncvar* must be the value of the non-standard term in + the ``aggregation_data`` attribute. + + .. versionadded:: TODOCFAVER - # Use the kwargs to create a specialised CFANetCDFArray + ncvar: `str` + The name of the CFA-netCDF aggregated variable. See + the *term* parameter. + + :Returns: + + (`CFANetCDFArray`, `dict`) + The new `NetCDFArray` instance and dictionary of the + kwargs used to create it. + + """ + g = self.read_vars + + # Get the kwargs needed to instantiate a general NetCDFArray # instance + kwargs = self._create_netcdfarray( + ncvar, + return_kwargs_only=True, + ) + + instructions = [] + aggregation_instructions = {} + for t, term_ncvar in g["cfa_aggregated_data"][parent_ncvar].items(): + if t in ("location", term): + aggregation_instructions[t] = g[ + "cfa_aggregation_instructions" + ][term_ncvar] + instructions.append(f"{t}: {ncvar}") + + kwargs["term"] = term + kwargs["dtype"] = aggregation_instructions[term].dtype + kwargs["x"] = aggregation_instructions + kwargs["instructions"] = " ".join(sorted(instructions)) + + # Use the kwargs to create a CFANetCDFArray instance array = self.implementation.initialise_CFANetCDFArray(**kwargs) return array, kwargs @@ -572,3 +700,201 @@ def _parse_chunks(self, ncvar): chunks = chunks2 return chunks + + def _customise_field_ancillaries(self, parent_ncvar, f): + """Create customised field ancillary constructs. + + This method currently creates: + + * Field ancillary constructs derived from non-standardised + terms in CFA aggregation instructions. Each construct spans + the same domain axes as the parent field construct. + Constructs are never created for `Domain` instances. + + .. versionadded:: TODOCFAVER + + :Parameters: + + parent_ncvar: `str` + The netCDF variable name of the parent variable. + + f: `Field` + The parent field construct. + + :Returns: + + `dict` + A mapping of netCDF variable names to newly-created + construct identifiers. + + **Examples** + + >>> n._customise_field_ancillaries('tas', f) + {} + + >>> n._customise_field_ancillaries('pr', f) + {'tracking_id': 'fieldancillary1'} + + """ + if not self._is_cfa_variable(parent_ncvar): + return {} + + # ------------------------------------------------------------ + # Still here? Then we have a CFA-netCDF variable: Loop round + # the aggregation instruction terms and convert each + # non-standard term into a field ancillary construct that + # spans the same domain axes as the parent field. + # ------------------------------------------------------------ + g = self.read_vars + + standardised_terms = ("location", "file", "address", "format") + + out = {} + for term, term_ncvar in g["cfa_aggregated_data"][parent_ncvar].items(): + if term in standardised_terms: + continue + + if g["variables"][term_ncvar].ndim != f.ndim: + # Can only create field ancillaries with the same rank + # as the field + continue + + # Still here? Then we've got a non-standard aggregation + # term from which we can create a field + # ancillary construct. + anc = self.implementation.initialise_FieldAncillary() + + self.implementation.set_properties( + anc, g["variable_attributes"][term_ncvar] + ) + anc.set_property("long_name", term) + + # Store the term name as the 'id' attribute. This will be + # used as the term name if the field field ancillary is + # written to disk as a non-standard CFA term. + anc.id = term + + data = self._create_data( + parent_ncvar, anc, cfa_term={term: term_ncvar} + ) + + self.implementation.set_data(anc, data, copy=False) + self.implementation.nc_set_variable(anc, term_ncvar) + + key = self.implementation.set_field_ancillary( + f, + anc, + axes=self.implementation.get_field_data_axes(f), + copy=False, + ) + out[term_ncvar] = key + + return out + + def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): + """Parse a CFA-netCDF ``aggregated_data`` attribute. + + .. versionadded:: TODOCFAVER + + :Parameters: + + ncvar: `str` + The netCDF variable name. + + aggregated_data: `str` or `None` + The CFA-netCDF ``aggregated_data`` attribute. + + :Returns: + + `dict` + The parsed attribute. + + """ + if not aggregated_data: + return {} + + g = self.read_vars + aggregation_instructions = g["cfa_aggregation_instructions"] + variable_attributes = g["variable_attributes"] + + out = {} + for x in self._parse_x( + ncvar, + aggregated_data, + keys_are_variables=True, + ): + term, term_ncvar = tuple(x.items())[0] + term_ncvar = term_ncvar[0] + out[term] = term_ncvar + + if term_ncvar in aggregation_instructions: + # Already processed this term + continue + + array = g["variables"][term_ncvar][...] + aggregation_instructions[term_ncvar] = self._cfa_conform_array( + array + ) + + if term == "file": + # Find URI substitutions that may be stored in the + # CFA file instruction variable's "substitutions" + # attribute + subs = variable_attributes[term_ncvar].get( + "substitutions", + ) + if subs: + # Convert the string "${base}: value" to the + # dictionary {"${base}": "value"} + s = subs.split() + subs = { + base[:-1]: sub for base, sub in zip(s[::2], s[1::2]) + } + + # Apply user-defined substitutions, which take + # precedence over those defined in the file. + subs.update(g["cfa_options"].get("substitutions", {})) + g["cfa_file_substitutions"][term_ncvar] = subs + + g["cfa_aggregated_data"][ncvar] = out + return out + + def _cfa_conform_array(self, array): + """Conform an array so that it is suitable for CFA processing. + + .. versionadded: TODOCFAVER + + :Parameters: + + array: `np.ndarray` + The array to conform. + + :Returns: + + array: `np.ndarray` + The conformed array. + + """ + if isinstance(array, str): + # string + return np.array(array, dtype=f"S{len(array)}").astype("U") + + kind = array.dtype.kind + if kind == "O": + # string + return array.astype("U") + + if kind in "SU": + # char + if kind == "U": + array = array.astype("S") + + array = netCDF4.chartostring(array) + shape = array.shape + array = np.array([x.rstrip() for x in array.flat], dtype="S") + array = np.reshape(array, shape) + array = np.ma.masked_where(array == b"", array) + return array.astype("U") + + # number + return array diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 149cd1cc46..757ac91fcc 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -1,19 +1,11 @@ -import random -from string import hexdigits +from os import remove import cfdm import dask.array as da import numpy as np -from ... import Bounds, Coordinate, DomainAncillary from .netcdfread import NetCDFRead -_cfa_message = ( - "Writing CFA files has been temporarily disabled, " - "and will return at version 4.0.0. " - "CFA-0.4 functionality is still available at version 3.13.x." -) - class NetCDFWrite(cfdm.read_write.netcdf.NetCDFWrite): """A container for writing Fields to a netCDF dataset.""" @@ -30,31 +22,74 @@ def __new__(cls, *args, **kwargs): instance._NetCDFRead = NetCDFRead return instance - def _write_as_cfa(self, cfvar): - """True if the variable should be written as a CFA variable. + def _write_as_cfa(self, cfvar, construct_type, domain_axes): + """Whether or not to write as a CFA variable. .. versionadded:: 3.0.0 + :Parameters: + + cfvar: cf instance that contains data + + construct_type: `str` + The construct type of the *cfvar*, or its parent if + *cfvar* is not a construct. + + .. versionadded:: TODOCFAVER + + domain_axes: `None`, or `tuple` of `str` + The domain axis construct identifiers for *cfvar*. + + .. versionadded:: TODOCFAVER + + :Returns: + + `bool` + True if the variable is to be written as a CFA + variable. + """ - if not self.write_vars["cfa"]: + if construct_type is None: + # This prevents recursion whilst writing CFA-netCDF term + # variables. return False - data = self.implementation.get_data(cfvar, None) - if data is None: + g = self.write_vars + if not g["cfa"]: return False - if data.size == 1: + data = self.implementation.get_data(cfvar, None) + if data is None: return False - if isinstance(cfvar, (Coordinate, DomainAncillary)): - return cfvar.ndim > 1 - - if isinstance(cfvar, Bounds): - return cfvar.ndim > 2 - - return True - - def _customize_createVariable(self, cfvar, kwargs): + cfa_options = g["cfa_options"] + for ctype, ndim in cfa_options.get("constructs", {}).items(): + # Write as CFA if it has an appropriate construct type ... + if ctype in ("all", construct_type): + # ... and then only if it satisfies the + # number-of-dimenions criterion and the data is + # flagged as OK. + if ndim is None or ndim == len(domain_axes): + cfa_get_write = data.cfa_get_write() + if not cfa_get_write and cfa_options["strict"]: + if g["mode"] == "w": + remove(g["filename"]) + + raise ValueError( + f"Can't write {cfvar!r} as a CFA-netCDF " + "aggregation variable. Consider setting " + "cfa={'strict': False}" + ) + + return cfa_get_write + + break + + return False + + def _customise_createVariable( + self, cfvar, construct_type, domain_axes, kwargs + ): """Customise keyword arguments for `netCDF4.Dataset.createVariable`. @@ -64,6 +99,17 @@ def _customize_createVariable(self, cfvar, kwargs): cfvar: cf instance that contains data + construct_type: `str` + The construct type of the *cfvar*, or its parent if + *cfvar* is not a construct. + + .. versionadded:: TODOCFAVER + + domain_axes: `None`, or `tuple` of `str` + The domain axis construct identifiers for *cfvar*. + + .. versionadded:: TODOCFAVER + kwargs: `dict` :Returns: @@ -73,11 +119,11 @@ def _customize_createVariable(self, cfvar, kwargs): `netCDF4.Dataset.createVariable`. """ - kwargs = super()._customize_createVariable(cfvar, kwargs) - - if self._write_as_cfa(cfvar): - raise ValueError(_cfa_message) + kwargs = super()._customise_createVariable( + cfvar, construct_type, domain_axes, kwargs + ) + if self._write_as_cfa(cfvar, construct_type, domain_axes): kwargs["dimensions"] = () kwargs["chunksizes"] = None @@ -89,9 +135,11 @@ def _write_data( cfvar, ncvar, ncdimensions, + domain_axes=None, unset_values=(), compressed=False, attributes={}, + construct_type=None, ): """Write a Data object. @@ -107,22 +155,47 @@ def _write_data( ncdimensions: `tuple` of `str` + domain_axes: `None`, or `tuple` of `str` + The domain axis construct identifiers for *cfvar*. + + .. versionadded:: TODOCFAVER + unset_values: sequence of numbers + attributes: `dict`, optional + The netCDF attributes for the constructs that have been + written to the file. + + construct_type: `str`, optional + The construct type of the *cfvar*, or its parent if + *cfvar* is not a construct. + + .. versionadded:: TODOCFAVER + + :Returns: + + `None` + """ g = self.write_vars - if self._write_as_cfa(cfvar): - raise ValueError(_cfa_message) - - self._write_cfa_data(ncvar, ncdimensions, data, cfvar) + if self._write_as_cfa(cfvar, construct_type, domain_axes): + # -------------------------------------------------------- + # Write the data as CFA aggregated data + # -------------------------------------------------------- + self._create_cfa_data( + ncvar, + ncdimensions, + data, + cfvar, + ) return - # Still here? + # ------------------------------------------------------------ + # Still here? The write a normal (non-CFA) variable + # ------------------------------------------------------------ if compressed: - # -------------------------------------------------------- # Write data in its compressed form - # -------------------------------------------------------- data = data.source().source() # Get the dask array @@ -145,9 +218,14 @@ def _write_data( # Check for out-of-range values if g["warn_valid"]: + if construct_type: + var = cfvar + else: + var = None + dx = dx.map_blocks( self._check_valid, - cfvar=cfvar, + cfvar=var, attributes=attributes, meta=np.array((), dx.dtype), ) @@ -175,9 +253,10 @@ def _write_dimension_coordinate( ncdim: `str` or `None` The name of the netCDF dimension for this dimension - coordinate construct, including any groups structure. Note - that the group structure may be different to the - corodinate variable, and the basename. + coordinate construct, including any groups + structure. Note that the group structure may be + different to the coordinate variable, and the + basename. .. versionadded:: 3.6.0 @@ -231,8 +310,7 @@ def _write_scalar_coordinate( The updated list of netCDF auxiliary coordinate names. """ - # Unsafe to set mutable '{}' as default in the func signature. - if extra is None: # distinguish from falsy '{}' + if extra is None: extra = {} coord_1d = self._change_reference_datetime(coord_1d) @@ -303,7 +381,7 @@ def _change_reference_datetime(self, coord): else: return coord2 - def _write_cfa_data(self, ncvar, ncdimensions, data, cfvar): + def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): """Write a CFA variable to the netCDF file. Any CFA private variables required will be autmatically created @@ -327,33 +405,142 @@ def _write_cfa_data(self, ncvar, ncdimensions, data, cfvar): `None` """ - raise ValueError(_cfa_message) + g = self.write_vars - def _random_hex_string(self, size=10): - """Return a random hexadecimal string with the given number of - characters. + ndim = data.ndim + + cfa = self._cfa_aggregation_instructions(data, cfvar) + + # ------------------------------------------------------------ + # Get the location netCDF dimensions. These always start with + # "f_{size}_loc". + # ------------------------------------------------------------ + location_ncdimensions = [] + for size in cfa["location"].shape: + l_ncdim = f"f_{size}_loc" + if l_ncdim not in g["dimensions"]: + # Create a new location dimension + self._write_dimension(l_ncdim, None, size=size) + + location_ncdimensions.append(l_ncdim) + + location_ncdimensions = tuple(location_ncdimensions) + + # ------------------------------------------------------------ + # Get the fragment netCDF dimensions. These always start with + # "f_". + # ------------------------------------------------------------ + aggregation_address = cfa["address"] + fragment_ncdimensions = [] + for ncdim, size in zip( + ncdimensions + ("extra",) * (aggregation_address.ndim - ndim), + aggregation_address.shape, + ): + f_ncdim = f"f_{ncdim}" + if f_ncdim not in g["dimensions"]: + # Create a new fragement dimension + self._write_dimension(f_ncdim, None, size=size) + + fragment_ncdimensions.append(f_ncdim) + + fragment_ncdimensions = tuple(fragment_ncdimensions) + + # ------------------------------------------------------------ + # Write the standardised aggregation instruction variables to + # the CFA-netCDF file + # ------------------------------------------------------------ + substitutions = data.cfa_file_substitutions() + substitutions.update(g["cfa_options"].get("substitutions", {})) + + aggregated_data = data.cfa_get_aggregated_data() + aggregated_data_attr = [] + + # Location + term = "location" + term_ncvar = self._cfa_write_term_variable( + cfa[term], + aggregated_data.get(term, f"cfa_{term}"), + location_ncdimensions, + ) + aggregated_data_attr.append(f"{term}: {term_ncvar}") - .. versionadded:: 3.0.0 + # File + term = "file" + if substitutions: + # Create the "substitutions" netCDF attribute + subs = [] + for base, sub in substitutions.items(): + subs.append(f"{base}: {sub}") - :Parameters: + attributes = {"substitutions": " ".join(sorted(subs))} + else: + attributes = None - size: `int`, optional - The number of characters in the generated string. + term_ncvar = self._cfa_write_term_variable( + cfa[term], + aggregated_data.get(term, f"cfa_{term}"), + fragment_ncdimensions, + attributes=attributes, + ) + aggregated_data_attr.append(f"{term}: {term_ncvar}") - :Returns: + # Address + term = "address" - `str` - The hexadecimal string. + # Attempt to reduce addresses to a common scalar value + u = cfa[term].unique().compressed().persist() + if u.size == 1: + cfa[term] = u.squeeze() + dimensions = () + else: + dimensions = fragment_ncdimensions - **Examples:** + term_ncvar = self._cfa_write_term_variable( + cfa[term], + aggregated_data.get(term, f"cfa_{term}"), + dimensions, + ) + aggregated_data_attr.append(f"{term}: {term_ncvar}") - >>> _random_hex_string() - 'C3eECbBBcf' - >>> _random_hex_string(6) - '7a4acc' + # Format + term = "format" - """ - return "".join(random.choice(hexdigits) for i in range(size)) + # Attempt to reduce addresses to a common scalar value + u = cfa[term].unique().compressed().persist() + if u.size == 1: + cfa[term] = u.squeeze() + dimensions = () + else: + dimensions = fragment_ncdimensions + + term_ncvar = self._cfa_write_term_variable( + cfa[term], + aggregated_data.get(term, f"cfa_{term}"), + dimensions, + ) + aggregated_data_attr.append(f"{term}: {term_ncvar}") + + # ------------------------------------------------------------ + # Look for non-standard CFA terms stored as field ancillaries + # on a field and write them to the CFA-netCDF file + # ------------------------------------------------------------ + if self.implementation.is_field(cfvar): + non_standard_terms = self._cfa_write_non_standard_terms( + cfvar, fragment_ncdimensions[:ndim], aggregated_data + ) + aggregated_data_attr.extend(non_standard_terms) + + # ------------------------------------------------------------ + # Add the CFA aggregation variable attributes + # ------------------------------------------------------------ + self._write_attributes( + None, + ncvar, + extra={ + "aggregated_dimensions": " ".join(ncdimensions), + "aggregated_data": " ".join(sorted(aggregated_data_attr)), + }, + ) def _convert_to_builtin_type(self, x): """Convert a non-JSON-encodable object to a JSON-encodable @@ -455,25 +642,400 @@ def _filled_string_array(self, array, fill_value=""): return array + def _write_field_ancillary(self, f, key, anc): + """Write a field ancillary to the netCDF file. + + If an equal field ancillary has already been written to the file + then it is not re-written. + + .. versionadded:: TODOCFAVER + + :Parameters: + + f: `Field` + + key: `str` + + anc: `FieldAncillary` + + :Returns: + + `str` + The netCDF variable name of the field ancillary + object. If no ancillary variable was written then an + empty string is returned. + + """ + if anc.data.cfa_get_term(): + # This field ancillary construct is to be written as a + # non-standard CFA term belonging to the parent field, or + # else not at all. + return "" + + return super()._write_field_ancillary(f, key, anc) + + def _cfa_write_term_variable( + self, data, ncvar, ncdimensions, attributes=None + ): + """Write a CFA aggregation instruction term variable + + .. versionadded:: TODOCFAVER + + :Parameters: + + data `Data` + The data to write. + + ncvar: `str` + The netCDF variable name. + + ncdimensions: `tuple` of `str` + The variable's netCDF dimensions. + + attributes: `dict`, optional + Any attributes to attach to the variable. + + :Returns: + + `str` + The netCDF variable name of the CFA term variable. + + """ + create = not self._already_in_file(data, ncdimensions) + + if create: + # Create a new CFA term variable in the file + ncvar = self._netcdf_name(ncvar) + self._write_netcdf_variable( + ncvar, ncdimensions, data, None, extra=attributes + ) + else: + # This CFA term variable has already been written to the + # file + ncvar = self.write_vars["seen"][id(data)]["ncvar"] + + return ncvar + + def _cfa_write_non_standard_terms( + self, field, fragment_ncdimensions, aggregated_data + ): + """Write a non-standard CFA aggregation instruction term variable. + + Writes non-standard CFA terms stored as field ancillaries. + + .. versionadded:: TODOCFAVER + + :Parameters: + + field: `Field` + + fragment_ncdimensions: `list` of `str` + + aggregated_data: `dict` + + """ + aggregated_data_attr = [] + terms = ["location", "file", "address", "format"] + for key, field_anc in self.implementation.get_field_ancillaries( + field + ).items(): + if not field_anc.data.cfa_get_term(): + continue + + data = self.implementation.get_data(field_anc, None) + if data is None: + continue + + # Check that the field ancillary has the same axes as its + # parent field, and in the same order. + if field.get_data_axes(key) != field.get_data_axes(): + continue + + # Still here? Then this field ancillary can be represented + # by a non-standard aggregation term. + + # Then transform the data so that it spans the fragment + # dimensions, with one value per fragment. If a chunk has + # more than one unique value then the fragment's value is + # missing data. + dx = data.to_dask_array() + dx_ind = tuple(range(dx.ndim)) + out_ind = dx_ind + dx = da.blockwise( + self._cfa_unique, + out_ind, + dx, + dx_ind, + adjust_chunks={i: 1 for i in out_ind}, + dtype=dx.dtype, + ) + + # Get the non-standard term name from the field + # ancillary's 'id' attribute + term = getattr(field_anc, "id", "term") + term = term.replace(" ", "_") + name = term + n = 0 + while term in terms: + n += 1 + term = f"{name}_{n}" + + terms.append(term) + + # Create the new CFA term variable + term_ncvar = self._cfa_write_term_variable( + data=type(data)(dx), + ncvar=aggregated_data.get(term, f"cfa_{term}"), + ncdimensions=fragment_ncdimensions, + ) + + aggregated_data_attr.append(f"{term}: {term_ncvar}") + + return aggregated_data_attr + + @classmethod + def _cfa_unique(cls, a): + """Return the unique value of an array. + + If there are multiple unique vales then missing data is + returned. + + .. versionadded:: TODOCFAVER + + :Parameters: + + a: `numpy.ndarray` + The array. + + :Returns: + + `numpy.ndarray` + A size 1 array containing the unique value, or missing + data if there is not a unique value. + + """ + out_shape = (1,) * a.ndim + a = np.unique(a) + if np.ma.isMA(a): + # Remove a masked element + a = a.compressed() + + if a.size == 1: + return a.reshape(out_shape) + + return np.ma.masked_all(out_shape, dtype=a.dtype) -# def _convert_dtype(self, array, new_dtype=None): -# """Convert the data type of a numpy array. -# -# .. versionadded:: 3.14.0 -# -# :Parameters: -# -# array: `numpy.ndarray` -# The `numpy` array -# -# new_dtype: data-type -# The new data type. -# -# :Returns: -# -# `numpy.ndarray` -# The array with converted data type. -# -# """ -# return array.astype(new_dtype) -# + def _cfa_aggregation_instructions(self, data, cfvar): + """Convert data to standardised CFA aggregation instruction terms. + + .. versionadded:: TODOCFAVER + + :Parameters: + + data: `Data` + The data to be converted to standardised CFA + aggregation instruction terms. + + cfvar: construct + The construct that contains the *data*. + + :Returns: + + `dict` + A dictionary whose keys are the standardised CFA + aggregation instruction terms, with values of `Data` + instances containing the corresponding variables. + + **Examples** + + >>> n._cfa_aggregation_instructions(data, cfvar) + {'location': , + 'file': , + 'format': , + 'address': } + + """ + from os.path import abspath, join, relpath + from pathlib import PurePath + from urllib.parse import urlparse + + g = self.write_vars + + # Define the CFA file susbstitutions, giving precedence over + # those set on the Data object to those provided by the CFA + # options. + substitutions = data.cfa_file_substitutions() + substitutions.update(g["cfa_options"].get("substitutions", {})) + + absolute_paths = g["cfa_options"].get("absolute_paths") + cfa_dir = g["cfa_dir"] + + # Size of the trailing dimension + n_trailing = 0 + + aggregation_file = [] + aggregation_address = [] + aggregation_format = [] + for indices in data.chunk_indices(): + file_details = self._cfa_get_file_details(data[indices]) + if len(file_details) != 1: + if file_details: + raise ValueError( + "Can't write CFA-netCDF aggregation variable from " + f"{cfvar!r} when the " + f"dask storage chunk defined by indices {indices} " + "spans two or more files" + ) + + raise ValueError( + "Can't write CFA-netCDF aggregation variable from " + f"{cfvar!r} when the " + f"dask storage chunk defined by indices {indices} spans " + "zero files" + ) + + filenames, addresses, formats = file_details.pop() + + if len(filenames) > n_trailing: + n_trailing = len(filenames) + + filenames2 = [] + for filename in filenames: + uri = urlparse(filename) + uri_scheme = uri.scheme + if not uri_scheme: + filename = abspath(join(cfa_dir, filename)) + if absolute_paths: + filename = PurePath(filename).as_uri() + else: + filename = relpath(filename, start=cfa_dir) + elif not absolute_paths and uri_scheme == "file": + filename = relpath(uri.path, start=cfa_dir) + + if substitutions: + # Apply the CFA file susbstitutions + for base, sub in substitutions.items(): + filename = filename.replace(sub, base) + + filenames2.append(filename) + + aggregation_file.append(tuple(filenames2)) + aggregation_address.append(addresses) + aggregation_format.append(formats) + + # Pad each value of the aggregation instruction arrays so that + # it has 'n_trailing' elements + a_shape = data.numblocks + pad = None + if n_trailing > 1: + a_shape += (n_trailing,) + + # Pad the ... + for i, (filenames, addresses, formats) in enumerate( + zip(aggregation_file, aggregation_address, aggregation_format) + ): + n = n_trailing - len(filenames) + if n: + pad = ("",) * n + aggregation_file[i] = filenames + pad + aggregation_format[i] = formats + pad + if isinstance(addresses[0], int): + pad = (-1,) * n + + aggregation_address[i] = addresses + pad + + # Reshape the 1-d aggregation instruction arrays to span the + # data dimensions, plus the extra trailing dimension if there + # is one. + aggregation_file = np.array(aggregation_file).reshape(a_shape) + aggregation_address = np.array(aggregation_address).reshape(a_shape) + aggregation_format = np.array(aggregation_format).reshape(a_shape) + + # Mask any padded elements + if pad: + aggregation_file = np.ma.where( + aggregation_file == "", np.ma.masked, aggregation_file + ) + mask = aggregation_file.mask + aggregation_address = np.ma.array(aggregation_address, mask=mask) + aggregation_format = np.ma.array(aggregation_format, mask=mask) + + # ------------------------------------------------------------ + # Create the location array + # ------------------------------------------------------------ + dtype = np.dtype(np.int32) + if max(data.to_dask_array().chunksize) > np.iinfo(dtype).max: + dtype = np.dtype(np.int64) + + ndim = data.ndim + aggregation_location = np.ma.masked_all( + (ndim, max(a_shape[:ndim])), dtype=dtype + ) + + for i, chunks in enumerate(data.chunks): + aggregation_location[i, : len(chunks)] = chunks + + # ------------------------------------------------------------ + # Return Data objects + # ------------------------------------------------------------ + data = type(data) + return { + "location": data(aggregation_location), + "file": data(aggregation_file), + "format": data(aggregation_format), + "address": data(aggregation_address), + } + + def _customise_write_vars(self): + """Customise the write parameters. + + .. versionadded:: TODOCFAVER + + """ + g = self.write_vars + + if g.get("cfa"): + from os.path import abspath + from pathlib import PurePath + + # Find the absolute directory path of the output + # CFA-netCDF file URI + g["cfa_dir"] = PurePath(abspath(g["filename"])).parent + + def _cfa_get_file_details(self, data): + """Get the details of all files referenced by the data. + + .. versionadded:: TODOCFAVER + + :Parameters: + + data: `Data` + The data + + :Returns: + + `set` of 3-tuples + A set containing 3-tuples giving the file names, + the addresses in the files, and the file formats. If + no files are required to compute the data then + an empty `set` is returned. + + **Examples** + + >>> n._cfa_get_file_details(data): + {(('/home/file.nc',), ('tas',), ('nc',))} + + >>> n._cfa_get_file_details(data): + {(('/home/file.pp',), (34556,), ('um',))} + + """ + out = set() + for a in data.todict().values(): + try: + out.update( + ((a.get_filenames(), a.get_addresses(), a.get_formats()),) + ) + except AttributeError: + pass + + return out diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 9fdcbef71c..852220b199 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -59,11 +59,16 @@ def read( warn_valid=False, chunks="auto", domain=False, + cfa=None, ): - """Read field constructs from netCDF, CDL, PP or UM fields datasets. + """Read field or domain constructs from files. - Input datasets are mapped to field constructs in memory which are - returned as elements of a `FieldList`. + The following file formats are supported: CF-netCDF, CFA-netCDF, + CDL, PP and UM fields datasets. + + Input datasets are mapped to constructs in memory which are + returned as elements of a `FieldList` or if the *domain* parameter + is True, a `DomainList`. NetCDF files may be on disk or on an OPeNDAP server. @@ -635,6 +640,28 @@ def read( .. versionadded:: 3.11.0 + cfa: `dict`, optional + Configure the reading of CFA-netCDF files. The dictionary + may have any subset of the following key/value pairs to + override the information read from the file: + + * ``'substitutions'``: `dict` + + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key may be specified with or without the + ``${...}`` syntax. For instance, the following are + equivalent: ``{'base': 'sub'}``, ``{'${base}': 'sub'}``. + The substitutions are used in conjunction with, and take + precedence over, any that are stored in the CFA-netCDF + file by the ``substitutions`` attribute of the ``file`` + CFA aggregation instruction variable. + + *Example:* + ``{'base': 'file:///data/'}`` + + .. versionadded:: TODOCFAVER + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -768,6 +795,29 @@ def read( f"when recursive={recursive!r}" ) + # Parse the 'cfa' parameter + if cfa is None: + cfa_options = {} + else: + cfa_options = cfa.copy() + keys = ("substitutions",) + if not set(cfa_options).issubset(keys): + raise ValueError( + "Invalid dictionary key to the 'cfa' parameter." + f"Valid keys are {keys}. Got: {cfa_options}" + ) + + if "substitutions" in cfa_options: + substitutions = cfa_options["substitutions"].copy() + for base, sub in tuple(substitutions.items()): + if not (base.startswith("${") and base.endswith("}")): + # Add missing ${...} + substitutions[f"${{{base}}}"] = substitutions.pop(base) + else: + substitutions = {} + + cfa_options["substitutions"] = substitutions + # Initialise the output list of fields/domains if domain: out = DomainList() @@ -897,6 +947,7 @@ def read( warn_valid=warn_valid, select=select, domain=domain, + cfa_options=cfa_options, ) # -------------------------------------------------------- @@ -1008,6 +1059,7 @@ def _read_a_file( chunks="auto", select=None, domain=False, + cfa_options=None, ): """Read the contents of a single file into a field list. @@ -1038,6 +1090,11 @@ def _read_a_file( domain: `bool`, optional See `cf.read` for details. + cfa_options: `dict`, optional + See `cf.read` for details. + + .. versionadded:: TODOCFAVER + :Returns: `FieldList` or `DomainList` @@ -1071,11 +1128,7 @@ def _read_a_file( "chunks": chunks, "fmt": selected_fmt, "ignore_read_error": ignore_read_error, - # 'cfa' defaults to False. If the file has - # "CFA" in its Conventions global attribute - # then 'cfa' will be changed to True in - # netcdf.read - "cfa": False, + "cfa_options": cfa_options, } # ---------------------------------------------------------------- diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index d9e818d7e2..04ef52d8bf 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -38,7 +38,6 @@ _cached_time = {} _cached_ctime = {} _cached_size_1_height_coordinate = {} -_cached_z_reference_coordinate = {} _cached_date2num = {} _cached_model_level_number_coordinate = {} @@ -1077,12 +1076,7 @@ def __init__( for cm in cell_methods: self.implementation.set_cell_method(field, cm) - # Check for decreasing axes that aren't decreasing - down_axes = self.down_axes - logger.info(f"down_axes = {down_axes}") # pragma: no cover - - if down_axes: - field.flip(down_axes, inplace=True) + logger.info(f"down_axes = {self.down_axes}") # pragma: no cover # Force cyclic X axis for particular values of LBHEM if xkey is not None and int_hdr[lbhem] in (0, 1, 2, 4): @@ -1135,6 +1129,57 @@ def __str__(self): return "\n".join(out) + def _reorder_z_axis(self, indices, z_axis, pmaxes): + """Reorder the Z axis `Rec` instances. + + :Parameters: + + indices: `list` + Aggregation axis indices. See `create_data` for + details. + + z_axis: `int` + The identifier of the Z axis. + + pmaxes: sequence of `int` + The aggregation axes, which include the Z axis. + + :Returns: + + `list` + + **Examples** + + >>> _reorder_z_axis([(0, ), (1, )], 0, [0]) + [(0, ), (1, )] + + >>> _reorder_z_axis( + ... [(0, 0, ), + ... (0, 1, ), + ... (1, 0, ), + ... (1, 1, )], + ... 1, [0, 1] + ... ) + [(0, 0, ), (0, 1, ), (1, 0, ), (1, 1, )] + + """ + indices_new = [] + zpos = pmaxes.index(z_axis) + aaa0 = indices[0] + indices2 = [aaa0] + for aaa in indices[1:]: + if aaa[zpos] > aaa0[zpos]: + indices2.append(aaa) + else: + indices_new.extend(indices2[::-1]) + aaa0 = aaa + indices2 = [aaa0] + + indices_new.extend(indices2[::-1]) + + indices = [a[:-1] + b[-1:] for a, b in zip(indices, indices_new)] + return indices + def atmosphere_hybrid_height_coordinate(self, axiscode): """`atmosphere_hybrid_height_coordinate` when not an array axis. @@ -1246,8 +1291,9 @@ def atmosphere_hybrid_height_coordinate(self, axiscode): dc = None else: array = array / toa_height + bounds = bounds / toa_height dc = self.implementation.initialise_DimensionCoordinate() - dc = self.coord_data(dc, array, bounds, units=_Units[""]) + dc = self.coord_data(dc, array, bounds, units=_Units["1"]) self.implementation.set_properties( dc, {"standard_name": "atmosphere_hybrid_height_coordinate"}, @@ -1644,10 +1690,12 @@ def coord_data( """ if array is not None: array = Data(array, units=units, fill_value=fill_value) + array._cfa_set_write(True) self.implementation.set_data(c, array, copy=False) if bounds is not None: bounds_data = Data(bounds, units=units, fill_value=fill_value) + bounds_data._cfa_set_write(True) bounds = self.implementation.initialise_Bounds() self.implementation.set_data(bounds, bounds_data, copy=False) self.implementation.set_bounds(c, bounds, copy=False) @@ -1685,7 +1733,8 @@ def coord_positive(self, c, axiscode, domain_axis_key): :Parameters: - c: Coordinate construct + c: `Coordinate` + A 1-d coordinate construct axiscode: `int` @@ -1701,6 +1750,7 @@ def coord_positive(self, c, axiscode, domain_axis_key): c.positive = positive if positive == "down" and axiscode != 4: self.down_axes.add(domain_axis_key) + c.flip(inplace=True) return c @@ -1870,6 +1920,9 @@ def create_data(self): name = (UMArray().__class__.__name__ + "-" + token,) dsk = {} full_slice = Ellipsis + klass_name = UMArray().__class__.__name__ + + fmt = self.fmt if len(recs) == 1: # -------------------------------------------------------- @@ -1888,19 +1941,19 @@ def create_data(self): subarray = UMArray( filename=filename, + address=rec.hdr_offset, shape=yx_shape, dtype=data_type_in_file(rec), - header_offset=rec.hdr_offset, - data_offset=rec.data_offset, - disk_length=rec.disk_length, - fmt=self.fmt, + fmt=fmt, word_size=self.word_size, byte_ordering=self.byte_ordering, units=units, calendar=calendar, ) - dsk[name + (0, 0)] = (getter, subarray, full_slice, False, False) + key = f"{klass_name}-{tokenize(subarray)}" + dsk[key] = subarray + dsk[name + (0, 0)] = (getter, key, full_slice, False, False) dtype = data_type_in_file(rec) chunks = normalize_chunks((-1, -1), shape=data_shape, dtype=dtype) @@ -1917,18 +1970,23 @@ def create_data(self): # ---------------------------------------------------- # 1-d partition matrix # ---------------------------------------------------- + z_axis = _axis[self.z_axis] if nz > 1: - pmaxes = [_axis[self.z_axis]] + pmaxes = [z_axis] data_shape = (nz, LBROW, LBNPT) else: pmaxes = [_axis["t"]] data_shape = (nt, LBROW, LBNPT) - fmt = self.fmt word_size = self.word_size byte_ordering = self.byte_ordering - for i, rec in enumerate(recs): + indices = [(i, rec) for i, rec in enumerate(recs)] + + if z_axis in self.down_axes: + indices = self._reorder_z_axis(indices, z_axis, pmaxes) + + for i, rec in indices: # Find the data type of the array in the file file_data_type = data_type_in_file(rec) file_data_types.add(file_data_type) @@ -1937,11 +1995,9 @@ def create_data(self): subarray = UMArray( filename=filename, + address=rec.hdr_offset, shape=shape, dtype=file_data_type, - header_offset=rec.hdr_offset, - data_offset=rec.data_offset, - disk_length=rec.disk_length, fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, @@ -1949,9 +2005,11 @@ def create_data(self): calendar=calendar, ) + key = f"{klass_name}-{tokenize(subarray)}" + dsk[key] = subarray dsk[name + (i, 0, 0)] = ( getter, - subarray, + key, full_slice, False, False, @@ -1965,17 +2023,21 @@ def create_data(self): # ---------------------------------------------------- # 2-d partition matrix # ---------------------------------------------------- - pmaxes = [_axis["t"], _axis[self.z_axis]] + z_axis = _axis[self.z_axis] + pmaxes = [_axis["t"], z_axis] + data_shape = (nt, nz, LBROW, LBNPT) - fmt = self.fmt word_size = self.word_size byte_ordering = self.byte_ordering - for i, rec in enumerate(recs): - # Find T and Z axis indices - t, z = divmod(i, nz) + indices = [ + divmod(i, nz) + (rec,) for i, rec in enumerate(recs) + ] + if z_axis in self.down_axes: + indices = self._reorder_z_axis(indices, z_axis, pmaxes) + for t, z, rec in indices: # Find the data type of the array in the file file_data_type = data_type_in_file(rec) file_data_types.add(file_data_type) @@ -1984,11 +2046,9 @@ def create_data(self): subarray = UMArray( filename=filename, + address=rec.hdr_offset, shape=shape, dtype=file_data_type, - header_offset=rec.hdr_offset, - data_offset=rec.data_offset, - disk_length=rec.disk_length, fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, @@ -1996,9 +2056,11 @@ def create_data(self): calendar=calendar, ) + key = f"{klass_name}-{tokenize(subarray)}" + dsk[key] = subarray dsk[name + (t, z, 0, 0)] = ( getter, - subarray, + key, full_slice, False, False, @@ -2021,6 +2083,7 @@ def create_data(self): # Create the Data object data = Data(array, units=um_Units, fill_value=fill_value) + data._cfa_set_write(True) self.data = data self.data_axes = data_axes @@ -3117,12 +3180,6 @@ def z_coordinate(self, axiscode): if _coord_positive.get(axiscode, None) == "down": bounds0, bounds1 = bounds1, bounds0 - # key = (axiscode, array, bounds0, bounds1) - # dc = _cached_z_coordinate.get(key, None) - - # if dc is not None: - # copy = True - # else: copy = False array = np.array(array, dtype=float) bounds0 = np.array(bounds0, dtype=float) @@ -3161,69 +3218,6 @@ def z_coordinate(self, axiscode): return dc - @_manage_log_level_via_verbose_attr - def z_reference_coordinate(self, axiscode): - """Create and return the Z reference coordinates.""" - logger.info( - "Creating Z reference coordinates from BRLEV" - ) # pragma: no cover - - array = np.array( - [rec.real_hdr.item(brlev) for rec in self.z_recs], dtype=float - ) - - LBVC = self.lbvc - atol = self.atol - - key = (axiscode, LBVC, array) - dc = _cached_z_reference_coordinate.get(key, None) - - if dc is not None: - copy = True - else: - if not 128 <= LBVC <= 139: - bounds = [] - for rec in self.z_recs: - BRLEV = rec.real_hdr.item(brlev) - BRSVD1 = rec.real_hdr.item(brsvd1) - - if abs(BRSVD1 - BRLEV) >= atol: - bounds = None - break - - bounds.append((BRLEV, BRSVD1)) - else: - bounds = None - - if bounds: - bounds = np.array((bounds,), dtype=float) - - dc = self.implementation.initialise_DimensionCoordinate() - dc = self.coord_data( - dc, - array, - bounds, - units=_axiscode_to_Units.setdefault(axiscode, None), - ) - dc = self.coord_axis(dc, axiscode) - dc = self.coord_names(dc, axiscode) - - if not dc.get("positive", True): # ppp - dc.flip(i=True) - - _cached_z_reference_coordinate[key] = dc - copy = False - - self.implementation.set_dimension_coordinate( - self.field, - dc, - axes=[_axis["z"]], - copy=copy, - autocyclic=_autocyclic_false, - ) - - return dc - # _stash2standard_name = {} # diff --git a/cf/read_write/write.py b/cf/read_write/write.py index c9cb5cb741..bba9f173d0 100644 --- a/cf/read_write/write.py +++ b/cf/read_write/write.py @@ -2,7 +2,12 @@ from ..cfimplementation import implementation from ..decorators import _manage_log_level_via_verbosity -from ..functions import _DEPRECATION_ERROR_FUNCTION_KWARGS, flat +from ..functions import ( + _DEPRECATION_ERROR_FUNCTION_KWARG, + _DEPRECATION_ERROR_FUNCTION_KWARG_VALUE, + CFA, + flat, +) from .netcdf import NetCDFWrite netcdf = NetCDFWrite(implementation()) @@ -27,7 +32,7 @@ def write( shuffle=True, reference_datetime=None, verbose=None, - cfa_options=None, + cfa=False, single=None, double=None, variable_attributes=None, @@ -36,9 +41,7 @@ def write( group=True, coordinates=False, omit_data=None, - HDF_chunksizes=None, - no_shuffle=None, - unlimited=None, + cfa_options=None, ): """Write field constructs to a netCDF file. @@ -135,14 +138,14 @@ def write( fmt: `str`, optional The format of the output file. One of: - ========================== ================================ + ========================== ============================== *fmt* Output file type - ========================== ================================ - ``'NETCDF4'`` NetCDF4 format file. This is the - default. + ========================== ============================== + ``'NETCDF4'`` NetCDF4 format file. This is + the default. - ``'NETCDF4_CLASSIC'`` NetCDF4 classic format file (see - below) + ``'NETCDF4_CLASSIC'`` NetCDF4 classic format file + (see below) ``'NETCDF3_CLASSIC'`` NetCDF3 classic format file (limited to file sizes less @@ -155,18 +158,20 @@ def write( ``'NETCDF3_64BIT_OFFSET'`` ``'NETCDF3_64BIT_DATA'`` NetCDF3 64-bit offset format - file with extensions (see below) + file with extensions (see + below) - ``'CFA'`` or ``'CFA4'`` CFA-netCDF4 format file + ``'CFA'`` or ``'CFA4'`` Deprecated at version + TODOCFAVER. See the *cfa* + parameter. - ``'CFA3'`` CFA-netCDF3 classic format file - ========================== ================================ + ``'CFA3'`` Deprecated at version + TODOCFAVER. See the *cfa* + parameter. + ========================== ============================== By default the format is ``'NETCDF4'``. - All formats support large files (i.e. those greater than - 2GB) except ``'NETCDF3_CLASSIC'``. - ``'NETCDF3_64BIT_DATA'`` is a format that requires version 4.4.0 or newer of the C library (use `cf.environment` to see which version if the netCDF-C library is in use). It @@ -356,31 +361,6 @@ def write( external to the named external file. Ignored if there are no such constructs. - cfa_options: `dict`, optional - A dictionary giving parameters for configuring the output - CFA-netCDF file: - - ========== =============================================== - Key Value - ========== =============================================== - ``'base'`` * If ``None`` (the default) then file names - within CFA-netCDF files are stored with - absolute paths. - - * If set to an empty string then file names - within CFA-netCDF files are given relative to - the directory or URL base containing the - output CFA-netCDF file. - - * If set to a string then file names within - CFA-netCDF files are given relative to the - directory or URL base described by the - value. For example: ``'../archive'``. - ========== =============================================== - - By default no parameters are specified. - - endian: `str`, optional The endian-ness of the output file. Valid values are ``'little'``, ``'big'`` or ``'native'``. By default the @@ -440,9 +420,6 @@ def write( `_ for more details. - This parameter replaces the deprecated *no_shuffle* - parameter. - datatype: `dict`, optional Specify data type conversions to be applied prior to writing data to disk. This may be useful as a means of @@ -609,15 +586,101 @@ def write( .. versionadded:: 3.14.0 - HDF_chunksizes: deprecated at version 3.0.0 - HDF chunk sizes may be set for individual constructs prior - to writing, instead. See `cf.Data.nc_set_hdf5_chunksizes`. - - no_shuffle: deprecated at version 3.0.0 - Use keyword *shuffle* instead. - - unlimited: deprecated at version 3.0.0 - Use method `DomainAxis.nc_set_unlimited` instead. + cfa: `bool` or `dict`, optional + If True or a (possibly empty) dictionary then write the + constructs as CFA-netCDF aggregation variables, where + possible and where requested. + + The netCDF format of the CFA-netCDF file is determined by + the *fmt* parameter, as usual. + + If *cfa* is a dictionary then it is used to configure the + CFA write process. The default options when CFA writing is + enabled are ``{'constructs': 'field', 'absolute_paths': + True, 'strict': True, 'substitutions': {}}``, and the + dictionary may have any subset of the following key/value + pairs to override these defaults: + + * ``'constructs'``: `dict` or (sequence of) `str` + + The types of construct to be written as CFA-netCDF + aggregation variables. By default only field constructs + are written as CFA-netCDF aggregation variables. + + The types may be given as a (sequence of) `str`, which + may take any of the values allowed by the *omit_data* + parameter. Alternatively, the same types may be given as + keys to a `dict` whose values specify the number of + dimensions that a construct must also have if it is to + be written as a CFA-netCDF aggregation variable. A value + of `None` means no restriction on the number of + dimensions, which is equivalent to a value of + ``cf.ge(0)``. + + *Example:* + Equivalent ways to only write cell measure constructs + as CFA-netCDF aggregation variables: + ``'cell_measure``, ``['cell_measure']``, + ``{'cell_measure': None}``, ``{'cell_measure': + cf.ge(0)}`` + + *Example:* + Equivalent ways to only write field and auxiliary + coordinate constructs as CFA-netCDF aggregation + variables: ``('field', 'auxiliary_coordinate')`` and + ``{'field': None, 'auxiliary_coordinate': None}``. + + *Example:* + Equivalent ways to only write two-dimensional + auxiliary coordinate constructs as CFA-netCDF + aggregation variables: ``{'auxiliary_coordinate': + 2}`` and ``{'auxiliary_coordinate': cf.eq(2)}``. + + *Example:* + Only write auxiliary coordinate constructs with two or + more dimensions as CFA-netCDF variables, and also all + field constructs: ``{'field': None, + 'auxiliary_coordinate': cf.ge(2)}``. + + * ``'absolute_paths'``: `bool` + + How to write fragment file names. Set to True (the + default) for them to be written as fully qualified URIs, + or else set to False for them to be written as local + paths relative to the location of the CFA-netCDF file + being created. + + * ``'strict'``: `bool` + + If True (the default) then an exception is raised if it + is not possible to create a CFA aggregation variable + from data identified by the ``'constructs'`` option. If + False then a normal CF-netCDF variable will be written + in this case. + + * ``'substitutions'``: `dict` + + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key may be specified with or without the + ``${...}`` syntax. For instance, the following are + equivalent: ``{'base': 'sub'}``, ``{'${base}': 'sub'}``. + The substitutions are used in conjunction with, and take + precedence over, any that are also defined on individual + constructs (see `cf.Data.cfa_update_file_substitutions` + for details). + + Substitutions are stored in the output file by the + ``substitutions`` attribute of the ``file`` CFA + aggregation instruction variable. + + *Example:* + ``{'base': 'file:///data/'}`` + + .. versionadded:: TODOCFAVER + + cfa_options: Deprecated at version TODOCFAVER + Use the *cfa* parameter instead. :Returns: @@ -637,26 +700,23 @@ def write( >>> cf.write(f, 'file.nc', Conventions='CMIP-6.2') """ - if unlimited is not None: - _DEPRECATION_ERROR_FUNCTION_KWARGS( - "cf.write", - {"unlimited": unlimited}, - "Use method 'DomainAxis.nc_set_unlimited' instead.", - ) # pragma: no cover - - if no_shuffle is not None: - _DEPRECATION_ERROR_FUNCTION_KWARGS( + if fmt in ("CFA", "CFA4", "CFA3"): + return _DEPRECATION_ERROR_FUNCTION_KWARG_VALUE( "cf.write", - {"no_shuffle": no_shuffle}, - "Use keyword 'shuffle' instead.", + "fmt", + fmt, + "Use the 'cfa' keyword instead.", + version="TODOCFAVER", + removed_at="5.0.0", ) # pragma: no cover - if HDF_chunksizes is not None: - _DEPRECATION_ERROR_FUNCTION_KWARGS( + if cfa_options is not None: + return _DEPRECATION_ERROR_FUNCTION_KWARG( "cf.write", - {"HDF_chunksizes": HDF_chunksizes}, - "HDF chunk sizes may be set for individual field constructs " - "prior to writing, instead.", + "cfa_options", + "Use keyword 'cfa' instead.", + version="TODOCFAVER", + removed_at="5.0.0", ) # pragma: no cover # Flatten the sequence of intput fields @@ -670,9 +730,7 @@ def write( raise ValueError("Can't set datatype and double") if single is not None and double is not None: - raise ValueError( - "Can't set both the single and double " "parameters" - ) + raise ValueError("Can't set both the single and double parameters") if single is not None and not single: double = True @@ -692,32 +750,60 @@ def write( numpy.dtype("int32"): numpy.dtype(int), } - extra_write_vars = { - "cfa": False, - "cfa_options": {}, - "reference_datetime": reference_datetime, - } - - # CFA options - if fmt in ("CFA", "CFA4"): - extra_write_vars["cfa"] = True - fmt = "NETCDF4" - if cfa_options: - extra_write_vars["cfa_options"] = cfa_options - elif fmt == "CFA3": - extra_write_vars["cfa"] = True - fmt = "NETCDF3_CLASSIC" - if cfa_options: - extra_write_vars["cfa_options"] = cfa_options - - if extra_write_vars["cfa"]: - if Conventions: - if isinstance(Conventions, str): - Conventions = (Conventions,) - - Conventions = tuple(Conventions) + ("CFA",) + # Extra write variables + extra_write_vars = {"reference_datetime": reference_datetime} + + # ------------------------------------------------------------ + # CFA + # ------------------------------------------------------------ + if isinstance(cfa, dict): + cfa_options = cfa.copy() + cfa = True + else: + cfa_options = {} + cfa = bool(cfa) + + if cfa: + # Add CFA to the Conventions + cfa_conventions = f"CFA-{CFA()}" + if not Conventions: + Conventions = cfa_conventions + elif isinstance(Conventions, str): + Conventions = (Conventions, cfa_conventions) else: - Conventions = "CFA" + Conventions = tuple(Conventions) + (cfa_conventions,) + + keys = ("constructs", "absolute_paths", "strict", "substitutions") + if not set(cfa_options).issubset(keys): + raise ValueError( + "Invalid dictionary key to the 'cfa_options' " + f"parameter. Valid keys are {keys}. Got: {cfa_options}" + ) + + cfa_options.setdefault("constructs", "field") + cfa_options.setdefault("absolute_paths", True) + cfa_options.setdefault("strict", True) + cfa_options.setdefault("substitutions", {}) + + constructs = cfa_options["constructs"] + if isinstance(constructs, dict): + cfa_options["constructs"] = constructs.copy() + else: + if isinstance(constructs, str): + constructs = (constructs,) + + cfa_options["constructs"] = {c: None for c in constructs} + + substitutions = cfa_options["substitutions"].copy() + for base, sub in tuple(substitutions.items()): + if not (base.startswith("${") and base.endswith("}")): + # Add missing ${...} + substitutions[f"${{{base}}}"] = substitutions.pop(base) + + cfa_options["substitutions"] = substitutions + + extra_write_vars["cfa"] = cfa + extra_write_vars["cfa_options"] = cfa_options netcdf.write( fields, diff --git a/cf/test/create_test_files.py b/cf/test/create_test_files.py index fe49d28e63..0cd4f5dd42 100644 --- a/cf/test/create_test_files.py +++ b/cf/test/create_test_files.py @@ -15539,6 +15539,77 @@ def _make_regrid_file(filename): ] +def _make_cfa_file(filename): + n = netCDF4.Dataset(filename, "w", format="NETCDF4") + + n.Conventions = f"CF-{VN} CFA-0.6.2" + n.comment = ( + "A CFA-netCDF file with non-standarised aggregation instructions" + ) + + n.createDimension("time", 12) + level = n.createDimension("level", 1) + lat = n.createDimension("lat", 73) + lon = n.createDimension("lon", 144) + n.createDimension("f_time", 2) + n.createDimension("f_level", 1) + n.createDimension("f_lat", 1) + n.createDimension("f_lon", 1) + n.createDimension("i", 4) + n.createDimension("j", 2) + + lon = n.createVariable("lon", "f4", ("lon",)) + lon.standard_name = "longitude" + lon.units = "degrees_east" + + lat = n.createVariable("lat", "f4", ("lat",)) + lat.standard_name = "latitude" + lat.units = "degrees_north" + + time = n.createVariable("time", "f4", ("time",)) + time.standard_name = "time" + time.units = "days since 2000-01-01" + + level = n.createVariable("level", "f4", ("level",)) + + tas = n.createVariable("tas", "f4", ()) + tas.standard_name = "air_temperature" + tas.units = "K" + tas.aggregated_dimensions = "time level lat lon" + tas.aggregated_data = "location: aggregation_location file: aggregation_file format: aggregation_format address: aggregation_address tracking_id: aggregation_tracking_id" + + loc = n.createVariable("aggregation_location", "i4", ("i", "j")) + loc[0, :] = 6 + loc[1, 0] = level.size + loc[2, 0] = lat.size + loc[3, 0] = lon.size + + fil = n.createVariable( + "aggregation_file", str, ("f_time", "f_level", "f_lat", "f_lon") + ) + fil[0, 0, 0, 0] = "January-June.nc" + fil[1, 0, 0, 0] = "July-December.nc" + + add = n.createVariable( + "aggregation_address", str, ("f_time", "f_level", "f_lat", "f_lon") + ) + add[0, 0, 0, 0] = "tas0" + add[1, 0, 0, 0] = "tas1" + + fmt = n.createVariable("aggregation_format", str, ()) + fmt[()] = "nc" + + tid = n.createVariable( + "aggregation_tracking_id", str, ("f_time", "f_level", "f_lat", "f_lon") + ) + tid[0, 0, 0, 0] = "tracking_id0" + tid[1, 0, 0, 0] = "tracking_id1" + + n.close() + + return filename + + contiguous_file = _make_contiguous_file("DSG_timeSeries_contiguous.nc") indexed_file = _make_indexed_file("DSG_timeSeries_indexed.nc") indexed_contiguous_file = _make_indexed_contiguous_file( @@ -15572,6 +15643,8 @@ def _make_regrid_file(filename): regrid_file = _make_regrid_file("regrid.nc") +cfa_file = _make_cfa_file("cfa.nc") + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) cf.environment() diff --git a/cf/test/file1.pp b/cf/test/file1.pp new file mode 100644 index 0000000000..04d26869c1 Binary files /dev/null and b/cf/test/file1.pp differ diff --git a/cf/test/individual_tests.sh b/cf/test/individual_tests.sh index 67bbfd91b8..f02a941197 100755 --- a/cf/test/individual_tests.sh +++ b/cf/test/individual_tests.sh @@ -1,12 +1,16 @@ #!/bin/bash -python create_test_files.py +file=create_test_files.py +echo "Running $file" +python $file rc=$? if [[ $rc != 0 ]]; then exit $rc fi -python setup_create_field.py +file=setup_create_field.py +echo "Running $file" +python $file rc=$? if [[ $rc != 0 ]]; then exit $rc @@ -14,6 +18,7 @@ fi for file in test_*.py do + echo "Running $file" python $file rc=$? if [[ $rc != 0 ]]; then diff --git a/cf/test/setup_create_field.py b/cf/test/setup_create_field.py index 6440bd372f..34e4bdcd58 100644 --- a/cf/test/setup_create_field.py +++ b/cf/test/setup_create_field.py @@ -83,7 +83,7 @@ def test_create_field(self): "iota", "kappa", ], - dtype="S", + dtype="U", ) ) ) diff --git a/cf/test/test_CFA.py b/cf/test/test_CFA.py new file mode 100644 index 0000000000..0fbe4b4996 --- /dev/null +++ b/cf/test/test_CFA.py @@ -0,0 +1,459 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest +from pathlib import PurePath + +import netCDF4 + +faulthandler.enable() # to debug seg faults and timeouts + +import cf + +n_tmpfiles = 5 +tmpfiles = [ + tempfile.mkstemp("_test_CFA.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +( + tmpfile1, + tmpfile2, + tmpfile3, + tmpfile4, + tmpfile5, +) = tmpfiles + + +def _remove_tmpfiles(): + """Try to remove defined temporary files by deleting their paths.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class CFATest(unittest.TestCase): + netcdf3_fmts = [ + "NETCDF3_CLASSIC", + "NETCDF3_64BIT", + "NETCDF3_64BIT_OFFSET", + "NETCDF3_64BIT_DATA", + ] + netcdf4_fmts = ["NETCDF4", "NETCDF4_CLASSIC"] + netcdf_fmts = netcdf3_fmts + netcdf4_fmts + + def test_CFA_fmt(self): + """Test the cf.read 'fmt' and 'cfa' keywords.""" + f = cf.example_field(0) + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + + for fmt in self.netcdf_fmts: + cf.write(f, tmpfile2, fmt=fmt, cfa=True) + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_multiple_fragments(self): + """Test CFA with more than one fragment.""" + f = cf.example_field(0) + + cf.write(f[:2], tmpfile1) + cf.write(f[2:], tmpfile2) + + a = cf.read([tmpfile1, tmpfile2]) + self.assertEqual(len(a), 1) + a = a[0] + + nc_file = tmpfile3 + cfa_file = tmpfile4 + cf.write(a, nc_file) + cf.write(a, cfa_file, cfa=True) + + n = cf.read(nc_file) + c = cf.read(cfa_file) + self.assertEqual(len(n), 1) + self.assertEqual(len(c), 1) + self.assertTrue(c[0].equals(f)) + self.assertTrue(n[0].equals(c[0])) + + def test_CFA_strict(self): + """Test CFA 'strict' option to the cfa.write 'cfa' keyword.""" + f = cf.example_field(0) + + # By default, can't write as CF-netCDF those variables + # selected for CFA treatment, but which aren't suitable. + with self.assertRaises(ValueError): + cf.write(f, tmpfile1, cfa=True) + + # The previous line should have deleted the output file + self.assertFalse(os.path.exists(tmpfile1)) + + cf.write(f, tmpfile1, cfa={"strict": False}) + g = cf.read(tmpfile1) + self.assertEqual(len(g), 1) + self.assertTrue(g[0].equals(f)) + + cf.write(g, tmpfile2, cfa={"strict": True}) + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(g[0].equals(f)) + + def test_CFA_field_ancillaries(self): + """Test creation of field ancillaries from non-standard CFA terms.""" + f = cf.example_field(0) + self.assertFalse(f.field_ancillaries()) + + a = f[:2] + b = f[2:] + a.set_property("foo", "bar_a") + b.set_property("foo", "bar_b") + cf.write(a, tmpfile1) + cf.write(b, tmpfile2) + + c = cf.read( + [tmpfile1, tmpfile2], aggregate={"field_ancillaries": "foo"} + ) + self.assertEqual(len(c), 1) + c = c[0] + self.assertEqual(len(c.field_ancillaries()), 1) + anc = c.field_ancillary() + self.assertTrue(anc.data.cfa_get_term()) + self.assertFalse(anc.data.cfa_get_write()) + + cf.write(c, tmpfile3, cfa=False) + c2 = cf.read(tmpfile3) + self.assertEqual(len(c2), 1) + self.assertFalse(c2[0].field_ancillaries()) + + cf.write(c, tmpfile4, cfa=True) + d = cf.read(tmpfile4) + self.assertEqual(len(d), 1) + d = d[0] + + self.assertEqual(len(d.field_ancillaries()), 1) + anc = d.field_ancillary() + self.assertTrue(anc.data.cfa_get_term()) + self.assertFalse(anc.data.cfa_get_write()) + self.assertTrue(d.equals(c)) + + cf.write(d, tmpfile5, cfa=False) + e = cf.read(tmpfile5) + self.assertEqual(len(e), 1) + self.assertFalse(e[0].field_ancillaries()) + + cf.write(d, tmpfile5, cfa=True) + e = cf.read(tmpfile5) + self.assertEqual(len(e), 1) + self.assertTrue(e[0].equals(d)) + + def test_CFA_substitutions_0(self): + """Test CFA substitution URI substitutions (0).""" + f = cf.example_field(0) + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + + cwd = os.getcwd() + + f.data.cfa_update_file_substitutions({"base": cwd}) + + cf.write( + f, + tmpfile2, + cfa={"absolute_paths": True}, + ) + + nc = netCDF4.Dataset(tmpfile2, "r") + cfa_file = nc.variables["cfa_file"] + self.assertEqual( + cfa_file.getncattr("substitutions"), + f"${{base}}: {cwd}", + ) + self.assertEqual( + cfa_file[...], f"file://${{base}}/{os.path.basename(tmpfile1)}" + ) + nc.close() + + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_substitutions_1(self): + """Test CFA substitution URI substitutions (1).""" + f = cf.example_field(0) + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + + cwd = os.getcwd() + for base in ("base", "${base}"): + cf.write( + f, + tmpfile2, + cfa={"absolute_paths": True, "substitutions": {base: cwd}}, + ) + + nc = netCDF4.Dataset(tmpfile2, "r") + cfa_file = nc.variables["cfa_file"] + self.assertEqual( + cfa_file.getncattr("substitutions"), + f"${{base}}: {cwd}", + ) + self.assertEqual( + cfa_file[...], f"file://${{base}}/{os.path.basename(tmpfile1)}" + ) + nc.close() + + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_substitutions_2(self): + """Test CFA substitution URI substitutions (2).""" + f = cf.example_field(0) + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + + cwd = os.getcwd() + + f.data.cfa_clear_file_substitutions() + f.data.cfa_update_file_substitutions({"base": cwd}) + + cf.write( + f, + tmpfile2, + cfa={ + "absolute_paths": True, + "substitutions": {"base2": "/bad/location"}, + }, + ) + + nc = netCDF4.Dataset(tmpfile2, "r") + cfa_file = nc.variables["cfa_file"] + self.assertEqual( + cfa_file.getncattr("substitutions"), + f"${{base2}}: /bad/location ${{base}}: {cwd}", + ) + self.assertEqual( + cfa_file[...], f"file://${{base}}/{os.path.basename(tmpfile1)}" + ) + nc.close() + + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + f.data.cfa_clear_file_substitutions() + f.data.cfa_update_file_substitutions({"base": "/bad/location"}) + + cf.write( + f, + tmpfile2, + cfa={"absolute_paths": True, "substitutions": {"base": cwd}}, + ) + + nc = netCDF4.Dataset(tmpfile2, "r") + cfa_file = nc.variables["cfa_file"] + self.assertEqual( + cfa_file.getncattr("substitutions"), + f"${{base}}: {cwd}", + ) + self.assertEqual( + cfa_file[...], f"file://${{base}}/{os.path.basename(tmpfile1)}" + ) + nc.close() + + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + f.data.cfa_clear_file_substitutions() + f.data.cfa_update_file_substitutions({"base2": "/bad/location"}) + + cf.write( + f, + tmpfile2, + cfa={"absolute_paths": True, "substitutions": {"base": cwd}}, + ) + + nc = netCDF4.Dataset(tmpfile2, "r") + cfa_file = nc.variables["cfa_file"] + self.assertEqual( + cfa_file.getncattr("substitutions"), + f"${{base2}}: /bad/location ${{base}}: {cwd}", + ) + self.assertEqual( + cfa_file[...], f"file://${{base}}/{os.path.basename(tmpfile1)}" + ) + nc.close() + + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_absolute_paths(self): + """Test CFA 'absolute_paths' option to the cfa.write 'cfa' keyword.""" + f = cf.example_field(0) + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + + for absolute_paths, filename in zip( + (True, False), + ( + PurePath(os.path.abspath(tmpfile1)).as_uri(), + os.path.basename(tmpfile1), + ), + ): + cf.write(f, tmpfile2, cfa={"absolute_paths": absolute_paths}) + + nc = netCDF4.Dataset(tmpfile2, "r") + cfa_file = nc.variables["cfa_file"] + self.assertEqual(cfa_file[...], filename) + nc.close() + + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_constructs(self): + """Test choice of constructs to write as CFA-netCDF variables.""" + f = cf.example_field(1) + f.del_construct("T") + f.del_construct("long_name=Grid latitude name") + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + + # No constructs + cf.write(f, tmpfile2, cfa={"constructs": []}) + nc = netCDF4.Dataset(tmpfile2, "r") + for var in nc.variables.values(): + attrs = var.ncattrs() + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + # Field construct + cf.write(f, tmpfile2, cfa={"constructs": "field"}) + nc = netCDF4.Dataset(tmpfile2, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ("ta",): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + # Dimension construct + for constructs in ( + "dimension_coordinate", + ["dimension_coordinate"], + {"dimension_coordinate": None}, + {"dimension_coordinate": 1}, + {"dimension_coordinate": cf.eq(1)}, + ): + cf.write(f, tmpfile2, cfa={"constructs": constructs}) + nc = netCDF4.Dataset(tmpfile2, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ( + "x", + "x_bnds", + "y", + "y_bnds", + "atmosphere_hybrid_height_coordinate", + "atmosphere_hybrid_height_coordinate_bounds", + ): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + # Dimension and auxiliary constructs + for constructs in ( + ["dimension_coordinate", "auxiliary_coordinate"], + {"dimension_coordinate": None, "auxiliary_coordinate": cf.ge(2)}, + ): + cf.write(f, tmpfile2, cfa={"constructs": constructs}) + nc = netCDF4.Dataset(tmpfile2, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ( + "x", + "x_bnds", + "y", + "y_bnds", + "atmosphere_hybrid_height_coordinate", + "atmosphere_hybrid_height_coordinate_bounds", + "latitude_1", + "longitude_1", + ): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + def test_CFA_PP(self): + """Test writing CFA-netCDF with PP format fragments.""" + f = cf.read("file1.pp")[0] + cf.write(f, tmpfile1, cfa=True) + + # Check that only the fields have been aggregated + nc = netCDF4.Dataset(tmpfile1, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ("UM_m01s15i201_vn405",): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + g = cf.read(tmpfile1) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_multiple_files(self): + """Test storing multiple CFA frgament locations.""" + tmpfile1 = "delme1.nc" + tmpfile2 = "delme2.nc" + f = cf.example_field(0) + cf.write(f, tmpfile1) + f = cf.read(tmpfile1)[0] + f.add_file_location("/new/location") + + cf.write(f, tmpfile2, cfa=True) + g = cf.read(tmpfile2) + self.assertEqual(len(g), 1) + g = g[0] + self.assertTrue(f.equals(g)) + + self.assertEqual(len(g.data.get_filenames()), 2) + self.assertEqual(len(g.get_filenames()), 3) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print() + unittest.main(verbosity=2) diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 15ae5e59bb..87656e80b4 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1173,6 +1173,14 @@ def test_Data_concatenate(self): self.assertEqual(f.shape, f_np.shape) self.assertTrue((f.array == f_np).all()) + # Check cached elements + str(d) + str(e) + f = cf.Data.concatenate([d, e], axis=1) + cached = f._get_cached_elements() + self.assertEqual(cached[0], d.first_element()) + self.assertEqual(cached[-1], e.last_element()) + # Check concatenation with one invalid units d.override_units(cf.Units("foo"), inplace=1) with self.assertRaises(ValueError): @@ -4393,7 +4401,7 @@ def test_Data__init__datetime(self): def test_Data_get_filenames(self): """Test `Data.get_filenames`.""" - d = cf.Data.full((5, 8), 1, chunks=4) + d = cf.Data.ones((5, 8), float, chunks=4) self.assertEqual(d.get_filenames(), set()) f = cf.example_field(0) @@ -4528,6 +4536,108 @@ def test_Data_clear_after_dask_update(self): d._set_dask(dx, clear=_ALL) self.assertFalse(d._get_cached_elements()) + def test_Data_cfa_aggregated_data(self): + """Test `Data` CFA aggregated_data methods""" + d = cf.Data(9) + aggregated_data = { + "location": "cfa_location", + "file": "cfa_file", + "address": "cfa_address", + "format": "cfa_format", + "tracking_id": "tracking_id", + } + + self.assertFalse(d.cfa_has_aggregated_data()) + self.assertIsNone(d.cfa_set_aggregated_data(aggregated_data)) + self.assertTrue(d.cfa_has_aggregated_data()) + self.assertEqual(d.cfa_get_aggregated_data(), aggregated_data) + self.assertEqual(d.cfa_del_aggregated_data(), aggregated_data) + self.assertFalse(d.cfa_has_aggregated_data()) + self.assertEqual(d.cfa_get_aggregated_data(), {}) + self.assertEqual(d.cfa_del_aggregated_data(), {}) + + def test_Data_cfa_file_substitutions(self): + """Test `Data` CFA file_substitutions methods""" + d = cf.Data(9) + self.assertFalse(d.cfa_has_file_substitutions()) + self.assertIsNone( + d.cfa_update_file_substitutions({"base": "file:///data/"}) + ) + self.assertTrue(d.cfa_has_file_substitutions()) + self.assertEqual( + d.cfa_file_substitutions(), {"${base}": "file:///data/"} + ) + + d.cfa_update_file_substitutions({"${base2}": "/home/data/"}) + self.assertEqual( + d.cfa_file_substitutions(), + {"${base}": "file:///data/", "${base2}": "/home/data/"}, + ) + + d.cfa_update_file_substitutions({"${base}": "/new/location/"}) + self.assertEqual( + d.cfa_file_substitutions(), + {"${base}": "/new/location/", "${base2}": "/home/data/"}, + ) + self.assertEqual( + d.cfa_del_file_substitution("${base}"), + {"${base}": "/new/location/"}, + ) + self.assertEqual( + d.cfa_clear_file_substitutions(), {"${base2}": "/home/data/"} + ) + self.assertFalse(d.cfa_has_file_substitutions()) + self.assertEqual(d.cfa_file_substitutions(), {}) + self.assertEqual(d.cfa_clear_file_substitutions(), {}) + self.assertEqual(d.cfa_del_file_substitution("base"), {}) + + def test_Data_file_location(self): + """Test `Data` file location methods""" + f = cf.example_field(0) + + self.assertEqual( + f.data.add_file_location("/data/model/"), "/data/model" + ) + + cf.write(f, file_A) + d = cf.read(file_A, chunks=4)[0].data + self.assertGreater(d.npartitions, 1) + + e = d.copy() + location = os.path.dirname(os.path.abspath(file_A)) + + self.assertEqual(d.file_locations(), set((location,))) + self.assertEqual(d.add_file_location("/data/model/"), "/data/model") + self.assertEqual(d.file_locations(), set((location, "/data/model"))) + + # Check that we haven't changed 'e' + self.assertEqual(e.file_locations(), set((location,))) + + self.assertEqual(d.del_file_location("/data/model/"), "/data/model") + self.assertEqual(d.file_locations(), set((location,))) + d.del_file_location("/invalid") + self.assertEqual(d.file_locations(), set((location,))) + + def test_Data_todict(self): + """Test Data.todict""" + d = cf.Data([1, 2, 3, 4], chunks=2) + key = d.to_dask_array().name + + x = d.todict() + self.assertIsInstance(x, dict) + self.assertIn((key, 0), x) + self.assertIn((key, 1), x) + + e = d[0] + x = e.todict() + self.assertIn((key, 0), x) + self.assertNotIn((key, 1), x) + + x = e.todict(optimize_graph=False) + self.assertIsInstance(x, dict) + self.assertIn((key, 0), x) + self.assertIn((key, 1), x) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_NetCDFArray.py b/cf/test/test_NetCDFArray.py new file mode 100644 index 0000000000..c69a4654e7 --- /dev/null +++ b/cf/test/test_NetCDFArray.py @@ -0,0 +1,129 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest + +from dask.base import tokenize + +faulthandler.enable() # to debug seg faults and timeouts + +import cf + +n_tmpfiles = 1 +tmpfiles = [ + tempfile.mkstemp("_test_NetCDFArray.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tmpfile1,) = tmpfiles + + +def _remove_tmpfiles(): + """Try to remove defined temporary files by deleting their paths.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class NetCDFArrayTest(unittest.TestCase): + def test_NetCDFArray_del_file_location(self): + a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) + b = a.del_file_location("/data1") + self.assertIsNot(b, a) + self.assertEqual(b.get_filenames(), ("/data2/file2",)) + self.assertEqual(b.get_addresses(), ("tas2",)) + + a = cf.NetCDFArray( + ("/data1/file1", "/data2/file1", "/data2/file2"), + ("tas1", "tas1", "tas2"), + ) + b = a.del_file_location("/data2") + self.assertEqual(b.get_filenames(), ("/data1/file1",)) + self.assertEqual(b.get_addresses(), ("tas1",)) + + # Can't be left with no files + self.assertEqual(b.file_locations(), ("/data1",)) + with self.assertRaises(ValueError): + b.del_file_location("/data1/") + + def test_NetCDFArray_file_locations(self): + a = cf.NetCDFArray("/data1/file1") + self.assertEqual(a.file_locations(), ("/data1",)) + + a = cf.NetCDFArray(("/data1/file1", "/data2/file2")) + self.assertEqual(a.file_locations(), ("/data1", "/data2")) + + a = cf.NetCDFArray(("/data1/file1", "/data2/file2", "/data1/file2")) + self.assertEqual(a.file_locations(), ("/data1", "/data2", "/data1")) + + def test_NetCDFArray_add_file_location(self): + a = cf.NetCDFArray("/data1/file1", "tas") + b = a.add_file_location("/home/user") + self.assertIsNot(b, a) + self.assertEqual( + b.get_filenames(), ("/data1/file1", "/home/user/file1") + ) + self.assertEqual(b.get_addresses(), ("tas", "tas")) + + a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) + b = a.add_file_location("/home/user") + self.assertEqual( + b.get_filenames(), + ( + "/data1/file1", + "/data2/file2", + "/home/user/file1", + "/home/user/file2", + ), + ) + self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1", "tas2")) + + a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) + b = a.add_file_location("/home/user") + self.assertEqual( + b.get_filenames(), + ("/data1/file1", "/data2/file1", "/home/user/file1"), + ) + self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1")) + + a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) + b = a.add_file_location("/data1/") + self.assertEqual(b.get_filenames(), a.get_filenames()) + self.assertEqual(b.get_addresses(), a.get_addresses()) + + def test_NetCDFArray__dask_tokenize__(self): + a = cf.NetCDFArray("/data1/file1", "tas", shape=(12, 2), mask=False) + self.assertEqual(tokenize(a), tokenize(a.copy())) + + b = cf.NetCDFArray("/home/file2", "tas", shape=(12, 2)) + self.assertNotEqual(tokenize(a), tokenize(b)) + + def test_NetCDFArray_multiple_files(self): + f = cf.example_field(0) + cf.write(f, tmpfile1) + + # Create instance with non-existent file + n = cf.NetCDFArray( + filename=os.path.join("/bad/location", os.path.basename(tmpfile1)), + address=f.nc_get_variable(), + shape=f.shape, + dtype=f.dtype, + ) + # Add file that exists + n = n.add_file_location(os.path.dirname(tmpfile1)) + + self.assertEqual(len(n.get_filenames()), 2) + self.assertTrue((n[...] == f.array).all()) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print() + unittest.main(verbosity=2) diff --git a/cf/test/test_aggregate.py b/cf/test/test_aggregate.py index e80a98878d..975458be38 100644 --- a/cf/test/test_aggregate.py +++ b/cf/test/test_aggregate.py @@ -334,6 +334,25 @@ def test_aggregate_relaxed_units(self): self.assertEqual(i.Units.__dict__, bad_units.__dict__) self.assertTrue((i.array == f.array).all()) + def test_aggregate_field_ancillaries(self): + f = cf.example_field(0) + self.assertFalse(f.field_ancillaries()) + + a = f[:2] + b = f[2:] + a.set_property("foo", "bar_a") + b.set_property("foo", "bar_b") + + c = cf.aggregate([a, b], field_ancillaries="foo") + self.assertEqual(len(c), 1) + c = c[0] + self.assertTrue(len(c.field_ancillaries()), 1) + + anc = c.field_ancillary() + self.assertEqual(anc.shape, c.shape) + self.assertTrue((anc[:2] == "bar_a").all()) + self.assertTrue((anc[2:] == "bar_b").all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 7a32b0d859..7210e35263 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -368,6 +368,9 @@ def test_size(self): x = da.arange(9) self.assertEqual(cf.size(x), x.size) + def test_CFA(self): + self.assertEqual(cf.CFA(), cf.__cfa_version__) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 64c5434e98..0720ef4173 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -42,8 +42,6 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -TEST_DASKIFIED_ONLY = True - class read_writeTest(unittest.TestCase): filename = os.path.join( @@ -72,7 +70,6 @@ class read_writeTest(unittest.TestCase): netcdf4_fmts = ["NETCDF4", "NETCDF4_CLASSIC"] netcdf_fmts = netcdf3_fmts + netcdf4_fmts - # @unittest.skipIf(TEST_DASKIFIED_ONLY, "KeyError: 'q'") def test_write_filename(self): f = self.f0 a = f.array diff --git a/docs/source/class/cf.AuxiliaryCoordinate.rst b/docs/source/class/cf.AuxiliaryCoordinate.rst index 228394b7ef..f697317e41 100644 --- a/docs/source/class/cf.AuxiliaryCoordinate.rst +++ b/docs/source/class/cf.AuxiliaryCoordinate.rst @@ -508,6 +508,24 @@ Groups ~cf.AuxiliaryCoordinate.nc_set_variable_groups ~cf.AuxiliaryCoordinate.nc_clear_variable_groups +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.AuxiliaryCoordinate.add_file_location + ~cf.AuxiliaryCoordinate.cfa_clear_file_substitutions + ~cf.AuxiliaryCoordinate.cfa_del_file_substitution + ~cf.AuxiliaryCoordinate.cfa_file_substitutions + ~cf.AuxiliaryCoordinate.cfa_update_file_substitutions + ~cf.AuxiliaryCoordinate.del_file_location + ~cf.AuxiliaryCoordinate.file_locations + Aliases ------- diff --git a/docs/source/class/cf.Bounds.rst b/docs/source/class/cf.Bounds.rst index 2cdb1214c1..94b2bf2f42 100644 --- a/docs/source/class/cf.Bounds.rst +++ b/docs/source/class/cf.Bounds.rst @@ -405,6 +405,23 @@ NetCDF ~cf.Bounds.nc_has_dimension ~cf.Bounds.nc_set_dimension +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.Bounds.add_file_location + ~cf.Bounds.cfa_clear_file_substitutions + ~cf.Bounds.cfa_del_file_substitution + ~cf.Bounds.cfa_file_substitutions + ~cf.Bounds.cfa_update_file_substitutions + ~cf.Bounds.del_file_location + ~cf.Bounds.file_locations Aliases ------- diff --git a/docs/source/class/cf.CellMeasure.rst b/docs/source/class/cf.CellMeasure.rst index 466a7e348c..cff23ee73e 100644 --- a/docs/source/class/cf.CellMeasure.rst +++ b/docs/source/class/cf.CellMeasure.rst @@ -425,6 +425,23 @@ NetCDF ~cf.CellMeasure.nc_get_external ~cf.CellMeasure.nc_set_external +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.CellMeasure.add_file_location + ~cf.CellMeasure.cfa_clear_file_substitutions + ~cf.CellMeasure.cfa_del_file_substitution + ~cf.CellMeasure.cfa_file_substitutions + ~cf.CellMeasure.cfa_update_file_substitutions + ~cf.CellMeasure.del_file_location + ~cf.CellMeasure.file_locations Aliases ------- diff --git a/docs/source/class/cf.Count.rst b/docs/source/class/cf.Count.rst index f35a115757..47d9c4509c 100644 --- a/docs/source/class/cf.Count.rst +++ b/docs/source/class/cf.Count.rst @@ -402,6 +402,24 @@ NetCDF ~cf.Count.nc_has_sample_dimension ~cf.Count.nc_set_sample_dimension +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.Count.add_file_location + ~cf.Count.cfa_clear_file_substitutions + ~cf.Count.cfa_del_file_substitution + ~cf.Count.cfa_file_substitutions + ~cf.Count.cfa_update_file_substitutions + ~cf.Count.del_file_location + ~cf.Count.file_locations + Aliases ------- diff --git a/docs/source/class/cf.Data.rst b/docs/source/class/cf.Data.rst index 3a01228d82..afe0f56dcb 100644 --- a/docs/source/class/cf.Data.rst +++ b/docs/source/class/cf.Data.rst @@ -70,6 +70,8 @@ Dask ~cf.Data.cull_graph ~cf.Data.dask_compressed_array ~cf.Data.rechunk + ~cf.Data.chunk_indices + ~cf.Data.todict ~cf.Data.to_dask_array .. rubric:: Attributes @@ -645,6 +647,32 @@ Performance ~cf.Data.npartitions ~cf.Data.numblocks + +CFA +--- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.Data.add_file_location + ~cf.Data.cfa_clear_file_substitutions + ~cf.Data.cfa_del_aggregated_data + ~cf.Data.cfa_del_file_substitution + ~cf.Data.cfa_file_substitutions + ~cf.Data.cfa_get_aggregated_data + ~cf.Data.cfa_get_term + ~cf.Data.cfa_get_write + ~cf.Data.cfa_has_aggregated_data + ~cf.Data.cfa_has_file_substitutions + ~cf.Data.cfa_set_aggregated_data + ~cf.Data.cfa_set_term + ~cf.Data.cfa_set_write + ~cf.Data.cfa_update_file_substitutions + ~cf.Data.del_file_location + ~cf.Data.file_locations + Element-wise arithmetic, bit and comparison operations ------------------------------------------------------ diff --git a/docs/source/class/cf.DimensionCoordinate.rst b/docs/source/class/cf.DimensionCoordinate.rst index 5b1d81a3d4..94b372776e 100644 --- a/docs/source/class/cf.DimensionCoordinate.rst +++ b/docs/source/class/cf.DimensionCoordinate.rst @@ -516,6 +516,24 @@ Groups ~cf.DimensionCoordinate.nc_set_variable_groups ~cf.DimensionCoordinate.nc_clear_variable_groups +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.DimensionCoordinate.add_file_location + ~cf.DimensionCoordinate.cfa_clear_file_substitutions + ~cf.DimensionCoordinate.cfa_del_file_substitution + ~cf.DimensionCoordinate.cfa_file_substitutions + ~cf.DimensionCoordinate.cfa_update_file_substitutions + ~cf.DimensionCoordinate.del_file_location + ~cf.DimensionCoordinate.file_locations + Aliases ------- diff --git a/docs/source/class/cf.Domain.rst b/docs/source/class/cf.Domain.rst index 6a8d424c01..0655568037 100644 --- a/docs/source/class/cf.Domain.rst +++ b/docs/source/class/cf.Domain.rst @@ -251,6 +251,24 @@ Groups ~cf.Domain.nc_set_group_attribute ~cf.Domain.nc_set_group_attributes +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.Domain.add_file_location + ~cf.Domain.cfa_clear_file_substitutions + ~cf.Domain.cfa_del_file_substitution + ~cf.Domain.cfa_file_substitutions + ~cf.Domain.cfa_update_file_substitutions + ~cf.Domain.del_file_location + ~cf.Domain.file_locations + Geometries ^^^^^^^^^^ diff --git a/docs/source/class/cf.DomainAncillary.rst b/docs/source/class/cf.DomainAncillary.rst index a478ef869d..2313f367eb 100644 --- a/docs/source/class/cf.DomainAncillary.rst +++ b/docs/source/class/cf.DomainAncillary.rst @@ -452,8 +452,26 @@ NetCDF ~cf.DomainAncillary.nc_del_variable ~cf.DomainAncillary.nc_get_variable ~cf.DomainAncillary.nc_has_variable - ~cf.DomainAncillary.nc_set_variable + ~cf.DomainAncillary.nc_set_variable +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.DomainAncillary.add_file_location + ~cf.DomainAncillary.cfa_clear_file_substitutions + ~cf.DomainAncillary.cfa_del_file_substitution + ~cf.DomainAncillary.cfa_file_substitutions + ~cf.DomainAncillary.cfa_update_file_substitutions + ~cf.DomainAncillary.del_file_location + ~cf.DomainAncillary.file_locations + Aliases ------- diff --git a/docs/source/class/cf.Field.rst b/docs/source/class/cf.Field.rst index 28a6133615..b82562117c 100644 --- a/docs/source/class/cf.Field.rst +++ b/docs/source/class/cf.Field.rst @@ -426,7 +426,25 @@ Groups ~cf.Field.nc_clear_group_attributes ~cf.Field.nc_set_group_attribute ~cf.Field.nc_set_group_attributes - + +CFA +^^^ + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.Field.add_file_location + ~cf.Field.cfa_clear_file_substitutions + ~cf.Field.cfa_del_file_substitution + ~cf.Field.cfa_file_substitutions + ~cf.Field.cfa_update_file_substitutions + ~cf.Field.del_file_location + ~cf.Field.file_locations + Geometries ^^^^^^^^^^ diff --git a/docs/source/class/cf.FieldAncillary.rst b/docs/source/class/cf.FieldAncillary.rst index a9c14b3a13..0e53ca16f5 100644 --- a/docs/source/class/cf.FieldAncillary.rst +++ b/docs/source/class/cf.FieldAncillary.rst @@ -399,6 +399,23 @@ NetCDF ~cf.FieldAncillary.nc_has_variable ~cf.FieldAncillary.nc_set_variable +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.FieldAncillary.add_file_location + ~cf.FieldAncillary.cfa_clear_file_substitutions + ~cf.FieldAncillary.cfa_del_file_substitution + ~cf.FieldAncillary.cfa_file_substitutions + ~cf.FieldAncillary.cfa_update_file_substitutions + ~cf.FieldAncillary.del_file_location + ~cf.FieldAncillary.file_locations Aliases ------- diff --git a/docs/source/class/cf.Index.rst b/docs/source/class/cf.Index.rst index 81405432b2..1f680f9477 100644 --- a/docs/source/class/cf.Index.rst +++ b/docs/source/class/cf.Index.rst @@ -403,6 +403,23 @@ NetCDF ~cf.Index.nc_has_sample_dimension ~cf.Index.nc_set_sample_dimension +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.Index.add_file_location + ~cf.Index.cfa_clear_file_substitutions + ~cf.Index.cfa_del_file_substitution + ~cf.Index.cfa_file_substitutions + ~cf.Index.cfa_update_file_substitutions + ~cf.Index.del_file_location + ~cf.Index.file_locations Aliases ------- diff --git a/docs/source/class/cf.List.rst b/docs/source/class/cf.List.rst index 1a27978da7..3bcc834a01 100644 --- a/docs/source/class/cf.List.rst +++ b/docs/source/class/cf.List.rst @@ -395,6 +395,23 @@ NetCDF ~cf.List.nc_has_variable ~cf.List.nc_set_variable +CFA +--- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.List.add_file_location + ~cf.List.cfa_clear_file_substitutions + ~cf.List.cfa_del_file_substitution + ~cf.List.cfa_file_substitutions + ~cf.List.cfa_update_file_substitutions + ~cf.List.del_file_location + ~cf.List.file_locations Aliases ------- diff --git a/docs/source/class/cf.NetCDFArray.rst b/docs/source/class/cf.NetCDFArray.rst index 9df7e9fda3..34d7bf0d65 100644 --- a/docs/source/class/cf.NetCDFArray.rst +++ b/docs/source/class/cf.NetCDFArray.rst @@ -17,14 +17,22 @@ cf.NetCDFArray :toctree: ../method/ :template: method.rst + ~cf.NetCDFArray.add_file_location ~cf.NetCDFArray.close ~cf.NetCDFArray.copy + ~cf.NetCDFArray.del_file_location + ~cf.NetCDFArray.file_locations + ~cf.NetCDFArray.filename ~cf.NetCDFArray.get_address + ~cf.NetCDFArray.get_addresses + ~cf.NetCDFArray.get_format + ~cf.NetCDFArray.get_formats ~cf.NetCDFArray.get_calendar ~cf.NetCDFArray.get_compression_type ~cf.NetCDFArray.get_filename ~cf.NetCDFArray.get_filenames ~cf.NetCDFArray.get_group + ~cf.NetCDFArray.get_groups ~cf.NetCDFArray.get_mask ~cf.NetCDFArray.get_missing_values ~cf.NetCDFArray.get_ncvar @@ -33,7 +41,8 @@ cf.NetCDFArray ~cf.NetCDFArray.get_varid ~cf.NetCDFArray.open ~cf.NetCDFArray.to_memory - + ~cf.NetCDFArray.Units + .. rubric:: Attributes .. autosummary:: diff --git a/docs/source/installation.rst b/docs/source/installation.rst index a7a5be668b..353354d821 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -201,8 +201,8 @@ Required * `cftime `_, version 1.6.0 or newer (note that this package may be installed with netCDF4). -* `cfdm `_, version 1.10.0.3 or up to, - but not including, 1.10.1.0. +* `cfdm `_, version 1.10.1.0 or up to, + but not including, 1.10.2.0. * `cfunits `_, version 3.3.5 or newer. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index a49e844acd..2ca8f7f1cb 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -119,10 +119,8 @@ elements. The following file types can be read: -* All formats of netCDF3 and netCDF4 files (including `CFA-netCDF - `_ files) - can be read, containing datasets for any version of CF up to and - including CF-|version|. +* All formats of netCDF3 and netCDF4 files can be read, containing + datasets for any version of CF up to and including CF-|version|. .. @@ -132,6 +130,12 @@ The following file types can be read: .. +* `CFA-netCDF + `_ + files at version 0.6 or later. + +.. + * :ref:`PP and UM fields files `, whose contents are mapped into field constructs. @@ -5256,6 +5260,10 @@ The `cf.write` function has optional parameters to * append to the netCDF file rather than over-writing it by default; +* write as a `CFA-netCDF + `_ + file. + * specify which field construct properties should become netCDF data variable attributes and which should become netCDF global attributes; @@ -5275,6 +5283,8 @@ The `cf.write` function has optional parameters to * set the endian-ness of the output data. +* omit the data arrays of selected constructs. + For example, to use the `mode` parameter to append a new field, or fields, to a netCDF file whilst preserving the field or fields already contained in that file: diff --git a/requirements.txt b/requirements.txt index 050ec56d78..cc13a7e9f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ netCDF4>=1.5.4 cftime>=1.6.0 numpy>=1.22 -cfdm>=1.10.0.3, <1.10.1.0 +cfdm>=1.10.1.0, <1.10.2.0 psutil>=0.6.0 cfunits>=3.3.5 dask>=2022.12.1 diff --git a/scripts/cfa b/scripts/cfa index e5d637d48c..c411cd7bf9 100755 --- a/scripts/cfa +++ b/scripts/cfa @@ -294,13 +294,18 @@ containing the source property values. . .TP .B \-\-cfa_base=[value] +Deprecated. Use \-\-cfa_paths instead. +. +. +.TP +.B \-\-cfa_paths=value For output CFA\-netCDF files only. File names referenced by an output CFA\-netCDF file have relative, as opposed to absolute, paths or URL bases. This may be useful when relocating a CFA\-netCDF file together with the datasets referenced by it. .PP .RS -If set with no value (\-\-cfa_base=) or the value is empty then file +If set with no value (\-\-cfa_paths=) or the value is empty then file names are given relative to the directory or URL base containing the output CFA\-netCDF file. If set with a non\-empty value then file names are given relative to the directory or URL base described by the @@ -1069,7 +1074,7 @@ Written by David Hassell [--reference_datetime] Override coordinate reference date-times [--overwrite] Overwrite pre-existing output files [--unlimited=axis Create an unlimited dimension - [--cfa_base=[value]] Configure CFA-netCDF output files + [--cfa_paths=value] Configure CFA-netCDF output files [--single] Write out as single precision [--double] Write out as double precision [--compress=N] Compress the output data @@ -1102,7 +1107,7 @@ Using cf-python library version {cf.__version__} at {library_path}""" "1aDd:f:hino:r:s:uv:x:", longopts=[ "axis=", - "cfa_base=", + "cfa_paths=", "compress=", "contiguous", "Directory", @@ -1150,6 +1155,7 @@ Using cf-python library version {cf.__version__} at {library_path}""" "read=", "write=", "verbose=", + "cfa_base=", ], ) except GetoptError as err: @@ -1183,6 +1189,7 @@ Using cf-python library version {cf.__version__} at {library_path}""" aggregate_options = {} read_options = {} # Keyword parameters to cf.read write_options = {} # Keyword parameters to cf.write + cfa_options = {} for option, arg in opts: if option in ("-h", "--help"): @@ -1274,16 +1281,29 @@ Using cf-python library version {cf.__version__} at {library_path}""" write_options["single"] = True elif option == "--double": write_options["double"] = True - elif option == "--cfa_base": - write_options["cfa_options"] = {"base": arg} + elif option == "--cfa_paths": + cfa_options["absolute_paths"] = option == "absolute" + if option not in ("absolute", "relative"): + print( + f"{iam} ERROR: The {option} option must have a value " + "of either 'absolute' or 'relative'.", + file=sys.stderr, + ) + sys.exit(2) elif option == "--unlimited": unlimited.append(arg) + elif option == "--cfa_base": + print( + f"{iam} ERROR: The {option} option has been deprecated.", + file=sys.stderr, + ) + sys.exit(2) elif option in ("-v", "--view"): view = arg if view not in "smc": print( f"{iam} ERROR: The {option} option must have a value " - "of either s, m or c", + "of either 's', 'm' or 'c'.", file=sys.stderr, ) sys.exit(2) @@ -1381,19 +1401,23 @@ Using cf-python library version {cf.__version__} at {library_path}""" ) sys.exit(2) - write_options["fmt"] = fmt - if unlimited: write_options["unlimited"] = unlimited - if fmt == "CFA": - print( - f"{iam} ERROR: '-f CFA' has been replaced by '-f CFA3' or " - "'-f CFA4' for netCDF3 classic and netCDF4 CFA output formats " - "respectively", - file=sys.stderr, - ) - sys.exit(2) + if fmt.startswith("CFA"): + if fmt in ("CFA", "CFA4"): + fmt = "NETCDF4" + elif fmt == "CFA3": + fmt = "NETCDF3_CLASSIC" + + read_options["chunks"] = -1 + + if cfa_options: + write_options["cfa"] = cfa_options + else: + write_options["cfa"] = True + + write_options["fmt"] = fmt if not infiles: print( @@ -1472,7 +1496,7 @@ Using cf-python library version {cf.__version__} at {library_path}""" # ------------------------------------------------------------ if view is None and one_to_one: outfile = re_sub("(\.pp|\.nc|\.nca)$", ".nc", infile) - if fmt in ("CFA3", "CFA4"): + if write_options["cfa"]: outfile += "a" if directory is not None: