diff --git a/Changelog.rst b/Changelog.rst index fa3bed26f5..1b1265acb7 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,10 +1,21 @@ +Version NEXTVERSION +---------------- + +**2025-??-??** + +* Introduction of reading and writing of aggregation datasets (for + CF-1.13) (https://github.com/NCAS-CMS/cfdm/issues/319) +* Changed dependency: ``h5py>=3.12.0`` + +---- + Version 1.11.2.0 ---------------- **2025-01-28** * Introduction of `dask` for all data manipulations - https://github.com/NCAS-CMS/cfdm/issues/317) + (https://github.com/NCAS-CMS/cfdm/issues/317) * Fix bug that returned incorrect results when an invalid identifer is provided to `cf.Field.cell_methods` (https://github.com/NCAS-CMS/cfdm/issues/299) @@ -31,6 +42,7 @@ Version 1.11.2.0 * New dependency: ``h5py>=3.10.0`` * New dependency: ``s3fs>=2024.6.0`` * New dependency: ``dask>=2024.6.0,<=2024.7.1`` +* New dependency: ``uritools>=4.0.3`` * Removed dependency: ``netcdf_flattener`` ---- diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 0827cb3da6..6374d8556f 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -164,6 +164,7 @@ atol, chunksize, configuration, + dirname, environment, integer_dtype, log_level, @@ -195,10 +196,12 @@ from .data import ( Array, + AggregatedArray, BoundsFromNodesArray, CellConnectivityArray, CompressedArray, Data, + FullArray, GatheredArray, H5netcdfArray, NetCDFArray, diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index ebec61b877..3070e4c716 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -26,6 +26,7 @@ ) from .abstract import Implementation from .data import ( + AggregatedArray, BoundsFromNodesArray, CellConnectivityArray, Data, @@ -1346,19 +1347,27 @@ def get_field_data_axes(self, field): """ return field.get_data_axes() - def get_filenames(self, parent): + def get_filenames(self, parent, normalise=True): """Return the name of the file or files containing the data. :Parameters: parent: + normalise: `bool`, optional + If True (the default) then normalise the filenames by + applying any text substitutions and resolving the name + to an absolute path. If False then neither of these is + carried out. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `set` """ - return parent.get_filenames() + return parent.get_filenames(normalise=normalise) def get_data_max(self, parent): """Use `get_data_maximum` instead (since cfdm version 1.8.0).""" @@ -1726,7 +1735,10 @@ def get_data(self, parent, default=None): """ - return parent.get_data(default=default) + try: + return parent.get_data(default=default) + except AttributeError: + return default def get_data_axes(self, parent, key, default=None): """Get domain axis identifiers. @@ -1868,6 +1880,24 @@ def initialise_CellMethod(self, axes=None, method=None, qualifiers=None): cls = self.get_class("CellMethod") return cls(axes=axes, method=method, qualifiers=qualifiers) + def initialise_AggregatedArray(self, **kwargs): + """Return a `AggregatedArray` instance. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + kwargs: optional + Initialisation parameters to pass to the new instance. + + :Returns: + + `AggregatedArray` + + """ + cls = self.get_class("AggregatedArray") + return cls(**kwargs) + def initialise_CoordinateConversion( self, domain_ancillaries=None, parameters=None ): @@ -3653,25 +3683,59 @@ def has_property(self, parent, prop): """ return parent.has_property(prop) - def squeeze(self, construct, axes=None): + def squeeze(self, construct, axes=None, inplace=False): """Remove size 1 axes from construct data. :Parameters: construct: + The construct. axes: optional + The axes to squeeze. If `None` then all size 1 axes + are removed from the data. + + inplace: `bool`, optional + If True then do the operation in-place and return + `None`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + The construct with removed axes, or `None` if the + operation was in-place. + + """ + return construct.squeeze(axes=axes, inplace=inplace) + + def unsqueeze(self, field, inplace=False): + """Insert size 1 axes into the field data array. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + field: `Field` + The field construct. + + inplace: `bool`, optional + If True then do the operation in-place and return + `None`. :Returns: - The construct with removed axes. + `Field` or `None` + The field with inserted axes, or `None` if the + operation was in-place. """ - return construct.squeeze(axes=axes) + return field.unsqueeze(inplace=inplace) _implementation = CFDMImplementation( cf_version=CF(), + AggregatedArray=AggregatedArray, AuxiliaryCoordinate=AuxiliaryCoordinate, CellConnectivity=CellConnectivity, CellConnectivityArray=CellConnectivityArray, diff --git a/cfdm/core/__init__.py b/cfdm/core/__init__.py index d92be840c4..520bfbebcf 100644 --- a/cfdm/core/__init__.py +++ b/cfdm/core/__init__.py @@ -11,9 +11,9 @@ """ -__date__ = "2025-01-28" -__cf_version__ = "1.11" -__version__ = "1.11.2.0" +__date__ = "202?-??-??" +__cf_version__ = "1.12" +__version__ = "1.12.0.0" from packaging import __version__ as _packaging_ver from packaging import __file__ as _packaging_file diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index 22a7835398..8a460ecb8b 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -15,9 +15,11 @@ from .subarray.abstract import MeshSubarray, Subarray, SubsampledSubarray +from .aggregatedarray import AggregatedArray from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray +from .fullarray import FullArray from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray from .netcdf4array import NetCDF4Array diff --git a/cfdm/data/abstract/__init__.py b/cfdm/data/abstract/__init__.py index 50063aa559..cd88bed134 100644 --- a/cfdm/data/abstract/__init__.py +++ b/cfdm/data/abstract/__init__.py @@ -1,4 +1,5 @@ from .array import Array from .compressedarray import CompressedArray +from .filearray import FileArray from .mesharray import MeshArray from .raggedarray import RaggedArray diff --git a/cfdm/data/abstract/compressedarray.py b/cfdm/data/abstract/compressedarray.py index c8c387c86f..cea7e126e6 100644 --- a/cfdm/data/abstract/compressedarray.py +++ b/cfdm/data/abstract/compressedarray.py @@ -338,7 +338,10 @@ def compressed_array(self): if ca is None: raise ValueError("There is no underlying compressed array") - return ca.array + try: + return ca.array + except AttributeError: + return ca def get_compressed_axes(self): """Return axes that are compressed in the underlying array. @@ -439,27 +442,45 @@ def conformed_data(self): """ return {"data": self.source().copy()} - def get_filenames(self): - """Return the names of any files containing the compressed data. + def get_filename(self, normalise=False, default=AttributeError()): + """Return the name of the file containing the compressed data. .. versionadded:: (cfdm) 1.10.0.2 + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + + default: optional + Return the value of the *default* parameter if there + is no file name. + + {{default Exception}} + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: - `set` - The file names in normalised, absolute form. If the - data are all in memory then an empty `set` is - returned. + `str` + The file name. """ data = self._get_compressed_Array(None) if data is None: return set() - try: - return data.get_filenames() - except AttributeError: - return set() + filenames = data.get_filenames(normalise=normalise) + if len(filenames) != 1: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no unique file name" + ) + + return filenames[0] def get_Subarray(self): """Return the Subarray class. diff --git a/cfdm/data/abstract/filearray.py b/cfdm/data/abstract/filearray.py new file mode 100644 index 0000000000..297ad9850d --- /dev/null +++ b/cfdm/data/abstract/filearray.py @@ -0,0 +1,653 @@ +from copy import deepcopy +from os import sep +from os.path import join +from urllib.parse import urlparse + +from s3fs import S3FileSystem +from uritools import isuri, urisplit + +from ...functions import abspath, dirname +from . import Array + + +class FileArray(Array): + """Abstract base class for an array in a file. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + unpack=True, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The locations of datasets containing the array. + + address: (sequence of `str`), optional + How to find the array in the datasets. + + dtype: `numpy.dtype`, optional + The data type of the array. May be `None` if is not + known. This may differ from the data type of the + array in the datasets. + + shape: `tuple`, optional + The shape of the dataset array. + + {{init mask: `bool`, optional}} + + {{init unpack: `bool`, optional}} + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set during the first `__getitem__` + call. + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + unpack = source._get_component("unpack", True) + except AttributeError: + unpack = True + + try: + attributes = source._get_component("attributes", None) + except AttributeError: + attributes = None + + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + + if shape is not None: + self._set_component("shape", shape, copy=False) + + if filename is not None: + self._set_component("filename", filename, copy=False) + + if address is not None: + self._set_component("address", address, copy=False) + + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", bool(mask), copy=False) + self._set_component("unpack", bool(unpack), copy=False) + + if storage_options is not None: + self._set_component("storage_options", storage_options, copy=copy) + + if attributes is not None: + self._set_component("attributes", attributes, copy=copy) + + # By default, close the netCDF file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Return a subspace of the array. + + x.__getitem__(indices) <==> x[indices] + + Returns a subspace of the array as an independent numpy array. + + """ + raise NotImplementedError( + f"Must implement {self.__class__.__name__}.__getitem__" + ) # pragma: no cover + + def __repr__(self): # noqa: D105 + return f"" + + def __str__(self): # noqa: D105 + return f"{self.get_filename()}, {self.get_address()}" + + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return ( + self.__class__, + self.shape, + self.get_filename(normalise=True, default=None), + self.get_address(), + self.get_mask(), + self.get_unpack(), + self.get_attributes(copy=False), + self.get_storage_options(), + ) + + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + The subspace is defined by the `index` attributes, and is + applied with `cfdm.netcdf_indexer`. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + raise NotImplementedError( + f"Must implement {self.__class__.__name__}._get_array" + ) # pragma: no cover + + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + **Examples** + + >>> n = {{package}}.{{class}}.array(a) + >>> isinstance(n, numpy.ndarray) + True + + """ + return self[...] + + @property + def dtype(self): + """Data-type of the array.""" + return self._get_component("dtype") + + @property + def shape(self): + """Shape of the array.""" + return self._get_component("shape") + + def close(self, dataset): + """Close the dataset containing the data.""" + raise NotImplementedError( + f"Must implement {self.__class__.__name__}.close" + ) # pragma: no cover + + def get_address(self, default=AttributeError()): + """The name of the file containing the array. + + If there are multiple files then an `AttributeError` is + raised by default. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Parameters: + + default: optional + Return the value of the *default* parameter if there + is no file. + + {{default Exception}} + + :Returns: + + `str` + The file name. + + """ + address = self._get_component("address", None) + if address is None: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no address" + ) + + return address + + def file_directory(self, normalise=False, default=AttributeError()): + """The file directory. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + {{normalise: `bool`, optional}} + + :Returns: + + `str` + The file directory name. + + **Examples** + + >>> a.get_filename() + '/data1/file1' + + """ + filename = self.get_filename(normalise=normalise, default=None) + if filename is None: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no file name" + ) + + return dirname(filename) + + def get_filename(self, normalise=False, default=AttributeError()): + """The name of the file containing the array. + + .. versionadded:: (cfdm) 1.10.0.2 + + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + + default: optional + Return the value of the *default* parameter if there + is no file name. + + {{default Exception}} + + :Returns: + + `str` + The file name. + + """ + filename = self._get_component("filename", None) + if filename is None: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no file name" + ) + + if normalise: + filename = abspath(filename) + + return filename + + def get_mask(self): + """Whether or not to automatically mask the data. + + .. versionadded:: (cfdm) 1.8.2 + + **Examples** + + >>> b = a.get_mask() + + """ + return self._get_component("mask") + + def get_storage_options( + self, create_endpoint_url=True, filename=None, parsed_filename=None + ): + """Return `s3fs.S3FileSystem` options for accessing S3 files. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + create_endpoint_url: `bool`, optional + If True, the default, then create an + ``'endpoint_url'`` option if and only if one was not + set during object initialisation. In this case the + ``'endpoint_url'`` will be set from the file name + returned by `get_filename`, unless either of the + *filename* or *parsed_filename* parameters is also + set. + + filename: `str`, optional + Used to set the ``'endpoint_url'`` if it was not + set during object initialisation and + *create_endpoint_url* is True. Ignored if the + *parsed_filename* parameter has been set. + + parsed_filename: `urllib.parse.ParseResult`, optional + Used to set the ``'endpoint_url'`` if it was not + set during object initialisation and + *create_endpoint_url* is True. Ignored if the + *filename* parameter has been set. + + :Returns: + + `dict` or `None` + The `s3fs.S3FileSystem` options. + + **Examples** + + >>> f.get_filename() + 's3://store/data/file.nc' + >>> f.get_storage_options(create_endpoint_url=False) + {} + >>> f.get_storage_options() + {'endpoint_url': 'https://store'} + >>> f.get_storage_options(filename='s3://other-store/data/file.nc') + {'endpoint_url': 'https://other-store'} + >>> f.get_storage_options(create_endpoint_url=False, + ... filename='s3://other-store/data/file.nc') + {} + + >>> f.get_storage_options() + {'key': 'scaleway-api-key...', + 'secret': 'scaleway-secretkey...', + 'endpoint_url': 'https://s3.fr-par.scw.cloud', + 'client_kwargs': {'region_name': 'fr-par'}} + + """ + storage_options = self._get_component("storage_options", None) + if not storage_options: + storage_options = {} + else: + storage_options = deepcopy(storage_options) + + client_kwargs = storage_options.get("client_kwargs", {}) + if ( + create_endpoint_url + and "endpoint_url" not in storage_options + and "endpoint_url" not in client_kwargs + ): + if parsed_filename is None: + if filename is None: + try: + filename = self.get_filename(normalise=False) + except AttributeError: + pass + else: + parsed_filename = urlparse(filename) + else: + parsed_filename = urlparse(filename) + + if parsed_filename is not None and parsed_filename.scheme == "s3": + # Derive endpoint_url from filename + storage_options["endpoint_url"] = ( + f"https://{parsed_filename.netloc}" + ) + + return storage_options + + def open(self, func, *args, **kwargs): + """Return a dataset file object and address. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and a file object is + returned from the first file that exists. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Parameters: + + func: callable + Function that opens a file. + + args, kwargs: optional + Optional arguments to *func*. + + :Returns: + + 2-`tuple` + The file object for the dataset, and the address of + the data within the file. + + """ + filename = self.get_filename(normalise=True) + url = urlparse(filename) + if url.scheme == "file": + # Convert a file URI into an absolute path + filename = url.path + elif url.scheme == "s3": + # Create an openable S3 file object + storage_options = self.get_storage_options( + create_endpoint_url=True, parsed_filename=url + ) + fs = S3FileSystem(**storage_options) + filename = fs.open(url.path[1:], "rb") + + try: + dataset = func(filename, *args, **kwargs) + except FileNotFoundError: + raise FileNotFoundError(f"No such file: {filename}") + except RuntimeError as error: + raise RuntimeError(f"{error}: {filename}") + + # Successfully opened a dataset, so return. + return dataset, self.get_address() + + def replace_directory(self, old=None, new=None, normalise=False): + """Replace the file directory. + + Modifies the name of the file. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `file_directory`, `get_filename` + + :Parameters: + + {{replace old: `str` or `None`, optional}} + + {{replace new: `str` or `None`, optional}} + + {{replace normalise: `bool`, optional}} + + :Returns: + + `{{class}}` + A new `{{class}}` with modified file locations. + + **Examples** + + >>> a.get_filename() + '/data/file1.nc' + >>> b = a.replace_directory('/data', '/new/data/path/') + >>> b.get_filename() + '/new/data/path/file1.nc' + >>> c = b.replace_directory('/new/data', None) + >>> c.get_filename() + 'path/file1.nc' + >>> c = b.replace_directory('path', '../new_path', normalise=False) + >>> c.get_filename() + '../new_path/file1.nc' + >>> c = b.replace_directory(None, '/data') + >>> c.get_filename() + '/data/../new_path/file1.nc' + >>> c = b.replace_directory('/new_path/', None, normalise=True) + >>> c.get_filename() + 'file1.nc' + + """ + a = self.copy() + + filename = a.get_filename(normalise=normalise) + if old or new: + if normalise: + if not old: + raise ValueError( + "When 'normalise' is True and 'new' is a non-empty " + "string, 'old' must also be a non-empty string." + ) + + uri = isuri(filename) + try: + old = dirname(old, normalise=True, uri=uri, isdir=True) + except ValueError: + old = dirname(old, normalise=True, isdir=True) + + u = urisplit(old) + if not uri and u.scheme == "file": + old = u.getpath() + + if new: + try: + new = dirname(new, normalise=True, uri=uri, isdir=True) + except ValueError: + new = dirname(new, normalise=True, isdir=True) + + if old: + if filename.startswith(old): + if not new: + new = "" + if old and not old.endswith(sep): + old += sep + + filename = filename.replace(old, new) + elif new: + if filename.startswith(sep): + filename = filename[1:] + + filename = join(new, filename) + + a._set_component("filename", filename, copy=False) + return a + + def get_missing_values(self): + """The missing values of the data. + + Deprecated at version NEXTVERSION. Use `get_attributes` instead. + + """ + + class DeprecationError(Exception): + """Deprecation error.""" + + pass + + raise DeprecationError( + f"{self.__class__.__name__}.get_missing_values was deprecated " + "at version NEXTVERSION and is no longer available. " + f"Use {self.__class__.__name__}.get_attributes instead." + ) # pragma: no cover + + def to_memory(self): + """Bring data on disk into memory. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `numpy.ndarray` + The new array. + + """ + return self.array + + def _set_attributes(self, var): + """Set the netCDF variable attributes. + + These are set from the netCDF variable attributes, but only if + they have not already been defined, either during {{class}} + instantiation or by a previous call to `_set_attributes`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The netCDF variable. + + :Returns: + + `dict` + The attributes. + + """ + raise NotImplementedError( + f"Must implement {self.__class__.__name__}._set_attributes" + ) # pragma: no cover + + def get_unpack(self): + """Whether or not to automatically unpack the data. + + .. versionadded:: (cfdm) NEXTVERSION + + **Examples** + + >>> a.get_unpack() + True + + """ + return self._get_component("unpack") + + def replace_filename(self, filename): + """Replace the file location. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `file_directory`, `get_filename`, + `replace_directory` + + :Parameters: + + filename: `str` + The new file location. + + :Returns: + + `{{class}}` + A new `{{class}}` with modified file name. + + """ + a = self.copy() + a._set_component("filename", filename, copy=False) + return a diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py new file mode 100644 index 0000000000..050db6e94d --- /dev/null +++ b/cfdm/data/aggregatedarray.py @@ -0,0 +1,782 @@ +from copy import deepcopy +from itertools import accumulate, product + +import numpy as np +from uritools import isuri, uricompose + +from ..functions import dirname +from . import abstract +from .fragment import FragmentFileArray, FragmentUniqueValueArray +from .netcdfindexer import netcdf_indexer +from .utils import chunk_locations, chunk_positions + + +class AggregatedArray(abstract.FileArray): + """An array stored in a CF aggregation variable. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __new__(cls, *args, **kwargs): + """Store fragment array classes. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + instance = super().__new__(cls) + instance._FragmentArray = { + "location": FragmentFileArray, + "unique_value": FragmentUniqueValueArray, + } + return instance + + def __init__( + self, + filename=None, + address=None, + dtype=None, + mask=True, + unpack=True, + fragment_array=None, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: `str`, optional + The name of the aggregation file containing the + aaggregation variable. + + address: `str`, optional + The name of the aggregation variable for the array. + + dtype: `numpy.dtype` + The data type of the aggregated data array. May be + `None` if the numpy data-type is not known (which can + be the case for some string types, for example). + + {{init mask: `bool`, optional}} + + {{init unpack: `bool`, optional}} + + fragment_array: `dict` + A dictionary representation of the fragment array, in + "location" form:: + + {'map': <'map' fragment array variable data>, + 'location': <'location' fragment array variable data>, + 'variable': <'variable' fragment array variable data>,} + + or "unique_value" form: + + {'map': <'map' fragment array variable data>, + 'unique_value': <'unique_value' fragment array data>} + + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of fragment files in S3 object stores. Ignored + for files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing a fragment S3 + file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + mask=True, + unpack=unpack, + attributes=attributes, + storage_options=storage_options, + source=source, + copy=copy, + ) + + if source is not None: + try: + shape = source.shape + except AttributeError: + shape = None + + try: + fragment_array_shape = source.get_fragment_array_shape() + except AttributeError: + fragment_array_shape = None + + try: + fragment_array = source.get_fragment_array(copy=False) + except AttributeError: + fragment_array = {} + + try: + fragment_type = source.get_fragment_type() + except AttributeError: + fragment_type = None + else: + if filename is not None: + ( + shape, + fragment_array_shape, + fragment_type, + fragment_array, + ) = self._parse_fragment_array(filename, fragment_array) + else: + shape = None + fragment_array_shape = None + fragment_array = None + fragment_type = None + + self._set_component("shape", shape, copy=False) + self._set_component( + "fragment_array_shape", fragment_array_shape, copy=False + ) + self._set_component("fragment_array", fragment_array, copy=False) + self._set_component("fragment_type", fragment_type, copy=False) + + def __getitem__(self, index): + """Return a subspace. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + dx = netcdf_indexer( + self.to_dask_array(), + mask=True, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + attributes=self.get_attributes(), + copy=False, + ) + return dx[index].compute() + + @property + def __in_memory__(self): + """True if the array data is in memory. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `bool` + + """ + return False + + def _parse_fragment_array(self, aggregated_filename, fragment_array): + """Parse the fragment array dictionary. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + aggregated_filename: `str` + The name of the aggregation file. + + fragment_array: `dict` + A dictionary representation of the fragment array, in + "location" form:: + + {'map': <'map' fragment array variable data>, + 'location': <'location' fragment array variable data>, + 'variable': <'variable' fragment array variable data>} + + or "unique_value" form:: + + {'map': <'map' fragment array variable data>, + 'unique_value': <'unique_value' fragment array data>} + + :Returns: + + 4-`tuple` + 1. The shape of the aggregated data. + 2. The shape of the array of fragments. + 3. The type of the fragments (either + ``'unique_value'`` or ``'location'``). + 4. The parsed aggregation instructions. + + """ + parsed_fragment_array = {} + + fa_map = fragment_array["map"] + if fa_map.ndim: + compressed = np.ma.compressed + chunks = [compressed(i).tolist() for i in fa_map] + else: + # Scalar 'map' variable + chunks = [] + + aggregated_shape = tuple([sum(c) for c in chunks]) + fragment_array_indices = chunk_positions(chunks) + fragment_shapes = chunk_locations(chunks) + + if "location" in fragment_array: + # -------------------------------------------------------- + # Each fragment is in a file, rather than given by a + # unique value. + # -------------------------------------------------------- + fragment_type = "location" + fa_variable = fragment_array["variable"] + fa_location = fragment_array["location"] + fragment_array_shape = fa_location.shape + + if not fa_variable.ndim: + fa_variable = fa_variable.item() + scalar = True + else: + scalar = False + + for index, shape in zip(fragment_array_indices, fragment_shapes): + if scalar: + variable = fa_variable + else: + variable = fa_variable[index].item() + + parsed_fragment_array[index] = { + "map": shape, + "location": fa_location[index].item(), + "variable": variable, + } + else: + # -------------------------------------------------------- + # Each fragment comprises a unique value, rather than + # being in a file. + # -------------------------------------------------------- + fragment_type = "unique_value" + fa_unique_value = fragment_array["unique_value"] + fragment_array_shape = fa_unique_value.shape + parsed_fragment_array = { + index: { + "map": shape, + "unique_value": fa_unique_value[index].item(), + } + for index, shape in zip( + fragment_array_indices, fragment_shapes + ) + } + + return ( + aggregated_shape, + fragment_array_shape, + fragment_type, + parsed_fragment_array, + ) + + def get_fragment_array(self, copy=True): + """Get the aggregation data dictionary. + + The aggregation data dictionary contains the definitions of + the fragments and the instructions on how to aggregate them. + The keys are indices of the fragment array dimensions, + e.g. ``(1, 0, 0, 0)``. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_fragment_type`, + `get_fragment_array_shape`, + `get_fragmented_dimensions` + + :Parameters: + + copy: `bool`, optional + Whether or not to return a copy of the aggregation + dictionary. By default a deep copy is returned. + + .. warning:: If False then changing the returned + dictionary in-place will change the + aggregation dictionary stored in the + {{class}} instance, **as well as in any + copies of it**. + + :Returns: + + `dict` + The aggregation data dictionary. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_array_shape() + (2, 1, 1, 1) + >>> a.get_fragment_array() + {(0, 0, 0, 0): { + 'file': ('January-June.nc',), + 'variable': ('temp',), + 'format': 'nc', + 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, + (1, 0, 0, 0): { + 'file': ('July-December.nc',), + 'variable': ('temp',), + 'format': 'nc', + 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} + + """ + fragment_array = self._get_component("fragment_array") + if copy: + fragment_array = deepcopy(fragment_array) + + return fragment_array + + def get_fragment_array_shape(self): + """Get the sizes of the fragment dimensions. + + The fragment dimension sizes are given in the same order as + the aggregated dimension sizes given by `shape`. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_fragment_array`, + `get_fragment_type`, + `get_fragmented_dimensions` + + :Returns: + + `tuple` + The shape of the fragment dimensions. + + """ + return self._get_component("fragment_array_shape") + + def get_fragment_type(self): + """The type of fragments in the fragment array. + + Either ``'location'`` to indicate that the fragments are + files, or else ``'unique_value'`` to indicate that they + are represented by their unique data values. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_fragment_array`, + `get_fragment_array_shape`, + `get_fragmented_dimensions` + + :Returns: + + `str` + The fragment type. + + """ + return self._get_component("fragment_type", None) + + def get_fragmented_dimensions(self): + """The positions of dimensions spanned by two or more fragments. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_fragment_array`, + `get_fragment_array_shape`, + `get_fragment_type` + + :Returns: + + `list` + The dimension positions. + + **Examples** + + >>> a.get_fragment_array_shape() + (20, 1, 40, 1) + >>> a.get_fragmented_dimensions() + [0, 2] + + >>> a.get_fragment_array_shape() + (1, 1, 1) + >>> a.get_fragmented_dimensions() + [] + + """ + return [ + i + for i, size in enumerate(self.get_fragment_array_shape()) + if size > 1 + ] + + def subarray_shapes(self, shapes): + """Create the subarray shapes. + + A fragmented dimenion (i.e. one spanned by two or fragments) + will always have a subarray size equal to the size of each of + its fragments, overriding any other size implied by the + *shapes* parameter. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `subarrays` + + :Parameters: + + shapes: `int`, sequence, `dict` or `str`, optional + Define the subarray shapes. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The subarray sizes implied by *chunks* for a dimension + that has been fragmented are ignored, so their + specification is arbitrary. + + :Returns: + + `tuple` + The subarray sizes along each dimension. + + **Examples** + + >>> a.shape + (12, 1, 73, 144) + >>> a.get_fragment_array_shape() + (2, 1, 1, 1) + >>> a.fragmented_dimensions() + [0] + >>> a.subarray_shapes(-1) + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes(None) + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes("auto") + ((6, 6), (1,), (73,), (144,)) + >>> a.subarray_shapes((None, 1, 40, 50)) + ((6, 6), (1,), (40, 33), (50, 50, 44)) + >>> a.subarray_shapes((None, None, "auto", 50)) + ((6, 6), (1,), (73,), (50, 50, 44)) + >>> a.subarray_shapes({2: 40}) + ((6, 6), (1,), (40, 33), (144,)) + + """ + from numbers import Number + + from dask.array.core import normalize_chunks + + # Positions of fragmented dimensions (i.e. those spanned by + # two or more fragments) + f_dims = self.get_fragmented_dimensions() + + shape = self.shape + fragment_array = self.get_fragment_array(copy=False) + + # Create the base chunks. + chunks = [] + ndim = self.ndim + for dim, (n_fragments, size) in enumerate( + zip(self.get_fragment_array_shape(), self.shape) + ): + if dim in f_dims: + # This aggregated dimension is spanned by two or more + # fragments => set the chunks to be the same size as + # the each fragment. + c = [] + index = [0] * ndim + for j in range(n_fragments): + index[dim] = j + loc = fragment_array[tuple(index)]["map"][dim] + chunk_size = loc[1] - loc[0] + c.append(chunk_size) + + chunks.append(tuple(c)) + else: + # This aggregated dimension is spanned by exactly one + # fragment => store `None` for now. This will get + # overwritten from 'shapes'. + chunks.append(None) + + if isinstance(shapes, (str, Number)) or shapes is None: + chunks = [ + c if i in f_dims else shapes for i, c in enumerate(chunks) + ] + elif isinstance(shapes, dict): + chunks = [ + chunks[i] if i in f_dims else shapes.get(i, "auto") + for i, c in enumerate(chunks) + ] + else: + # chunks is a sequence + if len(shapes) != ndim: + raise ValueError( + f"Wrong number of 'shapes' elements in {shapes}: " + f"Got {len(shapes)}, expected {self.ndim}" + ) + + chunks = [ + c if i in f_dims else shapes[i] for i, c in enumerate(chunks) + ] + + return normalize_chunks(chunks, shape=shape, dtype=self.dtype) + + def subarrays(self, subarray_shapes): + """Return descriptors for every subarray. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `subarray_shapes` + + :Parameters: + + subarray_shapes: `tuple` + The subarray sizes along each dimension, as returned + by a prior call to `subarray_shapes`. + + :Returns: + + 6-`tuple` of iterators + Each iterator iterates over a particular descriptor + from each subarray. + + 1. The indices of the aggregated array that correspond + to each subarray. + + 2. The shape of each subarray. + + 3. The indices of the fragment that corresponds to each + subarray (some subarrays may be represented by a + part of a fragment). + + 4. The location of each subarray. + + 5. The location on the fragment dimensions of the + fragment that corresponds to each subarray. + + 6. The shape of each fragment that overlaps each chunk. + + **Examples** + + An aggregated array with shape (12, 73, 144) has two + fragments, both with with shape (6, 73, 144). + + >>> a.shape + (12, 73, 144) + >>> a.get_fragment_array_shape() + (2, 1, 1) + >>> a.fragmented_dimensions() + [0] + >>> subarray_shapes = a.subarray_shapes({1: 40}) + >>> print(subarray_shapes) + ((6, 6), (40, 33), (144,)) + >>> ( + ... u_indices, + ... u_shapes, + ... f_indices, + ... s_locations, + ... f_locations, + ... f_shapes, + ... ) = a.subarrays(subarray_shapes) + >>> for i in u_indices: + ... print(i) + ... + (slice(0, 6, None), slice(0, 40, None), slice(0, 144, None)) + (slice(0, 6, None), slice(40, 73, None), slice(0, 144, None)) + (slice(6, 12, None), slice(0, 40, None), slice(0, 144, None)) + (slice(6, 12, None), slice(40, 73, None), slice(0, 144, None)) + + >>> for i in u_shapes + ... print(i) + ... + (6, 40, 144) + (6, 33, 144) + (6, 40, 144) + (6, 33, 144) + >>> for i in f_indices: + ... print(i) + ... + (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) + (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) + (slice(None, None, None), slice(0, 40, None), slice(0, 144, None)) + (slice(None, None, None), slice(40, 73, None), slice(0, 144, None)) + >>> for i in s_locations: + ... print(i) + ... + (0, 0, 0) + (0, 1, 0) + (1, 0, 0) + (1, 1, 0) + >>> for i in f_locations: + ... print(i) + ... + (0, 0, 0) + (0, 0, 0) + (1, 0, 0) + (1, 0, 0) + >>> for i in f_shapes: + ... print(i) + ... + (6, 73, 144) + (6, 73, 144) + (6, 73, 144) + (6, 73, 144) + + """ + f_dims = self.get_fragmented_dimensions() + + # The indices of the uncompressed array that correspond to + # each subarray, the shape of each uncompressed subarray, and + # the location of each subarray + s_locations = [] + u_shapes = [] + u_indices = [] + f_locations = [] + for dim, c in enumerate(subarray_shapes): + nc = len(c) + s_locations.append(tuple(range(nc))) + u_shapes.append(c) + + if dim in f_dims: + f_locations.append(tuple(range(nc))) + else: + # No fragmentation along this dimension + f_locations.append((0,) * nc) + + c = tuple(accumulate((0,) + c)) + u_indices.append([slice(i, j) for i, j in zip(c[:-1], c[1:])]) + + # For each subarray, the part of the fragment that corresponds + # to it. + f_indices = [ + (slice(None),) * len(u) if dim in f_dims else u + for dim, u in enumerate(u_indices) + ] + + # For each subarray, the shape of the fragment that + # corresponds to it. + f_shapes = [ + u_shape if dim in f_dims else (size,) * len(u_shape) + for dim, (u_shape, size) in enumerate(zip(u_shapes, self.shape)) + ] + + return ( + product(*u_indices), + product(*u_shapes), + product(*f_indices), + product(*s_locations), + product(*f_locations), + product(*f_shapes), + ) + + def to_dask_array(self, chunks="auto"): + """Create a dask array with `FragmentArray` chunks. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + chunks: `int`, `tuple`, `dict` or `str`, optional + Specify the chunking of the returned dask array. + + Any value accepted by the *chunks* parameter of the + `dask.array.from_array` function is allowed. + + The chunk sizes implied by *chunks* for a dimension that + has been fragmented are ignored and replaced with values + that are implied by that dimensions fragment sizes. + + :Returns: + + `dask.array.Array` + + """ + import dask.array as da + from dask.array.core import getter + from dask.base import tokenize + + name = (f"{self.__class__.__name__}-{tokenize(self)}",) + + dtype = self.dtype + fragment_array = self.get_fragment_array(copy=False) + storage_options = self.get_storage_options() + fragment_type = self.get_fragment_type() + aggregated_attributes = self.get_attributes() + unpack = self.get_unpack() + + if fragment_type == "location": + # Get the directory of the aggregation file as an absolute + # URI + aggregation_file_directory = dirname(self.get_filename()) + if not isuri(aggregation_file_directory): + aggregation_file_directory = uricompose( + scheme="file", + authority="", + path=aggregation_file_directory, + ) + + # Set the chunk sizes for the dask array + chunks = self.subarray_shapes(chunks) + + try: + FragmentArray = self._FragmentArray[fragment_type] + except KeyError: + raise ValueError( + "Can't get fragment array class for unknown " + f"fragment type: {fragment_type!r}" + ) + + dsk = {} + for ( + u_indices, + u_shape, + f_indices, + chunk_index, + fragment_index, + fragment_shape, + ) in zip(*self.subarrays(chunks)): + kwargs = fragment_array[fragment_index].copy() + kwargs.pop("map", None) + + if fragment_type == "location": + kwargs["filename"] = kwargs.pop("location") + kwargs["address"] = kwargs.pop("variable") + kwargs["storage_options"] = storage_options + kwargs["aggregation_file_directory"] = ( + aggregation_file_directory + ) + + fragment = FragmentArray( + dtype=dtype, + shape=fragment_shape, + unpack_aggregated_data=unpack, + aggregated_attributes=aggregated_attributes, + **kwargs, + ) + + key = f"{fragment.__class__.__name__}-{tokenize(fragment)}" + dsk[key] = fragment + dsk[name + chunk_index] = ( + getter, + key, + f_indices, + False, + False, + ) + + # Return the dask array + return da.Array(dsk, name[0], chunks=chunks, dtype=dtype) diff --git a/cfdm/data/data.py b/cfdm/data/data.py index d3943a56ac..88f5b3405f 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -4,6 +4,7 @@ from itertools import product, zip_longest from math import prod from numbers import Integral +from os.path import commonprefix import dask.array as da import numpy as np @@ -22,7 +23,7 @@ from ..functions import _numpy_allclose, is_log_level_info, parse_indices from ..mixin.container import Container from ..mixin.files import Files -from ..mixin.netcdf import NetCDFHDF5 +from ..mixin.netcdf import NetCDFAggregation, NetCDFHDF5 from ..units import Units from .abstract import Array from .creation import to_dask @@ -35,6 +36,8 @@ ) from .utils import ( allclose, + chunk_indices, + chunk_positions, collapse, convert_to_datetime, convert_to_reftime, @@ -47,7 +50,7 @@ logger = logging.getLogger(__name__) -class Data(Container, NetCDFHDF5, Files, core.Data): +class Data(Container, NetCDFAggregation, NetCDFHDF5, Files, core.Data): """An N-dimensional data array with units and masked values. * Contains an N-dimensional, indexable and broadcastable array with @@ -118,9 +121,10 @@ class Data(Container, NetCDFHDF5, Files, core.Data): # other constants. It is therefore convenient to define these # constants in binary. _NONE = 0b000 - _ARRAY = 0b01 - _CACHE = 0b10 - _ALL = 0b11 + _ARRAY = 0b001 + _CACHE = 0b010 + _CFA = 0b100 + _ALL = 0b111 # The default mask hardness _DEFAULT_HARDMASK = True @@ -461,8 +465,7 @@ def __init__( "for compressed input arrays" ) - # Bring the compressed data into memory without - # decompressing it + # Bring data into memory (compressed data is not decompressed) if to_memory: try: array = array.to_memory() @@ -732,7 +735,7 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Set the subspaced dask array # ------------------------------------------------------------ - new._set_dask(dx, clear=self._ALL, in_memory=None) + new._set_dask(dx, clear=self._ALL ^ self._CFA, in_memory=None) if 0 in new.shape: raise IndexError( @@ -987,7 +990,7 @@ def __setitem__(self, indices, value): # Do the assignment self._set_subspace(dx, indices, value) - self._set_dask(dx, in_memory=True) + self._set_dask(dx, clear=self._ALL ^ self._CFA, in_memory=True) return @@ -1010,14 +1013,7 @@ def __str__(self): try: first = self.first_element() except Exception: - raise - out = "" - if units and not isreftime: - out += f" {units}" - if calendar: - out += f" {calendar}" - - return out + first = "??" size = self.size shape = self.shape @@ -1038,12 +1034,16 @@ def __str__(self): first = type(self)( np.ma.array(first, mask=mask[0]), units, calendar ).datetime_array - except (ValueError, OverflowError): + except Exception: first = "??" out = f"{open_brackets}{first}{close_brackets}" else: - last = self.last_element() + try: + last = self.last_element() + except Exception: + last = "??" + if isreftime: if last is np.ma.masked: last = 0 @@ -1056,13 +1056,17 @@ def __str__(self): units, calendar, ).datetime_array - except (ValueError, OverflowError): + except Exception: first, last = ("??", "??") if size > 3: out = f"{open_brackets}{first}, ..., {last}{close_brackets}" elif shape[-1:] == (3,): - middle = self.second_element() + try: + middle = self.second_element() + except Exception: + last = "??" + if isreftime: # Convert reference time to date-time if middle is np.ma.masked: @@ -1399,7 +1403,7 @@ def __in_memory__(self): @property def __keepdims_indexing__(self): - """Flag indicating if dimensions indexed with integers are kept. + """Flag to indicate if axes indexed with integers are kept. If set to True (the default) then providing a single integer as a single-axis index does *not* reduce the number of array @@ -1574,18 +1578,10 @@ def _binary_operation(cls, data, other, method): >>> d = {{package}}.Data([0, 1, 2, 3]) >>> e = {{package}}.Data([1, 1, 3, 4]) - >>> f = d._binary_operation(e, '__add__') + >>> f = d._binary_operation(e, '__lt__') >>> print(f.array) - [1 2 5 7] - - >>> e = d._binary_operation(e, '__lt__') - >>> print(e.array) [ True False True True] - >>> d._binary_operation(2, '__imul__') - >>> print(d.array) - [0 2 4 6] - """ # Note: This method is a classmethod to allow its # functionality to be used with a LHS operand that is @@ -1650,7 +1646,7 @@ def _binary_operation(cls, data, other, method): else: d = data.copy() - d._set_dask(result, in_memory=True) + d._set_dask(result, clear=cls._ALL, in_memory=True) if axes is not None: d._axes = axes @@ -1677,7 +1673,7 @@ def _clear_after_dask_update(self, clear=None): .. versionadded:: (cfdm) 1.11.2.0 .. seealso:: `_del_Array`, `_del_cached_elements`, - `_set_dask` + `nc_del_aggregation_write_status`, `_set_dask` :Parameters: @@ -1700,7 +1696,7 @@ def _clear_after_dask_update(self, clear=None): :Returns: `int` - The integer value of *clear*. + The integer value of *clear*. """ if clear is None: @@ -1718,12 +1714,134 @@ def _clear_after_dask_update(self, clear=None): # Delete cached element values self._del_cached_elements() + if clear & self._CFA: + # Set the aggregation write status to False (under certain + # circumstances) + self._del_nc_aggregation_write_status() + return clear + @classmethod + def _concatenate_conform_units(cls, data1, units0, relaxed_units, copy): + """Check and conform the units of data prior to concatenation. + + This is a helper function for `concatenate` that may be easily + overridden in subclasses, to allow for customisation of the + concatenation process. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `concatenate` + + :Parameters: + + data1: `Data` + Data with units. + + units0: `Units` + The units to conform *data1* to. + + {{relaxed_units: `bool`, optional}} + + copy: `bool` + If False then modify *data1* in-place. Otherwise a + copy of it is modified. + + :Returns: + + `Data` + Returns *data1*, possibly modified so that it conforms + to *units0*. If *copy* is False and *data1* is + modified, then it is done so in-place. + + """ + # Check and conform, if necessary, the units of all inputs + units1 = data1.Units + if ( + relaxed_units + and not units0.isvalid + and not units1.isvalid + and units0.__dict__ == units1.__dict__ + ): + # Allow identical invalid units to be equal + pass + elif not units0.equals(units1): + raise ValueError( + "Can't concatenate: All the input arrays must have " + f"equal units. Got {units0!r} and {units1!r}" + ) + + return data1 + + @classmethod + def _concatenate_post_process( + cls, concatenated_data, axis, conformed_data + ): + """Post-process concatenated data. + + This is a helper function for `concatenate` that may be easily + overridden in subclasses, to allow for customisation of the + concatenation process. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `concatenate` + + :Parameters: + + concatenated_data: `Data` + The concatenated data array. + + axis: `int` + The axis of concatenation. + + conformed_data: sequence of `Data` + The ordered sequence of data arrays that were + concatenated. + + :Returns: + + `Data` + Returns *concatenated_data*, possibly modified + in-place. + + """ + # ------------------------------------------------------------ + # Set the aggregation 'aggregated_data' terms + # ------------------------------------------------------------ + if concatenated_data.nc_get_aggregation_write_status(): + # Set the netCDF aggregated_data terms, giving precedence + # to those towards the left hand side of the input + # list. If any input Data object has no aggregated_data + # terms, then nor will the concatenated data. + aggregated_data = {} + for d in conformed_data[::-1]: + value = d.nc_get_aggregated_data() + if not value: + aggregated_data = {} + break + + aggregated_data.update(value) + + concatenated_data.nc_set_aggregated_data(aggregated_data) + + # ------------------------------------------------------------ + # Update the deterministic status of the concatenated data + # ------------------------------------------------------------ + deterministic = True + for d in conformed_data: + if not d.has_deterministic_name(): + deterministic = False + break + + concatenated_data._update_deterministic(deterministic) + + return concatenated_data + def _del_cached_elements(self): """Delete any cached element values. - Updates *data* in-place to remove the cached element values. + Updates the data in-place to remove the cached element values. .. versionadded:: (cfdm) 1.11.2.0 @@ -1736,6 +1854,29 @@ def _del_cached_elements(self): """ self._del_component("cached_elements", None) + def _del_nc_aggregation_write_status(self): + """Set the aggregation write status to False. + + Updates the data in-place to set the aggregation write status + to `False`, but only if the fragment type is not ``'value'``. + + A necessary (but not sufficient) condition for writing the + data as CF-netCDF aggregated data is that the write status is + True. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregation_write_status`, + `nc_get_aggregation_fragment_type` + + :Returns: + + `None` + + """ + if self.nc_get_aggregation_fragment_type() != "unique_value": + self.nc_del_aggregation_write_status() + def _del_dask(self, default=ValueError(), clear=None): """Remove the dask array. @@ -1849,11 +1990,7 @@ def _item(self, index): >>> d = {{package}}.Data([[1, 2, 3]], 'km') >>> d._item((0, -1)) - - - - - + array(3) >>> d[0, 1] = {{package}}.masked >>> d._item((slice(None), slice(1, 2))) masked @@ -1865,6 +2002,79 @@ def _item(self, index): return array + def _modify_dask_graph( + self, method, args=(), kwargs=None, exceptions=(AttributeError,) + ): + """Modify the Dask graph. + + The value of each node of the Dask graph is replaced with the + result of calling its *method* method. If attempting to call + the method results in any of the exceptions given by + *exceptions*, then that node is unmodified. If attempting to + call the method results in an exception not given by + *exceptions*, then that exception is raised. + + The `Data` object is modified in-place, but the embedded + Dask graph is not. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + method: `str` + The name of the callable method which returns a new + graph node value. + + args: `tuple`, optional + Positional arguments for the *method*. No arguments + (the default) are specified by an empty `tuple`. + + kwargs: `dict` or `None`, optional + Keyword arguments for the *method*. No keyword + arguments (the default) is specified by an empty + `dict` or `None`. + + exceptions: `tuple` of `Exception`, optional + Do not change graph node values if calling its + *method* results in any of the specified exceptions. + + :Returns: + + `None` + + """ + if kwargs is None: + kwargs = {} + + updated = False + + dsk = self.todict( + optimize_graph=True, + _force_mask_hardness=False, + _force_to_memory=False, + ) + for key, a in dsk.items(): + try: + dsk[key] = getattr(a, method)(*args, **kwargs) + except exceptions: + # This graph node could not be modified + pass + else: + # This graph node was successfully modified + updated = True + + if updated: + # The Dask graph was modified, so recast the dictionary + # representation as a Dask array. + dx = self.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ) + dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) + self._set_dask(dx, clear=self._NONE, in_memory=None) + + # Update the deterministic status + self._update_deterministic(False) + def _parse_axes(self, axes): """Parses the data axes and returns valid non-duplicate axes. @@ -2411,6 +2621,8 @@ def array(self): elif not isinstance(a, np.ndarray): a = np.asanyarray(a) + ndim = a.ndim + shape = a.shape size = a.size if not size: return a @@ -2862,6 +3074,8 @@ def npartitions(self): >>> d = {{package}}.Data.empty((6, 5), chunks=(2, 4)) >>> d.chunks ((2, 2, 2), (4, 1)) + >>> d.chunksize + (2, 4) >>> d.numblocks (3, 2) >>> d.npartitions @@ -2886,6 +3100,8 @@ def numblocks(self): >>> d = {{package}}.Data.empty((6, 5), chunks=(2, 4)) >>> d.chunks ((2, 2, 2), (4, 1)) + >>> d.chunksize + (2, 4) >>> d.numblocks (3, 2) >>> d.npartitions @@ -3379,8 +3595,11 @@ def apply_masking( if mask is not None: dx = da.ma.masked_where(mask, dx) + CFA = self._CFA + else: + CFA = self._NONE - d._set_dask(dx, in_memory=True) + d._set_dask(dx, clear=self._ALL ^ CFA, in_memory=True) return d @@ -3455,11 +3674,11 @@ def asdata(cls, d, dtype=None, copy=False): return data def chunk_indices(self): - """Return indices that define each dask chunk. + """Return indices of the data that define each dask chunk. .. versionadded:: (cfdm) 1.11.2.0 - .. seealso:: `chunks` + .. seealso:: `chunks`, `chunk_positions` :Returns: @@ -3483,18 +3702,102 @@ def chunk_indices(self): (slice(1, 3, None), slice(0, 9, None), slice(9, 15, None)) """ - from dask.utils import cached_cumsum + return chunk_indices(self.chunks) - chunks = self.chunks + def chunk_positions(self): + """Find the position of each chunk. - cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] - indices = [ - [slice(s, s + dim) for s, dim in zip(starts, shapes)] - for starts, shapes in zip(cumdims, chunks) - ] - return product(*indices) + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `chunks`, `chunk_indices` - def compute(self): + :Parameters: + + chunks: `tuple` + + The chunk sizes along each dimension, as output by + `dask.array.Array.chunks`. + + **Examples** + + >>> d = {{package}}.Data(np.arange(405).reshape(3, 9, 15), + ... chunks=((1, 2), (9,), (4, 5, 6))) + >>> d.npartitions + 6 + >>> for position in d.chunk_positions(): + ... print(position) + ... + (0, 0, 0) + (0, 0, 1) + (0, 0, 2) + (1, 0, 0) + (1, 0, 1) + (1, 0, 2) + + """ + return chunk_positions(self.chunks) + + @_inplace_enabled(default=False) + def compressed(self, inplace=False): + """Return all non-masked values in a one dimensional data array. + + Not to be confused with compression by convention (see the + `uncompress` method). + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `flatten` + + :Parameters: + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The non-masked values, or `None` if the operation was + in-place. + + **Examples** + + >>> d = {{package}}.{class}}(numpy.arange(12).reshape(3, 4), 'm') + >>> print(d.array) + [[ 0 1 2 3] + [ 4 5 6 7] + [ 8 9 10 11]] + >>> print(d.compressed().array) + [ 0 1 2 3 4 5 6 7 8 9 10 11] + >>> d[1, 1] = {{package}}.masked + >>> d[2, 3] = {{package}}.masked + >>> print(d.array) + [[0 1 2 3] + [4 -- 6 7] + [8 9 10 --]] + >>> print(d.compressed().array) + [ 0 1 2 3 4 6 7 8 9 10] + + >>> d = {{package}}.{class}}(9) + >>> print(d.compressed().array) + [9] + + """ + d = _inplace_enabled_define_and_cleanup(self) + + dx = d.to_dask_array(_force_mask_hardness=True, _force_to_memory=True) + dx = da.blockwise( + np.ma.compressed, + "i", + dx.ravel(), + "i", + adjust_chunks={"i": lambda n: np.nan}, + dtype=dx.dtype, + meta=np.array((), dtype=dx.dtype), + ) + + d._set_dask(dx, clear=self._ALL, in_memory=True) + return d + + def compute(self, _force_to_memory=True): """A view of the computed data. In-place changes to the returned array *might* affect the @@ -3515,6 +3818,15 @@ def compute(self): .. seealso:: `persist`, `array`, `datetime_array`, `sparse_array` + :Parameters: + + _force_to_memory: `bool`, optional + If True (the default) then force the data resulting + from computing the returned Dask graph to be in + memory. If False then the data resulting from + computing the Dask graph may or may not be in memory, + depending on the nature of the stack + :Returns: An in-memory view of the data @@ -3537,12 +3849,25 @@ def compute(self): array([[0., 0., 0.], [0., 0., 0.]]) + >>> f = {{package}}.example_field(0) + >>> {{package}}.write(f, 'file.nc') + >>> f = {{package}}.read('file.nc')[0] + >>> print(f.data.compute()) + [[0.007 0.034 0.003 0.014 0.018 0.037 0.024 0.029] + [0.023 0.036 0.045 0.062 0.046 0.073 0.006 0.066] + [0.11 0.131 0.124 0.146 0.087 0.103 0.057 0.011] + [0.029 0.059 0.039 0.07 0.058 0.072 0.009 0.017] + [0.006 0.036 0.019 0.035 0.018 0.037 0.034 0.013]] + >>> f.data.compute(_force_to_memory=False) + <{{repr}}NetCDF4Array(5, 8): file.nc, q(5, 8)> + """ - dx = self.to_dask_array(_force_mask_hardness=False) + dx = self.to_dask_array( + _force_mask_hardness=False, _force_to_memory=_force_to_memory + ) a = dx.compute() if np.ma.isMA(a) and a is not np.ma.masked: - a.set_fill_value(999) if self.hardmask: a.harden_mask() else: @@ -3552,6 +3877,235 @@ def compute(self): return a + @classmethod + def concatenate( + cls, data, axis=0, cull_graph=False, relaxed_units=False, copy=True + ): + """Join a sequence of data arrays together. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `cull_graph` + + :Parameters: + + data: sequence of `Data` + The data arrays to be concatenated. Concatenation is + carried out in the order given. Each data array must + have equivalent units and the same shape, except in + the concatenation axis. Note that scalar arrays are + treated as if they were one dimensional. + + axis: `int`, optional + The axis along which the arrays will be joined. The + default is 0. Note that scalar arrays are treated as + if they were one dimensional. + + {{cull_graph: `bool`, optional}} + + {{relaxed_units: `bool`, optional}} + + copy: `bool`, optional + If True (the default) then make copies of the data, if + required, prior to the concatenation, thereby ensuring + that the input data arrays are not changed by the + concatenation process. If False then some or all input + data arrays might be changed in-place, but the + concatenation process will be faster. + + :Returns: + + `Data` + The concatenated data. + + **Examples** + + >>> d = {{package}}.Data([[1, 2], [3, 4]]) + >>> e = {{package}}.Data([[5.0, 6.0]]) + >>> f = {{package}}.Data.concatenate((d, e)) + >>> print(f.array) + [[ 1. 2. ] + [ 3. 4. ] + [ 5. 6. ]] + >>> f.equals({{package}}.Data.concatenate((d, e), axis=-2)) + True + + >>> e = {{package}}.Data([[5.0], [6.0]]) + >>> f = {{package}}.Data.concatenate((d, e), axis=1) + >>> print(f.array) + [[ 1. 2. 5.] + [ 3. 4. 6.]] + + >>> d = {{package}}.Data(1) + >>> e = {{package}}.Data(50.0) + >>> f = {{package}}.Data.concatenate((d, e)) + >>> print(f.array) + [ 1. 50.] + + """ + if isinstance(data, cls): + raise ValueError("Must provide a sequence of Data objects") + + data = tuple(data) + n_data = len(data) + if not n_data: + raise ValueError( + "Can't concatenate: Must provide at least one Data object" + ) + + if cull_graph: + # Remove unnecessary components from the graph, which may + # improve performance, and because complicated task graphs + # can sometimes confuse da.concatenate. + for d in data: + d.cull_graph() + + data0 = data[0] + units0 = data0.Units + data0_cached_elements = data0._get_cached_elements() + + if copy: + data0 = data0.copy() + + if not data0.ndim: + data0.insert_dimension(inplace=True) + + if n_data == 1: + return data0 + + conformed_data = [data0] + for data1 in data[1:]: + # Turn any scalar array into a 1-d array + copied = False + if not data1.ndim: + if copy: + data1 = data1.copy() + copied = True + + data1.insert_dimension(inplace=True) + + # Check and conform the units of data1 with respect to + # those of data0 + data1 = cls._concatenate_conform_units( + data1, units0, relaxed_units, copy and not copied + ) + + conformed_data.append(data1) + + # Get data as dask arrays and apply concatenation operation + dxs = [ + d.to_dask_array(_force_mask_hardness=False, _force_to_memory=False) + for d in conformed_data + ] + dx = da.concatenate(dxs, axis=axis) + + # ------------------------------------------------------------ + # Set the aggregation write status + # ------------------------------------------------------------ + # + # Assume at first that all input data instances have True + # status, but then .. + CFA = cls._CFA + for d in conformed_data: + if not d.nc_get_aggregation_write_status(): + # 1) The status must be False when any input data + # object has False status. + CFA = cls._NONE + break + + if CFA != cls._NONE: + non_concat_axis_chunks0 = list(data[0].chunks) + non_concat_axis_chunks0.pop(axis) + for d in conformed_data[1:]: + non_concat_axis_chunks = list(d.chunks) + non_concat_axis_chunks.pop(axis) + if non_concat_axis_chunks != non_concat_axis_chunks0: + # 2) The status must be False when any two input + # data objects have different Dask chunk + # patterns for the non-concatenated axes. + CFA = cls._NONE + break + + # ------------------------------------------------------------ + # Set the aggregation fragment type + # ------------------------------------------------------------ + if CFA != cls._NONE: + fragment_type0 = data[0].nc_get_aggregation_fragment_type() + for d in conformed_data[1:]: + fragment_type1 = d.nc_get_aggregation_fragment_type() + if fragment_type1 != fragment_type0 and "location" in ( + fragment_type1, + fragment_type0, + ): + # 3) The status must be False when any two input + # Data objects have different fragment types, + # onew of which is 'location'. + data0._nc_del_aggregation_fragment_type() + CFA = cls._NONE + break + + # ------------------------------------------------------------ + # Set the __in_memory__ status + # ------------------------------------------------------------ + in_memory = data[0].__in_memory__ + for d in conformed_data[1:]: + if d.__in_memory__ != in_memory: + # If and only if any two input Data objects have + # different __in_memory__ values, then set + # in_memory=False for the concatenation. + in_memory = False + break + + # ------------------------------------------------------------ + # Set the concatenated dask array + # ------------------------------------------------------------ + data0._set_dask(dx, clear=cls._ALL ^ CFA, in_memory=in_memory) + + # # ------------------------------------------------------------ + # # Set the aggregation 'aggregated_data' terms + # # ------------------------------------------------------------ + # if data0.nc_get_aggregation_write_status(): + # # Set the netCDF aggregated_data terms, giving precedence + # # to those towards the left hand side of the input + # # list. If any input Data object has no aggregated_data + # # terms, then nor will the concatenated data. + # aggregated_data = {} + # for d in conformed_data[::-1]: + # value = d.nc_get_aggregated_data() + # if not value: + # aggregated_data = {} + # break + # + # aggregated_data.update(value) + # + # data0.nc_set_aggregated_data(aggregated_data) + # + # ------------------------------------------------------------ + # Re-set appropriate cached elements (after '_set_dask' has + # just cleared them from data0) + # ------------------------------------------------------------ + cached_elements = {} + i = 0 + element = data0_cached_elements.get(i) + if element is not None: + cached_elements[i] = element + + i = -1 + element = conformed_data[i]._get_cached_elements().get(i) + if element is not None: + cached_elements[i] = element + + if cached_elements: + data0._set_cached_elements(cached_elements) + + # ------------------------------------------------------------ + # Apply extra post-processing to the concatenated data + # ------------------------------------------------------------ + data0 = cls._concatenate_post_process(data0, axis, conformed_data) + + # Return the concatenated data + return data0 + def creation_commands( self, name="data", namespace=None, indent=0, string=True ): @@ -4051,6 +4605,39 @@ def equals( else: return True + def file_directories(self, normalise=False): + """The directories of files containing parts of the data. + + Returns the locations of any files referenced by the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_filenames`, `replace_directory` + + :Returns: + + `set` + The unique set of file directories as absolute paths. + + **Examples** + + >>> d.file_directories() + {'https:///data/1', 'file:///data2'} + + """ + out = [] + append = out.append + for key, a in self.todict( + _force_mask_hardness=False, _force_to_memory=False + ).items(): + try: + append(a.file_directory(normalise=normalise)) + except AttributeError: + # This graph element doesn't contain a file name + pass + + return set(out) + @_inplace_enabled(default=False) def filled(self, fill_value=None, inplace=False): """Replace masked elements with a fill value. @@ -4104,7 +4691,7 @@ def filled(self, fill_value=None, inplace=False): _force_mask_hardness=False, _force_to_memory=False ) dx = dx.map_blocks(cfdm_filled, fill_value=fill_value, dtype=d.dtype) - d._set_dask(dx, in_memory=True) + d._set_dask(dx, clear=self._ALL, in_memory=True) return d @@ -4168,7 +4755,8 @@ def flatten(self, axes=None, inplace=False): .. versionadded:: (cfdm) 1.7.11 - .. seealso:: `insert_dimension`, `squeeze`, `transpose` + .. seealso:: `compressed`, `insert_dimension`, `squeeze`, + `transpose` :Parameters: @@ -4279,7 +4867,7 @@ def flatten(self, axes=None, inplace=False): dx = d.to_dask_array(_force_mask_hardness=True, _force_to_memory=True) dx = dx.reshape(new_shape) - d._set_dask(dx, in_memory=True) + d._set_dask(dx, clear=self._ALL, in_memory=True) # Update the axis names data_axes0 = d._axes @@ -4539,7 +5127,7 @@ def get_compressed_dimension(self, default=ValueError()): def get_data(self, default=ValueError(), _units=None, _fill_value=None): """Returns the data. - .. versionadded:: 3.0.0 + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -4657,21 +5245,27 @@ def get_deterministic_name(self): units._canonical_calendar, ) - def get_filenames(self): + def get_filenames(self, normalise=False, per_chunk=False): """The names of files containing parts of the data array. Returns the names of any files that may be required to deliver - the computed data array. This set may contain fewer names than - the collection of file names that defined the data when it was - first instantiated, as could be the case after the data has - been subspaced. + the computed data array. + + .. seealso:: `replace_filenames, `replace_directory` + + :Parameters: + + {{normalise: `bool`, optional}} - **Implementation** + .. versionadded:: (cfdm) NEXTVERSION - A `dask` chunk that contributes to the computed array is - assumed to reference data within a file if that chunk's array - object has a callable `get_filenames` method, the output of - which is added to the returned `set`. + per_chunk: `bool`, optional + Return a `numpy` array that provides the file name + that contributes to each Dask chunk. This array will + have the same shape as the Dask chunks (as returned by + the `numblocks` attribute). + + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -4687,17 +5281,77 @@ def get_filenames(self): >>> f = {{package}}.example_field(0) >>> {{package}}.write(f, "file.nc") - >>> d = {{package}}.read("file.nc")[0].data + >>> d = {{package}}.read("file.nc", dask_chunks'128 B')[0].data >>> d.get_filenames() {'file.nc'} + >>> d.numblocks + (2, 2) + >>> filenames = d.get_filenames(per_chunk=True) + >>> filenames.shape + (2, 2) + >>> print(filenames) + [['file.nc' 'file.nc'] + ['file.nc' 'file.nc']] + """ + if per_chunk: + # -------------------------------------------------------- + # Return the filenames in a numpy array + # -------------------------------------------------------- + + # Maximum number of characters in any file name + n_char = 1 + + filenames = {} + for index, position in zip( + self.chunk_indices(), self.chunk_positions() + ): + for a in ( + self[index] + .todict(_force_mask_hardness=False, _force_to_memory=False) + .values() + ): + try: + filename = a.get_filename( + normalise=normalise, default=None + ) + except AttributeError: + pass + else: + if filename: + if position in filenames: + raise ValueError( + "Can't return 'per_chunk' file names: " + f"The Dask chunk in position {position} " + f"(defined by {index!r}) has multiple " + "file locations" + ) + + filenames[position] = filename + n_char = max(n_char, len(filename)) + + array = np.ma.masked_all( + self.numblocks, + dtype=f"U{n_char}", + ) + array.set_fill_value("") + + for position, filename in filenames.items(): + array[position] = filename + + return array + + # ------------------------------------------------------------ + # Return the filenames in a set + # ------------------------------------------------------------ out = [] + append = out.append for a in self.todict( _force_mask_hardness=False, _force_to_memory=False ).values(): try: - out.extend(a.get_filenames()) + append(a.get_filename(normalise=normalise)) except AttributeError: pass @@ -5022,6 +5676,8 @@ def insert_dimension(self, position=0, inplace=False): :Returns: `Data` or `None` + The new data with expanded data axes, or `None` if + the operation was in-place. **Examples** @@ -5051,14 +5707,21 @@ def insert_dimension(self, position=0, inplace=False): f"Can't insert dimension: Invalid position {position!r}" ) - new_shape = list(d.shape) - new_shape.insert(position, 1) + # new_shape = list(d.shape) + # new_shape.insert(position, 1) - dx = d.to_dask_array(_force_mask_hardness=False, _force_to_memory=True) - dx = dx.reshape(new_shape) + dx = d.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ) + index = [slice(None)] * dx.ndim + index.insert(position, np.newaxis) + dx = dx[tuple(index)] - # Inserting a dimension doesn't affect the cached elements - d._set_dask(dx, clear=self._ALL ^ self._CACHE, in_memory=True) + # Inserting a dimension doesn't affect the cached elements or + # the CFA write status + d._set_dask( + dx, clear=self._ALL ^ self._CACHE ^ self._CFA, in_memory=None + ) # Expand _axes axis = new_axis_identifier(d._axes) @@ -5172,7 +5835,8 @@ def masked_values(self, value, rtol=None, atol=None, inplace=False): dx = d.to_dask_array(_force_mask_hardness=True, _force_to_memory=True) dx = da.ma.masked_values(dx, value, rtol=rtol, atol=atol) - d._set_dask(dx, in_memory=True) + d._set_dask(dx, clear=self._ALL, in_memory=True) + return d @_inplace_enabled(default=False) @@ -5218,7 +5882,10 @@ def masked_where(self, condition, inplace=False): array = cfdm_where(d.array, condition, masked, None, d.hardmask) dx = da.from_array(array, chunks=d.chunks) - d._set_dask(dx, in_memory=True) + d._set_dask(dx, clear=self._ALL, in_memory=True) + + # Update the deterministic status + d._update_deterministic(condition) # Update the deterministic status d._update_deterministic(condition) @@ -5495,20 +6162,15 @@ def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): @_inplace_enabled(default=False) def persist(self, inplace=False): - """Persist the underlying dask array into memory. + """Persist data into memory. - This turns an underlying lazy dask array into a equivalent - chunked dask array, but now with the results fully computed. - - `persist` is particularly useful when using distributed - systems, because the results will be kept in distributed - memory, rather than returned to the local process. + {{persist description}} Compare with `compute` and `array`. **Performance** - `persist` causes all delayed operations to be computed. + `persist` causes delayed operations to be computed. .. versionadded:: (cfdm) 1.11.2.0 @@ -5611,20 +6273,170 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - # Dask rechunking is essentially a wrapper for __getitem__ - # calls on the chunks, which means that we can use the same - # 'in_memory' and 'clear' keywords to `_set_dask` as are used - # in `__getitem__`. dx = d.to_dask_array( _force_mask_hardness=False, _force_to_memory=False ) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) d._set_dask( - dx, clear=self._ALL ^ self._ARRAY ^ self._CACHE, in_memory=None + dx, + clear=self._ALL ^ self._ARRAY ^ self._CACHE ^ self._CFA, + in_memory=None, ) return d + def replace_directory( + self, + old=None, + new=None, + normalise=False, + common=False, + ): + """Replace file directories in-place. + + Modifies the names of files that are required to deliver + the computed data array. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `file_directories`, `get_filenames` + + :Parameters: + + {{replace old: `str` or `None`, optional}} + + {{replace new: `str` or `None`, optional}} + + {{replace normalise: `bool`, optional}} + + common: `bool`, optional + If True the base directory structure that is common to + all files with *new*. + + :Returns: + + `None` + + **Examples** + + >>> d.get_filenames() + {'/data/file1.nc', '/home/file2.nc'} + >>> d.replace_directory('/data', '/new/data/path/') + >>> d.get_filenames() + {'/new/data/path/file1.nc', '/home/file2.nc'} + >>> d.replace_directory('/new/data, '/archive/location') + >>> d.get_filenames() + {'/archive/location/path/file1.nc', '/home/file2.nc'} + >>> d.replace_directory('/home') + >>> d.get_filenames() + {'/archive/location/path/file1.nc', 'file2.nc'} + + """ + if not old and not new and not normalise and not common: + return + + if common: + if old is not None: + raise ValueError( + "When 'common' is True, 'old' must be None " + "(because 'old' is going to be determined " + "automatically)" + ) + + old = commonprefix( + tuple(self.file_directories(normalise=normalise)) + ) + + self._modify_dask_graph( + "replace_directory", + (), + {"old": old, "new": new, "normalise": normalise}, + ) + + def replace_filenames(self, filenames): + """Replace file locations in-place. + + A fragment is a part of the data array that is stored in a + file. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_filenames` + + :Parameters: + + filenames: array_like + The replacement file names. It must either have the + same shape as the Dask chunks (as returned by the + `numblocks` attribute), or may also include an extra + trailing dimension for different file location + versions. Any output from the `get_filenames` method + with ``per_chunk=True`` is guaranteed to have an + acceptable shape. + + :Returns: + + `None` + + """ + filenames = np.ma.filled(filenames, "") + if self.numblocks != filenames.shape: + raise ValueError( + f"'filenames' shape {filenames.shape} must be the same " + f"as the Dask chunks shape {self.numblocks}" + ) + + dsk = self.todict(_force_mask_hardness=False, _force_to_memory=False) + + updated_keys = {} + for index, position in zip( + self.chunk_indices(), self.chunk_positions() + ): + filename = filenames[position] + if not filename: + # Don't replace the filename(s) for this chunk + continue + + chunk_updated = False + for key, a in ( + self[index] + .todict(_force_mask_hardness=False, _force_to_memory=False) + .items() + ): + try: + dsk[key] = a.replace_filename(filename) + except AttributeError: + pass + else: + if chunk_updated: + raise ValueError( + f"The Dask chunk in position {position} " + f"(defined by data index {index!r}) references multiple " + "file locations" + ) + + if key in updated_keys: + raise ValueError( + f"The Dask chunk in position {position} " + f"(defined by data index {index!r}) references a file " + "location that has already been replaced within " + "the Dask chunk in position " + f"{updated_keys[key][0]!r} (defined by " + f"{updated_keys[key][1]!r})" + ) + + chunk_updated = True + updated_keys[key] = (position, index) + + dx = self.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ) + dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) + self._set_dask(dx, clear=self._NONE, in_memory=None) + + # Update the deterministic status + self._update_deterministic(False) + @_inplace_enabled(default=False) def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): """Change the shape of the data without changing its values. @@ -5936,7 +6748,9 @@ def squeeze(self, axes=None, inplace=False): dx = dx.squeeze(axis=iaxes) # Squeezing a dimension doesn't affect the cached elements - d._set_dask(dx, clear=self._ALL ^ self._CACHE, in_memory=True) + d._set_dask( + dx, clear=self._ALL ^ self._CACHE ^ self._CFA, in_memory=True + ) # Remove the squeezed axis names d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes] diff --git a/cfdm/data/fragment/__init__.py b/cfdm/data/fragment/__init__.py new file mode 100644 index 0000000000..8c7c76bd0f --- /dev/null +++ b/cfdm/data/fragment/__init__.py @@ -0,0 +1,4 @@ +from .fragmentfilearray import FragmentFileArray +from .fragmenth5netcdfarray import FragmentH5netcdfArray +from .fragmentnetcdf4array import FragmentNetCDF4Array +from .fragmentuniquevaluearray import FragmentUniqueValueArray diff --git a/cfdm/data/fragment/fragmentfilearray.py b/cfdm/data/fragment/fragmentfilearray.py new file mode 100644 index 0000000000..598dd53745 --- /dev/null +++ b/cfdm/data/fragment/fragmentfilearray.py @@ -0,0 +1,245 @@ +from os.path import join + +from uritools import urisplit + +from ...functions import abspath +from ..abstract import FileArray +from ..mixin import IndexMixin +from .mixin import FragmentArrayMixin + + +class FragmentFileArray( + FragmentArrayMixin, + IndexMixin, + FileArray, +): + """Fragment of aggregated data in a file. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __new__(cls, *args, **kwargs): + """Store fragment classes. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + # Import fragment classes. Do this here (as opposed to outside + # the class) to aid subclassing. + from . import FragmentH5netcdfArray, FragmentNetCDF4Array + + instance = super().__new__(cls) + instance._FragmentArrays = ( + FragmentNetCDF4Array, + FragmentH5netcdfArray, + ) + return instance + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + storage_options=None, + unpack_aggregated_data=True, + aggregated_attributes=None, + aggregation_file_directory=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The locations of fragment datasets containing the + array. + + address: (sequence of `str`), optional + How to find the array in the fragment datasets. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if is not known. This may differ from the data type of + the fragment's data. + + shape: `tuple`, optional + The shape of the fragment in its canonical form. + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the fragment dataset + during the first `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + unpack=True, + attributes=None, + storage_options=storage_options, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_attributes = source._get_component( + "aggregated_attributes", None + ) + except AttributeError: + aggregated_attributes = None + + try: + unpack_aggregated_data = source._get_component( + "unpack_aggregated_data", True + ) + except AttributeError: + unpack_aggregated_data = True + + try: + aggregation_file_directory = source._get_component( + "aggregation_file_directory", None + ) + except AttributeError: + aggregation_file_directory = None + + self._set_component( + "aggregation_file_directory", + aggregation_file_directory, + copy=False, + ) + self._set_component( + "unpack_aggregated_data", + unpack_aggregated_data, + copy=False, + ) + if aggregated_attributes is not None: + self._set_component( + "aggregated_attributes", aggregated_attributes, copy=copy + ) + + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + The method acts as a factory for either a + `NetCDF4FragmentArray`, `H5netcdfFragmentArray`, or + `UMFragmentArray` class, and it is the result of calling + `!_get_array` on the newly created instance that is returned. + + `H5netcdfFragmentArray` will only be used if + `NetCDF4FragmentArray` returns a `FileNotFoundError` + exception; and `UMFragmentArray` will only be used + if `H5netcdfFragmentArray` returns an `Exception`. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + When a `tuple`, there must be a distinct entry for each + fragment dimension. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + # Loop round the fragment array backends, in the order + # given by the `_FragmentArrays` attribute (which is + # defined in `__new__`), until we find one that can open + # the file. + if index is None: + index = self.index() + + errors = [] + for FragmentArray in self._FragmentArrays: + try: + array = FragmentArray(source=self, copy=False)._get_array( + index + ) + except Exception as error: # noqa: F841 + errors.append( + f"{FragmentArray().__class__.__name__}:\n" + f"{error.__class__.__name__}: {error}" + ) + else: + return array + + # Still here? + error = "\n\n".join(errors) + raise RuntimeError( + f"Can't access array index {index} from fragment file " + f"{self.get_filename()}:\n\n" + f"{error}" + ) + + def get_filename(self, normalise=False, default=AttributeError()): + """The name of the file containing the fragment. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + {{normalise: `bool`, optional}} + + default: optional + Return the value of the *default* parameter if there + is no file name. + + {{default Exception}} + + :Returns: + + `str` + The file name. + + """ + filename = super().get_filename(normalise=False, default=None) + if filename is None: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no file name" + ) + + if normalise: + uri = urisplit(filename) + + # Convert the file name to an absolute URI + if uri.isrelpath(): + # File name is a relative-path URI reference + filename = join( + self._get_component("aggregation_file_directory"), filename + ) + elif not (uri.isabsuri() or uri.isabspath()): + raise ValueError( + "Fragment file location must be an absolute URI, a " + "relative-path URI reference, or an absolute-path URI: " + f"Got: {filename!r}" + ) + + filename = abspath(filename, uri=True) + + return filename diff --git a/cfdm/data/fragment/fragmenth5netcdfarray.py b/cfdm/data/fragment/fragmenth5netcdfarray.py new file mode 100644 index 0000000000..ea9635a46c --- /dev/null +++ b/cfdm/data/fragment/fragmenth5netcdfarray.py @@ -0,0 +1,10 @@ +from ..h5netcdfarray import H5netcdfArray +from .mixin import FragmentFileArrayMixin + + +class FragmentH5netcdfArray(FragmentFileArrayMixin, H5netcdfArray): + """A fragment of aggregated data in a file accessed with `h5netcdf`. + + .. versionadded:: (cfdm) NEXTVERSION + + """ diff --git a/cfdm/data/fragment/fragmentnetcdf4array.py b/cfdm/data/fragment/fragmentnetcdf4array.py new file mode 100644 index 0000000000..0efa5d811a --- /dev/null +++ b/cfdm/data/fragment/fragmentnetcdf4array.py @@ -0,0 +1,10 @@ +from ..netcdf4array import NetCDF4Array +from .mixin import FragmentFileArrayMixin + + +class FragmentNetCDF4Array(FragmentFileArrayMixin, NetCDF4Array): + """A fragment of aggregated data in a file accessed with `netCDF4`. + + .. versionadded:: (cfdm) NEXTVERSION + + """ diff --git a/cfdm/data/fragment/fragmentuniquevaluearray.py b/cfdm/data/fragment/fragmentuniquevaluearray.py new file mode 100644 index 0000000000..4bec257917 --- /dev/null +++ b/cfdm/data/fragment/fragmentuniquevaluearray.py @@ -0,0 +1,87 @@ +from ..fullarray import FullArray +from .mixin import FragmentArrayMixin + + +class FragmentUniqueValueArray(FragmentArrayMixin, FullArray): + """A fragment of aggregated data that has a single unique value. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __init__( + self, + unique_value=None, + dtype=None, + shape=None, + unpack_aggregated_data=True, + aggregated_attributes=None, + attributes=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + unique_value: scalar + The unique value for the fragment. + + dtype: `numpy.dtype` + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple` + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + {{init attributes: `dict` or `None`, optional}} + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + fill_value=unique_value, + dtype=dtype, + shape=shape, + attributes=None, + source=source, + copy=False, + ) + + if source is not None: + + try: + aggregated_attributes = source._get_component( + "aggregated_attributes", None + ) + except AttributeError: + aggregated_attributes = None + + try: + unpack_aggregated_data = source._get_component( + "unpack_aggregated_data", True + ) + except AttributeError: + unpack_aggregated_data = True + + self._set_component( + "unpack_aggregated_data", + unpack_aggregated_data, + copy=False, + ) + if aggregated_attributes is not None: + self._set_component( + "aggregated_attributes", aggregated_attributes, copy=copy + ) diff --git a/cfdm/data/fragment/mixin/__init__.py b/cfdm/data/fragment/mixin/__init__.py new file mode 100644 index 0000000000..dad50f7451 --- /dev/null +++ b/cfdm/data/fragment/mixin/__init__.py @@ -0,0 +1,2 @@ +from .fragmentarraymixin import FragmentArrayMixin +from .fragmentfilearraymixin import FragmentFileArrayMixin diff --git a/cfdm/data/fragment/mixin/fragmentarraymixin.py b/cfdm/data/fragment/mixin/fragmentarraymixin.py new file mode 100644 index 0000000000..dda4ea0a80 --- /dev/null +++ b/cfdm/data/fragment/mixin/fragmentarraymixin.py @@ -0,0 +1,251 @@ +from math import prod + +import numpy as np + +from ....units import Units +from ...netcdfindexer import netcdf_indexer + + +class FragmentArrayMixin: + """Mixin class for a fragment of aggregated data. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + It is important that there is a distinct value for each + fragment dimension, which is guaranteed when the + default of the `index` attribute is being used. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + if index is None: + index = self.index() + + try: + array = super()._get_array(index) + except ValueError: + # A ValueError is expected to be raised when the fragment + # variable has fewer than 'self.ndim' dimensions (we know + # that this is the case because 'index' has 'self.ndim' + # elements). + axis = self._size_1_axis() # index) + if axis is not None: + # There is a unique size 1 index that must correspond + # to the missing dimension => Remove it from the + # indices, get the fragment array with the new + # indices; and then insert the missing size one + # dimension. + index = list(index) + index.pop(axis) + array = super()._get_array(tuple(index)) + array = np.expand_dims(array, axis) + else: + # There are multiple size 1 indices so we don't know + # how many missing dimensions the fragment has, nor + # their positions => Get the full fragment array and + # then reshape it to the shape of the dask compute + # chunk; and then apply the index. + array = super()._get_array(Ellipsis) + if array.size > prod(self.original_shape): + raise ValueError( + f"Can't get fragment data from ({self}) when " + "the fragment has two or more missing size 1 " + "dimensions, whilst also spanning two or more " + "Dask compute chunks." + "\n\n" + "Consider re-creating the data with exactly one " + "Dask compute chunk per fragment (e.g. by setting " + "'chunks=None' as a keyword to cf.read)." + ) + + array = array.reshape(self.original_shape) + array = array[index] + + array = self._conform_to_aggregated_units(array) + + # Apply any unpacking deinfed on the aggregation variable. Do + # this after conforming the units. + array = self._unpack_aggregated_data(array) + + return array + + def _conform_to_aggregated_units(self, array): + """Conform the array to have the aggregated units. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + array: `numpy.ndarray` or `dict` + The array to be conformed. If *array* is a `dict` with + `numpy` array values then selected values are + conformed. + + :Returns: + + `numpy.ndarray` or `dict` + The conformed array. The returned array may or may not + be the input array updated in-place, depending on its + data type and the nature of its units and the + aggregated units. + + If *array* is a `dict` then a dictionary of conformed + arrays is returned. + + """ + units = self.Units + if units: + aggregated_units = self.aggregated_Units + if not units.equivalent(aggregated_units): + raise ValueError( + f"Can't convert fragment data with units {units!r} to " + f"have aggregated units {aggregated_units!r}" + ) + + if units != aggregated_units: + if isinstance(array, dict): + # 'array' is a dictionary. + raise ValueError( + "TODOACTIVE. Placeholder notification thatn " + "we can't yet dealing with active " + "storage reductions on fragments." + ) + else: + # 'array' is a numpy array + array = Units.conform( + array, units, aggregated_units, inplace=True + ) + + return array + + def _size_1_axis(self): # , indices): + """Find the position of a unique size 1 index. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `_parse_indices`, `__getitem__` + + :Parameters: + + indices: sequence of index + The array indices to be parsed, as returned by + `_parse_indices`. + + :Returns: + + `int` or `None` + The position of the unique size 1 index, or `None` if + there are zero or at least two of them. + + **Examples** + + >>> a._size_1_axis(([2, 4, 5], slice(0, 1), slice(0, 73))) + 1 + >>> a._size_1_axis(([2, 4, 5], slice(3, 4), slice(0, 73))) + 1 + >>> a._size_1_axis(([2, 4, 5], [0], slice(0, 73))) + 1 + >>> a._size_1_axis(([2, 4, 5], slice(0, 144), slice(0, 73))) + None + >>> a._size_1_axis(([2, 4, 5], slice(3, 7), [0, 1])) + None + >>> a._size_1_axis(([2, 4, 5], slice(0, 1), [0])) + None + + """ + original_shape = self.original_shape + if original_shape.count(1): + return original_shape.index(1) + + return + + def _unpack_aggregated_data(self, array): + """Unpack the canonical data, if requested. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + if self.get_unpack_aggregated_data(): + array = netcdf_indexer( + array, + mask=False, + unpack=True, + attributes=self.get_aggregated_attributes(), + copy=False, + )[...] + + return array + + @property + def aggregated_Units(self): + """The units of the aggregated data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `Units` + The units of the aggregated data. + + """ + aggregated_attributes = self.get_aggregated_attributes(copy=False) + calendar = aggregated_attributes.get("calendar", None) + units = aggregated_attributes.get("units", None) + return Units(units, calendar) + + def get_aggregated_attributes(self, copy=True): + """The calendar of the aggregated array. + + If the calendar is `None` then the CF default calendar is + assumed, if applicable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + aggregated calendar has not been set. If set to an + `Exception` instance then it will be raised instead. + + :Returns: + + `str` or `None` + The calendar value. + + """ + attributes = self._get_component("aggregated_attributes") + return attributes.copy() + + def get_unpack_aggregated_data(self): + """Whether or not to unpack the canonical data. + + If `True` and there are aggregated variable packing + attributes, then the array is unpacked according to those + attributes. + + .. versionadded:: (cfdm) NEXTVERSION + + **Examples** + + >>> a.get_unpack_aggregated_data() + True + + """ + return self._get_component("unpack_aggregated_data") diff --git a/cfdm/data/fragment/mixin/fragmentfilearraymixin.py b/cfdm/data/fragment/mixin/fragmentfilearraymixin.py new file mode 100644 index 0000000000..6384f8f570 --- /dev/null +++ b/cfdm/data/fragment/mixin/fragmentfilearraymixin.py @@ -0,0 +1,94 @@ +from .fragmentarraymixin import FragmentArrayMixin + + +class FragmentFileArrayMixin(FragmentArrayMixin): + """Mixin class for a fragment of aggregated data in a file. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + storage_options=None, + unpack_aggregated_data=True, + aggregated_attributes=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the + array. + + address: (sequence of `str`), optional + The name of the netCDF variable containing the + fragment array. Required unless *varid* is set. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple`, optional + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + unpack=True, + attributes=None, + storage_options=storage_options, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_attributes = source._get_component( + "aggregated_attributes", None + ) + except AttributeError: + aggregated_attributes = None + + try: + unpack_aggregated_data = source._get_component( + "unpack_aggregated_data", True + ) + except AttributeError: + unpack_aggregated_data = True + + self._set_component( + "unpack_aggregated_data", + unpack_aggregated_data, + copy=False, + ) + self._set_component( + "aggregated_attributes", aggregated_attributes, copy=False + ) diff --git a/cfdm/data/fullarray.py b/cfdm/data/fullarray.py new file mode 100644 index 0000000000..42814224c3 --- /dev/null +++ b/cfdm/data/fullarray.py @@ -0,0 +1,253 @@ +import numpy as np + +from ..functions import indices_shape, parse_indices +from .abstract import Array +from .mixin import IndexMixin + +_FULLARRAY_HANDLED_FUNCTIONS = {} + + +class FullArray(IndexMixin, Array): + """A array filled with a given value. + + The array may be empty or all missing values. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __init__( + self, + fill_value=None, + dtype=None, + shape=None, + attributes=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + fill_value : scalar, optional + The fill value for the array. May be set to + `cf.masked` or `np.ma.masked`. + + dtype: `numpy.dtype` + The data type of the array. + + shape: `tuple` + The array dimension sizes. + + {{init attributes: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + fill_value = source._get_component("full_value", None) + except AttributeError: + fill_value = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + attributes = source._get_component("attributes", False) + except AttributeError: + attributes = None + + self._set_component("full_value", fill_value, copy=False) + self._set_component("dtype", dtype, copy=False) + self._set_component("shape", shape, copy=False) + self._set_component("attributes", attributes, copy=False) + + def __array_function__(self, func, types, args, kwargs): + """The `numpy` `__array_function__` protocol. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + if func not in _FULLARRAY_HANDLED_FUNCTIONS: + return NotImplemented + + # Note: This allows subclasses that don't override + # __array_function__ to handle FullArray objects + if not all(issubclass(t, self.__class__) for t in types): + return NotImplemented + + return _FULLARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"" + + def __str__(self): + """Called by the `str` built-in function. + + x.__str__() <==> str(x) + + """ + fill_value = self.get_full_value() + if fill_value is None: + return "Uninitialised" + + return f"Filled with {fill_value!r}" + + def _get_array(self, index=None): + """Returns the data as a `numpy` array. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + :Returns: + + `numpy.ndarray` + The data. + + """ + if index is None: + shape = self.shape + else: + original_shape = self.original_shape + index = parse_indices(original_shape, index, keepdims=False) + shape = indices_shape(index, original_shape, keepdims=False) + + fill_value = self.get_full_value() + if fill_value is np.ma.masked: + array = np.ma.masked_all(shape, dtype=self.dtype) + elif fill_value is not None: + array = np.full(shape, fill_value=fill_value, dtype=self.dtype) + else: + array = np.empty(shape, dtype=self.dtype) + + return array + + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + """ + return self._get_array() + + @property + def dtype(self): + """Data-type of the data elements.""" + return self._get_component("dtype") + + @property + def shape(self): + """Tuple of array dimension sizes.""" + return self._get_component("shape") + + def get_full_value(self, default=AttributeError()): + """Return the data array fill value. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `set_full_value` + + :Parameters: + + default: optional + Return the value of the *default* parameter if the + fill value has not been set. If set to an `Exception` + instance then it will be raised instead. + + :Returns: + + The fill value. + + """ + return self._get_component("full_value", default=default) + + def set_full_value(self, fill_value): + """Set the data array fill value. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_full_value` + + :Parameters: + + fill_value : scalar, optional + The fill value for the array. May be set to + `cf.masked` or `np.ma.masked`. + + :Returns: + + `None` + + """ + self._set_component("full_value", fill_value, copy=False) + + +def fullarray_implements(numpy_function): + """An __array_function__ implementation for `FullArray` objects. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def decorator(func): + _FULLARRAY_HANDLED_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@fullarray_implements(np.unique) +def unique( + a, return_index=False, return_inverse=False, return_counts=False, axis=None +): + """Version of `np.unique` that is optimised for `FullArray` objects. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + if return_index or return_inverse or return_counts or axis is not None: + # Fall back to the slow unique. (I'm sure we could probably do + # something more clever here, but there is no use case at + # present.) + return np.unique( + a[...], + return_index=return_index, + return_inverse=return_inverse, + return_counts=return_counts, + axis=axis, + ) + + # Fast unique based on the full value + x = a.get_full_value() + if x is np.ma.masked: + return np.ma.masked_all((1,), dtype=a.dtype) + + return np.array((x,), dtype=a.dtype) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index e98edf43d5..f20b7f0ef6 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -4,153 +4,19 @@ from . import abstract from .locks import netcdf_lock -from .mixin import FileArrayMixin, IndexMixin, NetCDFFileMixin +from .mixin import IndexMixin from .netcdfindexer import netcdf_indexer logger = logging.getLogger(__name__) -class H5netcdfArray( - IndexMixin, NetCDFFileMixin, FileArrayMixin, abstract.Array -): +class H5netcdfArray(IndexMixin, abstract.FileArray): """A netCDF array accessed with `h5netcdf`. .. versionadded:: (cfdm) 1.11.2.0 """ - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - mask=True, - unpack=True, - attributes=None, - storage_options=None, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of) `str`, optional - The name of the file(s) containing the array. - - address: (sequence of) `str`, optional - The identity of the variable in each file defined by - *filename*. Must be a netCDF variable name. - - dtype: `numpy.dtype` - The data type of the array in the file. May be `None` - if the numpy data-type is not known (which can be the - case for string types, for example). - - shape: `tuple` - The array dimension sizes in the file. - - {{init mask: `bool`, optional}} - - {{init unpack: `bool`, optional}} - - {{init attributes: `dict` or `None`, optional}} - - If *attributes* is `None`, the default, then the - attributes will be set from the netCDF variable during - the first `__getitem__` call. - - .. versionadded:: (cfdm) 1.11.2.0 - - {{init storage_options: `dict` or `None`, optional}} - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - """ - super().__init__(source=source, copy=copy) - - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - mask = source._get_component("mask", True) - except AttributeError: - mask = True - - try: - unpack = source._get_component("unpack", True) - except AttributeError: - unpack = True - - try: - attributes = source._get_component("attributes", None) - except AttributeError: - attributes = None - - try: - storage_options = source._get_component( - "storage_options", None - ) - except AttributeError: - storage_options = None - - if shape is not None: - self._set_component("shape", shape, copy=False) - - if filename is not None: - if isinstance(filename, str): - filename = (filename,) - else: - filename = tuple(filename) - - self._set_component("filename", filename, copy=False) - - if address is not None: - if isinstance(address, (str, int)): - address = (address,) - else: - address = tuple(address) - - self._set_component("address", address, copy=False) - - self._set_component("dtype", dtype, copy=False) - self._set_component("mask", bool(mask), copy=False) - self._set_component("unpack", bool(unpack), copy=False) - self._set_component("storage_options", storage_options, copy=False) - self._set_component("attributes", attributes, copy=False) - - # By default, close the file after data array access - self._set_component("close", True, copy=False) - - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: (cfdm) 1.11.2.0 - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - @property def _lock(self): """Return the lock used for netCDF file access. @@ -218,6 +84,33 @@ def _get_array(self, index=None): return array + def _group(self, dataset, groups): + """Return the group object containing a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `h5netcdf.File` + The dataset containing the variable. + + groups: sequence of `str` + The definition of which group the variable is in. For + instance, if the variable is in group + ``/forecast/model`` then *groups* would be + ``['forecast', 'model']``. + + :Returns: + + `h5netcdf.File` or `h5netcdf.Group` + The group object, which might be the root group. + + """ + for g in groups: + dataset = dataset.groups[g] + + return dataset + def _set_attributes(self, var): """Set the netCDF variable attributes. diff --git a/cfdm/data/mixin/__init__.py b/cfdm/data/mixin/__init__.py index d79c9e5f2f..2c9e1ef283 100644 --- a/cfdm/data/mixin/__init__.py +++ b/cfdm/data/mixin/__init__.py @@ -1,5 +1,3 @@ from .arraymixin import ArrayMixin from .compressedarraymixin import CompressedArrayMixin -from .filearraymixin import FileArrayMixin from .indexmixin import IndexMixin -from .netcdffilemixin import NetCDFFileMixin diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 1cd123b6a2..8856e12fb3 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -112,7 +112,7 @@ def Units(self): """ return Units(self.get_units(None), self.get_calendar(None)) - def get_attributes(self, default=ValueError()): + def get_attributes(self, copy=True): """The attributes of the array. .. versionadded:: (cfdm) 1.11.2.0 @@ -132,15 +132,11 @@ def get_attributes(self, default=ValueError()): """ attributes = self._get_component("attributes", None) if attributes is None: - if default is None: - return - - return self._default( - default, - f"{self.__class__.__name__} attributes have not yet been set", - ) + attributes = {} + elif copy: + attributes = deepcopy(attributes) - return deepcopy(attributes) + return attributes def get_calendar(self, default=ValueError()): """The calendar of the array. diff --git a/cfdm/data/mixin/compressedarraymixin.py b/cfdm/data/mixin/compressedarraymixin.py index 459a915b4e..c61a1304ad 100644 --- a/cfdm/data/mixin/compressedarraymixin.py +++ b/cfdm/data/mixin/compressedarraymixin.py @@ -41,7 +41,7 @@ def _lock_file_read(self, array): pass try: - array.get_filenames() + array.get_filename() except AttributeError: pass else: diff --git a/cfdm/data/mixin/indexmixin.py b/cfdm/data/mixin/indexmixin.py index 33509f0449..c97e751616 100644 --- a/cfdm/data/mixin/indexmixin.py +++ b/cfdm/data/mixin/indexmixin.py @@ -98,16 +98,28 @@ def __getitem__(self, index): """ shape0 = self.shape index0 = self.index(conform=False) - original_shape = self.original_shape + reference_shape = list(self.reference_shape) - index1 = parse_indices(shape0, index, keepdims=False) + index1 = parse_indices(shape0, index, keepdims=False, newaxis=True) new = self.copy() new_indices = [] new_shape = [] + newaxis = np.newaxis + if len(index1) > len(index0): + # Take new axes out of 'index1' for now. We'll put them + # back later. + none_positions = [ + i for i, ind1 in enumerate(index1) if ind1 is newaxis + ] + index1 = [ind1 for ind1 in index1 if ind1 is not newaxis] + else: + none_positions = [] + i = 0 - for ind0, original_size in zip(index0, original_shape): + j = 0 + for ind0, reference_size in zip(index0, reference_shape[:]): if isinstance(ind0, Integral): # The previous call to __getitem__ resulted in a # dimension being removed (i.e. 'ind0' is @@ -120,7 +132,24 @@ def __getitem__(self, index): ind1 = index1[i] size0 = shape0[i] + i += 1 + if ind0 is newaxis: + if isinstance(ind1, Integral): + # A previously introduced new axis is being + # removed by an integer index + if ind1 not in (0, -1): + raise IndexError( + f"index {ind1} is out of bounds for axis {i - 1} " + "with size 1" + ) + + reference_shape.pop(i - 1 - j) + j += 1 + else: + new_indices.append(ind0) + + continue # If this dimension is not subspaced by the new index then # we don't need to update the old index. @@ -147,7 +176,7 @@ def __getitem__(self, index): if isinstance(ind1, slice): # ind0: slice # ind1: slice - start, stop, step = ind0.indices(original_size) + start, stop, step = ind0.indices(reference_size) start1, stop1, step1 = ind1.indices(size0) size1, mod1 = divmod(stop1 - start1, step1) @@ -170,7 +199,7 @@ def __getitem__(self, index): else: # ind0: slice # ind1: int, or array of int/bool - new_index = np.arange(*ind0.indices(original_size))[ind1] + new_index = np.arange(*ind0.indices(reference_size))[ind1] else: # ind0: array of int. If we made it to here then it # can't be anything else. This is @@ -187,10 +216,16 @@ def __getitem__(self, index): new_indices.append(new_index) + if none_positions: + for i in none_positions: + new_indices.insert(i, newaxis) + reference_shape.insert(i, 1) + new._custom["index"] = tuple(new_indices) + new._custom["reference_shape"] = tuple(reference_shape) # Find the shape defined by the new index - new_shape = indices_shape(new_indices, original_shape, keepdims=False) + new_shape = indices_shape(new_indices, reference_shape, keepdims=False) new._set_component("shape", tuple(new_shape), copy=False) return new @@ -243,6 +278,26 @@ def _get_array(self, index=None): f"Must implement {self.__class__.__name__}._get_array" ) + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + **Examples** + + >>> n = numpy.asanyarray(a) + >>> isinstance(n, numpy.ndarray) + True + + """ + return self.__array__() + def index(self, conform=True): """The index to be applied when converting to a `numpy` array. @@ -251,7 +306,7 @@ def index(self, conform=True): .. versionadded:: (cfdm) 1.11.2.0 - .. seealso:: `shape`, `original_shape` + .. seealso:: `shape`, `original_shape`, `reference_shape` :Parameters: @@ -294,9 +349,11 @@ def index(self, conform=True): if ind is None: # No indices have been applied yet, so define indices that # are equivalent to Ellipsis, and set the original shape. - ind = (slice(None),) * self.ndim + shape = self.shape + ind = tuple([slice(0, n, 1) for n in shape]) self._custom["index"] = ind - self._custom["original_shape"] = self.shape + self._custom["original_shape"] = shape + self._custom["reference_shape"] = shape return ind if not conform: @@ -344,16 +401,37 @@ def index(self, conform=True): return tuple(ind) + @property + def reference_shape(self): + """The shape of the data in the file with added dimensions. + + This is the same as `original_shape`, but with added size 1 + dimensions if `index` has new dimensions added with index + values of `numpy.newaxis`. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `index`, `shape`, `original_shape` + + """ + out = self._custom.get("reference_shape") + if out is None: + # No subspace has been defined yet + out = self.original_shape + self._custom["reference_shape"] = out + + return out + @property def original_shape(self): - """The original shape of the data, before any subspacing. + """The shape of the data in the file. The `shape` is defined by the result of subspacing the data in its original shape with the indices given by `index`. .. versionadded:: (cfdm) 1.11.2.0 - .. seealso:: `index`, `shape` + .. seealso:: `index`, `shape`, `reference_shape` """ out = self._custom.get("original_shape") @@ -363,3 +441,19 @@ def original_shape(self): self._custom["original_shape"] = out return out + + def is_subspace(self): + """True if the index represents a subspace of the data. + + The presence of `numpy.newaxis` (i.e. added size 1 dimensions) + in `index` will not, on their own, cause `is_subspace` to + return `False` + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `index`, `shape` + + """ + newaxis = np.newaxis + index = [ind for ind in self.index() if ind is not newaxis] + return index != [slice(0, n, 1) for n in self.original_shape] diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 722730d475..6b93cbac21 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -2,202 +2,17 @@ from . import abstract from .locks import netcdf_lock -from .mixin import FileArrayMixin, IndexMixin, NetCDFFileMixin +from .mixin import IndexMixin from .netcdfindexer import netcdf_indexer -class NetCDF4Array( - IndexMixin, NetCDFFileMixin, FileArrayMixin, abstract.Array -): +class NetCDF4Array(IndexMixin, abstract.FileArray): """A netCDF array accessed with `netCDF4`. .. versionadded:: (cfdm) 1.7.0 """ - def __init__( - self, - filename=None, - address=None, - dtype=None, - shape=None, - mask=True, - unpack=True, - attributes=None, - storage_options=None, - source=None, - copy=True, - ): - """**Initialisation** - - :Parameters: - - filename: (sequence of) `str`, optional - The name of the netCDF file(s) containing the array. - - address: (sequence of) `str` or `int`, optional - The identity of the netCDF variable in each file - defined by *filename*. Either a netCDF variable name - or an integer netCDF variable ID. - - .. versionadded:: (cfdm) 1.10.1.0 - - dtype: `numpy.dtype` - The data type of the array in the netCDF file. May be - `None` if the numpy data-type is not known (which can be - the case for netCDF string types, for example). - - shape: `tuple` - The array dimension sizes in the netCDF file. - - {{init mask: `bool`, optional}} - - .. versionadded:: (cfdm) 1.8.2 - - {{init unpack: `bool`, optional}} - - .. versionadded:: (cfdm) 1.11.2.0 - - {{init attributes: `dict` or `None`, optional}} - - If *attributes* is `None`, the default, then the - attributes will be set from the netCDF variable during - the first `__getitem__` call. - - .. versionadded:: (cfdm) 1.11.2.0 - - {{init storage_options: `dict` or `None`, optional}} - - .. versionadded:: (cfdm) 1.11.2.0 - - {{init source: optional}} - - .. versionadded:: (cfdm) 1.10.0.0 - - {{init copy: `bool`, optional}} - - .. versionadded:: (cfdm) 1.10.0.0 - - missing_values: Deprecated at version 1.11.2.0 - The missing value indicators defined by the netCDF - variable attributes. They may now be recorded via the - *attributes* parameter - - ncvar: Deprecated at version 1.10.1.0 - Use the *address* parameter instead. - - varid: Deprecated at version 1.10.1.0 - Use the *address* parameter instead. - - group: Deprecated at version 1.10.1.0 - Use the *address* parameter instead. - - units: `str` or `None`, optional - Deprecated at version 1.11.2.0. Use the - *attributes* parameter instead. - - calendar: `str` or `None`, optional - Deprecated at version 1.11.2.0. Use the - *attributes* parameter instead. - - """ - super().__init__(source=source, copy=copy) - - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - mask = source._get_component("mask", True) - except AttributeError: - mask = True - - try: - unpack = source._get_component("unpack", True) - except AttributeError: - unpack = True - - try: - attributes = source._get_component("attributes", None) - except AttributeError: - attributes = None - - try: - storage_options = source._get_component( - "storage_options", None - ) - except AttributeError: - storage_options = None - - if shape is not None: - self._set_component("shape", shape, copy=False) - - if filename is not None: - if isinstance(filename, str): - filename = (filename,) - else: - filename = tuple(filename) - - self._set_component("filename", filename, copy=False) - - if address is not None: - if isinstance(address, (str, int)): - address = (address,) - else: - address = tuple(address) - - self._set_component("address", address, copy=False) - - self._set_component("dtype", dtype, copy=False) - self._set_component("mask", bool(mask), copy=False) - self._set_component("unpack", bool(unpack), copy=False) - self._set_component("storage_options", storage_options, copy=False) - self._set_component("attributes", attributes, copy=False) - - # By default, close the netCDF file after data array access - self._set_component("close", True, copy=False) - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return f"<{self.__class__.__name__}{self.shape}: {self}>" - - def __str__(self): - """Called by the `str` built-in function. - - x.__str__() <==> str(x) - - """ - return f"{self.get_filename(None)}, {self.get_address()}" - - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: (cfdm) 1.11.2.0 - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - @property def _lock(self): """Return the lock used for netCDF file access. @@ -276,6 +91,33 @@ def _get_array(self, index=None): return array + def _group(self, dataset, groups): + """Return the group object containing a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `netCDF4.Dataset + The dataset containing the variable. + + groups: sequence of `str` + The definition of which group the variable is in. For + instance, if the variable is in group + ``/forecast/model`` then *groups* would be + ``['forecast', 'model']``. + + :Returns: + + `netCDF4.Dataset` or `netCDF4.Group` + The group object, which might be the root group. + + """ + for g in groups: + dataset = dataset.groups[g] + + return dataset + def _set_attributes(self, var): """Set the netCDF variable attributes. @@ -303,6 +145,24 @@ def _set_attributes(self, var): attributes = {attr: var.getncattr(attr) for attr in var.ncattrs()} self._set_component("attributes", attributes, copy=False) + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Parameters: + + dataset: + The dataset to be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + def get_groups(self, address): """The netCDF4 group structure of a netCDF variable. @@ -347,24 +207,6 @@ def get_groups(self, address): out = address.split("/")[1:] return out[:-1], out[-1] - def close(self, dataset): - """Close the dataset containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Parameters: - - dataset: `netCDF4.Dataset` - The netCDF dataset to be closed. - - :Returns: - - `None` - - """ - if self._get_component("close"): - dataset.close() - def open(self): """Return a dataset file object and address. diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 75ce87227e..ec0b176867 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -225,7 +225,39 @@ def __getitem__(self, index): # ------------------------------------------------------------ # Index the variable # ------------------------------------------------------------ - data = self._index(index) + try: + data = self._index(index) + except (IndexError, AttributeError): + # Assume we are here because we have one or more + # np.newaxis values in 'index', and the variable doesn't + # support that type of indexing. It is known that + # `netCDF4` raises an IndexError and h5netcdf raises an + # AttributeError. + + # Subspace the variable with the np.newaxis elements + # removed + newaxis = np.newaxis + index1 = [i for i in index if i is not newaxis] + data = self._index(tuple(index1)) + + # Now subspace the result (which we're assuming is + # something that likes np.newaxis indices) with the + # np.newaxis elements reinstated. + index2 = [i if i is newaxis else slice(None) for i in index] + data = self._index(tuple(index2), data=data) + + # E.g. index : (1, np.newaxis, slice(1, 5)) + # => index1: (1, slice(1, 5)) + # and index2: (slice(None), np.newaxis, slice(None)) + except ValueError: + # Something went wrong, which is indicative of the + # variable not supporting the appropriate slicing method + # (e.g. `h5netcdf` might have returned "ValueError: Step + # must be >= 1 (got -2)"). Therefore we'll just get the + # entire array as a numpy array, and then try indexing + # that. + data = self._index(Ellipsis) + data = self._index(index, data=data) # Reset a netCDF4 variable's scale and mask behaviour if netCDF4_scale: @@ -362,7 +394,7 @@ def _default_FillValue(self, dtype): return default_fillvals[dtype.str[1:]] - def _index(self, index): + def _index(self, index, data=None): """Get a subspace of the variable. .. versionadded:: (cfdm) 1.11.2.0 @@ -374,13 +406,20 @@ def _index(self, index): index: The indices that define the subspace. + data: array_like, optional + The data to be indexed. If `None` (the default) then + the data given by the `variable` attribute will be + used. + :Returns: `numpy.ndarray` The subspace of the variable. """ - data = self.variable + if data is None: + data = self.variable + if index is Ellipsis: return data[...] @@ -754,6 +793,24 @@ def _unpack(self, data, attributes): return data + def _size_1_axis(self): + """Find the position of a unique size 1 index. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `int` or `None` + The position of the unique size 1 index, or `None` if + there are zero or at least two of them. + + """ + shape = self.shape + if shape.count(1): + return shape.index(1) + + return + @property def dtype(self): """The data type of the array elements. diff --git a/cfdm/data/numpyarray.py b/cfdm/data/numpyarray.py index 114c2b0c45..b84e987256 100644 --- a/cfdm/data/numpyarray.py +++ b/cfdm/data/numpyarray.py @@ -1,58 +1,22 @@ -from .. import core -from .mixin import ArrayMixin -from .netcdfindexer import netcdf_indexer +class DeprecationError(Exception): + """Deprecation error.""" + pass -class NumpyArray(ArrayMixin, core.NumpyArray): + +class NumpyArray: """An underlying `numpy` array. + Deprecated at version NEXTVERSION and is no longer available. Use + `numpy` instead. + .. versionadded:: (cfdm) 1.7.0 """ - def __getitem__(self, indices): - """Returns a subspace of the array as a `numpy` array. - - x.__getitem__(indices) <==> x[indices] - - The indices that define the subspace must be either `Ellipsis` - or a sequence that contains an index for each dimension. In - the latter case, each dimension's index must either be a - `slice` object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: - - * When two or more dimension's indices are sequences of - integers then these indices work independently along each - dimension (similar to the way vector subscripts work in - Fortran). - - .. versionadded:: (cfdm) 1.7.0 - - """ - array = netcdf_indexer( - self._get_component("array"), - mask=False, - unpack=False, - always_masked_array=False, - orthogonal_indexing=True, - copy=True, + def __init__(self, *args, **kwargs): + """**Initialisation**""" + raise DeprecationError( + f"{self.__class__.__name__} was deprecated at version NEXTVERSION " + "and is no longer available. Use numpy instead." ) - return array[indices] - - def to_memory(self): - """Bring data on disk into memory. - - There is no change to data that is already in memory. - - .. versionadded:: (cfdm) 1.7.0 - - :Returns: - - `{{class}}` - A copy of the array with all of its data in memory. - - """ - return self diff --git a/cfdm/data/sparsearray.py b/cfdm/data/sparsearray.py index 93b1829e40..337a886715 100644 --- a/cfdm/data/sparsearray.py +++ b/cfdm/data/sparsearray.py @@ -1,7 +1,7 @@ -from .numpyarray import NumpyArray +from .abstract import Array -class SparseArray(NumpyArray): +class SparseArray(Array): """An underlying sparse array. The sparse array is assumed to have the same API as `scipy` sparse @@ -11,6 +11,32 @@ class SparseArray(NumpyArray): """ + def __init__(self, array=None, source=None, copy=True): + """**Initialisation** + + :Parameters: + + array: `numpy.ndarray` + The numpy array. + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + array = source._get_component("array", None) + except AttributeError: + array = None + + self._set_component("array", array, copy=False) + if array is not None: + self._set_component("dtype", array.dtype, copy=False) + self._set_component("shape", array.shape, copy=False) + @property def array(self): """Return an independent numpy array containing the data. @@ -52,3 +78,16 @@ def sparse_array(self): """ return self._get_component("array").copy() + + def to_memory(self): + """Bring data on disk into memory. + + .. versionadded:: (cfdm) 1.11.0.0 + + :Returns: + + Returns the contained sparse array, which is already + in memory. + + """ + return self diff --git a/cfdm/data/subarray/abstract/subarray.py b/cfdm/data/subarray/abstract/subarray.py index fc1d061758..c7f9025414 100644 --- a/cfdm/data/subarray/abstract/subarray.py +++ b/cfdm/data/subarray/abstract/subarray.py @@ -263,20 +263,38 @@ def compressed_dimensions(self): return c.copy() - def get_filenames(self): - """Return the names of any files containing the data. + def get_filename(self, normalise=True, default=AttributeError()): + """Return the name of the file containing the data. .. versionadded:: (cfdm) 1.10.0.2 + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + + default: optional + Return the value of the *default* parameter if there + is no file name. + + {{default Exception}} + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: - `set` - The file names in normalised, absolute form. If the - data are all in memory then an empty `set` is - returned. + `str` + The file name. """ - try: - return tuple(self.data.get_filenames()) - except AttributeError: - return () + filenames = self.data.get_filenames(normalise=normalise) + if len(filenames) != 1: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no unique file name" + ) + + return filenames[0] diff --git a/cfdm/data/subarray/abstract/subsampledsubarray.py b/cfdm/data/subarray/abstract/subsampledsubarray.py index 0193a9c77f..3b7f489604 100644 --- a/cfdm/data/subarray/abstract/subsampledsubarray.py +++ b/cfdm/data/subarray/abstract/subsampledsubarray.py @@ -603,33 +603,60 @@ def subarea_indices(self): """ return self._get_component("subarea_indices") - def get_filenames(self): - """Return the names of any files containing the data. - - Includes the names of files that contain any parameters and - dependent tie points. + def get_filename(self, normalise=True, default=AttributeError()): + """Return the name of the file containing the data. .. versionadded:: (cfdm) 1.10.0.2 + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + + default: optional + Return the value of the *default* parameter if there + is no file name. + + {{default Exception}} + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: - `tuple` - The file names in normalised, absolute form. If the - data are all in memory then an empty `set` is - returned. + `str` + The file name. """ - filenames = super().get_filenames() + filename = super().get_filename(normalise=normalise, default=None) + if filename is None: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no unique file name" + ) + filenames = [filename] for x in tuple(self.parameters.values()) + tuple( self.dependent_tie_points.values() ): try: - filenames += x.get_filenames() + filenames2 = x.get_filenames(normalise=normalise) except AttributeError: pass + else: + filenames.extend(filenames2) + + if len(set(filenames)) != 1: + if default is None: + return + + return self._default( + default, f"{self.__class__.__name__} has no unique file name" + ) - return tuple(set(filenames)) + return filenames[0] def get_interpolation_description(self, default=ValueError()): """Get a non-standardised interpolation method description. diff --git a/cfdm/data/utils.py b/cfdm/data/utils.py index a2ec44c786..7be7ee095e 100644 --- a/cfdm/data/utils.py +++ b/cfdm/data/utils.py @@ -278,10 +278,10 @@ def convert_to_reftime(a, units=None, first_value=None): An array of string or object date-times units: `Units`, optional - Specify the units for the output reference time - values. By default the units are inferred from the first - non-missing value in the array, or set to ```` if all values are missing. + Specify the units for the output reference time values. By + default the units are inferred from the first non-missing + value in the array, or set to ```` if all values are missing. first_value: optional If set, then assumed to be equal to the first non-missing @@ -567,6 +567,48 @@ def new_axis_identifier(existing_axes=(), basename="dim"): return axis +def chunk_indices(chunks): + """Return indices that define each dask chunk. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `chunks` + + :Parameters: + + chunks: `tuple` + The chunk sizes along each dimension, as output by + `dask.array.Array.chunks`. + + :Returns: + + `itertools.product` + An iterator over tuples of indices of the data array. + + **Examples** + + >>> chunks = ((1, 2), (9,), (4, 5, 6))) + >>> for index in cfdm.data.utils.chunk_indices(): + ... print(index) + ... + (slice(0, 1, None), slice(0, 9, None), slice(0, 4, None)) + (slice(0, 1, None), slice(0, 9, None), slice(4, 9, None)) + (slice(0, 1, None), slice(0, 9, None), slice(9, 15, None)) + (slice(1, 3, None), slice(0, 9, None), slice(0, 4, None)) + (slice(1, 3, None), slice(0, 9, None), slice(4, 9, None)) + (slice(1, 3, None), slice(0, 9, None), slice(9, 15, None)) + + """ + from dask.utils import cached_cumsum + + cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] + indices = [ + [slice(s, s + dim) for s, dim in zip(starts, shapes)] + for starts, shapes in zip(cumdims, chunks) + ] + return product(*indices) + + def chunk_positions(chunks): """Find the position of each chunk. @@ -582,7 +624,7 @@ def chunk_positions(chunks): **Examples** - >>> chunks = ((1, 2), (9,), (44, 55, 66)) + >>> chunks = ((1, 2), (9,), (4, 5, 6)) >>> for position in cfdm.data.utils.chunk_positions(chunks): ... print(position) ... diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 3f04c64d65..2cfd5ef5d4 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -119,6 +119,540 @@ was produced by combining other objects that also store their original file names, then the returned files will be the collection of original files from all contributing sources.""", + # read external + "{{read external: (sequence of) `str`, optional}}": """external: (sequence of) `str`, optional + Read external variables (i.e. variables which are named by + attributes, but are not present, in the parent file given + by the *filename* parameter) from the given external + files. Ignored if the parent file does not contain a + global ``external_variables`` attribute. Multiple external + files may be provided, which are searched in random order + for the required external variables. + + If an external variable is not found in any external + files, or is found in multiple external files, then the + relevant metadata construct is still created, but without + any metadata or data. In this case the construct's + `!is_external` method will return `True`. + + *Parameter example:* + ``external='cell_measure.nc'`` + + *Parameter example:* + ``external=['cell_measure.nc']`` + + *Parameter example:* + ``external=('cell_measure_A.nc', 'cell_measure_O.nc')``""", + # read extra + "{{read extra: (sequence of) `str`, optional}}": """extra: (sequence of) `str`, optional + Create extra, independent fields from netCDF variables + that correspond to particular types of metadata constructs. + Ignored if *domain* is True. + + The *extra* parameter may be one, or a sequence, of: + + ========================== =============================== + *extra* Metadata constructs + ========================== =============================== + ``'field_ancillary'`` Field ancillary constructs + ``'domain_ancillary'`` Domain ancillary constructs + ``'dimension_coordinate'`` Dimension coordinate constructs + ``'auxiliary_coordinate'`` Auxiliary coordinate constructs + ``'cell_measure'`` Cell measure constructs + ========================== =============================== + + *Parameter example:* + To create fields from auxiliary coordinate constructs: + ``extra='auxiliary_coordinate'`` or + ``extra=['auxiliary_coordinate']``. + + *Parameter example:* + To create fields from domain ancillary and cell measure + constructs: ``extra=['domain_ancillary', + 'cell_measure']``. + + An extra field construct created via the *extra* parameter + will have a domain limited to that which can be inferred + from the corresponding netCDF variable, but without the + connections that are defined by the parent netCDF data + variable. It is possible to create independent fields from + metadata constructs that do incorporate as much of the + parent field construct's domain as possible by using the + `~{{package}}.Field.convert` method of a returned field + construct, instead of setting the *extra* parameter.""", + # read verbose + "{{read verbose: `int` or `str` or `None`, optional}}": """verbose: `int` or `str` or `None`, optional + If an integer from ``-1`` to ``3``, or an equivalent string + equal ignoring case to one of: + + * ``'DISABLE'`` (``0``) + * ``'WARNING'`` (``1``) + * ``'INFO'`` (``2``) + * ``'DETAIL'`` (``3``) + * ``'DEBUG'`` (``-1``) + + set for the duration of the method call only as the + minimum cut-off for the verboseness level of displayed + output (log) messages, regardless of the + globally-configured `{{package}}.log_level`. Note that + increasing numerical value corresponds to increasing + verbosity, with the exception of ``-1`` as a special case + of maximal and extreme verbosity. + + Otherwise, if `None` (the default value), output messages + will be shown according to the value of the + `{{package}}.log_level` setting. + + Overall, the higher a non-negative integer or equivalent string + that is set (up to a maximum of ``3``/``'DETAIL'``) for + increasing verbosity, the more description that is printed to + convey how the contents of the netCDF file were parsed and + mapped to CF data model constructs.""", + # read warnings + "{{read warnings: `bool`, optional}}": """warnings: `bool`, optional + If True then print warnings when an output field construct + is incomplete due to structural non-compliance of the + dataset. By default such warnings are not displayed.""", + # read warn_valid + "{{read warn_valid: `bool`, optional}}": """warn_valid: `bool`, optional + If True then print a warning for the presence of + ``valid_min``, ``valid_max`` or ``valid_range`` properties + on field constructs and metadata constructs that have + data. By default no such warning is issued. + + "Out-of-range" data values in the file, as defined by any + of these properties, are automatically masked by default, + which may not be as intended. See the *mask* parameter for + turning off all automatic masking.""", + # read mask + "{{read mask: `bool`, optional}}": """mask: `bool`, optional + If True (the default) then mask by convention the data of + field and metadata constructs. + + A netCDF array is masked depending on the values of any of + the netCDF attributes ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_min``, ``valid_max``, and + ``valid_range``.""", + # read unpack + "{{read unpack: `bool`}}": """unpack: `bool` + If True, the default, then unpack arrays by convention + when the data is read from disk. + + Unpacking is determined by netCDF conventions for the + following variable attributes: ``add_offset``, + ``scale_factor``, and ``_Unsigned``.""", + # read domain + "{{read domain: `bool`, optional}}": """domain: `bool`, optional + If True then return only the domain constructs that are + explicitly defined by CF-netCDF domain variables, ignoring + all CF-netCDF data variables. By default only the field + constructs defined by CF-netCDF data variables are + returned. + + CF-netCDF domain variables are only defined from CF-1.9, + so older datasets automatically contain no CF-netCDF + domain variables. + + The unique domain constructs of the dataset are + found with the `{{package}}.unique_constructs` + function. For example:: + + >>> d = {{package}}.read('file.nc', domain=True) + >>> ud = {{package}}.unique_constructs(d) + >>> f = {{package}}.read('file.nc') + >>> ufd = {{package}}.unique_constructs(x.domain for x in f)""", + # read netcdf_backend + "{{read netcdf_backend: `None` or (sequence of) `str`, optional}": """netcdf_backend: `None` or (sequence of) `str`, optional + Specify which library, or libraries, to use for opening + and reading netCDF files. By default, or if `None`, then + the first one of `h5netcdf` and `netCDF4` to successfully + open the netCDF file is used. The libraries will be used + in the order given, until a file is successfully + opened.""", + # read storage_options + "{{read storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional + Pass parameters to the backend file system driver, such as + username, password, server, port, etc. How the storage + options are interpreted depends on the location of the + file: + + * **Local File System**: Storage options are ignored for + local files. + + * **HTTP(S)**: Storage options are ignored for files + available across the network via OPeNDAP. + + * **S3-compatible services**: The backend used is `s3fs`, + and the storage options are used to initialise an + `s3fs.S3FileSystem` file system object. By default, or + if `None`, then *storage_options* is taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options*, nor in a dictionary defined by the + ``'client_kwargs'`` key (both of which are the case when + *storage_options* is `None`), then one will be + automatically inserted for accessing an S3 file. For + instance, with a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. To + disable this, set the ``'endpoint_url'`` key to `None`. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``None``, ``{}``, + ``{'endpoint_url': 'https://store'}``, and + ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` + + *Parameter example:* + ``{'key': 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}``""", + # read cache + "{{read cache: `bool`, optional}}": """cache: `bool`, optional + If True, the default, then cache the first and last array + elements of metadata constructs (not field constructs) for + fast future access. In addition, the second and + penultimate array elements will be cached from coordinate + bounds when there are two bounds per cell. For remote + data, setting *cache* to False may speed up the parsing of + the file.""", + # read dask_chunks + "{{read dask_chunks: `str`, `int`, `None`, or `dict`, optional}}": """dask_chunks: `str`, `int`, `None`, or `dict`, optional + Specify the Dask chunking for data. May be one of the + following: + + * ``'storage-aligned'`` + + This is the default. The Dask chunk size in bytes will + be as close as possible the size given by + `{{package}}.chunksize`, favouring square-like chunk + shapes, with the added restriction that the entirety of + each storage chunk lies within exactly one Dask + chunk. This strategy is general the most performant, as + it ensures that when accessing the data, each storage + chunk is read from disk at most once (as opposed to once + per Dask chunk in which it lies). + + For instance, consider a file variable that has an array + of 64-bit floats with shape (400, 300, 60) and a storage + chunk shape of (100, 5, 60). This has an overall size of + 54.93 MiB, partitioned into 240 storage chunks each of + size 100*5*60*8 bytes = 0.23 MiB. Then: + + * If `{{package}}.chunksize` returns 134217728 (i.e. 128 + MiB), then the storage-aligned Dask chunks will have + shape (400, 300, 60), giving 1 Dask chunk with size of + 54.93 MiB (compare with a Dask chunk shape of (400, + 300, 60) and size 54.93 MiB when *dask_chunks* is + ``'auto'``.) + + * If `{{package}}.chunksize` returns 33554432 (i.e. 32 + MiB), then the storage-aligned Dask chunks will have + shape (200, 260, 60), giving 4 Dask chunks with a + maximum size of 23.80 MiB (compare with a Dask chunk + shape of (264, 264, 60) and maximum size 31.90 MiB + when *dask_chunks* is ``'auto'``.) + + * If `{{package}}.chunksize` returns 4194304 (i.e. 4 + MiB), then the storage-aligned Dask chunks will have + shape (100, 85, 60), giving 16 Dask chunks with a + maximum size of 3.89 MiB (compare with a Dask chunk + shape of (93, 93, 60) and maximum size 3.96 MiB when + *dask_chunks* is ``'auto'``.) + + There are, however, some occasions when, for particular + data arrays in the file, the ``'auto'`` option will + automatically be used instead of storage-aligned Dask + chunks. This occurs when: + + * The data array in the file is stored contiguously. + + * The data array in the file is compressed by convention + (e.g. ragged array representations, compression by + gathering, subsampled coordinates, etc.). In this case + the Dask chunks are for the uncompressed data, and so + cannot be aligned with the storage chunks of the + compressed array in the file. + + * ``'storage-exact'`` + + Each Dask chunk will contain exactly one storage chunk + and each storage chunk will lie within exactly one Dask + chunk. + + For instance, consider a file variable that has an array + of 64-bit floats with shape (400, 300, 60) and a storage + chunk shape of (100, 5, 60). This has an overall size of + 54.93 MiB, partitioned into 240 storage chunks each of + size 100*5*60*8 bytes = 0.23 MiB. The corresponding + storage-exact Dask chunks will also have shape (100, 5, + 60), giving 240 Dask chunks with a maximum size of 0.23 + MiB. + + There are, however, some occasions when, for particular + data arrays in the file, the ``'auto'`` option will + automatically be used instead of storage-exact Dask + chunks. This occurs when: + + * The data array in the file is stored contiguously. + + * The data array in the file is compressed by convention + (e.g. ragged array representations, compression by + gathering, subsampled coordinates, etc.). In this case + the Dask chunks are for the uncompressed data, and so + cannot be aligned with the storage chunks of the + compressed array in the file. + + * ``auto`` + + The Dask chunk size in bytes will be as close as + possible to the size given by `{{package}}.chunksize`, + favouring square-like chunk shapes. This may give + similar Dask chunk shapes as the ``'storage-aligned'`` + option, but without the guarantee that each storage + chunk will lie within exactly one Dask chunk. + + * A byte-size given by a `str` + + The Dask chunk size in bytes will be as close as + possible to the given byte-size, favouring square-like + chunk shapes. Any string value, accepted by the *chunks* + parameter of the `dask.array.from_array` function is + permitted. + + *Example:* + A Dask chunksize of 2 MiB may be specified as + ``'2097152'`` or ``'2 MiB'``. + + * `-1` or `None` + + There is no Dask chunking, i.e. every data array has one + Dask chunk regardless of its size. + + * Positive `int` + + Every dimension of all Dask chunks has this number of + elements. + + *Example:* + For 3-dimensional data, *dask_chunks* of `10` will + give Dask chunks with shape (10, 10, 10). + + * `dict` + + Each of dictionary key identifies a file dimension, with + a value that defines the Dask chunking for that + dimension whenever it is spanned by a data array. A file + dimension is identified in one of three ways: + + 1. the netCDF dimension name, preceded by ``ncdim%`` + (e.g. ``'ncdim%lat'``); + + 2. the value of the "standard name" attribute of a + CF-netCDF coordinate variable that spans the + dimension (e.g. ``'latitude'``); + + 3. the value of the "axis" attribute of a CF-netCDF + coordinate variable that spans the dimension + (e.g. ``'Y'``). + + The dictionary values may be a byte-size string, + ``'auto'``, `int` or `None`, with the same meanings as + those types for the *dask_chunks* parameter itself, but + applying only to the specified dimension. In addition, a + dictionary value may be a `tuple` or `list` of integers + that sum to the dimension size. + + Not specifying a file dimension in the dictionary is + equivalent to it being defined with a value of + ``'auto'``. + + *Example:* + ``{'T': '0.5 MiB', 'Z': 'auto', 'Y': [36, 37], 'X': + None}`` + + *Example:* + If a netCDF file contains dimensions ``time``, ``z``, + ``lat`` and ``lon``, then ``{'ncdim%time': 12, + 'ncdim%lat', None, 'ncdim%lon': None}`` will ensure + that, for all applicable data arrays, all ``time`` + axes have a `dask` chunksize of 12; all ``lat`` and + ``lon`` axes are not `dask` chunked; and all ``z`` + axes are `dask` chunked to comply as closely as + possible with the default `dask` chunk size. + + If the netCDF file also contains a ``time`` coordinate + variable with a "standard_name" attribute of + ``'time'`` or "axis" attribute of ``'T'``, then the + same `dask` chunking could be specified with either + ``{'time': 12, 'ncdim%lat', None, 'ncdim%lon': None}`` + or ``{'T': 12, 'ncdim%lat', None, 'ncdim%lon': + None}``.""", + # read store_hdf5_chunks + "{{read store_dataset_chunks: `bool`, optional}}": """store_dataset_chunks: `bool`, optional + If True (the default) then store the dataset chunking + strategy for each returned data array. The dataset + chunking strategy is then accessible via an object's + `nc_hdf5_chunksizes` method. When the dataset chunking + strategy is stored, it will be used when the data is + written to a new netCDF4 file with `{{package}}.write` + (unless the strategy is modified prior to writing). + + If False, or if the dataset format does not support + chunking, then no dataset chunking strategy is stored. + (i.e. an `nc_hdf5_chunksizes` method will return `None` + for all `Data` objects). In this case, when the data is + written to a new netCDF4 file, the dataset chunking + strategy will be determined by `{{package}}.write`. + + See the `{{package}}.write` *hdf5_chunks* parameter for + details on how the dataset chunking strategy is determined + at the time of writing.""", + # read cfa + "{{read cfa: `dict`, optional}}": """cfa: `dict`, optional + Configure the reading of CF-netCDF aggregation files. + + The dictionary may have any subset of the following + key/value pairs that supplement or override the + information read from the file: + + * ``'replace_directory'``: `dict` + + A dictionary whose key/value pairs define modifications + to be applied to the directories of the fragment file + locations. The dictionary comprises keyword arguments to + the {{package}}.Data.replace_directory` method, which is + used to make the the changes. The aggregation file being + read is unaltered. An empty dictionary results in no + modifications. + + *Example:* + Replace a leading ``data/model`` with ``home``, + wherever it occurs: ``{'replace_directory': {'old': + 'data/model', 'new': 'home'}}`` + + *Example:* + Normalise all file locations and replace a leading + ``/archive`` with ``/data/obs``, wherever it occurs: + ``{'replace_directory': {'old': '/archive', 'new': + '/data/obs', 'normalise': True}}`` + + *Example:* + Normalise all file locations and remove a leading + ``/data`, wherever it occurs: ``{'replace_directory': + {'old': '/data', 'normalise': True}}``.""", + # read cfa_write + "{{read cfa_write: (sequence of) `str`, optional}}": """cfa_write: (sequence of) `str`, optional + Register the intention for named construct types to be + subsequently written as CF-netCDF aggregation variables. + + This makes no difference to the logical content of any + construct, but ensures that the data of each of specified + construct types will have only one Dask chunk, regardless + of the seting of *dask_chunks*, which is a requirement for + the creation CF-netCDF aggregation variables. + + The *cfa_write* parameter may be one, or a sequence, of: + + ========================== =============================== + *cfa_write* Construct types + ========================== =============================== + ``'field'`` Field constructs + ``'field_ancillary'`` Field ancillary constructs + ``'domain_ancillary'`` Domain ancillary constructs + ``'dimension_coordinate'`` Dimension coordinate constructs + ``'auxiliary_coordinate'`` Auxiliary coordinate constructs + ``'cell_measure'`` Cell measure constructs + ``'domain_topology'`` Domain topology constructs + ``'cell_connectivity'`` Cell connectivity constructs + ``'all'`` All constructs + ========================== =============================== + + .. note:: If the *dask_chunks* parameter is set to `None` + or ``-1`` then the data of all constructs will + already have only one Dask chunk, so in this + case setting *cfa_write* will have no further + effect. + + *Example:* + To register field constructs to be written as CF-netCDF + aggregation variables: ``cfa_write='field'`` or + ``cfa_write=['field']``. + + *Example:* + To register field and auxiliary coordinate constructs to + be written as CF-netCDF aggregation variables: + ``cfa_write=['field', 'auxiliary_coordinate']``.""", + # read to_memory + "{{read to_memory: (sequence of) `str`, optional}}": """to_memory: (sequence of) `str`, optional + Read all data arrays of the named construct types into + memory. By default, lazy loading is employed for all data + arrays. + + The *to_memory* parameter may be one, or a sequence, of: + + ========================== =============================== + *to_memory* Construct types + ========================== =============================== + ``'all'`` All constructs + ``'metadata'`` All metadata constructs (i.e. + all constructs except Field + constructs) + + ``'field'`` Field constructs + ``'field_ancillary'`` Field ancillary constructs + ``'domain_ancillary'`` Domain ancillary constructs + ``'dimension_coordinate'`` Dimension coordinate constructs + ``'auxiliary_coordinate'`` Auxiliary coordinate constructs + ``'cell_measure'`` Cell measure constructs + ``'domain_topology'`` Domain topology constructs + ``'cell_connectivity'`` Cell connectivity constructs + ========================== =============================== + + *Example:* + To read field construct data arrays into memory: + ``to_memory='field'`` or ``to_memory=['field']``. + + *Example:* + To read field and auxiliary coordinate construct data + arrays into memory: ``to_memory=['field', + 'auxiliary_coordinate']``.""", + # read squeeze + "{{read squeeze: `bool`, optional}}": """squeeze: `bool`, optional + If True then remove all size 1 dimensions from field + construct data arrays, regardless of how the data are + stored in the dataset. If False (the default) then the + presence or not of size 1 dimensions is determined by how + the data are stored in its dataset.""", + # read unsqueeze + "{{read unsqueeze: `bool`, optional}}": """unsqueeze: `bool`, optional + If True then ensure that field construct data arrays span + all of the size 1 dimensions, regardless of how the data + are stored in the dataset. If False (the default) then the + presence or not of size 1 dimensions is determined by how + the data are stored in its dataset.""", + # read file_type + "{{read file_type: `None` or (sequence of) `str`, optional}}": """file_type: `None` or (sequence of) `str`, optional + Only read files of the given type(s). All other file + types are ignored. If `None` (the default) then files of + any valid type are read. If there are no files of the + given type(s), or *file_type* is empty sequence, then an + empty list is returned.""", + # read ignore_unknown_type + "{{read ignore_unknown_type: `bool`, optional}}": """ignore_unknown_type: `bool`, optional + If True then ignore any file which does not have one of + the valid types specified by the *file_type* + parameter. If False (the default) then attempting to read + a file with an unrecognised type will result in an + error.""", + # persist + "{{persist description}}": """Persisting turns an underlying lazy dask array into an + equivalent chunked dask array, but now with the results fully + computed and in memory. This can avoid the expense of + re-reading the data from disk, or re-computing it, when the + data is accessed on multiple occassions.""", # ---------------------------------------------------------------- # Method description substitutions (3 levels of indentation) # ------------------------1---------------------------------------- @@ -501,20 +1035,11 @@ # init mask "{{init mask: `bool`, optional}}": """mask: `bool`, optional If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of - any of the netCDF attributes ``_FillValue``, - ``missing_value``, ``_Unsigned``, ``valid_min``, - ``valid_max``, and ``valid_range``.""", + reading data from disk.""", # init unpack "{{init unpack: `bool`, optional}}": """unpack: `bool`, optional If True (the default) then unpack by convention when - reading data from disk. - - A netCDF array is unpacked depending on the values of - the netCDF attributes ``add_offset`` and - ``scale_factor``.""", + reading data from disk.""", # init attributes "{{init attributes: `dict` or `None`, optional}}": """attributes: `dict` or `None`, optional Provide netCDF attributes for the data as a dictionary @@ -663,7 +1188,7 @@ so that any of the intermediate or final aggregation steps operates on no more than ``split_every`` inputs. The depth of the aggregation graph will be - :math:`log_{split\_every}}(\textnormal{input chunks + :math:`log_{split\_every}(\textnormal{input chunks along reduced axes})`. Setting to a low value can reduce cache size and network transfers, at the cost of more CPU and a larger dask graph. @@ -684,12 +1209,58 @@ "{{to_size: `int`, optional}}": """to_size: `int`, optional Pad the axis after so that the new axis has the given size.""", + # cull_graph + "{{cull_graph: `bool`, optional}}": """cull_graph: `bool`, optional + If True then unnecessary tasks are removed (culled) + from each array's dask graph before + concatenation. This process can have a considerable + overhead but can sometimes improve the overall + performance of a workflow. If False (the default) then + dask graphs are not culled. See + `dask.optimization.cull` for details.""", + # relaxed_units + "{{relaxed_units: `bool`, optional}}": """relaxed_units: `bool`, optional + If True then allow the concatenation of data with + invalid but otherwise equal units. By default, if any + data array has invalid units then the concatenation + will fail. A `Units` object is considered to be + invalid if its `!isvalid` attribute is `False`.""", + # relaxed_units + "{{concatenate copy: `bool`, optional}}": """copy: `bool`, optional + If True (the default) then make copies of the + `{{class}}` objects prior to the concatenation, + thereby ensuring that the input constructs are not + changed by the concatenation process. If False then + some or all input constructs might be changed + in-place, but the concatenation process will be + faster.""", + # normalise + "{{normalise: `bool`, optional}}": """normalise: `bool`, optional + If True then normalise to an absolute path. If False + (the default) then no normalisation is done.""", + # replace old + "{{replace old: `str` or `None`, optional}}": """old: `str` or `None`, optional + The base directory structure to be replaced by + *new*. If `None` (the default) or an empty string, and + *normalise* is False, then *new* (if set) is prepended + to each file name.""", + # replace new + "{{replace new: `str` or `None`, optional}}": """new: `str` or `None`, optional + The new directory that replaces the base directory + structure identified by *old*. If `None` (the default) + or an empty string, then *old* (if set) is replaced + with an empty string.""", + # replace normalise + "{{replace normalise: `bool`, optional}}": """normalise: `bool`, optional + If True then *old* and *new* directories, and the file + names, are normalised to absolute paths prior to the + replacement. If False (the default) then no + normalisation is done.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- # Returns constructs - "{{Returns constructs}}": """ - The selected constructs in a new `Constructs` object, + "{{Returns constructs}}": """The selected constructs in a new `Constructs` object, unless modified by any *filter_kwargs* parameters. The returned object will contain no constructs if none were selected.""", diff --git a/cfdm/domain.py b/cfdm/domain.py index 4332d251b4..ef48284168 100644 --- a/cfdm/domain.py +++ b/cfdm/domain.py @@ -940,9 +940,15 @@ def get_data_axes(self, *identity, default=ValueError(), **filter_kwargs): return axes - def get_filenames(self): + def get_filenames(self, normalise=True): """Return the file names containing the metadata construct data. + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `set` @@ -962,7 +968,7 @@ def get_filenames(self): out = set() for c in self.constructs.filter_by_data(todict=True).values(): - out.update(c.get_filenames()) + out.update(c.get_filenames(normalise=normalise)) return out @@ -1075,6 +1081,38 @@ def identities(self): return out + @_inplace_enabled(default=False) + def persist(self, inplace=False): + """Persist data into memory. + + {{persist description}} + + **Performance** + + `persist` causes delayed operations to be computed. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `{{package}}.Data.persist` + + :Parameters: + + {{inplace: `bool`, optional}} + + :Returns: + + `{{class}}` or `None` + The domain construct with persisted metadata. If the + operation was in-place then `None` is returned. + + """ + d = _inplace_enabled_define_and_cleanup(self) + + for c in d.constructs.filter_by_data(todict=True).values(): + c.persist(inplace=True) + + return d + @_inplace_enabled(default=False) def uncompress(self, inplace=False): """Uncompress the domain construct. diff --git a/cfdm/field.py b/cfdm/field.py index 12ea8ae423..7a7bd4b35e 100644 --- a/cfdm/field.py +++ b/cfdm/field.py @@ -1421,6 +1421,136 @@ def _derive_count(flattened_data): return f + @classmethod + def concatenate( + cls, fields, axis, cull_graph=False, relaxed_units=False, copy=True + ): + """Join together a sequence of Field constructs. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `Data.concatenate`, `Data.cull_graph` + + :Parameters: + + fields: sequence of `{{class}}` + The fields to concatenate. + + axis: + Select the domain axis to along which to concatenate, + defined by that which would be selected by passing + *axis* to a call of the field construct's + `domain_axis` method. For example, for a value of + 'time', the domain axis construct returned by + ``f.domain_axis('time')`` is selected. + + {{cull_graph: `bool`, optional}} + + {{relaxed_units: `bool`, optional}} + + {{concatenate copy: `bool`, optional}} + + :Returns: + + `{{class}}` + The concatenated construct. + + """ + if isinstance(fields, cls): + raise ValueError("Must provide a sequence of Field constructs") + + fields = tuple(fields) + field0 = fields[0] + data_axes = field0.get_data_axes() + axis_key = field0.domain_axis( + axis, + key=True, + default=ValueError( + f"Can't identify a unique concatenation axis from {axis!r}" + ), + ) + try: + axis = data_axes.index(axis_key) + except ValueError: + raise ValueError( + "The field's data must span the concatenation axis" + ) + + out = field0 + if copy: + out = out.copy() + + if len(fields) == 1: + return out + + new_data = out._Data.concatenate( + [f.get_data(_fill_value=False) for f in fields], + axis=axis, + cull_graph=cull_graph, + relaxed_units=relaxed_units, + copy=copy, + ) + + # Change the domain axis size + out.set_construct( + out._DomainAxis(size=new_data.shape[axis]), key=axis_key + ) + + # Insert the concatenated data + out.set_data(new_data, axes=data_axes, copy=False) + + # ------------------------------------------------------------ + # Concatenate constructs with data + # ------------------------------------------------------------ + for key, construct in field0.constructs.filter_by_data( + todict=True + ).items(): + construct_axes = field0.get_data_axes(key) + + if axis_key not in construct_axes: + # This construct does not span the concatenating axis + # in the first field + continue + + constructs = [construct] + for f in fields[1:]: + c = f.constructs.get(key) + if c is None: + # This field does not have this construct + constructs = None + break + + constructs.append(c) + + if not constructs: + # Not every field has this construct, so remove it + # from the output field. + out.del_construct(key) + continue + + # Still here? Then try concatenating the constructs from + # each field. + try: + construct = construct.concatenate( + constructs, + axis=construct_axes.index(axis_key), + cull_graph=cull_graph, + relaxed_units=relaxed_units, + copy=copy, + ) + except ValueError: + # Couldn't concatenate this construct, so remove it from + # the output field. + out.del_construct(key) + else: + # Successfully concatenated this construct, so insert + # it into the output field. + out.set_construct( + construct, key=key, axes=construct_axes, copy=False + ) + + return out + def creation_commands( self, representative_data=False, @@ -1798,6 +1928,40 @@ def dump(self, display=True, _level=0, _title=None): return "\n".join(string) + def file_directories(self, constructs=True): + """The directories of files containing parts of the data. + + Returns the locations of any files referenced by the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `replace_directory` + + :Parameters: + + constructs: `bool`, optional + If True (the default) then add also the directory to + the data of metadata constructs. If False then don't + do this. + + :Returns: + + `set` + The unique set of file directories as absolute paths. + + **Examples** + + >>> d.file_directories() + {'/home/data1', 'file:///data2'} + + """ + directories = super().file_directories() + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + directories.update(c.file_directories()) + + return directories + def get_data_axes(self, *identity, default=ValueError(), **filter_kwargs): """Gets the keys of the axes spanned by the construct data. @@ -1912,12 +2076,18 @@ def get_domain(self): return domain - def get_filenames(self): + def get_filenames(self, normalise=True): """Return the names of the files containing the data. The names of the files containing the data of the field constructs and of any metadata constructs are returned. + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `set` @@ -1934,10 +2104,10 @@ def get_filenames(self): {'temp_file.nc'} """ - out = super().get_filenames() + out = super().get_filenames(normalise=normalise) for c in self.constructs.filter_by_data(todict=True).values(): - out.update(c.get_filenames()) + out.update(c.get_filenames(normalise=normalise)) return out @@ -2135,37 +2305,28 @@ def insert_dimension( ): """Expand the shape of the data array. - Inserts a new size 1 axis, corresponding to an existing domain - axis construct, into the data array. - .. versionadded:: (cfdm) 1.7.0 - .. seealso:: `squeeze`, `transpose` + .. seealso:: `squeeze`, `transpose`, `unsqueeze` :Parameters: - axis: `str` - The identifier of the domain axis construct - corresponding to the inserted axis. + axis: + Select the domain axis to insert, generally defined by that + which would be selected by passing the given axis description + to a call of the field construct's `domain_axis` method. For + example, for a value of ``'time'``, the domain axis construct + returned by ``f.domain_axis('time')`` is selected. If *axis* is `None` then a new domain axis construct will be created for the inserted dimension. - *Parameter example:* - ``axis='domainaxis2'`` - position: `int`, optional Specify the position that the new axis will have in the data array. By default the new axis has position 0, the slowest varying position. Negative integers counting from the last position are allowed. - *Parameter example:* - ``position=2`` - - *Parameter example:* - ``position=-1`` - constructs: `bool` If True then also insert the new axis into all metadata constructs that don't already include it. By @@ -2177,24 +2338,43 @@ def insert_dimension( :Returns: - `Field` or `None` - The new field construct with expanded data axes. If - the operation was in-place then `None` is returned. + `{{class}}` or `None` + The field construct with expanded data, or `None` if the + operation was in-place. **Examples** - >>> f.data.shape - (19, 73, 96) - >>> f.insert_dimension('domainaxis3').data.shape - (1, 96, 73, 19) - >>> f.insert_dimension('domainaxis3', position=3).data.shape - (19, 73, 96, 1) - >>> f.insert_dimension('domainaxis3', position=-1, inplace=True) - (19, 73, 1, 96) - >>> f.data.shape - (19, 73, 1, 96) - >>> f.insert_dimension(None, 1).data.shape - (19, 1, 73, 1, 96) + >>> f = {{package}}.example_field(0) + >>> print(f) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(latitude(5), longitude(8)) 1 + Cell methods : area: mean + Dimension coords: latitude(5) = [-75.0, ..., 75.0] degrees_north + : longitude(8) = [22.5, ..., 337.5] degrees_east + : time(1) = [2019-01-01 00:00:00] + >>> g = f.insert_dimension('time', 0) + >>> print(g) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(time(1), latitude(5), longitude(8)) 1 + Cell methods : area: mean + Dimension coords: latitude(5) = [-75.0, ..., 75.0] degrees_north + : longitude(8) = [22.5, ..., 337.5] degrees_east + : time(1) = [2019-01-01 00:00:00] + + A previously non-existent size 1 axis must be created prior to + insertion: + + >>> f.insert_dimension(None, 1, inplace=True) + >>> print(f) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(time(1), key%domainaxis3(1), latitude(5), longitude(8)) 1 + Cell methods : area: mean + Dimension coords: latitude(5) = [-75.0, ..., 75.0] degrees_north + : longitude(8) = [22.5, ..., 337.5] degrees_east + : time(1) = [2019-01-01 00:00:00] """ f = _inplace_enabled_define_and_cleanup(self) @@ -2214,6 +2394,9 @@ def insert_dimension( f"{domain_axis.get_size()}" ) + if position < 0: + position = position + f.ndim + 1 + data_axes = f.get_data_axes(default=None) if data_axes is not None: data_axes0 = data_axes[:] @@ -2456,6 +2639,138 @@ def convert(self, *identity, full_domain=True, **filter_kwargs): return f + @_inplace_enabled(default=False) + def persist(self, metadata=False, inplace=False): + """Persist the data into memory. + + This turns the underlying lazy dask array into an equivalent + chunked dask array, but now with the results fully computed + and in memory. This can avoid the expense of re-reading the + data from disk, or re-computing it, when the data is accessed + on multiple occassions. + + **Performance** + + `persist` causes delayed operations to be computed. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `persist_metadata`, `array`, `datetime_array`, + `{{package}}.Data.persist` + + :Parameters: + + metadata: `bool` + If True then also persist the metadata constructs. By + default, metadata constructs are not changed. + + {{inplace: `bool`, optional}} + + :Returns: + + `Field` or `None` + The field construct with persisted data. If the + operation was in-place then `None` is returned. + + """ + f = _inplace_enabled_define_and_cleanup(self) + + super(Field, f).persist(inplace=True) + if metadata: + f.persist_metadata(inplace=True) + + return f + + @_inplace_enabled(default=False) + def persist_metadata(self, inplace=False): + """Persist the data of metadata constructs into memory. + + {{persist description}} + + **Performance** + + `persist_metadata` causes delayed operations to be computed. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `persist`, `array`, `datetime_array`, + `dask.array.Array.persist` + + :Parameters: + + {{inplace: `bool`, optional}} + + :Returns: + + `Field` or `None` + The field construct with persisted metadata. If the + operation was in-place then `None` is returned. + + """ + f = _inplace_enabled_define_and_cleanup(self) + + for c in f.constructs.filter_by_data(todict=True).values(): + c.persist(inplace=True) + + return f + + def replace_directory( + self, + old=None, + new=None, + normalise=False, + common=False, + constructs=True, + ): + """Replace a file directory in-place. + + Every file in *old_directory* that is referenced by the data + is redefined to be in *new*. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `file_directories`, `get_filenames` + + :Parameters: + + {{replace old: `str` or `None`, optional}} + + {{replace new: `str` or `None`, optional}} + + {{replace normalise: `bool`, optional}} + + common: `bool`, optional + If True the base directory structure that is common to + all files with *new*. + + constructs: `bool`, optional + If True (the default) then add also the directory to + the data of metadata constructs. If False then don't + do this. + + :Returns: + + `None` + + **Examples** + + >>> d.get_filenames() + {'/data/file1.nc', '/home/file2.nc'} + >>> d.replace_directory('/data', '/new/data/path/') + '/new/data/path' + >>> d.get_filenames() + {'/new/data/path/file1.nc', '/home/file2.nc'} + + """ + super().replace_directory( + old=old, new=new, normalise=normalise, common=common + ) + if constructs: + for c in self.constructs.filter_by_data(todict=True).values(): + c.replace_directory( + old=old, new=new, normalise=normalise, common=common + ) + def nc_hdf5_chunksizes(self, todict=False): """Get the HDF5 chunking strategy for the data. @@ -2764,59 +3079,73 @@ def nc_set_hdf5_chunksizes( @_inplace_enabled(default=False) def squeeze(self, axes=None, inplace=False): - """Remove size one axes from the data. + """Remove size 1 axes from the data. By default all size one axes are removed, but particular size one axes may be selected for removal. + Squeezed domain axis constructs are not removed from the metadata + constructs, nor from the domain of the field construct. + .. versionadded:: (cfdm) 1.7.0 - .. seealso:: `insert_dimension`, `transpose` + .. seealso:: `insert_dimension`, `transpose`, `unsqueeze` :Parameters: - axes: (sequence of) `int`, optional - The positions of the size one axes to be removed. By - default all size one axes are removed. + axes: + Select the domain axes to squeeze, defined by the + domain axes that would be selected by passing each + given axis description to a call of the field + construct's `domain_axis` method. For example, for a + value of ``'time'``, the domain axis construct + returned by ``f.domain_axis('time')`` is selected. - {{axes int examples}} + If *axes* is `None` (the default) then all size 1 axes + are removed. {{inplace: `bool`, optional}} :Returns: - `Field` or `None` - The field construct with removed data axes. If the - operation was in-place then `None` is returned. + `{{class}}` or `None` + The field construct with squeezed data, or `None` if the + operation was in-place. **Examples** - >>> f.data.shape - (1, 73, 1, 96) - >>> f.squeeze().data.shape - (73, 96) - >>> f.squeeze(0).data.shape - (73, 1, 96) - >>> f.squeeze([-3, 2], inplace=True) - >>> f.data.shape - (73, 96) + >>> g = f.squeeze() + >>> g = f.squeeze('time') + >>> g = f.squeeze(1) + >>> g = f.squeeze(['time', 1, 'dim2']) + >>> f.squeeze(['dim2'], inplace=True) """ f = _inplace_enabled_define_and_cleanup(self) + data_axes = f.get_data_axes(default=None) + if data_axes is None: + return f + if axes is None: - iaxes = [i for i, n in enumerate(f.data.shape) if n == 1] + domain_axes = f.domain_axes(todict=True) + axes = [ + axis + for axis in data_axes + if domain_axes[axis].get_size(None) == 1 + ] else: - try: - iaxes = f.data._parse_axes(axes) - except ValueError as error: - raise ValueError(f"Can't squeeze data: {error}") + if isinstance(axes, (str, int)): + axes = (axes,) - data_axes = f.get_data_axes(default=None) - if data_axes is not None: - new_data_axes = [ - data_axes[i] for i in range(f.data.ndim) if i not in iaxes - ] + axes = [f.domain_axis(x, key=True) for x in axes] + axes = set(axes).intersection(data_axes) + + iaxes = [data_axes.index(axis) for axis in axes] + + new_data_axes = [ + data_axes[i] for i in range(f.data.ndim) if i not in iaxes + ] # Squeeze the field's data array super(Field, f).squeeze(iaxes, inplace=True) @@ -2832,14 +3161,21 @@ def transpose(self, axes=None, constructs=False, inplace=False): .. versionadded:: (cfdm) 1.7.0 - .. seealso:: `insert_dimension`, `squeeze` + .. seealso:: `insert_dimension`, `squeeze`, `unsqueeze` :Parameters: - axes: (sequence of) `int`, optional - The new axis order. By default the order is reversed. + axes: sequence or `None` + Select the domain axis order, defined by the domain + axes that would be selected by passing each given axis + description to a call of the field construct's + `domain_axis` method. For example, for a value of + ``'time'``, the domain axis construct returned by + ``f.domain_axis('time')`` is selected. - {{axes int examples}} + Each dimension of the field construct's data must be + provided, or if *axes* is `None` (the default) then + the axis order is reversed. constructs: `bool` If True then transpose the metadata constructs to have @@ -2870,13 +3206,22 @@ def transpose(self, axes=None, constructs=False, inplace=False): """ f = _inplace_enabled_define_and_cleanup(self) - try: - iaxes = f.data._parse_axes(axes) - except ValueError as error: - raise ValueError(f"Can't transpose data: {error}") - - if iaxes is None: + if axes is None: iaxes = tuple(range(f.data.ndim - 1, -1, -1)) + else: + data_axes = self.get_data_axes(default=()) + if isinstance(axes, (str, int)): + axes = (axes,) + + axes2 = [f.domain_axis(axis, key=True) for axis in axes] + + if sorted(axes2) != sorted(data_axes): + raise ValueError( + f"Can't transpose {self.__class__.__name__}: " + f"Bad axis specification: {axes!r}" + ) + + iaxes = [data_axes.index(axis) for axis in axes2] data_axes = f.get_data_axes(default=None) @@ -2996,3 +3341,56 @@ def uncompress(self, inplace=False): c.uncompress(inplace=True) return f + + @_inplace_enabled(default=False) + def unsqueeze(self, inplace=None): + """Insert size 1 axes into the data array. + + All size 1 domain axes which are not spanned by the field + construct's data are inserted. + + The axes are inserted into the slowest varying data array positions. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `insert_dimension`, `squeeze`, `transpose` + + :Parameters: + + {{inplace: `bool`, optional}} + + :Returns: + + `Field` or `None` + The field construct with size-1 axes inserted in its + data, or `None` if the operation was in-place. + + **Examples** + + >>> f = {{package}}.example_field(0) + >>> print(f) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(latitude(5), longitude(8)) 1 + Cell methods : area: mean + Dimension coords: latitude(5) = [-75.0, ..., 75.0] degrees_north + : longitude(8) = [22.5, ..., 337.5] degrees_east + : time(1) = [2019-01-01 00:00:00] + >>> g = f.unsqueeze() + >>> print(g) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(time(1), latitude(5), longitude(8)) 1 + Cell methods : area: mean + Dimension coords: latitude(5) = [-75.0, ..., 75.0] degrees_north + : longitude(8) = [22.5, ..., 337.5] degrees_east + : time(1) = [2019-01-01 00:00:00] + + """ + f = _inplace_enabled_define_and_cleanup(self) + + size_1_axes = self.domain_axes(filter_by_size=(1,), todict=True) + for axis in set(size_1_axes).difference(self.get_data_axes()): + f.insert_dimension(axis, position=0, inplace=True) + + return f diff --git a/cfdm/functions.py b/cfdm/functions.py index 849eaa72ea..e96a4cd13c 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -4,12 +4,16 @@ from functools import total_ordering from math import isnan from numbers import Integral -from urllib.parse import urlparse +from os import sep as os_sep +from os.path import abspath as os_abspath +from os.path import dirname as os_dirname +from os.path import join import numpy as np from dask import config as _config from dask.base import is_dask_collection from dask.utils import parse_bytes +from uritools import uricompose, urisplit from . import __cf_version__, __file__, __version__, core from .constants import CONSTANTS, ValidLogLevels @@ -442,52 +446,270 @@ def CF(): **Examples** >>> CF() - '1.11' + '1.12' """ return __cf_version__ -def abspath(filename): - """Return a normalised absolute version of a file name. - - If a string containing URL is provided then it is returned - unchanged. +def abspath(path, uri=None): + """Return a normalised absolute version of a path. .. versionadded:: (cfdm) 1.7.0 :Parameters: - filename: `str` - The name of the file. + path: `str` + The file or directory path. + + uri: `None` or `bool`, optional + If True then the returned path will begin with a URI + scheme component followed by a ``:`` character, such as + ``file://data/file.nc``, ``https://remote/data/file.nc``, + etc.). If False then the returned path will not begin with + a URI scheme component only if the input *path* does. If + `None` (the default) then the returned path will begin + with a URI scheme component if the input *path* does. + + .. versionadded:: (cfdm) NEXTVERSION :Returns: `str` - The normalised absolutised version of *filename*. + The normalised absolutised version of the path. **Examples** >>> import os >>> os.getcwd() '/data/archive' + >>> cfdm.abspath('file.nc') '/data/archive/file.nc' - >>> cfdm.abspath('..//archive///file.nc') + >>> cfdm.abspath('../file.nc') + '/data/file.nc' + >>> cfdm.abspath('file:///file.nc') + 'file:///file.nc' + >>> cfdm.abspath('file://file.nc') + 'file:///data/archive' + >>> cfdm.abspath('file:/file.nc') + 'file:///file.nc' + + >>> cfdm.abspath('http:///file.nc') + 'http:///file.nc' + >>> cfdm.abspath('http://file.nc') + 'http://' + >>> cfdm.abspath('http:/file.nc') + 'http:///file.nc' + + >>> cfdm.abspath('file.nc', uri=True) + 'file:///data/archive/file.nc' + >>> cfdm.abspath('../file.nc', uri=True) + 'file:///data/file.nc' + >>> cfdm.abspath('file:///file.nc', uri=True) + 'file:///file.nc' + >>> cfdm.abspath('file://file.nc', uri=True) + 'file:///data/archive' + >>> cfdm.abspath('file:/file.nc', uri=True) + 'file:///file.nc' + + >>> cfdm.abspath('http:///file.nc', uri=True) + 'http:///file.nc' + >>> cfdm.abspath('http://file.nc', uri=True) + 'http://' + >>> cfdm.abspath('http:/file.nc', uri=True) + 'http:///file.nc' + + >>> cfdm.abspath('file.nc', uri=False) '/data/archive/file.nc' - >>> cfdm.abspath('http://data/archive/file.nc') - 'http://data/archive/file.nc' + + >>> cfdm.abspath('../file.nc', uri=False) + '/data/file.nc' + >>> cfdm.abspath('file:///file.nc', uri=False) + '/file.nc' + >>> cfdm.abspath('file://file.nc', uri=False) + '/data/archive' + >>> cfdm.abspath('file:/file.nc', uri=False) + '/file.nc' + + >>> cfdm.abspath('') + '/data/archive" + + """ + u = urisplit(path) + scheme = u.scheme + path = u.path + if scheme: + if scheme == "file" or path.startswith(os_sep): + path = os_abspath(path) + + if uri or uri is None: + path = uricompose(scheme=scheme, authority="", path=path) + elif scheme != "file": + raise ValueError(f"Can't set uri=False for path={u.geturi()!r}") + + return path + + path = os_abspath(path) + if uri: + path = uricompose(scheme="file", authority="", path=path) + + return path + + +def dirname(path, normalise=False, uri=None, isdir=False, sep=False): + """Return the directory of a path. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + path: `str` + The name of the path. + + normalise: `bool`, optional + If True then normalise the path by resolving it to an + absolute path. If False (the default) then no + normalisation is done. + + uri: `None` or `bool`, optional + If True then the returned directory will begin with a URI + scheme component followed by a ``:`` character, such as + ``file://data/file.nc``, ``https://remote/data/file.nc``, + etc.). If False then the returned directory will not begin + with a URI scheme component. If `None` (the default) then + the returned directory will begin with a URI scheme + component if the input *path* does. + + isdir: `bool`, optional + Set to True if *path* represents a directory, rather than + a file. + + sep: `bool`, optional + Set to True to add a trailing path separator to the + returned directory. + + :Returns: + + `str` + The directory of the path. + + **Examples** + + >>> import os + >>> os.getcwd() + '/data/archive' + + >>> cfdm.dirname('file.nc') + '/data/archive' + >>> cfdm.dirname('file.nc', normalise=True) + '/data/archive' + >>> cfdm.dirname('file.nc', normalise=True, uri=True) + 'file:///data/archive' + >>> cfdm.dirname('file.nc', normalise=True, uri=False) + '/data/archive' + >>> cfdm.dirname('file.nc', normalise=True, sep=True) + '/data/archive/' + + >>> cfdm.dirname('model/file.nc') + 'model' + >>> cfdm.dirname('model/file.nc', normalise=True) + /data/archive/model' + >>> cfdm.dirname('model/file.nc', normalise=True, uri=True) + 'file:///data/archive/model' + >>> cfdm.dirname('model/file.nc', normalise=True, uri=False) + /data/archive/model' + + >>> cfdm.dirname('../file.nc') + '..' + >>> cfdm.dirname('../file.nc', normalise=True) + '/data' + >>> cfdm.dirname('../file.nc', normalise=True, uri=True) + 'file:///data' + >>> cfdm.dirname('../file.nc', normalise=True, uri=False) + '/data' + + >>> cfdm.dirname('/model/file.nc') + '/model' + >>> cfdm.dirname('/model/file.nc', normalise=True) + '/model' + >>> cfdm.dirname('/model/file.nc', normalise=True, uri=True) + 'file:///model' + >>> cfdm.dirname('/model/file.nc', normalise=True, uri=False) + '/model' + + >>> cfdm.dirname('') + '' + >>> cfdm.dirname('', normalise=True) + '/data/archive' + >>> cfdm.dirname('', normalise=True, uri=True) + 'file:///data/archive' + >>> cfdm.dirname('', normalise=True, uri=False) + '/data/archive' + + >>> cfdm.dirname('https:///data/archive/file.nc') + 'https:///data/archive' + >>> cfdm.dirname('https:///data/archive/file.nc', normalise=True) + 'https:///data/archive' + >>> cfdm.dirname('https:///data/archive/file.nc', normalise=True, uri=True) + 'https:///data/archive' + >>> cfdm.dirname('https:///data/archive/file.nc', normalise=True, uri=False) + ValueError: Can't set uri=False for path='https:///data/archive/file.nc' + + >>> cfdm.dirname('file:///data/archive/file.nc') + 'file:///data/archive' + >>> cfdm.dirname('file:///data/archive/file.nc', normalise=True) + 'file:///data/archive' + >>> cfdm.dirname('file:///data/archive/file.nc', normalise=True, uri=True) + 'file:///data/archive' + >>> cfdm.dirname('file:///data/archive/file.nc', normalise=True, uri=False) + '/data/archive' + + >>> cfdm.dirname('file:///data/archive/../file.nc') + 'file:///data/archive/..' + >>> cfdm.dirname('file:///data/archive/../file.nc', normalise=True) + 'file::///data' + >>> cfdm.dirname('file:///data/archive/../file.nc', normalise=True, uri=True) + 'file::///data' + >>> cfdm.dirname('file:///data/archive/../file.nc', normalise=True, uri=False) + '/data' """ - u = urlparse(filename) + u = urisplit(path) scheme = u.scheme - if not scheme: - return os.path.abspath(filename) + path = u.path + if scheme: + # Remote (or "file:") + if normalise and (scheme == "file" or path.startswith(os_sep)): + path = os_abspath(path) + + if not isdir: + path = os_dirname(path) + + if sep: + path = join(path, "") - if scheme == "file": - return u.path + if uri or uri is None: + path = uricompose(scheme=scheme, authority="", path=path) + elif scheme != "file": + raise ValueError(f"Can't set uri=False for path={u.geturi()!r}") - return filename + return path + + # Local file + if not isdir: + path = os_dirname(path) + + if normalise: + path = os_abspath(path) + + if uri: + path = uricompose(scheme="file", authority="", path=path) + + if sep: + path = join(path, "") + + return path def unique_constructs(constructs, ignore_properties=None, copy=True): @@ -1947,7 +2169,16 @@ def indices_shape(indices, full_shape, keepdims=True): """ shape = [] + # i = 0 for index, full_size in zip(indices, full_shape): + # for index in indices: + if index is None: + shape.append(1) + continue + + # full_size = full_shape[i] + # i += 1 + if isinstance(index, slice): start, stop, step = index.indices(full_size) if (stop - start) * step < 0: @@ -1992,7 +2223,7 @@ def indices_shape(indices, full_shape, keepdims=True): return shape -def parse_indices(shape, indices, keepdims=True): +def parse_indices(shape, indices, keepdims=True, newaxis=False): """Parse indices for array access and assignment. .. versionadded:: (cfdm) 1.11.2.0 @@ -2009,6 +2240,11 @@ def parse_indices(shape, indices, keepdims=True): If True then an integral index is converted to a slice. For instance, ``3`` would become ``slice(3, 4)``. + newaxis: `bool`, optional + If True then allow *indices* to include one or more + `numpy.newaxis` elements. If False (the default) then + these elements are not allowed. + :Returns: `list` @@ -2040,22 +2276,31 @@ def parse_indices(shape, indices, keepdims=True): length = len(indices) n = len(shape) ndim = n - for index in indices: + for i, index in enumerate(indices): if index is Ellipsis: m = n - length + 1 + try: + if indices[i + 1] is np.newaxis: + m += 1 + except IndexError: + pass + parsed_indices.extend([slice(None)] * m) n -= m else: parsed_indices.append(index) - n -= 1 + if index is np.newaxis: + ndim += 1 + length += 1 + else: + n -= 1 length -= 1 len_parsed_indices = len(parsed_indices) - if ndim and len_parsed_indices > ndim: raise IndexError( - f"Invalid indices {parsed_indices} for array with shape {shape}" + f"Invalid indices {indices!r} for array with shape {shape}" ) if len_parsed_indices < ndim: @@ -2067,6 +2312,12 @@ def parse_indices(shape, indices, keepdims=True): ) for i, (index, size) in enumerate(zip(parsed_indices, shape)): + if not newaxis and index is np.newaxis: + raise IndexError( + f"Invalid indices {indices!r} for array with shape {shape}: " + "New axis indices are not allowed" + ) + if keepdims and isinstance(index, Integral): # Convert an integral index to a slice if index == -1: diff --git a/cfdm/mixin/fielddomain.py b/cfdm/mixin/fielddomain.py index 869cbbbd2b..a4ced461c9 100644 --- a/cfdm/mixin/fielddomain.py +++ b/cfdm/mixin/fielddomain.py @@ -88,7 +88,15 @@ def _construct( c = getattr(self, _constructs_method)(*identities, **filter_kwargs) # Return construct, or key, or both, or default - return self._filter_return_construct(c, key, item, default, _method) + return self._filter_return_construct( + c, + key, + item, + default, + _method, + identities=identities, + filter_kwargs=filter_kwargs, + ) def _get_data_compression_variables(self, component): """TODO.""" @@ -157,7 +165,16 @@ def _get_coordinate_geometry_variables(self, component): return out - def _filter_return_construct(self, c, key, item, default, _method): + def _filter_return_construct( + self, + c, + key, + item, + default, + _method, + identities=None, + filter_kwargs=None, + ): """Return construct, or key, or both, or default. :Parameters: @@ -178,6 +195,20 @@ def _filter_return_construct(self, c, key, item, default, _method): _method: `str` The name of the ultimate calling method. + identities: `None` or sequence, optional + The *identity* positional argument passed in by the + calling function (i.e. by the user). Used for + informative error messages. + + .. versionadded:: (cfdm) NEXTVERSION + + filter_kwargs: `None` or `dict`, optional + The filter_kwargs that were passed in by the calling + function (i.e. by the user). Used for informative + error messages. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: The selected construct, or its identifier if *key* is @@ -198,10 +229,26 @@ def _filter_return_construct(self, c, key, item, default, _method): if default is None: return default + # Create a nice error message + if identities is None: + identities = [] + else: + identities = [f"{i!r}" for i in identities] + + if filter_kwargs is None: + filter_kwargs = [] + else: + filter_kwargs.pop("todict", None) + filter_kwargs = [ + f"{key}={value!r}" for key, value in filter_kwargs.items() + ] + + args = ", ".join(identities + filter_kwargs) + construct_type = _method.replace("_", " ") + return self._default( default, - f"{self.__class__.__name__}.{_method}() can't return {n} " - "constructs", + f"No {construct_type} constructs found with identity {args}", ) def _filter_interface( @@ -286,7 +333,11 @@ def _filter_interface( # Return construct, or key, or both, or default return self._filter_return_construct( - c, key, item, default, _method + c, + key, + item, + default, + _method, ) kwargs = {"filter_by_type": _ctypes} @@ -329,7 +380,15 @@ def _filter_interface( return c # Return construct, or key, or both, or default - return self._filter_return_construct(c, key, item, default, _method) + return self._filter_return_construct( + c, + key, + item, + default, + _method, + identities=identities, + filter_kwargs=filter_kwargs, + ) def _original_filenames(self, define=None, update=None, clear=False): """The names of files containing the original data and metadata. diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index 8d76c38852..db7301a2d2 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -1,4 +1,5 @@ from numbers import Integral +from re import split from dask.utils import parse_bytes @@ -4603,3 +4604,397 @@ def nc_clear_node_coordinate_variable_groups(self): nc_set=self.nc_set_node_coordinate_variable, nc_groups=self.nc_node_coordinate_variable_groups, ) + + +class NetCDFAggregation(NetCDFMixin): + """Mixin class for netCDF aggregated variables. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def nc_del_aggregated_data(self): + """Remove the netCDF aggregated_data terms. + + The aggregated data terms define the names of the fragment + array variables, and are stored in a netCDF file in an + "aggregated_data" attribute. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_get_aggregated_data`, + `nc_has_aggregated_data`, + `nc_set_aggregated_data` + + :Returns: + + `dict` + The removed netCDF aggregated_data elements in a + dictionary whose key/value pairs are the feature names + and their corresponding fragment array variable names. + + **Examples** + + >>> f.nc_set_aggregated_data( + ... {'shape': 'shape', + ... 'location': 'location', + ... 'address': 'address'} + ... ) + >>> f.nc_has_aggregated_data() + True + >>> f.nc_get_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_del_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_has_aggregated_data() + False + >>> f.nc_del_aggregated_data() + {} + >>> f.nc_get_aggregated_data() + {} + >>> f.nc_set_aggregated_data( + ... 'shape: shape, location: location address: address' + ... ) + + """ + out = self._nc_del("aggregated_data", None) + if out is None: + return {} + + return out.copy() + + def nc_get_aggregated_data(self): + """Return the netCDF aggregated data terms. + + The aggregated data terms define the names of the fragment + array variables, and are stored in a netCDF file in an + "aggregated_data" attribute. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregated_data`, + `nc_has_aggregated_data`, + `nc_set_aggregated_data` + + :Returns: + + `dict` + The netCDF aggregated_data terms in a dictionary whose + key/value pairs are the feature names and their + corresponding fragment array variable names. + + **Examples** + + >>> f.nc_set_aggregated_data( + ... {'shape': 'shape', + ... 'location': 'location', + ... 'address': 'address'} + ... ) + >>> f.nc_has_aggregated_data() + True + >>> f.nc_get_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_del_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_has_aggregated_data() + False + >>> f.nc_del_aggregated_data() + {} + >>> f.nc_get_aggregated_data() + {} + >>> f.nc_set_aggregated_data( + ... 'shape: shape, location: location address: address' + ... ) + + """ + out = self._nc_get("aggregated_data", None) + if out is None: + return {} + + return out.copy() + + def nc_has_aggregated_data(self): + """Whether any netCDF aggregated_data terms have been set. + + The aggregated data terms define the names of the fragment + array variables, and are stored in a netCDF file in an + "aggregated_data" attribute. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregated_data`, + `nc_get_aggregated_data`, + `nc_set_aggregated_data` + + :Returns: + + `bool` + `True` if the netCDF aggregated_data terms have been + set, otherwise `False`. + + **Examples** + + >>> f.nc_set_aggregated_data( + ... {'shape': 'shape', + ... 'location': 'location', + ... 'address': 'address'} + ... ) + >>> f.nc_has_aggregated_data() + True + >>> f.nc_get_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_del_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_has_aggregated_data() + False + >>> f.nc_del_aggregated_data() + {} + >>> f.nc_get_aggregated_data() + {} + >>> f.nc_set_aggregated_data( + ... 'shape: shape, location: location address: address' + ... ) + + """ + return self._nc_has("aggregated_data") + + def nc_set_aggregated_data(self, value): + """Set the netCDF aggregated_data elements. + + The aggregated data terms define the names of the fragment + array variables, and are stored in a netCDF file in an + "aggregated_data" attribute. + + If there are any ``/`` (slash) characters in the netCDF + variable names then these act as delimiters for a group + hierarchy. By default, or if the name starts with a ``/`` + character and contains no others, the name is assumed to be in + the root group. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregated_data`, + `nc_get_aggregated_data`, + `nc_has_aggregated_data` + + :Parameters: + + value: `dict` or `str` + The netCDF aggregated_data terms in a dictionary whose + key/value pairs are the feature names and their + corresponding fragment array variable names; or else + an equivalent string formated with the the CF-netCDF + encoding. + + :Returns: + + `None` + + **Examples** + + >>> f.nc_set_aggregated_data( + ... {'shape': 'shape', + ... 'location': 'location', + ... 'address': 'address'} + ... ) + >>> f.nc_has_aggregated_data() + True + >>> f.nc_get_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_del_aggregated_data() + {'shape': 'shape', + 'location': 'location', + 'address': 'address'} + >>> f.nc_has_aggregated_data() + False + >>> f.nc_del_aggregated_data() + {} + >>> f.nc_get_aggregated_data() + {} + >>> f.nc_set_aggregated_data( + ... 'shape: shape, location: location address: address' + ... ) + + """ + if not value: + self.nc_del_aggregated_data() + + if isinstance(value, str): + v = split(r"\s+", value) + value = {term[:-1]: var for term, var in zip(v[::2], v[1::2])} + else: + # 'value' is a dictionary + value = value.copy() + + self._get_component("netcdf")["aggregated_data"] = value + + def _nc_del_aggregation_fragment_type(self): + """Remove the type of fragments in the aggregated data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `str` or `None` + The removed fragment type, either ``'location'`` for + fragment files, or ``'value'`` for fragment unique + values, or `None` if no fragment type was set. + + """ + return self._nc_del("aggregation_fragment_type", None) + + def nc_get_aggregation_fragment_type(self): + """The type of fragments in the aggregated data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `str` + The fragment type, either ``'location'`` for fragment + files, or ``'value'`` for fragment unique values. + + """ + return self._nc_get("aggregation_fragment_type", None) + + def _nc_set_aggregation_fragment_type(self, value): + """Set the type of fragments in the aggregated data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + value: `str` or `None` + The fragment type, either ``'location'`` for fragment + files, ``'value'`` for fragment unique values, or + `None` for an unspecified fragment type. + + :Returns: + + `None` + + """ + self._get_component("netcdf")["aggregation_fragment_type"] = value + if value == "unique_value": + self._nc_set_aggregation_write_status(True) + + def nc_del_aggregation_write_status(self): + """Set the netCDF aggregation write status to `False`. + + A necessary (but not sufficient) condition for writing the + data as CF-netCDF aggregated data is that the write status is + True. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_get_aggregation_write_status`, + `nc_set_aggregation_write_status` + + :Returns: + + `bool` + The netCDF aggregation write status prior to deletion. + + """ + return self._nc_del("aggregation_write_status", False) + + def nc_get_aggregation_write_status(self): + """Get the netCDF aggregation write status. + + A necessary (but not sufficient) condition for writing the + data as CF-netCDF aggregated data is that the write status is + True. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregation_write_status`, + `nc_set_aggregation_write_status` + + :Returns: + + `bool` + The netCDF aggregation write status. + + """ + status = self._nc_get("aggregation_write_status", False) + if ( + not status + and self.nc_get_aggregation_fragment_type() == "unique_value" + ): + status = True + self._nc_set_aggregation_write_status(status) + + return status + + def _nc_set_aggregation_write_status(self, status): + """Set the netCDF aggregation write status. + + A necessary (but not sufficient) condition for writing the + data as CF-netCDF aggregated data is that the write status is + True. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregation_write_status`, + `nc_get_aggregation_write_status`, + `nc_set_aggregation_write_status` + + :Parameters: + + status: `bool` + The new write status. + + :Returns: + + `None` + + """ + self._get_component("netcdf")["aggregation_write_status"] = bool( + status + ) + + def nc_set_aggregation_write_status(self, status): + """Set the netCDF aggregation write status. + + A necessary (but not sufficient) condition for writing the + data as CF-netCDF aggregated data is that the write status is + True. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_del_aggregation_write_status`, + `nc_get_aggregation_write_status` + + :Parameters: + + status: `bool` + The new write status. + + :Returns: + + `None` + + """ + if status: + raise ValueError( + "'nc_set_aggregation_write_status' only allows the netCDF " + "aggregation write status to be set to False. At your own " + "risk you may use '_nc_set_aggregation_write_status' to set " + "the status to True." + ) + + self._nc_set_aggregation_write_status(status) diff --git a/cfdm/mixin/propertiesdata.py b/cfdm/mixin/propertiesdata.py index d36e2340ab..db9246cc7b 100644 --- a/cfdm/mixin/propertiesdata.py +++ b/cfdm/mixin/propertiesdata.py @@ -343,6 +343,60 @@ def apply_masking(self, inplace=False): return v + @classmethod + def concatenate( + cls, + variables, + axis=0, + cull_graph=False, + relaxed_units=False, + copy=True, + ): + """Join a together sequence of `{{class}}`. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `Data.concatenate`, `Data.cull_graph` + + :Parameters: + + variables: sequence of constructs. + + axis: `int`, optional + Select the axis along which to concatenate, defined + by its position in the data array. By default + concatenatoin is along the axis in position 0. + + {{cull_graph: `bool`, optional}} + + {{relaxed_units: `bool`, optional}} + + {{concatenate copy: `bool`, optional}} + + :Returns: + + `{{class}}` + The concatenated construct. + + """ + out = variables[0] + if copy: + out = out.copy() + + if len(variables) == 1: + return out + + data = out.get_data(_fill_value=False, _units=False) + new_data = type(data).concatenate( + [v.get_data(_fill_value=False) for v in variables], + axis=axis, + cull_graph=cull_graph, + relaxed_units=relaxed_units, + copy=copy, + ) + out.set_data(new_data, copy=False) + return out + def creation_commands( self, representative_data=False, @@ -654,9 +708,41 @@ def equals( return True - def get_filenames(self): + def file_directories(self): + """The directories of files containing parts of the data. + + Returns the locations of any files referenced by the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `get_filenames`, `replace_directory` + + :Returns: + + `set` + The unique set of file directories as absolute paths. + + **Examples** + + >>> d.file_directories() + {'/home/data1', 'file:///data2'} + + """ + data = self.get_data(None, _fill_value=False, _units=False) + if data is not None: + return data.file_directories() + + return set() + + def get_filenames(self, normalise=True): """Return the name of the file or files containing the data. + :Parameters: + + {{normalise: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `set` The file names in normalised, absolute form. If the @@ -665,7 +751,7 @@ def get_filenames(self): """ data = self.get_data(None, _units=False, _fill_value=False) if data is not None: - return data.get_filenames() + return data.get_filenames(normalise=normalise) return set() @@ -719,6 +805,25 @@ def insert_dimension(self, position=0, inplace=False): return v + def nc_clear_hdf5_chunksizes(self): + """Clear the HDF5 chunking strategy for the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_hdf5_chunksizes`, `nc_set_hdf5_chunksizes`, + `{{package}}.read`, `{{package}}.write` + + :Returns: + + `None` or `str` or `int` or `tuple` of `int` + The chunking strategy prior to being cleared, as would + be returned by `nc_hdf5_chunksizes`. + + """ + data = self.get_data(None, _units=False, _fill_value=False) + if data is not None: + return data.nc_clear_hdf5_chunksizes() + def nc_hdf5_chunksizes(self, todict=False): """Get the HDF5 chunking strategy for the data. @@ -741,47 +846,98 @@ def nc_hdf5_chunksizes(self, todict=False): if data is not None: return data.nc_hdf5_chunksizes(todict=todict) - def nc_clear_hdf5_chunksizes(self): - """Clear the HDF5 chunking strategy for the data. + def nc_set_hdf5_chunksizes(self, chunksizes): + """Set the HDF5 chunking strategy. .. versionadded:: (cfdm) 1.11.2.0 - .. seealso:: `nc_hdf5_chunksizes`, `nc_set_hdf5_chunksizes`, + .. seealso:: `nc_hdf5_chunksizes`, `nc_clear_hdf5_chunksizes`, `{{package}}.read`, `{{package}}.write` + :Parameters: + + {{hdf5 chunksizes}} + Each dictionary key is an integer that specifies an + axis by its position in the data array. + :Returns: - `None` or `str` or `int` or `tuple` of `int` - The chunking strategy prior to being cleared, as would - be returned by `nc_hdf5_chunksizes`. + `None` """ data = self.get_data(None, _units=False, _fill_value=False) if data is not None: - return data.nc_clear_hdf5_chunksizes() + data.nc_set_hdf5_chunksizes(chunksizes) - def nc_set_hdf5_chunksizes(self, chunksizes): - """Set the HDF5 chunking strategy. + @_inplace_enabled(default=False) + def persist(self, inplace=False): + """Persist data into memory. - .. versionadded:: (cfdm) 1.11.2.0 + {{persist description}} - .. seealso:: `nc_hdf5_chunksizes`, `nc_clear_hdf5_chunksizes`, - `{{package}}.read`, `{{package}}.write` + **Performance** + + `persist` causes delayed operations to be computed. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `array`, `datetime_array`, + `{{package}}.Data.persist` :Parameters: - {{hdf5 chunksizes}} - Each dictionary key is an integer that specifies an - axis by its position in the data array. + {{inplace: `bool`, optional}} + + :Returns: + + `{{class}}` or `None` + The construct with persisted data. If the operation + was in-place then `None` is returned. + + """ + v = _inplace_enabled_define_and_cleanup(self) + + data = v.get_data(None) + if data is not None: + data.persist(inplace=True) + + return v + + def replace_directory( + self, + old=None, + new=None, + normalise=False, + common=False, + ): + """Replace a file directory in-place. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `file_directories`, `get_filenames` + + :Parameters: + + {{replace old: `str` or `None`, optional}} + + {{replace new: `str` or `None`, optional}} + + {{replace normalise: `bool`, optional}} + + common: `bool`, optional + If True the base directory structure that is common to + all files with *new*. :Returns: `None` """ - data = self.get_data(None, _units=False, _fill_value=False) + data = self.get_data(None, _fill_value=False, _units=False) if data is not None: - data.nc_set_hdf5_chunksizes(chunksizes) + return data.replace_directory( + old=old, new=new, normalise=normalise, common=common + ) @_inplace_enabled(default=False) def squeeze(self, axes=None, inplace=False): diff --git a/cfdm/mixin/propertiesdatabounds.py b/cfdm/mixin/propertiesdatabounds.py index fa75c66b14..22731dca7d 100644 --- a/cfdm/mixin/propertiesdatabounds.py +++ b/cfdm/mixin/propertiesdatabounds.py @@ -384,6 +384,81 @@ def apply_masking(self, bounds=True, inplace=False): return c + @classmethod + def concatenate( + cls, + variables, + axis=0, + cull_graph=False, + relaxed_units=False, + copy=True, + ): + """Join a together sequence of `{{class}}`. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `Data.concatenate`, `Data.cull_graph` + + :Parameters: + + variables: sequence of constructs + + axis: `int`, optional + Select the axis to along which to concatenate, defined + by its position in the data array. By default + concatenation is along the axis in position 0. + + {{cull_graph: `bool`, optional}} + + {{relaxed_units: `bool`, optional}} + + {{concatenate copy: `bool`, optional}} + + :Returns: + + `{{class}}` + The concatenated construct. + + """ + variable0 = variables[0] + if copy: + variable0 = variable0.copy() + + if len(variables) == 1: + return variable0 + + out = super().concatenate( + variables, + axis=axis, + cull_graph=cull_graph, + relaxed_units=relaxed_units, + copy=copy, + ) + + bounds = variable0.get_bounds(None) + if bounds is not None: + bounds = bounds.concatenate( + [v.get_bounds() for v in variables], + axis=axis, + cull_graph=cull_graph, + relaxed_units=relaxed_units, + copy=copy, + ) + out.set_bounds(bounds, copy=False) + + interior_ring = variable0.get_interior_ring(None) + if interior_ring is not None: + interior_ring = interior_ring.concatenate( + [v.get_interior_ring() for v in variables], + axis=axis, + cull_graph=cull_graph, + relaxed_units=relaxed_units, + copy=copy, + ) + out.set_interior_ring(interior_ring, copy=False) + + return out + def creation_commands( self, representative_data=False, @@ -925,6 +1000,36 @@ def get_node_count(self, default=ValueError()): return out + def file_directories(self): + """The directories of files containing parts of the data. + + Returns the locations of any files referenced by the data. + + .. seealso:: `get_filenames`, `replace_directory` + + :Returns: + + `set` + The unique set of file directories as absolute paths. + + **Examples** + + >>> d.file_directories() + {'https:///data/1', 'file:///data2'} + + """ + out = super().file_directories() + + bounds = self.get_bounds(None) + if bounds is not None: + out.update(bounds.file_directories()) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + out.update(interior_ring.file_directories()) + + return out + def get_part_node_count(self, default=ValueError()): """Return the part node count variable for geometry bounds. @@ -1365,6 +1470,100 @@ def insert_dimension(self, position, inplace=False): return c + @_inplace_enabled(default=False) + def persist(self, bounds=True, inplace=False): + """Persist data into memory. + + {{persist description}} + + **Performance** + + `persist` causes delayed operations to be computed. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `array`, `datetime_array`, + `{{package}}.Data.persist` + + :Parameters: + + bounds: `bool`, optional + If True, the default, then also persist any bounds + data. + + {{inplace: `bool`, optional}} + + :Returns: + + `{{class}}` or `None` + The construct with persisted data. If the operation + was in-place then `None` is returned. + + """ + c = _inplace_enabled_define_and_cleanup(self) + + super(PropertiesDataBounds, c).persist(inplace=True) + + # Bounds + bounds = c.get_bounds(None) + if bounds is not None: + bounds.persist(inplace=True) + + # Interior_ring + interior_ring = c.get_interior_ring(None) + if interior_ring is not None: + interior_ring.persist(inplace=True) + + return c + + def replace_directory( + self, + old=None, + new=None, + normalise=False, + common=False, + ): + """Replace file directories in-place. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `file_directories`, `get_filenames` + + :Parameters: + + {{replace old: `str` or `None`, optional}} + + {{replace new: `str` or `None`, optional}} + + {{replace normalise: `bool`, optional}} + + common: `bool`, optional + If True the base directory structure that is common to + all files with *new*. + + :Returns: + + `None` + + """ + directory = super().replace_directory( + old=old, new=new, normalise=normalise, common=common + ) + + bounds = self.get_bounds(None) + if bounds is not None: + bounds.replace_directory( + old=old, new=new, normalise=normalise, common=common + ) + + interior_ring = self.get_interior_ring(None) + if interior_ring is not None: + interior_ring.replace_directory( + old=old, new=new, normalise=normalise, common=common + ) + + return directory + def nc_clear_hdf5_chunksizes(self, bounds=True, interior_ring=True): """Clear the HDF5 chunking strategy for the data. diff --git a/cfdm/read_write/abstract/__init__.py b/cfdm/read_write/abstract/__init__.py index 222f1e2814..951e0c548c 100644 --- a/cfdm/read_write/abstract/__init__.py +++ b/cfdm/read_write/abstract/__init__.py @@ -1 +1,2 @@ from .abstractio import IO, IORead, IOWrite +from .readwrite import ReadWrite diff --git a/cfdm/read_write/abstract/readwrite.py b/cfdm/read_write/abstract/readwrite.py new file mode 100644 index 0000000000..ce6a6df6b1 --- /dev/null +++ b/cfdm/read_write/abstract/readwrite.py @@ -0,0 +1,83 @@ +from collections.abc import Iterable + +from ...cfdmimplementation import implementation +from ...core import DocstringRewriteMeta +from ...docstring import _docstring_substitution_definitions + + +class ReadWrite(metaclass=DocstringRewriteMeta): + """TODOCFA.""" + + implementation = implementation() + + def __docstring_substitutions__(self): + """Defines applicable docstring substitutions. + + Substitutons are considered applicable if they apply to this + class as well as all of its subclasses. + + These are in addtion to, and take precendence over, docstring + substitutions defined by the base classes of this class. + + See `_docstring_substitutions` for details. + + .. versionaddedd:: (cfdm) NEXTVERSION + + :Returns: + + `dict` + The docstring substitutions that have been applied. + + """ + return _docstring_substitution_definitions + + def __docstring_package_depth__(self): + """Returns the package depth for {{package}} substitutions. + + See `_docstring_package_depth` for details. + + .. versionaddedd:: (cfdm) NEXTVERSION + + """ + return 0 + + @classmethod + def _flat(cls, x): + """Return an iterator over an arbitrarily nested sequence. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + x: scalar or arbitrarily nested sequence + The arbitrarily nested sequence to be flattened. Note + that if *x* is a string or a scalar then this is + equivalent to passing a single element sequence + containing *x*. + + :Returns: + + generator + An iterator over the flattened sequence. + + **Examples** + + >>> list({{package}}.write._flat([1, (2, [3, 4])])) + [1, 2, 3, 4] + + >>> list({{package}}.write._flat(['a', ['bc', ['def', 'ghij']]])) + ['a', 'bc', 'def', 'ghij'] + + >>> list({{package}}.write._flat(2004)) + [2004] + + """ + if not isinstance(x, Iterable) or isinstance(x, str): + x = (x,) + + for a in x: + if not isinstance(a, str) and isinstance(a, Iterable): + for sub in cls._flat(a): + yield sub + else: + yield a diff --git a/cfdm/read_write/exceptions.py b/cfdm/read_write/exceptions.py new file mode 100644 index 0000000000..613306e204 --- /dev/null +++ b/cfdm/read_write/exceptions.py @@ -0,0 +1,5 @@ +"""Custom exceptions for read/write errors.""" + + +class DatasetTypeError(Exception): + """Raised when an input dataset is of an unknown type.""" diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index d1eeff14e4..7c3d07a616 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -155,22 +155,18 @@ def subst(s): ) # Regex for 'dict form': "k1: v1 v2 k2: v3" - pat_value = subst("(?PWORD)SEP") - pat_values = "({})*".format(pat_value) - pat_mapping = subst( - "(?PWORD):SEP(?P{})".format(pat_values) - ) - pat_mapping_list = "({})+".format(pat_mapping) + pat_value = subst(r"(?PWORD)SEP") + pat_values = f"({pat_value})*" + pat_mapping = subst(rf"(?PWORD):SEP(?P{pat_values})") + pat_mapping_list = f"({pat_mapping})+" # Regex for 'list form': "v1 v2 v3" (including single-item form) - pat_list_item = subst("(?PWORD)SEP") - pat_list = "({})+".format(pat_list_item) + pat_list_item = subst(r"(?PWORD)SEP") + pat_list = f"({pat_list_item})+" # Regex for any form: pat_all = subst( - "((?P{})|(?P{}))$".format( - pat_list, pat_mapping_list - ) + rf"((?P{pat_list})|(?P{pat_mapping_list}))$" ) m = re.match(pat_all, attribute) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index c6a2522d84..6c2940cd7b 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -1,6 +1,5 @@ import logging import operator -import os import re import struct import subprocess @@ -12,7 +11,6 @@ from math import log, nan, prod from numbers import Integral from typing import Any -from urllib.parse import urlparse from uuid import uuid4 import h5netcdf @@ -22,10 +20,13 @@ from dask.base import tokenize from packaging.version import Version from s3fs import S3FileSystem +from uritools import urisplit +from ...data.netcdfindexer import netcdf_indexer from ...decorators import _manage_log_level_via_verbosity -from ...functions import is_log_level_debug, is_log_level_detail +from ...functions import abspath, is_log_level_debug, is_log_level_detail from .. import IORead +from ..exceptions import DatasetTypeError from .flatten import netcdf_flatten from .flatten.config import ( flattener_attribute_map, @@ -507,15 +508,26 @@ def file_open(self, filename, flatten=True, verbose=None): """ g = self.read_vars - netcdf = False - hdf = False netcdf_backend = g["netcdf_backend"] - # Deal with a file in an S3 object store - u = urlparse(filename) + if g["ftype"] == "CDL": + # -------------------------------------------------------- + # Convert a CDL file to a local netCDF4 file + # -------------------------------------------------------- + cdl_filename = filename + filename = self.cdl_to_netcdf(filename) + g["filename"] = filename + else: + cdl_filename = None + + u = urisplit(filename) storage_options = self._get_storage_options(filename, u) if u.scheme == "s3": + # -------------------------------------------------------- + # A file in an S3 object store + # -------------------------------------------------------- + # Create an openable S3 file object fs_key = tokenize(("s3", storage_options)) file_systems = g["file_systems"] @@ -536,39 +548,38 @@ def file_open(self, filename, flatten=True, verbose=None): f" S3: s3fs.S3FileSystem options: {storage_options}\n" ) # pragma: no cover - if netcdf_backend is None: - try: - # Try opening the file with netCDF4 - nc = self._open_netCDF4(filename) - netcdf = True - except Exception: - # The file could not be read by netCDF4 so try opening - # it with h5netcdf - try: - nc = self._open_h5netcdf(filename) - hdf = True - except Exception as error: - raise error - - elif netcdf_backend == "netCDF4": - try: - nc = self._open_netCDF4(filename) - netcdf = True - except Exception as error: - raise error + # Map backend names to file-open functions + file_open_function = { + "h5netcdf": self._open_h5netcdf, + "netCDF4": self._open_netCDF4, + } - elif netcdf_backend == "h5netcdf": + # Loop around the netCDF backends until we successfully open + # the file + nc = None + errors = [] + for backend in netcdf_backend: try: - nc = self._open_h5netcdf(filename) - hdf = True + nc = file_open_function[backend](filename) + except KeyError: + errors.append(f"{backend}: Unknown netCDF backend name") except Exception as error: - raise error + errors.append( + f"{backend}:\n{error.__class__.__name__}: {error}" + ) + else: + break - else: - raise ValueError(f"Unknown netCDF backend: {netcdf_backend!r}") + if nc is None: + if cdl_filename is not None: + filename = f"{filename} (created from CDL file {cdl_filename})" - g["original_h5netcdf"] = hdf - g["original_netCDF4"] = netcdf + error = "\n\n".join(errors) + raise DatasetTypeError( + f"Can't interpret {filename} as a netCDF dataset " + f"with any of the netCDF backends {netcdf_backend!r}:\n\n" + f"{error}" + ) # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) @@ -600,14 +611,9 @@ def file_open(self, filename, flatten=True, verbose=None): nc = flat_nc - netcdf = True - hdf = False - g["has_groups"] = True g["flat_files"].append(flat_file) - g["netCDF4"] = netcdf - g["h5netcdf"] = hdf g["nc"] = nc return nc @@ -626,7 +632,9 @@ def _open_netCDF4(self, filename): `netCDF4.Dataset` """ - return netCDF4.Dataset(filename, "r") + nc = netCDF4.Dataset(filename, "r") + self.read_vars["file_opened_with"] = "netCDF4" + return nc def _open_h5netcdf(self, filename): """Return an open `h5netcdf.File`. @@ -649,7 +657,7 @@ def _open_h5netcdf(self, filename): `h5netcdf.File` """ - return h5netcdf.File( + nc = h5netcdf.File( filename, "r", decode_vlen_strings=True, @@ -657,9 +665,10 @@ def _open_h5netcdf(self, filename): rdcc_w0=0.75, rdcc_nslots=4133, ) + self.read_vars["file_opened_with"] = "h5netcdf" + return nc - @classmethod - def cdl_to_netcdf(cls, filename): + def cdl_to_netcdf(self, filename): """Create a temporary netCDF-4 file from a CDL text file. :Parameters: @@ -678,98 +687,78 @@ def cdl_to_netcdf(cls, filename): ) tmpfile = x.name - # ---------------------------------------------------------------- - # Need to cache the TemporaryFile object so that it doesn't get - # deleted too soon - # ---------------------------------------------------------------- - _cached_temporary_files[tmpfile] = x + ncgen_command = ["ncgen", "-knc4", "-o", tmpfile, filename] + + if self.read_vars["debug"]: + logger.debug( + f"Converting CDL file {filename} to netCDF file {tmpfile} " + f"with `{' '.join(ncgen_command)}`" + ) # pragma: no cover try: - subprocess.run( - ["ncgen", "-knc4", "-o", tmpfile, filename], check=True - ) + subprocess.run(ncgen_command, check=True) except subprocess.CalledProcessError as error: msg = str(error) if msg.startswith( "Command '['ncgen', '-knc4', '-o'" ) and msg.endswith("returned non-zero exit status 1."): - raise ValueError( - "The CDL provided is invalid so cannot be converted " - "to netCDF." + raise RuntimeError( + f"The CDL file {filename} is invalid so cannot be " + f"converted to netCDF with `{' '.join(ncgen_command)}`. " + "ncgen output:\n\n" + f"{msg}" ) else: raise + # Need to cache the TemporaryFile object so that it doesn't get + # deleted too soon + _cached_temporary_files[tmpfile] = x + return tmpfile @classmethod - def is_netcdf_file(cls, filename): - """Return `True` if the file is a netCDF file. + def string_to_cdl(cls, cdl_string): + """Create a temporary CDL file from a CDL string. - The file type is determined by inspecting the file's contents - and any file suffix is not not considered. However, file names - starting ``https://`` or ``http://`` are assumed, without - checking, to be netCDF files. + .. versionadded:: (cfdm) NEXTVERSION :Parameters: - filename: `str` - The name of the file. + cdl_string: `str` + The CDL string. :Returns: - `bool` - `True` if the file is netCDF, otherwise `False` - - **Examples** - - >>> {{package}}.{{class}}.is_netcdf_file('file.nc') - True + `str` + The name of the new netCDF file. """ - # Assume that URLs are in netCDF format - if ( - filename.startswith("https://") - or filename.startswith("http://") - or filename.startswith("s3://") - ): - return True - - # Read the magic number - try: - fh = open(filename, "rb") - magic_number = struct.unpack("=L", fh.read(4))[0] - except Exception: - magic_number = None - - try: - fh.close() - except Exception: - pass + x = tempfile.NamedTemporaryFile( + mode="w", + dir=tempfile.gettempdir(), + prefix="cfdm_", + suffix=".cdl", + ) + tmpfile = x.name - if magic_number in ( - 21382211, - 1128547841, - 1178880137, - 38159427, - 88491075, - ): - return True - else: - return False + with open(tmpfile, "w") as f: + f.write(cdl_string) - def is_cdl_file(cls, filename): - """True if the file is in CDL format. + # Need to cache the TemporaryFile object so that it doesn't + # get deleted too soon + _cached_temporary_files[tmpfile] = x - Return True if the file is a CDL text representation of a - netCDF file. + return tmpfile - Note that the file type is determined by inspecting the file's - contents and any file suffix is not not considered. The file is - assumed to be a CDL file if it is a text file that starts with - "netcdf ". + @classmethod + def ftype(cls, filename): + """Return type of the file. - .. versionaddedd:: (cfdm) 1.7.8 + The file type is determined by inspecting the file's contents + and any file suffix is not considered. However, file names + that are non-local URIs (such as those starting ``https:`` or + ``s3:``) are assumed, without checking, to be netCDF files. :Parameters: @@ -778,103 +767,66 @@ def is_cdl_file(cls, filename): :Returns: - `bool` - `True` if the file is CDL, otherwise `False` - - **Examples** + `str` or `None` + The file type: - >>> {{package}}.{{class}}.is_cdl_file('file.nc') - False + * ``'netCDF'`` for a binary netCDF-3 or netCDF-4 file, + * ``'CDL'`` a text CDL file, + * `None` for anything else. """ - # Read the magic number - cdl = False + # Assume that non-local URIs are in netCDF format + if urisplit(filename).scheme not in (None, "file"): + return "netCDF" + + f_type = None + try: - fh = open(filename, "rt") - except UnicodeDecodeError: - pass + # Read the first 4 bytes from the file + fh = open(filename, "rb") + magic_number = struct.unpack("=L", fh.read(4))[0] except Exception: + # Can't read 4 bytes from the file, so it can't be netCDF + # or CDL. pass else: - try: - line = fh.readline() - # Match comment and blank lines at the top of the file - while re.match(r"^\s*//|^\s*$", line): - line = fh.readline() - if not line: - break + # Is it a netCDF-C binary file? + if magic_number in ( + 21382211, + 1128547841, + 1178880137, + 38159427, + 88491075, + ): + f_type = "netCDF" + else: + # Is it a CDL text file? + fh.seek(0) + try: + line = fh.readline().decode("utf-8") + except Exception: + pass + else: + netcdf = line.startswith("netcdf ") + if not netcdf: + # Match comment and blank lines at the top of + # the file + while re.match(r"^\s*//|^\s*$", line): + line = fh.readline().decode("utf-8") + if not line: + break + + netcdf = line.startswith("netcdf ") - if line.startswith("netcdf "): - cdl = True - except UnicodeDecodeError: - pass + if netcdf: + f_type = "CDL" try: fh.close() except Exception: pass - return cdl - - @classmethod - def is_file(cls, filename): - """Return `True` if *filename* is a file. - - Note that a remote URL starting with ``http://`` or - ``https://`` is always considered as a file. - - .. versionadded:: (cfdm) 1.10.1.1 - - :Parameters: - - filename: `str` - The name of the file. - - :Returns: - - `bool` - Whether or not *filename* is a file. - - **Examples** - - >>> {{package}}.{{class}}.is_file('file.nc') - True - >>> {{package}}.{{class}}.is_file('http://file.nc') - True - >>> {{package}}.{{class}}.is_file('https://file.nc') - True - - """ - # Assume that URLs are files - u = urlparse(filename) - if u.scheme in ("http", "https", "s3"): - return True - - return os.path.isfile(filename) - - @classmethod - def is_dir(cls, filename): - """Return `True` if *filename* is a directory. - - .. versionadded:: (cfdm) 1.10.1.1 - - :Parameters: - - filename: `str` - The name of the file. - - :Returns: - - `bool` - Whether or not *filename* is a directory. - - **Examples** - - >>> {{package}}.{{class}}.is_dir('file.nc') - False - - """ - return os.path.isdir(filename) + return f_type def default_netCDF_fill_value(self, ncvar): """The default netCDF fill value for a variable. @@ -917,7 +869,14 @@ def read( netcdf_backend=None, cache=True, dask_chunks="storage-aligned", - store_hdf5_chunks=True, + store_dataset_chunks=True, + cfa=None, + cfa_write=None, + to_memory=None, + squeeze=False, + unsqueeze=False, + file_type=None, + ignore_unknown_type=False, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -968,30 +927,74 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 storage_options: `bool`, optional - See `cfdm.read` for details + See `cfdm.read` for details. .. versionadded:: (cfdm) 1.11.2.0 netcdf_backend: `None` or `str`, optional - See `cfdm.read` for details + See `cfdm.read` for details. .. versionadded:: (cfdm) 1.11.2.0 cache: `bool`, optional Control array element caching. See `cfdm.read` for - details + details. .. versionadded:: (cfdm) 1.11.2.0 dask_chunks: `str`, `int`, `None`, or `dict`, optional Specify the `dask` chunking of dimensions for data in - the input files. See `cfdm.read` for details + the input files. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.11.2.0 - store_hdf_chunks: `bool`, optional - Storing the HDF5 chunking strategy. See `cfdm.read` - for details. + store_dataset_chunks: `bool`, optional + Storing the dataset chunking strategy. See + `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + cfa: `dict`, optional + Configure the reading of CF-netCDF aggregation files. + See `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + cfa_write: sequence of `str`, optional + Configure the reading of CF-netCDF aggregation files. + See `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + to_memory: (sequence) of `str`, optional + Whether or not to bring data arrays into memory. See + `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + squeeze: `bool`, optional + Whether or not to remove all size 1 axes from field + construct data arrays. See `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + unsqueeze: `bool`, optional + Whether or not to ensure that all size 1 axes are + spanned by field construct data arrays. See + `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + file_type: `None` or (sequence of) `str`, optional + Only read files of the given type(s). See `cfdm.read` + for details. + + .. versionadded:: (cfdm) NEXTVERSION + + ignore_unknown_type: `bool`, optional + If True then ignore any file which does not have one + of the valid types specified by the *file_type* + parameter. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.11.2.0 @@ -1006,14 +1009,183 @@ def read( The field or domain constructs in the file. """ + debug = is_log_level_debug(logger) + + # ------------------------------------------------------------ + # Parse the 'filename' keyword parameter + # ------------------------------------------------------------ + try: + filename = abspath(filename, uri=False) + except ValueError: + filename = abspath(filename) + + # ------------------------------------------------------------ + # Parse the 'file_type' keyword parameter + # ------------------------------------------------------------ + if isinstance(file_type, str): + file_type = (file_type,) + + # ------------------------------------------------------------ + # Check the file type, returning/failing now if the type is + # not recognised. (It is much faster to do this with `ftype` + # than waiting for `file_open` to fail.) + # ------------------------------------------------------------ + ftype = self.ftype(filename) + if not ftype: + raise DatasetTypeError( + f"Can't interpret {filename} as a netCDF or CDL dataset" + ) + + if file_type and ftype not in file_type: + raise DatasetTypeError( + f"Can't interpret {filename} as one of the " + f"requested types: {file_type}" + ) + + # ------------------------------------------------------------ + # Parse the 'netcdf_backend' keyword parameter + # ------------------------------------------------------------ + if netcdf_backend is None: + # By default, try netCDF backends in this order: + netcdf_backend = ("h5netcdf", "netCDF4") + elif isinstance(netcdf_backend, str): + netcdf_backend = (netcdf_backend,) + + # ------------------------------------------------------------ + # Parse the 'external' keyword parameter + # ------------------------------------------------------------ + if external: + if isinstance(external, str): + external = (external,) + + external = set(external) + else: + external = set() + + # ------------------------------------------------------------ + # Parse 'extra' keyword parameter + # ------------------------------------------------------------ + get_constructs = { + "auxiliary_coordinate": self.implementation.get_auxiliary_coordinates, + "cell_measure": self.implementation.get_cell_measures, + "dimension_coordinate": self.implementation.get_dimension_coordinates, + "domain_ancillary": self.implementation.get_domain_ancillaries, + "field_ancillary": self.implementation.get_field_ancillaries, + } + + if extra: + if isinstance(extra, str): + extra = (extra,) + + for f in extra: + if f not in get_constructs: + raise ValueError( + f"Can't read: Bad parameter value: extra={extra!r}" + ) + + extra = set(extra) + else: + extra = set() + + # ------------------------------------------------------------ + # Parse 'dask_chunks' keyword parameter + # ------------------------------------------------------------ + if dask_chunks is not None and not isinstance( + dask_chunks, (str, Integral, dict) + ): + raise ValueError( + "The 'dask_chunks' keyword must be of type str, int, None or " + f"dict. Got: {dask_chunks!r}" + ) + + # ------------------------------------------------------------ + # Parse the 'cfa' keyword parameter + # ------------------------------------------------------------ + if cfa is None: + cfa = {} + else: + cfa = cfa.copy() + keys = ("replace_directory",) + if not set(cfa).issubset(keys): + raise ValueError( + "Invalid dictionary key to the 'cfa' parameter." + f"Valid keys are {keys}. Got: {cfa}" + ) + + if not isinstance(cfa.get("replace_directory", {}), dict): + raise ValueError( + "The 'replace_directory' key of the 'cfa' parameter " + "must have a dictionary value. " + f"Got: {cfa['replace_directory']!r}" + ) + + # ------------------------------------------------------------ + # Parse the 'cfa_write' keyword parameter + # ------------------------------------------------------------ + if cfa_write: + if isinstance(cfa_write, str): + cfa_write = (cfa_write,) + else: + cfa_write = tuple(cfa_write) + else: + cfa_write = () + + # ------------------------------------------------------------ + # Parse the 'squeeze' and 'unsqueeze' keyword parameters + # ------------------------------------------------------------ + if squeeze and unsqueeze: + raise ValueError( + "'squeeze' and 'unsqueeze' parameters can not both be True" + ) + + # ------------------------------------------------------------ + # Parse the 'to_memory' keyword parameter + # ------------------------------------------------------------ + if to_memory: + if isinstance(to_memory, str): + to_memory = (to_memory,) + + if "metadata" in to_memory: + to_memory = tuple(to_memory) + ( + "field_ancillary", + "domain_ancillary", + "dimension_coordinate", + "auxiliary_coordinate", + "cell_measure", + "domain_topology", + "cell_connectivity", + ) + to_memory = set(to_memory) + to_memory.remove("metadata") + else: + to_memory = () + + # ------------------------------------------------------------ + # Parse the 'storage_options' keyword parameter + # ------------------------------------------------------------ + if storage_options is None: + storage_options = {} + + # ------------------------------------------------------------ + # Parse the '_file_systems' keyword parameter + # ------------------------------------------------------------ + if _file_systems is None: + _file_systems = {} + # ------------------------------------------------------------ # Initialise netCDF read parameters # ------------------------------------------------------------ self.read_vars = { + # -------------------------------------------------------- + # File + # -------------------------------------------------------- + "filename": filename, + "ftype": ftype, + "ignore_unknown_type": bool(ignore_unknown_type), # -------------------------------------------------------- # Verbosity # -------------------------------------------------------- - "debug": is_log_level_debug(logger), + "debug": debug, # "new_dimension_sizes": {}, "formula_terms": {}, @@ -1047,11 +1219,18 @@ def read( # -------------------------------------------------------- # Variables listed by the global external_variables # attribute + "external_files": external, "external_variables": set(), # External variables that are actually referenced from # within the parent file "referenced_external_variables": set(), # -------------------------------------------------------- + # Create extra, independent fields from netCDF variables + # that correspond to particular types metadata constructs + # -------------------------------------------------------- + "extra": extra, + "get_constructs": get_constructs, + # -------------------------------------------------------- # Coordinate references # -------------------------------------------------------- # Grid mapping attributes that describe horizontal datum @@ -1102,10 +1281,6 @@ def read( # Interpolation parameter variables "interpolation_parameter": {}, # -------------------------------------------------------- - # CFA - # -------------------------------------------------------- - "cfa": False, - # -------------------------------------------------------- # NetCDF backend # -------------------------------------------------------- "netcdf_backend": netcdf_backend, @@ -1117,7 +1292,7 @@ def read( # File system storage options for each file "file_system_storage_options": {}, # Cached s3fs.S3FileSystem objects - "file_systems": {}, + "file_systems": _file_systems, # Cache of open s3fs.File objects "s3fs_File_objects": [], # -------------------------------------------------------- @@ -1129,27 +1304,36 @@ def read( # -------------------------------------------------------- "dask_chunks": dask_chunks, # -------------------------------------------------------- - # Whether or not to store HDF chunks + # Aggregation + # -------------------------------------------------------- + "parsed_aggregated_data": {}, + # fragment_array_variables as numpy arrays + "fragment_array_variables": {}, + # Aggregation configuration overrides + "cfa": cfa, + # Dask chunking of aggregated data for selected constructs + "cfa_write": cfa_write, + # -------------------------------------------------------- + # Whether or not to store the dataset chunking strategy + # -------------------------------------------------------- + "store_dataset_chunks": bool(store_dataset_chunks), + # -------------------------------------------------------- + # Constructs to read into memory + # -------------------------------------------------------- + "to_memory": to_memory, + # -------------------------------------------------------- + # Squeeze/unsqueeze fields # -------------------------------------------------------- - "store_hdf5_chunks": bool(store_hdf5_chunks), + "squeeze": bool(squeeze), + "unsqueeze": bool(unsqueeze), } g = self.read_vars - debug = g["debug"] - # Set versions - for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): + for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"): g["version"][version] = Version(version) - if storage_options is None: - g["storage_options"] = {} - - if _file_systems is not None: - # Update S3 file systems with those passed in as keyword - # parameter - g["file_systems"] = _file_systems - # ------------------------------------------------------------ # Add custom read vars # ------------------------------------------------------------ @@ -1157,62 +1341,25 @@ def read( g.update(deepcopy(extra_read_vars)) # ------------------------------------------------------------ - # Parse field parameter + # Open the netCDF file to be read # ------------------------------------------------------------ - g["get_constructs"] = { - "auxiliary_coordinate": self.implementation.get_auxiliary_coordinates, - "cell_measure": self.implementation.get_cell_measures, - "dimension_coordinate": self.implementation.get_dimension_coordinates, - "domain_ancillary": self.implementation.get_domain_ancillaries, - "field_ancillary": self.implementation.get_field_ancillaries, - } - - # Parse the 'external' keyword parameter - if external: - if isinstance(external, str): - external = (external,) - else: - external = () - - g["external_files"] = set(external) - - # Parse 'extra' keyword parameter - if extra: - if isinstance(extra, str): - extra = (extra,) - - for f in extra: - if f not in g["get_constructs"]: - raise ValueError( - f"Can't read: Bad parameter value: extra={extra!r}" - ) - - # Check dask_chunks - if dask_chunks is not None and not isinstance( - dask_chunks, (str, Integral, dict) - ): - raise ValueError( - "The 'dask_chunks' keyword must be of type str, int, None or " - f"dict. Got: {dask_chunks!r}" - ) - - g["extra"] = extra - - filename = os.path.expanduser(os.path.expandvars(filename)) - - if self.is_dir(filename): - raise IOError(f"Can't read directory {filename}") + try: + nc = self.file_open(filename, flatten=True, verbose=None) + except DatasetTypeError: + if not g["ignore_unknown_type"]: + raise - if not self.is_file(filename): - raise IOError(f"Can't read non-existent file {filename}") + if debug: + logger.debug( + f"Ignoring {filename}: Can't interpret as a " + "netCDF dataset" + ) # pragma: no cover - g["filename"] = filename + return [] - # ------------------------------------------------------------ - # Open the netCDF file to be read - # ------------------------------------------------------------ - nc = self.file_open(filename, flatten=True, verbose=None) - logger.info(f"Reading netCDF file: {filename}\n") # pragma: no cover + logger.info( + f"Reading netCDF file: {g['filename']}\n" + ) # pragma: no cover if debug: logger.debug( f" Input netCDF dataset:\n {nc}\n" @@ -1237,7 +1384,7 @@ def read( ) # pragma: no cover # ------------------------------------------------------------ - # Find the CF version for the file, and the CFA version. + # Find the CF version for the file # ------------------------------------------------------------ Conventions = g["global_attributes"].get("Conventions", "") @@ -1255,12 +1402,6 @@ def read( # Allow UGRID if it has been specified in Conventions, # regardless of the version of CF. g["UGRID_version"] = Version(c.replace("UGRID-", "", 1)) - elif c.startswith("CFA-"): - g["cfa"] = True - g["CFA_version"] = Version(c.replace("CFA-", "", 1)) - elif c == "CFA": - g["cfa"] = True - g["CFA_version"] = Version("0.4") if file_version is None: if default_version is not None: @@ -1274,7 +1415,7 @@ def read( g["file_version"] = Version(file_version) # Set minimum/maximum versions - for vn in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): + for vn in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"): g["CF>=" + vn] = g["file_version"] >= g["version"][vn] # From CF-1.11 we can assume UGRID-1.0 @@ -1412,7 +1553,7 @@ def read( # structure that was prepended to the netCDF # variable name by the netCDF flattener. ncvar_basename = re.sub( - f"^{flattener_separator.join(groups)}{flattener_separator}", + rf"^{flattener_separator.join(groups)}{flattener_separator}", "", ncvar_flat, ) @@ -1479,7 +1620,7 @@ def read( if groups: # This dimension is in a group. ncdim_basename = re.sub( - "^{flattener_separator.join(groups)}{flattener_separator}", + r"^{flattener_separator.join(groups)}{flattener_separator}", "", ncdim_flat, ) @@ -1638,6 +1779,30 @@ def read( # the read parameters. self._customise_read_vars() + # ------------------------------------------------------------ + # Aggregation variables (CF>=1.12) + # ------------------------------------------------------------ + if g["CF>=1.12"]: + for ncvar, attributes in variable_attributes.items(): + aggregated_dimensions = attributes.get("aggregated_dimensions") + if aggregated_dimensions is None: + # This is not an aggregated variable + continue + + # Set the aggregated variable's dimensions as its + # aggregated dimensions + ncdimensions = aggregated_dimensions.split() + variable_dimensions[ncvar] = tuple(map(str, ncdimensions)) + + # Parse the fragment array variables + self._cfa_parse_aggregated_data( + ncvar, attributes.get("aggregated_data") + ) + + # Do not create fields/domains from fragment array + # variables + g["do_not_create_field"].update(g["fragment_array_variables"]) + # ------------------------------------------------------------ # List variables # @@ -2018,6 +2183,17 @@ def read( # ------------------------------------------------------------ self.file_close() + # ------------------------------------------------------------ + # Squeeze/unsqueeze size 1 axes in field constructs + # ------------------------------------------------------------ + if not g["domain"]: + if g["unsqueeze"]: + for f in out: + self.implementation.unsqueeze(f, inplace=True) + elif g["squeeze"]: + for f in out: + self.implementation.squeeze(f, inplace=True) + # ------------------------------------------------------------ # Return the fields/domains # ------------------------------------------------------------ @@ -5582,11 +5758,6 @@ def _create_bounded_construct( if tie_points: # Add interpolation variable properties (CF>=1.9) pass - # nc = geometry.get("node_count") - # if nc is not None: - # self.implementation.set_interpolation_properties( - # parent=c, interpolation=i - # ) # Store the netCDF variable name self.implementation.nc_set_variable(c, ncvar) @@ -6234,15 +6405,69 @@ def _create_netcdfarray( "storage_options": g["file_system_storage_options"].get(filename), } + if not self._cfa_is_aggregation_variable(ncvar): + # Normal (non-aggregation) variable + if return_kwargs_only: + return kwargs + + file_opened_with = g["file_opened_with"] + if file_opened_with == "netCDF4": + array = self.implementation.initialise_NetCDF4Array(**kwargs) + elif file_opened_with == "h5netcdf": + array = self.implementation.initialise_H5netcdfArray(**kwargs) + + return array, kwargs + + # ------------------------------------------------------------ + # Still here? Then create a netCDF array for an + # aggregation variable + # ------------------------------------------------------------ + + # Only keep the relevant attributes + a = {} + for attr in ("units", "calendar", "add_offset", "scale_factor"): + value = attributes.get(attr) + if value is not None: + a[attr] = value + + kwargs["attributes"] = a + + # Get rid of the incorrect shape. This will end up getting set + # correctly by the AggregatedArray instance. + kwargs.pop("shape", None) + + # 'mask' must be True, to indicate that the aggregated data is + # to be masked by convention. + kwargs["mask"] = True + + fragment_array_variables = g["fragment_array_variables"] + standardised_terms = ("map", "location", "variable", "unique_value") + + fragment_array = {} + for term, term_ncvar in g["parsed_aggregated_data"][ncvar].items(): + if term not in standardised_terms: + logger.warning( + "Ignoring non-standardised fragment array feature found " + "in the aggregated_data attribute of variable " + f"{ncvar!r}: {term!r}" + ) + continue + + fragment_array_variable = fragment_array_variables[term_ncvar] + fragment_array[term] = fragment_array_variable + + if term == "unique_value" and kwargs["dtype"] is None: + # This is a string-valued aggregation variable with a + # 'value' fragment array variable, so set the correct + # numpy data type. + kwargs["dtype"] = fragment_array_variable.dtype + + kwargs["fragment_array"] = fragment_array if return_kwargs_only: return kwargs - if g["original_netCDF4"]: - array = self.implementation.initialise_NetCDF4Array(**kwargs) - else: - # h5netcdf - array = self.implementation.initialise_H5netcdfArray(**kwargs) - + # Use the kwargs to create a AggregatedArray instance + array = self.implementation.initialise_AggregatedArray(**kwargs) return array, kwargs def _create_data( @@ -6289,15 +6514,22 @@ def _create_data( """ g = self.read_vars - array, kwargs = self._create_netcdfarray( - ncvar, unpacked_dtype=unpacked_dtype, coord_ncvar=coord_ncvar + construct_type = self.implementation.get_construct_type(construct) + + netcdf_array, netcdf_kwargs = self._create_netcdfarray( + ncvar, + unpacked_dtype=unpacked_dtype, + coord_ncvar=coord_ncvar, ) - if array is None: + + if netcdf_array is None: return None - filename = kwargs["filename"] + array = netcdf_array - attributes = kwargs["attributes"] + filename = netcdf_kwargs["filename"] + + attributes = netcdf_kwargs["attributes"] units = attributes.get("units") calendar = attributes.get("calendar") @@ -6528,16 +6760,19 @@ def _create_data( calendar=calendar, ncvar=ncvar, compressed=compressed, + construct_type=construct_type, ) data._original_filenames(define=filename) # ------------------------------------------------------------ # Cache selected values from disk # ------------------------------------------------------------ + aggregation_variable = self._cfa_is_aggregation_variable(ncvar) if ( not compression_index and g.get("cache") - and self.implementation.get_construct_type(construct) != "field" + and construct_type != "field" + and not aggregation_variable ): # Only cache values from non-field data and # non-compression-index data, on the assumptions that: @@ -6549,6 +6784,60 @@ def _create_data( # compression index data. self._cache_data_elements(data, ncvar) + # ------------------------------------------------------------ + # Set data aggregation parameters + # ------------------------------------------------------------ + if not aggregation_variable: + # For non-aggregation variables, set the aggregated write + # status to True when there is exactly one dask chunk. + if data.npartitions == 1: + data._nc_set_aggregation_write_status(True) + data._nc_set_aggregation_fragment_type("location") + else: + if construct is not None: + # Remove the aggregation attributes from the construct + self.implementation.del_property( + construct, "aggregated_dimensions", None + ) + aggregated_data = self.implementation.del_property( + construct, "aggregated_data", None + ) + # Store the 'aggregated_data' attribute information + if aggregated_data: + data.nc_set_aggregated_data(aggregated_data) + + # Set the aggregated write status to True iff each + # non-aggregated axis has exactly one Dask chunk + cfa_write_status = True + for n, numblocks in zip( + netcdf_array.get_fragment_array_shape(), data.numblocks + ): + if n == 1 and numblocks > 1: + # Note: n is always 1 for non-aggregated axes + cfa_write_status = False + break + + data._nc_set_aggregation_write_status(cfa_write_status) + + # Store the fragment type + fragment_type = netcdf_array.get_fragment_type() + data._nc_set_aggregation_fragment_type(fragment_type) + + # Replace the directories of fragment locations + if fragment_type == "location": + replace_directory = g["cfa"].get("replace_directory") + if replace_directory: + try: + data.replace_directory(**replace_directory) + except TypeError: + raise TypeError( + "The 'replace_directory' key of the 'cfa' " + "parameter must provide valid parameters to " + "the 'Data.replace_directory' method. " + f"Got: {replace_directory!r}" + ) + + # Return the data object return data def _create_domain_axis(self, size, ncdim=None): @@ -7533,6 +7822,7 @@ def _create_Data( calendar=None, ncdimensions=None, compressed=False, + construct_type=None, **kwargs, ): """Create a Data object from a netCDF variable. @@ -7561,6 +7851,12 @@ def _create_Data( .. versionadded:: (cfdm) 1.11.2.0 + construct_type: `str` or `None`, optional + The type of the construct that contains *array*. Set + to `None` if the array does not belong to a construct. + + .. versionadded:: (cfdm) NEXTVERSION + kwargs: optional Extra parameters to pass to the initialisation of the returned `Data` object. @@ -7571,11 +7867,6 @@ def _create_Data( """ if array.dtype is None: - # The array is based on a netCDF VLEN variable, and - # therefore has unknown data type. To find the correct - # data type (e.g. "WORD)SEP") + pat_value = subst(r"(?PWORD)SEP") pat_values = f"({pat_value})+" pat_mapping = subst( - f"(?PWORD):SEP(?P{pat_values})" + rf"(?PWORD):SEP(?P{pat_values})" ) pat_mapping_list = f"({pat_mapping})+" pat_all = subst( - f"((?PWORD)|(?P{pat_mapping_list}))$" + rf"((?PWORD)|(?P{pat_mapping_list}))$" ) m = re.match(pat_all, string) @@ -10410,9 +10708,8 @@ def _get_storage_options(self, filename, parsed_filename): filename: `str` The name of the file. - parsed_filename: `urllib.parse.ParseResult` - The parsed file name, as returned by - ``urllib.parse.urlparse(filename)``. + parsed_filename: `uritools.SplitResultString` + The parsed file name. :Returns: @@ -10428,16 +10725,18 @@ def _get_storage_options(self, filename, parsed_filename): "endpoint_url" not in storage_options and "endpoint_url" not in client_kwargs ): - storage_options["endpoint_url"] = ( - f"https://{parsed_filename.netloc}" - ) + authority = parsed_filename.authority + if not authority: + authority = "" + + storage_options["endpoint_url"] = f"https://{authority}" g["file_system_storage_options"].setdefault(filename, storage_options) return storage_options - def _get_hdf5_chunks(self, ncvar): - """Return a netCDF variable's HDF5 chunks. + def _get_dataset_chunks(self, ncvar): + """Return a netCDF variable's dataset chunks. .. versionadded:: (cfdm) 1.11.2.0 @@ -10455,11 +10754,11 @@ def _get_hdf5_chunks(self, ncvar): **Examples** - >>> n._get_hdf5_chunks('tas') + >>> n._get_dataset_chunks('tas') [1, 324, 432], (12, 324, 432) - >>> n._get_hdf5_chunks('pr') + >>> n._get_dataset_chunks('pr') 'contiguous', (12, 324, 432) - >>> n._get_hdf5_chunks('ua') + >>> n._get_dataset_chunks('ua') None, (12, 324, 432) """ @@ -10484,7 +10783,7 @@ def _get_hdf5_chunks(self, ncvar): return chunks, var.shape - def _dask_chunks(self, array, ncvar, compressed): + def _dask_chunks(self, array, ncvar, compressed, construct_type=None): """Set the Dask chunking strategy for a netCDF variable. .. versionadded:: (cfdm) 1.11.2.0 @@ -10503,6 +10802,10 @@ def _dask_chunks(self, array, ncvar, compressed): Whether or not the netCDF variable is compressed by convention. + construct_type: `str` or `None` + The type of the construct that contains *array*. Set + to `None` if the array does not belong to a construct. + :Returns: `str` or `int` or `list` @@ -10512,9 +10815,30 @@ def _dask_chunks(self, array, ncvar, compressed): """ g = self.read_vars - dask_chunks = g.get("dask_chunks", "storage-aligned") + cfa_write = g["cfa_write"] + if ( + cfa_write + and construct_type is not None + and construct_type in cfa_write + or "all" in cfa_write + ): + # The intention is for this array to be written out as an + # aggregation variable, so set dask_chunks=None to ensure + # that each Dask chunk contains exactly one complete + # fragment. + dask_chunks = None + else: + dask_chunks = g.get("dask_chunks", "storage-aligned") + storage_chunks = self._netcdf_chunksizes(g["variables"][ncvar]) + # ------------------------------------------------------------ + # None + # ------------------------------------------------------------ + if dask_chunks is None: + # No Dask chunking + return -1 + ndim = array.ndim if ( storage_chunks is not None @@ -10574,11 +10898,11 @@ def _dask_chunks(self, array, ncvar, compressed): # original Dask: (5, 15, 150, 5, 160) 9000000 # storage-aligned: (50, 100, 150, 20, 5) 75000000 # -------------------------------------------------------- - # 1) Initialise the Dask chunk shape dask_chunks = normalize_chunks( "auto", shape=array.shape, dtype=array.dtype ) + dask_chunks = [sizes[0] for sizes in dask_chunks] n_dask_elements = prod(dask_chunks) @@ -10753,13 +11077,6 @@ def _dask_chunks(self, array, ncvar, compressed): # chunked variables. return storage_chunks - # ------------------------------------------------------------ - # None - # ------------------------------------------------------------ - if dask_chunks is None: - # No Dask chunking - return -1 - # ------------------------------------------------------------ # dict # ------------------------------------------------------------ @@ -10960,7 +11277,7 @@ def _netcdf_chunksizes(self, variable): :Parameters: - variable: + variable: The variable, that has the same API as `netCDF4.Variable` or `h5netcdf.Variable`. @@ -10990,3 +11307,82 @@ def _netcdf_chunksizes(self, variable): chunks = variable.chunks return chunks + + def _cfa_is_aggregation_variable(self, ncvar): + """Return True if *ncvar* is a CF-netCDF aggregated variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ncvar: `str` + The name of the netCDF variable. + + :Returns: + + `bool` + Whether or not *ncvar* is an aggregated variable. + + """ + g = self.read_vars + return ( + ncvar in g["parsed_aggregated_data"] + and ncvar not in g["external_variables"] + ) + + def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): + """Parse a CF-netCDF 'aggregated_data' attribute. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ncvar: `str` + The netCDF variable name. + + aggregated_data: `str` or `None` + The CF-netCDF ``aggregated_data`` attribute. + + :Returns: + + `dict` + The parsed attribute. + + """ + if not aggregated_data: + return {} + + g = self.read_vars + fragment_array_variables = g["fragment_array_variables"] + variables = g["variables"] + variable_attributes = g["variable_attributes"] + + # Loop round aggregation instruction terms + out = {} + for x in self._parse_x( + ncvar, + aggregated_data, + keys_are_variables=True, + ): + term, term_ncvar = tuple(x.items())[0] + term_ncvar = term_ncvar[0] + out[term] = term_ncvar + + if term_ncvar in fragment_array_variables: + # We've already processed this term + continue + + attributes = variable_attributes[term_ncvar] + array = netcdf_indexer( + variables[term_ncvar], + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=False, + attributes=attributes, + copy=False, + ) + fragment_array_variables[term_ncvar] = array[...] + + g["parsed_aggregated_data"][ncvar] = out + return out diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 77723ce1db..c325881919 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -3,20 +3,34 @@ import os import re +import dask.array as da import netCDF4 import numpy as np from dask import config as dask_config from dask.array.core import normalize_chunks from dask.utils import parse_bytes from packaging.version import Version +from uritools import uricompose, urisplit +from ...data.dask_utils import cfdm_to_memory from ...decorators import _manage_log_level_via_verbosity +from ...functions import abspath, dirname, integer_dtype from .. import IOWrite from .netcdfread import NetCDFRead logger = logging.getLogger(__name__) +class AggregationError(Exception): + """An error relating to CF-netCDF aggregation. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + pass + + class NetCDFWrite(IOWrite): """A container for writing Fields to a netCDF dataset.""" @@ -341,8 +355,8 @@ def _datatype(self, variable): For example, if variable.dtype is 'float32', then 'f4' will be returned. - For a NETCDF4 or CFA4 format file, numpy string data types will - either return `str` regardless of the numpy string length (and a + For a NETCDF4 format file, numpy string data types will either + return `str` regardless of the numpy string length (and a netCDF4 string type variable will be created) or, if `self.write_vars['string']`` is `False`, ``'S1'`` (see below). @@ -590,26 +604,28 @@ def _write_dimension( except RuntimeError as error: message = ( "Can't create unlimited dimension " - f"in {g['netcdf'].file_format} file ({error})." + f"in {g['netcdf'].data_model} file ({error})." ) error = str(error) if error == "NetCDF: NC_UNLIMITED size already in use": raise RuntimeError( message - + f" In a {g['netcdf'].file_format} file only one " + + f" In a {g['netcdf'].data_model} file only one " "unlimited dimension is allowed. Consider using " "a netCDF4 format." ) raise RuntimeError(message) + else: + g["unlimited_dimensions"].add(ncdim) else: try: parent_group.createDimension(ncdim, size) except RuntimeError as error: raise RuntimeError( f"Can't create size {size} dimension {ncdim!r} in " - f"{g['netcdf'].file_format} file ({error})" + f"{g['netcdf'].data_model} file ({error})" ) g["dimensions"].add(ncdim) @@ -651,6 +667,8 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): data_axes = self.implementation.get_construct_data_axes(f, key) axis = data_axes[0] + coord = self._change_reference_datetime(coord) + already_in_file = self._already_in_file(coord) create = False @@ -1469,8 +1487,7 @@ def _write_node_coordinates( if ncdim not in ncdim_to_size: size = self.implementation.get_data_size(nodes) logger.info( - f" Writing size {size} netCDF node dimension: " - f"{ncdim}" + f" Writing size {size} netCDF node dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size @@ -2058,6 +2075,8 @@ def _write_scalar_coordinate( g = self.write_vars + coord_1d = self._change_reference_datetime(coord_1d) + scalar_coord = self.implementation.squeeze(coord_1d, axes=0) if not self._already_in_file(scalar_coord, ()): @@ -2124,6 +2143,8 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): # The netCDF dimensions for the auxiliary coordinate variable ncdimensions = self._netcdf_dimensions(f, key, coord) + coord = self._change_reference_datetime(coord) + already_in_file = self._already_in_file(coord, ncdimensions) create = False @@ -2470,9 +2491,7 @@ def _createVariable(self, **kwargs): """ g = self.write_vars - ncvar = kwargs["varname"] - g["nc"][ncvar] = g["netcdf"].createVariable(**kwargs) def _write_grid_mapping(self, f, ref, multiple_grid_mappings): @@ -2586,6 +2605,7 @@ def _write_netcdf_variable( data_variable=False, domain_variable=False, construct_type=None, + chunking=None, ): """Creates a new netCDF variable for a construct. @@ -2629,6 +2649,14 @@ def _write_netcdf_variable( .. versionadded:: (cfdm) 1.10.1.0 + chunking: sequence of `int`, optional + Set `netCDF4.createVariable` 'contiguous' and + `chunksizes` parameters (in that order). If not set + (the default), then these parameters are inferred from + the data. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `None` @@ -2702,11 +2730,17 @@ def _write_netcdf_variable( else: lsd = None - # Set the HDF5 chunk strategy - contiguous, chunksizes = self._chunking_parameters(data, ncdimensions) + # Set the dataset chunk strategy + if chunking: + contiguous, chunksizes = chunking + else: + contiguous, chunksizes = self._chunking_parameters( + data, ncdimensions + ) + logger.debug( - f" HDF5 chunksizes: {chunksizes}\n" - f" HDF5 contiguous: {contiguous}" + f" chunksizes: {chunksizes}\n" + f" contiguous: {contiguous}" ) # pragma: no cover # ------------------------------------------------------------ @@ -2746,13 +2780,54 @@ def _write_netcdf_variable( "fill_value": fill_value, } - # Add compression parameters (but not for vlen strings). - if kwargs["datatype"] != str: + # ------------------------------------------------------------ + # For aggregation variables, create a dictionary containing + # the fragment array variables' data. + # + # E.g. {'map': , + # 'location': , + # 'variable': } + # ------------------------------------------------------------ + cfa = None + if self._cfa_write_status(ncvar, cfvar, construct_type, domain_axes): + try: + cfa = self._cfa_fragment_array_variables(data, cfvar) + except AggregationError: + if g["cfa"].get("strict", True): + # Raise the exception in 'strict' mode + if g["mode"] == "w": + os.remove(g["filename"]) + + raise + + # In 'non-strict' mode, write the data to a normal + # non-aggregation variable. + g["cfa_write_status"][ncvar] = False + else: + # We're going to create a scalar aggregation variable, + # so override dimensions and dataset chunking strategy + # keyword arguments. This is necessary because the + # dimensions and dataset chunking strategy will + # otherwise reflect the aggregated data in memory, + # rather than the scalar variable in the file. + kwargs["dimensions"] = () + kwargs["contiguous"] = True + kwargs["chunksizes"] = None + + # Add compression parameters (but not for scalars or vlen + # strings). + # + # From the NUG: + # + # Compression is permitted but may not be effective for VLEN + # data, because the compression is applied to structures + # containing lengths and pointers to the data, rather than + # the actual data. + if kwargs["dimensions"] and kwargs["datatype"] != str: kwargs.update(g["netcdf_compression"]) # Note: this is a trivial assignment in standalone cfdm, but - # required for non-trivial customisation applied by subclasses - # e.g. in cf-python + # allows for non-trivial customisation applied by subclasses. kwargs = self._customise_createVariable( cfvar, construct_type, domain_axes, kwargs ) @@ -2761,26 +2836,49 @@ def _write_netcdf_variable( f" to netCDF variable: {ncvar}({', '.join(ncdimensions)})" ) # pragma: no cover + # Adjust createVariable arguments for contiguous variables + if kwargs["contiguous"]: + if g["netcdf"].data_model.startswith("NETCDF4"): + # NETCDF4 contiguous variables can't span unlimited + # dimensions + unlimited_dimensions = g["unlimited_dimensions"].intersection( + kwargs["dimensions"] + ) + if unlimited_dimensions: + data_model = g["netcdf"].data_model + raise ValueError( + f"Can't create variable {ncvar!r} in {data_model} " + f"file from {cfvar!r}: In {data_model} it is not " + "allowed to write contiguous (as opposed to chunked) " + "data that spans one or more unlimited dimensions: " + f"{unlimited_dimensions}" + ) + + # NETCDF4 contiguous variables can't be compressed + kwargs["compression"] = None + kwargs["complevel"] = 0 + try: self._createVariable(**kwargs) except RuntimeError as error: error = str(error) message = ( - f"Can't create variable in {g['netcdf'].file_format} file " - f"from {cfvar!r} ({error})" + f"Can't create variable in {g['netcdf'].data_model} file " + f"from {cfvar!r}: {error}. " + f"netCDF4.createVariable arguments: {kwargs}" ) if error == ( "NetCDF: Not a valid data type or _FillValue type mismatch" ): raise ValueError( f"Can't write {cfvar.data.dtype.name} data from {cfvar!r} " - f"to a {g['netcdf'].file_format} file. " + f"to a {g['netcdf'].data_model} file. " "Consider using a netCDF4 format, or use the 'datatype' " "parameter, or change the datatype before writing." ) elif error == "NetCDF: NC_UNLIMITED in the wrong index": raise RuntimeError( - f"{message}. In a {g['netcdf'].file_format} file the " + f"{message}. In a {g['netcdf'].data_model} file the " "unlimited dimension must be the first (leftmost) " "dimension of the variable. " "Consider using a netCDF4 format." @@ -2830,6 +2928,7 @@ def _write_netcdf_variable( compressed=self._compressed_data(ncdimensions), attributes=attributes, construct_type=construct_type, + cfa=cfa, ) def _customise_createVariable( @@ -2866,9 +2965,10 @@ def _customise_createVariable( `netCDF4.Dataset.createVariable`. """ - # This method is trivial but the intention is that subclasses will - # override it to perform any desired customisation. Notably see - # the equivalent method in cf-python which is non-trivial. + # This method is trivial but the intention is that subclasses + # will override it to perform any desired + # customisation. Notably see the equivalent method in + # cf-python which is non-trivial. return kwargs def _transform_strings(self, data, ncdimensions): @@ -2897,10 +2997,6 @@ def _transform_strings(self, data, ncdimensions): # is str, so this conversion does not happen. # -------------------------------------------------------- array = self.implementation.get_array(data) - # if np.ma.is_masked(array): - # array = array.compressed() - # else: - # array = array.flatten() array = self._numpy_compressed(array) strlen = len(max(array, key=len)) @@ -2923,6 +3019,7 @@ def _write_data( compressed=False, attributes=None, construct_type=None, + cfa=None, ): """Write a data array to the netCDF file. @@ -2953,61 +3050,68 @@ def _write_data( .. versionadded:: (cfdm) 1.10.1.0 + cfa: `dict`, optional + For aggregation variables, a dictionary containing the + fragment array variables' data. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `None` """ - # To avoid mutable default argument (an anti-pattern) of attributes={} - if attributes is None: - attributes = {} - g = self.write_vars + if cfa: + # -------------------------------------------------------- + # Write the data as an aggregation variable + # -------------------------------------------------------- + self._cfa_create_data(cfa, ncvar, ncdimensions, data, cfvar) + return + + # ------------------------------------------------------------ + # Still here? The write a normal (non-aggregation) variable + # ------------------------------------------------------------ if compressed: - # Get the data as a compressed numpy array - array = self.implementation.get_compressed_array(data) - else: - # Get the data as an uncompressed numpy array - array = self.implementation.get_array(data) + # Write data in its compressed form + data = data.source().source() - # Convert data type - new_dtype = g["datatype"].get(array.dtype) - if new_dtype is not None: - array = array.astype(new_dtype) - - # Check that the array doesn't contain any elements - # which are equal to any of the missing data values - if unset_values: - # if np.ma.is_masked(array): - # temp_array = array.compressed() - # else: - # temp_array = array - if np.intersect1d( - unset_values, self._numpy_compressed(array) - ).size: - raise ValueError( - "ERROR: Can't write data that has _FillValue or " - f"missing_value at unmasked point: {ncvar!r}" - ) + # Get the dask array + dx = da.asanyarray(data) - if ( - g["fmt"] == "NETCDF4" - and array.dtype.kind in "SU" - and np.ma.isMA(array) - ): - # VLEN variables can not be assigned to by masked arrays - # https://github.com/Unidata/netcdf4-python/pull/465 - array = array.filled("") + # Convert the data type + new_dtype = g["datatype"].get(dx.dtype) + if new_dtype is not None: + dx = dx.astype(new_dtype) + + # VLEN variables can not be assigned to by masked arrays + # (https://github.com/Unidata/netcdf4-python/pull/465), so + # fill missing data in string (as opposed to char) data types. + if g["fmt"] == "NETCDF4" and dx.dtype.kind in "SU": + dx = dx.map_blocks( + self._filled_string_array, + fill_value="", + meta=np.array((), dx.dtype), + ) + # Check for out-of-range values if g["warn_valid"]: - # Check for out-of-range values - self._check_valid(cfvar, array, attributes) + if construct_type: + var = cfvar + else: + var = None - # Copy the array into the netCDF variable - g["nc"][ncvar][...] = array + dx = dx.map_blocks( + self._check_valid, + cfvar=var, + attributes=attributes, + meta=np.array((), dx.dtype), + ) + + da.store(dx, g["nc"][ncvar], compute=True, return_stored=False) - def _check_valid(self, cfvar, array, attributes): + def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. Specifically, checks array for out-of-range values, as @@ -3017,16 +3121,19 @@ def _check_valid(self, cfvar, array, attributes): :Parameters: - cfvar: Construct + array: `numpy.ndarray` + The array to be checked. - array: `np.ndarray` + cfvar: construct + The CF construct containing the array. attributes: `dict` + The variable's CF properties. :Returns: - `bool` - Whether or not a warning was issued. + `numpy.ndarray` + The input array, unchanged. """ out = 0 @@ -3094,7 +3201,7 @@ def _check_valid(self, cfvar, array, attributes): ) out += 1 - return bool(out) + return array def _convert_to_char(self, data): """Convert string data into character data. @@ -4477,7 +4584,7 @@ def write( datatype=None, least_significant_digit=None, endian="native", - compress=0, + compress=4, fletcher32=False, shuffle=True, scalar=True, @@ -4488,7 +4595,9 @@ def write( group=True, coordinates=False, omit_data=None, - hdf5_chunks="4MiB", + dataset_chunks="4MiB", + cfa="auto", + reference_datetime=None, ): """Write field and domain constructs to a netCDF file. @@ -4715,11 +4824,13 @@ def write( .. versionadded:: (cfdm) 1.10.0.1 - hdf5_chunks: `str`, `int`, or `float`, optional - The HDF5 chunking strategy. The default - value is "4MiB". + dataset_chunks: `str`, `int`, or `float`, optional + The dataset chunking strategy. The default value is + "4MiB". See `cfdm.write` for details. - See `cfdm.write` for details. + cfa: `dict` or `None`, optional + Configure the creation of aggregation variables. See + `cfdm.write` for details. .. versionadded:: (cfdm) 1.11.2.0 @@ -4736,8 +4847,9 @@ def write( # Expand file name filename = os.path.expanduser(os.path.expandvars(filename)) + filename = abspath(filename) - # Parse the omit_data parameter + # Parse the 'omit_data' parameter if omit_data is None: omit_data = () elif isinstance(omit_data, str): @@ -4809,6 +4921,7 @@ def write( "geometry_dimensions": set(), "dimensions_with_role": {}, "dimensions": set(), + "unlimited_dimensions": set(), "latest_version": Version(self.implementation.get_cf_version()), "version": {}, # Warn for the presence of out-of-range data with of @@ -4834,8 +4947,22 @@ def write( "post_dry_run": False, # Do not write the data of the named construct types. "omit_data": omit_data, - # HDF5 chunking stategy - "hdf5_chunks": hdf5_chunks, + # Change the units of a reference date-time. + "reference_datetime": reference_datetime, + # -------------------------------------------------------- + # CF Aggregation variables + # -------------------------------------------------------- + # Configuration options for writing aggregation variables + "cfa": None, + # The directory of the aggregation file + "aggregation_file_directory": None, + # Cache the CF aggregation variable write status for each + # netCDF variable + "cfa_write_status": {}, + # -------------------------------------------------------- + # Dataset chunking stategy + # -------------------------------------------------------- + "dataset_chunks": dataset_chunks, } if mode not in ("w", "a", "r+"): @@ -4848,23 +4975,68 @@ def write( self.write_vars["mode"] = mode - # Parse hdf5_chunks - if hdf5_chunks != "contiguous": + # Parse the 'dataset_chunks' parameter + if dataset_chunks != "contiguous": try: - self.write_vars["hdf5_chunks"] = parse_bytes(hdf5_chunks) + self.write_vars["dataset_chunks"] = parse_bytes(dataset_chunks) except (ValueError, AttributeError): raise ValueError( - "Invalid value for the 'hdf5_chunks' keyword: " - f"{hdf5_chunks!r}." + "Invalid value for the 'dataset_chunks' keyword: " + f"{dataset_chunks!r}." + ) + + # ------------------------------------------------------------ + # Parse the 'cfa' keyword + # ------------------------------------------------------------ + if cfa is None: + cfa = {"constructs": None} + elif isinstance(cfa, str): + cfa = {"constructs": cfa} + elif isinstance(cfa, dict): + keys = ("constructs", "uri", "strict") + if not set(cfa).issubset(keys): + raise ValueError( + f"Invalid dictionary key to the 'cfa' keyword: {cfa!r}. " + f"Valid keys are {keys}" ) + valid_uri = ("default", "absolute", "relative") + if cfa.get("uri", "default") not in valid_uri: + raise ValueError( + "Invalid value for the 'uri' keyword of the 'cfa' " + f"parameter: {cfa!r}. Expected one of {valid_uri}" + ) + + cfa = cfa.copy() + else: + raise ValueError( + f"Invalid value for the 'cfa' keyword: {cfa!r}. " + "Should be a string, a dictionary, or None" + ) + + cfa.setdefault("constructs", "auto") + cfa.setdefault("uri", "default") + cfa.setdefault("strict", True) + + constructs = cfa["constructs"] + if isinstance(constructs, dict): + cfa["constructs"] = constructs.copy() + elif constructs is not None: + # Convert a (sequence of) `str` to a `dict` + if isinstance(constructs, str): + constructs = (constructs,) + + cfa["constructs"] = {c: None for c in constructs} + + self.write_vars["cfa"] = cfa + effective_mode = mode # actual mode to use for the first IO iteration effective_fields = fields if mode == "a": # First read in the fields from the existing file: effective_fields = self._NetCDFRead(self.implementation).read( - filename + filename, netcdf_backend="netCDF4" ) # Read rather than append for the first iteration to ensure nothing @@ -5022,7 +5194,7 @@ def _file_io_iteration( # ------------------------------------------------------------ # Set possible versions # ------------------------------------------------------------ - for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): + for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"): g["CF-" + version] = Version(version) if extra_write_vars: @@ -5032,7 +5204,10 @@ def _file_io_iteration( self._customise_write_vars() compress = int(compress) - zlib = bool(compress) + if compress: + compression = "zlib" + else: + compression = None netcdf3_fmts = ( "NETCDF3_CLASSIC", @@ -5103,7 +5278,7 @@ def _file_io_iteration( # ------------------------------------------------------- g["netcdf_compression"].update( { - "zlib": zlib, + "compression": compression, "complevel": compress, "fletcher32": bool(fletcher32), "shuffle": bool(shuffle), @@ -5333,8 +5508,8 @@ def _chunking_parameters(self, data, ncdimensions): g = self.write_vars # ------------------------------------------------------------ - # HDF5 chunk strategy: Either use that provided on the data, - # or else work it out. + # Dataset chunk strategy: Either use that provided on the + # data, or else work it out. # ------------------------------------------------------------ # Get the chunking strategy defined by the data itself chunksizes = self.implementation.nc_get_hdf5_chunksizes(data) @@ -5343,25 +5518,25 @@ def _chunking_parameters(self, data, ncdimensions): return True, None # Still here? - hdf5_chunks = g["hdf5_chunks"] + dataset_chunks = g["dataset_chunks"] if isinstance(chunksizes, int): - # Reset hdf_chunks to the integer given by 'data' - hdf5_chunks = chunksizes + # Reset dataset chunks to the integer given by 'data' + dataset_chunks = chunksizes elif chunksizes is not None: # Chunked as defined by the tuple of int given by 'data' return False, chunksizes # Still here? Then work out the chunking strategy from the - # hdf5_chunks - if hdf5_chunks == "contiguous": - # Contiguous as defined by 'hdf_chunks' + # dataset_chunks + if dataset_chunks == "contiguous": + # Contiguous as defined by 'dataset_chunks' return True, None # Still here? Then work out the chunks from both the - # size-in-bytes given by hdf5_chunks (e.g. 1024, or '1 KiB'), - # and the data shape (e.g. (12, 73, 96)). + # size-in-bytes given by dataset_chunks (e.g. 1024, or '1 + # KiB'), and the data shape (e.g. (12, 73, 96)). if self._compressed_data(ncdimensions): - # Base the HDF5 chunks on the compressed data that is + # Base the dataset chunks on the compressed data that is # going into the file d = self.implementation.get_compressed_array(data) else: @@ -5370,7 +5545,7 @@ def _chunking_parameters(self, data, ncdimensions): d_dtype = d.dtype dtype = g["datatype"].get(d_dtype, d_dtype) - with dask_config.set({"array.chunk-size": hdf5_chunks}): + with dask_config.set({"array.chunk-size": dataset_chunks}): chunksizes = normalize_chunks("auto", shape=d.shape, dtype=dtype) if chunksizes: @@ -5409,3 +5584,646 @@ def _compressed_data(self, ncdimensions): self.write_vars["sample_ncdim"].values() ) ) + + def _change_reference_datetime(self, coord): + """Change the units of a reference date-time. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + coord: `Coordinate` + The time coordinates. + + :Returns: + + A new coordinate construct with changed units. + + """ + reference_datetime = self.write_vars["reference_datetime"] + if not reference_datetime or not coord.Units.isreftime: + return coord + + if not hasattr(coord, "reference_datetime"): + raise ValueError( + "Can't override time coordinate reference date-time " + f"for {coord.__class__} objects." + ) + + coord = coord.copy() + try: + coord.reference_datetime = reference_datetime + except ValueError: + raise ValueError( + "Can't override time coordinate reference date-time " + f"{coord.reference_datetime!r} with {reference_datetime!r}" + ) + else: + return coord + + def _cfa_write_status(self, ncvar, cfvar, construct_type, domain_axes): + """The aggregation write status of the data. + + A necessary and sufficient condition for writing the data as + aggregated data is that this method returns `True` and + `_cfa_aggregation_instructions` returns a `dict`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + cfvar: + Construct (e.g. `DimensionCoordinate`), or construct + component e.g. (`Bounds`) that contains the data. + + construct_type: `str` + The construct type of the *cfvar*, or of its parent if + *cfvar* is a construct component. + + domain_axes: `None`, or `tuple` of `str` + The domain axis construct identifiers for *cfvar*. + + :Returns: + + `bool` + True if the variable is to be written as an + aggregation variable. + + """ + g = self.write_vars + + cfa_write_status = g["cfa_write_status"].get(ncvar) + if cfa_write_status is not None: + return cfa_write_status + + if construct_type is None: + # This prevents recursion whilst writing fragment array + # variables. + g["cfa_write_status"][ncvar] = False + return False + + data = self.implementation.get_data(cfvar, None) + if data is None: + # Can't write as an aggregation variable when there is no + # data + g["cfa_write_status"][ncvar] = False + return False + + constructs = g["cfa"].get("constructs") + if constructs is None: + # Nothing gets written as an aggregation variable when + # 'constructs' is set to None + g["cfa_write_status"][ncvar] = False + return False + + for c in (construct_type, "all"): + if c in constructs: + ndim = constructs[c] + if ndim is None or ndim == len(domain_axes): + g["cfa_write_status"][ncvar] = True + return True + + g["cfa_write_status"][ncvar] = False + return False + + if "auto" in constructs: + if not data.nc_get_aggregated_data(): + g["cfa_write_status"][ncvar] = False + return False + + ndim = constructs["auto"] + if ndim is None or ndim == len(domain_axes): + g["cfa_write_status"][ncvar] = True + return True + + g["cfa_write_status"][ncvar] = False + return False + + def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): + """Write an aggregation variable to the netCDF file. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + cfa: `dict` + A dictionary containing the fragment array variables' + data. + + ncvar: `str` + The netCDF name for the variable. + + ncdimensions: sequence of `str` + + netcdf_attrs: `dict` + + data: `Data` + + :Returns: + + `True` + + """ + g = self.write_vars + + # ------------------------------------------------------------ + # Write the fragment array variables to the netCDF file + # ------------------------------------------------------------ + aggregated_data = data.nc_get_aggregated_data() + aggregated_data_attr = [] + + all_dimensions = g["dimensions"] + all_unlimited_dimensions = g["unlimited_dimensions"] + + # ------------------------------------------------------------ + # Map + # ------------------------------------------------------------ + feature = "map" + f_map = cfa[feature] + + chunking = None + + # Get the shape netCDF dimensions from the 'map' fragment + # array variable. + map_ncdimensions = [] + dim = "j" + for size in f_map.shape: + cfa_ncdim = f"a_map_{dim}{size}" + if dim == "i" and all_unlimited_dimensions.intersection( + ncdimensions + ): + unlimited = True + # Append a "u" to the dimension name to allow there to + # fixed and unlimited dimensions with the same size + cfa_ncdim += "u" + else: + unlimited = False + + if cfa_ncdim not in all_dimensions: + # Create a new location dimension + self._write_dimension( + cfa_ncdim, None, unlimited=unlimited, size=size + ) + + map_ncdimensions.append(cfa_ncdim) + dim = "i" + + map_ncdimensions = tuple(map_ncdimensions) + + # # Write the fragment array variable to the netCDF dataset + # if ncdimensions[0].startswith('time'): + # chunking=(False, (f_map.shape[0], f_map.shape[1] * 85*12)) + + feature_ncvar = self._cfa_write_fragment_array_variable( + f_map, + aggregated_data.get(feature, f"cfa_{feature}"), + map_ncdimensions, + chunking=chunking, + ) + aggregated_data_attr.append(f"{feature}: {feature_ncvar}") + + if "location" in cfa: + # -------------------------------------------------------- + # Location + # -------------------------------------------------------- + feature = "location" + f_location = cfa[feature] + + chunking = None + + # Get the fragment array netCDF dimensions from the + # 'location' fragment array variable. + location_ncdimensions = [] + for ncdim, size in zip(ncdimensions, f_location.shape): + cfa_ncdim = f"a_{ncdim}" + if cfa_ncdim not in all_dimensions: + # Create a new fragment array dimension + unlimited = ncdim in all_unlimited_dimensions + # unlimited = ncdim in g[ + # "unlimited_dimensions" + # ] and ncdim.startswith("time") + self._write_dimension( + cfa_ncdim, None, unlimited=unlimited, size=size + ) + + location_ncdimensions.append(cfa_ncdim) + + location_ncdimensions = tuple(location_ncdimensions) + + # # Write the fragment array variable to the netCDF dataset + # if ncdimensions[0].startswith('time'): + # chunking = (False, ((85*12,) + f_location.shape[1:])) + # else: + chunking = None + feature_ncvar = self._cfa_write_fragment_array_variable( + f_location, + aggregated_data.get(feature, f"cfa_{feature}"), + location_ncdimensions, + chunking=chunking, + ) + aggregated_data_attr.append(f"{feature}: {feature_ncvar}") + + # -------------------------------------------------------- + # Variable + # -------------------------------------------------------- + feature = "variable" + + # Attempt to reduce variable names to a common scalar + # value + u = cfa[feature].unique().compressed().persist() + if u.size == 1: + cfa[feature] = u.squeeze() + variable_ncdimensions = () + else: + variable_ncdimensions = location_ncdimensions + + f_variable = cfa[feature] + + # Write the fragment array variable to the netCDF dataset + feature_ncvar = self._cfa_write_fragment_array_variable( + f_variable, + aggregated_data.get(feature, f"cfa_{feature}"), + variable_ncdimensions, + ) + aggregated_data_attr.append(f"{feature}: {feature_ncvar}") + else: + # -------------------------------------------------------- + # Unique value + # -------------------------------------------------------- + feature = "unique_value" + f_unique_value = cfa[feature] + + # Get the fragment array netCDF dimensions from the + # 'value' fragment array variable. + unique_value_ncdimensions = [] + for ncdim, size in zip(ncdimensions, f_unique_value.shape): + cfa_ncdim = f"a_{ncdim}" + if cfa_ncdim not in g["dimensions"]: + # Create a new fragment array dimension + self._write_dimension(cfa_ncdim, None, size=size) + + unique_value_ncdimensions.append(cfa_ncdim) + + unique_value_ncdimensions = tuple(unique_value_ncdimensions) + + # Write the fragment array variable to the netCDF dataset + feature_ncvar = self._cfa_write_fragment_array_variable( + f_unique_value, + aggregated_data.get(feature, f"cfa_{feature}"), + unique_value_ncdimensions, + ) + aggregated_data_attr.append(f"{feature}: {feature_ncvar}") + + # ------------------------------------------------------------ + # Add the aggregation variable attributes + # ------------------------------------------------------------ + self._write_attributes( + None, + ncvar, + extra={ + "aggregated_dimensions": " ".join(ncdimensions), + "aggregated_data": " ".join(sorted(aggregated_data_attr)), + }, + ) + + g["cfa_write_status"][ncvar] = True + return True + + def _filled_string_array(self, array, fill_value=""): + """Fill a string array. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + array: `numpy.ndarray` + The `numpy` array with string (byte or unicode) data + type. + + :Returns: + + `numpy.ndarray` + The string array with any missing data replaced + by the fill value. + + """ + if np.ma.isMA(array): + return array.filled(fill_value) + + return array + + def _cfa_write_fragment_array_variable( + self, data, ncvar, ncdimensions, attributes=None, chunking=None + ): + """Write an aggregation fragment array variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + data `Data` + The data to write. + + ncvar: `str` + The netCDF variable name. + + ncdimensions: `tuple` of `str` + The fragment array variable's netCDF dimensions. + + attributes: `dict`, optional + Any attributes to attach to the variable. + + chunking: sequence, optional + Set `netCDF4.createVariable` 'contiguous' and + `chunksizes` parameters (in that order) for the + fragment array variable. If not set (the default), + then these parameters are inferred from the data. + + :Returns: + + `str` + The netCDF variable name of the fragment array + variable. + + """ + create = not self._already_in_file(data, ncdimensions) + + if create: + # Create a new fragment array variable in the file, with + # 'contiguous' chunking + ncvar = self._netcdf_name(ncvar) + self._write_netcdf_variable( + ncvar, + ncdimensions, + data, + None, + extra=attributes, + chunking=chunking, + ) + else: + # This fragment array variable has already been written to + # the file + ncvar = self.write_vars["seen"][id(data)]["ncvar"] + + return ncvar + + @classmethod + def _cfa_unique_value(cls, a, strict=True): + """Return the unique value of an array. + + If there are multiple unique values then missing data is + returned. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + a: `numpy.ndarray` + The array. + + strict: `bool`, optional + If True then raise an exception if there is more than + one unique value. If False then a unique value of + missing data will be returned in this case. + + :Returns: + + `numpy.ndarray` + A size 1 array containing the unique value, or missing + data if there is not a unique value. + + """ + a = cfdm_to_memory(a) + + out_shape = (1,) * a.ndim + a = np.unique(a) + if a.size == 1: + return a.reshape(out_shape) + + if strict: + raise AggregationError(str(a)) + + return np.ma.masked_all(out_shape, dtype=a.dtype) + + def _cfa_fragment_array_variables(self, data, cfvar): + """Convert data to aggregated_data terms. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + data: `Data` + The data to be converted. + + cfvar: construct + The construct that contains the *data*. + + :Returns: + + `dict` + A dictionary whose keys are the standardised + aggregation_data terms, with values of `Data` + instances containing the corresponding variables. + + **Examples** + + >>> n._cfa_fragment_array_variables(data, cfvar) + {'shape': , + 'location': , + 'variable': } + + """ + from os.path import relpath + + g = self.write_vars + + if not data.nc_get_aggregation_write_status(): + raise AggregationError( + f"Can't write {cfvar!r} as a CF-netCDF aggregation variable. " + "This is probably because some fragment values have been " + "changed relative to those in the fragment files, or a " + "rechunking has occured." + ) + + # ------------------------------------------------------------ + # Create the 'map' array + # ------------------------------------------------------------ + a_shape = data.numblocks + if a_shape: + ndim = data.ndim + aggregation_shape = np.ma.masked_all( + (ndim, max(a_shape)), dtype=integer_dtype(max(data.chunksize)) + ) + for i, chunks in enumerate(data.chunks): + aggregation_shape[i, : len(chunks)] = chunks + else: + # Scalar 'shape' fragment array variable + aggregation_shape = np.ones((), dtype=np.dtype("int32")) + + out = {"map": type(data)(aggregation_shape)} + + if data.nc_get_aggregation_fragment_type() == "location": + # -------------------------------------------------------- + # Create 'location' and 'variable' arrays + # -------------------------------------------------------- + uri_default = g["cfa"].get("uri", "default") == "default" + uri_relative = ( + not uri_default + and g["cfa"].get("uri", "relative") == "relative" + ) + normalise = not uri_default + + if uri_relative: + # Get the aggregation file directory as an absolute + # URI + aggregation_file_directory = g["aggregation_file_directory"] + if aggregation_file_directory is None: + uri = urisplit(dirname(g["filename"])) + if uri.isuri(): + aggregation_file_scheme = uri.scheme + aggregation_file_directory = uri.geturi() + else: + aggregation_file_scheme = "file" + aggregation_file_directory = uricompose( + scheme=aggregation_file_scheme, + authority="", + path=uri.path, + ) + + g["aggregation_file_directory"] = ( + aggregation_file_directory + ) + g["aggregation_file_scheme"] = aggregation_file_scheme + + aggregation_file_scheme = g["aggregation_file_scheme"] + + aggregation_location = [] + aggregation_variable = [] + for index, position in zip( + data.chunk_indices(), data.chunk_positions() + ): + # Try to get this Dask chunk's data as a reference to + # fragment file + fragment = data[index].compute(_force_to_memory=False) + try: + filename, address, is_subspace, f_index = ( + fragment.get_filename(normalise=normalise), + fragment.get_address(), + fragment.is_subspace(), + fragment.index(), + ) + except (AttributeError, TypeError): + # This Dask chunk's data is not a reference to + # fragment file + raise AggregationError( + f"Can't write {cfvar!r} as a CF-netCDF " + "aggregation variable: " + f"The Dask chunk in position {position} " + f"(defined by data index {index!r}) does not " + "reference a unique fragment file. This is could be " + "because some fragment values have been changed " + "relative to those in the fragment files, or a " + "Dask rechunking has occured, etc." + ) + + if is_subspace: + # This Dask chunk's data is a reference to + # fragment file, but only to a subspace of it. + raise AggregationError( + f"Can't write {cfvar!r} as a CF-netCDF " + "aggregation variable: " + f"The Dask chunk in position {position} " + f"(defined by data index {index!r}) references " + f"a subspace ({f_index!r}) of the fragment file " + f"{fragment!r}. This might be fixable by setting " + "the 'cfa_write' parameter to the 'read' function." + ) + + uri = urisplit(filename) + if uri_relative and uri.isrelpath(): + filename = abspath(filename) + + if uri.isabspath(): + # File name is an absolute-path URI reference + filename = uricompose( + scheme="file", + authority="", + path=abspath(uri.path), + ) + + if uri_relative: + scheme = uri.scheme + if not scheme: + scheme = "file" + + if scheme != aggregation_file_scheme: + raise AggregationError( + f"Can't write {cfvar!r} as a CF-netCDF " + "aggregation variable: " + "Attempting to create a relative-path URI " + f"reference for the fragment file {fragment}, " + "referenced by the Dask chunk in position " + f"{position} (defined by data index {index!r}), " + "but the aggregation file URI scheme " + f"({aggregation_file_scheme}:) is incompatible." + ) + + filename = relpath( + filename, start=aggregation_file_directory + ) + + aggregation_location.append(filename) + aggregation_variable.append(address) + + # Reshape the 1-d aggregation instruction arrays to span + # the data dimensions, plus the extra trailing dimension + # if there is one. + aggregation_location = np.array(aggregation_location).reshape( + a_shape + ) + aggregation_variable = np.array(aggregation_variable).reshape( + a_shape + ) + + out["location"] = type(data)(aggregation_location) + out["variable"] = type(data)(aggregation_variable) + else: + # ------------------------------------------------------------ + # Create a 'value' array + # ------------------------------------------------------------ + # Transform the data so that it spans the fragment + # dimensions with one value per fragment. If a chunk has + # more than one unique value then the fragment's value is + # missing data. + dx = data.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ) + dx_ind = tuple(range(dx.ndim)) + out_ind = dx_ind + dx = da.blockwise( + self._cfa_unique_value, + out_ind, + dx, + dx_ind, + adjust_chunks={i: 1 for i in out_ind}, + meta=np.array((), dx.dtype), + strict=g["cfa"]["strict"], + ) + d = type(data)(dx) + + try: + d.persist(inplace=True) + except AggregationError as error: + raise AggregationError( + f"Can't write {cfvar!r} as a CF-netCDF aggregation " + "variable. " + "At least one Dask chunk has more than one unique value: " + f"{error}. " + "Set the 'strict' keyword of the 'cfa' parameter to True " + "to use a unique value of missing data in this case." + ) + + out["unique_value"] = d + + # Return the dictionary of Data objects + return out diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 20c815da4e..d1809c16c7 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -3,28 +3,12 @@ from numpy.ma.core import MaskError from ..cfdmimplementation import implementation +from .abstract import ReadWrite +from .exceptions import DatasetTypeError from .netcdf import NetCDFRead -_implementation = implementation() - - -def read( - filename, - external=None, - extra=None, - verbose=None, - warnings=False, - warn_valid=False, - mask=True, - unpack=True, - domain=False, - netcdf_backend=None, - storage_options=None, - cache=True, - dask_chunks="storage-aligned", - store_hdf5_chunks=True, - _implementation=_implementation, -): + +class read(ReadWrite): """Read field or domain constructs from a dataset. The following file formats are supported: netCDF and CDL. @@ -55,9 +39,9 @@ def read( Domain axis constructs that correspond to NetCDF unlimited dimensions may be accessed with the - `~cfdm.DomainAxis.nc_is_unlimited` and - `~cfdm.DomainAxis.nc_set_unlimited` methods of a domain axis - construct. + `~{{package}}.DomainAxis.nc_is_unlimited` and + `~{{package}}.DomainAxis.nc_set_unlimited` methods of a domain + axis construct. **NetCDF hierarchical groups** @@ -88,7 +72,7 @@ def read( variable. Other types of non-compliance are not checked, such whether or not controlled vocabularies have been adhered to. The structural compliance of the dataset may be checked with the - `~cfdm.Field.dataset_compliance` method of the returned + `~{{package}}.Field.dataset_compliance` method of the returned constructs, as well as optionally displayed when the dataset is read by setting the *warnings* parameter. @@ -96,16 +80,13 @@ def read( **Performance** Descriptive properties are always read into memory, but lazy - loading is employed for all data arrays, which means that no data - is read into memory until the data is required for inspection or - to modify the array contents. This maximises the number of field - constructs that may be read within a session, and makes the read - operation fast. + loading is employed for all data arrays, unless the *to_memory* + parameter has been set. .. versionadded:: (cfdm) 1.7.0 - .. seealso:: `cfdm.write`, `cfdm.Field`, `cfdm.Domain`, - `cfdm.unique_constructs` + .. seealso:: `{{package}}.write`, `{{package}}.Field`, + `{{package}}.Domain`, `{{package}}.unique_constructs` :Parameters: @@ -121,424 +102,86 @@ def read( ``'$HOME/file.nc'``, ``'${HOME}/file.nc'``, ``'~/file.nc'``, ``'~/tmp/../file.nc'``. - external: (sequence of) `str`, optional - Read external variables (i.e. variables which are named by - attributes, but are not present, in the parent file given - by the *filename* parameter) from the given external - files. Ignored if the parent file does not contain a - global ``external_variables`` attribute. Multiple external - files may be provided, which are searched in random order - for the required external variables. - - If an external variable is not found in any external - files, or is found in multiple external files, then the - relevant metadata construct is still created, but without - any metadata or data. In this case the construct's - `!is_external` method will return `True`. + {{read external: (sequence of) `str`, optional}} - *Parameter example:* - ``external='cell_measure.nc'`` + {{read extra: (sequence of) `str`, optional}} - *Parameter example:* - ``external=['cell_measure.nc']`` + {{read verbose: `int` or `str` or `None`, optional}} - *Parameter example:* - ``external=('cell_measure_A.nc', 'cell_measure_O.nc')`` + {{read warnings: `bool`, optional}} - extra: (sequence of) `str`, optional - Create extra, independent fields from netCDF variables - that correspond to particular types metadata constructs. - Ignored if *domain* is True. + {{read warn_valid: `bool`, optional}} - The *extra* parameter may be one, or a sequence, of: + .. versionadded:: (cfdm) 1.8.3 - ========================== =============================== - *extra* Metadata constructs - ========================== =============================== - ``'field_ancillary'`` Field ancillary constructs - ``'domain_ancillary'`` Domain ancillary constructs - ``'dimension_coordinate'`` Dimension coordinate constructs - ``'auxiliary_coordinate'`` Auxiliary coordinate constructs - ``'cell_measure'`` Cell measure constructs - ========================== =============================== + {{read mask: `bool`, optional}} - *Parameter example:* - To create fields from auxiliary coordinate constructs: - ``extra='auxiliary_coordinate'`` or - ``extra=['auxiliary_coordinate']``. + .. versionadded:: (cfdm) 1.8.2 - *Parameter example:* - To create fields from domain ancillary and cell measure - constructs: ``extra=['domain_ancillary', - 'cell_measure']``. - - An extra field construct created via the *extra* parameter - will have a domain limited to that which can be inferred - from the corresponding netCDF variable, but without the - connections that are defined by the parent netCDF data - variable. It is possible to create independent fields from - metadata constructs that do incorporate as much of the - parent field construct's domain as possible by using the - `~cfdm.Field.convert` method of a returned field - construct, instead of setting the *extra* parameter. - - verbose: `int` or `str` or `None`, optional - If an integer from ``-1`` to ``3``, or an equivalent string - equal ignoring case to one of: - - * ``'DISABLE'`` (``0``) - * ``'WARNING'`` (``1``) - * ``'INFO'`` (``2``) - * ``'DETAIL'`` (``3``) - * ``'DEBUG'`` (``-1``) - - set for the duration of the method call only as the minimum - cut-off for the verboseness level of displayed output (log) - messages, regardless of the globally-configured `cfdm.log_level`. - Note that increasing numerical value corresponds to increasing - verbosity, with the exception of ``-1`` as a special case of - maximal and extreme verbosity. - - Otherwise, if `None` (the default value), output messages will - be shown according to the value of the `cfdm.log_level` setting. - - Overall, the higher a non-negative integer or equivalent string - that is set (up to a maximum of ``3``/``'DETAIL'``) for - increasing verbosity, the more description that is printed to - convey how the contents of the netCDF file were parsed and - mapped to CF data model constructs. - - warnings: `bool`, optional - If True then print warnings when an output field construct - is incomplete due to structural non-compliance of the - dataset. By default such warnings are not displayed. - - warn_valid: `bool`, optional - If True then print a warning for the presence of - ``valid_min``, ``valid_max`` or ``valid_range`` properties - on field constructs and metadata constructs that have - data. By default no such warning is issued. - - "Out-of-range" data values in the file, as defined by any - of these properties, are automatically masked by default, - which may not be as intended. See the *mask* parameter for - turning off all automatic masking. - - See - https://ncas-cms.github.io/cfdm/tutorial.html#data-mask - for details. + {{read unpack: `bool`}} - .. versionadded:: (cfdm) 1.8.3 + .. versionadded:: (cfdm) 1.11.2.0 - mask: `bool`, optional - If True (the default) then mask by convention the data of - field and metadata constructs. + {{read domain: `bool`, optional}} - A netCDF array is masked depending on the values of any of - the netCDF attributes ``_FillValue``, ``missing_value``, - ``_Unsigned``, ``valid_min``, ``valid_max``, and - ``valid_range``. + .. versionadded:: (cfdm) 1.9.0.0 - See - https://ncas-cms.github.io/cfdm/tutorial.html#data-mask - for details. + {{read netcdf_backend: `None` or (sequence of) `str`, optional}} - .. versionadded:: (cfdm) 1.8.2 + .. versionadded:: (cfdm) 1.11.2.0 + + {{read storage_options: `dict` or `None`, optional}} + + .. versionadded:: (cfdm) 1.11.2.0 - unpack: `bool` - If True, the default, then unpack arrays by convention - when the data is read from disk. + {{read cache: `bool`, optional}} - Unpacking is determined by netCDF conventions for the - following variable attributes: ``add_offset``, - ``scale_factor``, and ``_Unsigned``. + .. versionadded:: (cfdm) 1.11.2.0 + + {{read dask_chunks: `str`, `int`, `None`, or `dict`, optional}} + + .. versionadded:: (cfdm) 1.11.2.0 + + {{read store_dataset_chunks: `bool`, optional}} .. versionadded:: (cfdm) 1.11.2.0 - domain: `bool`, optional - If True then return only the domain constructs that are - explicitly defined by CF-netCDF domain variables, ignoring - all CF-netCDF data variables. By default only the field - constructs defined by CF-netCDF data variables are - returned. + {{read cfa: `dict`, optional}} - CF-netCDF domain variables are only defined from CF-1.9, - so older datasets automatically contain no CF-netCDF - domain variables. + .. versionadded:: (cfdm) NEXTVERSION - The unique domain constructs of the dataset are easily - found with the `cfdm.unique_constructs` function. For - example:: + {{read cfa_write: (sequence of) `str`, optional}} - >>> d = cfdm.read('file.nc', domain=True) - >>> ud = cfdm.unique_constructs(d) - >>> f = cfdm.read('file.nc') - >>> ufd = cfdm.unique_constructs(x.domain for x in f) + .. versionadded:: (cfdm) NEXTVERSION - .. versionadded:: (cfdm) 1.9.0.0 + {{read to_memory: (sequence of) `str`, optional}} - netcdf_eninge: `None` or `str`, optional - Specify which library to use for opening and reading - netCDF files. By default, or if `None`, then the first one - of `netCDF4` and `h5netcdf` to successfully open the - netCDF file is used. Setting *netcdf_backend* to one of - ``'netCDF4'`` and ``'h5netcdf'`` will force the use of - that library. + .. versionadded:: (cfdm) NEXTVERSION - .. versionadded:: (cfdm) 1.11.2.0 + {{read squeeze: `bool`, optional}} - storage_options: `dict` or `None`, optional - Pass parameters to the backend file system driver, such as - username, password, server, port, etc. How the storage - options are interpreted depends on the location of the - file: - - * **Local File System**: Storage options are ignored for - local files. - - * **HTTP(S)**: Storage options are ignored for files - available across the network via OPeNDAP. - - * **S3-compatible services**: The backend used is `s3fs`, - and the storage options are used to initialise an - `s3fs.S3FileSystem` file system object. By default, or - if `None`, then *storage_options* is taken as ``{}``. - - If the ``'endpoint_url'`` key is not in - *storage_options*, nor in a dictionary defined by the - ``'client_kwargs'`` key (both of which are the case when - *storage_options* is `None`), then one will be - automatically inserted for accessing an S3 file. For - instance, with a file name of - ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key - with value ``'https://store'`` would be created. To - disable this, set the ``'endpoint_url'`` key to `None`. - - *Parameter example:* - For a file name of ``'s3://store/data/file.nc'``, the - following are equivalent: ``None``, ``{}``, - ``{'endpoint_url': 'https://store'}``, and - ``{'client_kwargs': {'endpoint_url': - 'https://store'}}`` - - *Parameter example:* - ``{'key': 'scaleway-api-key...', 'secret': - 'scaleway-secretkey...', 'endpoint_url': - 'https://s3.fr-par.scw.cloud', 'client_kwargs': - {'region_name': 'fr-par'}}`` + .. versionadded:: (cfdm) NEXTVERSION - .. versionadded:: (cfdm) 1.11.2.0 + {{read unsqueeze: `bool`, optional}} - cache: `bool`, optional - If True, the default, then cache the first and last array - elements of metadata constructs (not field constructs) for - fast future access. In addition, the second and - penultimate array elements will be cached from coordinate - bounds when there are two bounds per cell. For remote - data, setting *cache* to False may speed up the parsing of - the file. + .. versionadded:: (cfdm) NEXTVERSION - .. versionadded:: (cfdm) 1.11.2.0 + {{read file_type: `None` or (sequence of) `str`, optional}} - dask_chunks: `str`, `int`, `None`, or `dict`, optional - Specify the Dask chunking for data. May be one of the - following: - - * ``'storage-aligned'`` - - This is the default. The Dask chunk size in bytes will - be as close as possible to the size given by - `cfdm.chunksize`, favouring square-like chunk shapes, - with the added restriction that the entirety of each - storage chunk must also lie within exactly one Dask - chunk. - - When reading the data from disk, an entire storage chunk - will be read once per Dask storage chunk that contains - any part of it, so ensuring that a storage chunk lies - within only one Dask chunk can increase performance by - reducing the amount of disk access (particularly when - the data are stored remotely to the client). - - For instance, consider a file variable that has an array - of 64-bit floats with shape (400, 300, 60) and a storage - chunk shape of (100, 5, 60), giving 240 storage chunks - each of size 100*5*60*8 bytes = 0.23 MiB. Then: - - * If `cfdm.chunksize` returned 134217728 (i.e. 128 MiB), - then the storage-aligned Dask chunks will have shape - (400, 300, 60), giving 1 Dask chunk with size of 54.93 - MiB (compare with a Dask chunk shape of (400, 300, 60) - and size 54.93 MiB, if *dask_chunks* were ``'auto'``.) - - * If `cfdm.chunksize` returned 33554432 (i.e. 32 MiB), - then the storage-aligned Dask chunks will have shape - (200, 260, 60), giving 4 Dask chunks with a maximum - size of 23.80 MiB (compare with a Dask chunk shape of - (264, 264, 60) and maximum size 31.90 MiB, if - *dask_chunks* were ``'auto'``.) - - * If `cfdm.chunksize` returned 4194304 (i.e. 4 MiB), - then the storage-aligned Dask chunks will have shape - (100, 85, 60), giving 16 Dask chunks with a maximum - size of 3.89 MiB (compare with a Dask chunk shape of - (93, 93, 60) and maximum size 3.96 MiB, if - *dask_chunks* were ``'auto'``.) - - There are, however, some occasions when, for particular - data arrays in the file, the ``'auto'`` option will - automatically be used instead of storage-aligned Dask - chunks. This occurs when: - - * The data array in the file is stored contiguously. - - * The data array in the file is compressed by convention - (e.g. ragged array representations, compression by - gathering, subsampled coordinates, etc.). In this case - the Dask chunks are for the uncompressed data, and so - cannot be aligned with the storage chunks of the - compressed array in the file. - - * ``'storage-exact'`` - - Each Dask chunk will contain exactly one storage chunk - and each storage chunk will lie within exactly one Dask - chunk. - - For instance, consider a file variable that has an array - of 64-bit floats with shape (400, 300, 60) and a storage - chunk shape of (100, 5, 60) (i.e. there are 240 storage - chunks, each of size 0.23 MiB). Then the storage-exact - Dask chunks will also have shape (100, 5, 60) giving 240 - Dask chunks with a maximum size of 0.23 MiB. - - There are, however, some occasions when, for particular - data arrays in the file, the ``'auto'`` option will - automatically be used instead of storage-exact Dask - chunks. This occurs when: - - * The data array in the file is stored contiguously. - - * The data array in the file is compressed by convention - (e.g. ragged array representations, compression by - gathering, subsampled coordinates, etc.). In this case - the Dask chunks are for the uncompressed data, and so - cannot be aligned with the storage chunks of the - compressed array in the file. - - * ``auto`` - - The Dask chunk size in bytes will be as close as - possible to the size given by `cfdm.chunksize`, - favouring square-like chunk shapes. This may give - similar Dask chunk shapes as the ``'storage-aligned'`` - option, but without the guarantee that each storage - chunk will lie within exactly one Dask chunk. - - * A byte-size given by a `str` - - The Dask chunk size in bytes will be as close as - possible to the given byte-size, favouring square-like - chunk shapes. Any string value, accepted by the *chunks* - parameter of the `dask.array.from_array` function is - permitted. - - *Example:* - A Dask chunksize of 2 MiB may be specified as - ``'2097152'`` or ``'2 MiB'``. - - * `-1` or `None` - - There is no Dask chunking, i.e. every data array has one - Dask chunk regardless of its size. - - * Positive `int` - - Every dimension of all Dask chunks has this number of - elements. - - *Example:* - For 3-dimensional data, *dask_chunks* of `10` will - give Dask chunks with shape (10, 10, 10). - - * `dict` - - Each of dictionary key identifies a file dimension, with - a value that defines the Dask chunking for that - dimension whenever it is spanned by a data array. A file - dimension is identified in one of three ways: - - 1. the netCDF dimension name, preceded by ``ncdim%`` - (e.g. ``'ncdim%lat'``); - - 2. the value of the "standard name" attribute of a - CF-netCDF coordinate variable that spans the - dimension (e.g. ``'latitude'``); - - 3. the value of the "axis" attribute of a CF-netCDF - coordinate variable that spans the dimension - (e.g. ``'Y'``). - - The dictionary values may be a byte-size string, - ``'auto'``, `int` or `None`, with the same meanings as - those types for the *dask_chunks* parameter itself, but - applying only to the specified dimension. In addition, a - dictionary value may be a `tuple` or `list` of integers - that sum to the dimension size. - - Not specifying a file dimension in the dictionary is - equivalent to it being defined with a value of - ``'auto'``. - - *Example:* - ``{'T': '0.5 MiB', 'Z': 'auto', 'Y': [36, 37], 'X': - None}`` - - *Example:* - If a netCDF file contains dimensions ``time``, ``z``, - ``lat`` and ``lon``, then ``{'ncdim%time': 12, - 'ncdim%lat', None, 'ncdim%lon': None}`` will ensure - that, for all applicable data arrays, all ``time`` - axes have a `dask` chunksize of 12; all ``lat`` and - ``lon`` axes are not `dask` chunked; and all ``z`` - axes are `dask` chunked to comply as closely as - possible with the default `dask` chunk size. - - If the netCDF file also contains a ``time`` coordinate - variable with a "standard_name" attribute of - ``'time'`` and an "axis" attribute of ``'T'``, then - the same `dask` chunking could be specified with - either ``{'time': 12, 'ncdim%lat', None, 'ncdim%lon': - None}`` or ``{'T': 12, 'ncdim%lat', None, 'ncdim%lon': - None}``. + Valid files types are: - .. versionadded:: (cfdm) 1.11.2.0 + ============ ============================================ + *file_type* Description + ============ ============================================ + ``'netCDF'`` Binary netCDF-3 or netCDF-4 file + ``'CDL'`` Text CDL representation of a netCDF file + ============ ============================================ - store_hdf5_chunks: `bool`, optional - If True (the default) then store the HDF5 chunking - strategy for each returned data array. The HDF5 chunking - strategy is then accessible via an object's - `nc_hdf5_chunksizes` method. When the HDF5 chunking - strategy is stored, it will be used when the data is - written to a new netCDF4 file with `cfdm.write` (unless - the strategy was modified prior to writing). - - If False, or if the file being read is not in netCDF4 - format, then no HDF5 chunking strategy is stored. - (i.e. an `nc_hdf5_chunksizes` method will return `None` - for all `Data` objects). In this case, when the data is - written to a new netCDF4 file, the HDF5 chunking strategy - will be determined by `cfdm.write`. - - See the `cfdm.write` *hdf5_chunks* parameter for details - on how the HDF5 chunking strategy is determined at the - time of writing. + .. versionadded:: (cfdm) NEXTVERSION - .. versionadded:: (cfdm) 1.11.2.0 + {{read ignore_unknown_type: `bool`, optional}} - _implementation: (subclass of) `CFDMImplementation`, optional - Define the CF data model implementation that provides the - returned field constructs. + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -567,38 +210,41 @@ def read( >>> j = cfdm.read('parent.nc', external=['external1.nc', 'external2.nc']) """ - # Initialise a netCDF read object - netcdf = NetCDFRead(_implementation) - - # Parse the field parameter - if extra is None: - extra = () - elif isinstance(extra, str): - extra = (extra,) - - filename = os.path.expanduser(os.path.expandvars(filename)) - - if netcdf.is_dir(filename): - raise IOError(f"Can't read directory {filename}") - - if not netcdf.is_file(filename): - raise IOError(f"Can't read non-existent file {filename}") - - # ---------------------------------------------------------------- - # Read the file into field/domain contructs - # ---------------------------------------------------------------- - cdl = False - if netcdf.is_cdl_file(filename): - # Create a temporary netCDF file from input CDL - cdl = True - cdl_filename = filename - filename = netcdf.cdl_to_netcdf(filename) - - if netcdf.is_netcdf_file(filename): - # See https://github.com/NCAS-CMS/cfdm/issues/128 for context - # on the try/except here, which acts as a temporary fix - # pending decisions on the best way to handle CDL with only - # header or coordinate info. + + implementation = implementation() + + def __new__( + cls, + filename, + external=None, + extra=None, + verbose=None, + warnings=False, + warn_valid=False, + mask=True, + unpack=True, + domain=False, + netcdf_backend=None, + storage_options=None, + cache=True, + dask_chunks="storage-aligned", + store_dataset_chunks=True, + cfa=None, + cfa_write=None, + to_memory=False, + squeeze=False, + unsqueeze=False, + file_type=None, + ignore_unknown_type=False, + extra_read_vars=None, + ): + """Read field or domain constructs from a dataset.""" + # Initialise a netCDF read object + netcdf = NetCDFRead(cls.implementation) + cls.netcdf = netcdf + + filename = os.path.expanduser(os.path.expandvars(filename)) + try: fields = netcdf.read( filename, @@ -614,32 +260,30 @@ def read( netcdf_backend=netcdf_backend, cache=cache, dask_chunks=dask_chunks, - store_hdf5_chunks=store_hdf5_chunks, - extra_read_vars=None, + store_dataset_chunks=store_dataset_chunks, + cfa=cfa, + cfa_write=cfa_write, + to_memory=to_memory, + squeeze=squeeze, + unsqueeze=unsqueeze, + file_type=file_type, + ignore_unknown_type=ignore_unknown_type, + extra_read_vars=extra_read_vars, ) + except DatasetTypeError: + if file_type is None: + raise + + fields = [] except MaskError: - # Some data required for field interpretation is missing, - # manifesting downstream as a NumPy MaskError. - if cdl: - raise ValueError( - "Unable to convert CDL without data to field construct(s) " - "because there is insufficient information provided by " - "the header and/or coordinates alone in this case." - ) - else: - raise ValueError( - "Unable to convert netCDF to field construct(s) because " - "there is missing data." - ) - elif cdl: - raise IOError( - f"Can't determine format of file {filename} " - f"generated from CDL file {cdl_filename}" - ) - else: - raise IOError(f"Can't determine format of file {filename}") - - # ---------------------------------------------------------------- - # Return the field or domain constructs - # ---------------------------------------------------------------- - return fields + # Some data required for field interpretation is + # missing, manifesting downstream as a NumPy + # MaskError. + raise ValueError( + f"Unable to read {filename} because some netCDF " + "variable arrays that are required for construct " + "creation contain missing values when they shouldn't" + ) + + # Return the field or domain constructs + return fields diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 8533df4972..17f8b40d9b 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -1,35 +1,11 @@ +import numpy as np + from ..cfdmimplementation import implementation +from .abstract import ReadWrite from .netcdf import NetCDFWrite -_implementation = implementation() - - -def write( - fields, - filename, - fmt="NETCDF4", - mode="w", - overwrite=True, - global_attributes=None, - variable_attributes=None, - file_descriptors=None, - external=None, - Conventions=None, - datatype=None, - least_significant_digit=None, - endian="native", - compress=0, - fletcher32=False, - shuffle=True, - string=True, - verbose=None, - warn_valid=True, - group=True, - coordinates=False, - omit_data=None, - hdf5_chunks="4 MiB", - _implementation=_implementation, -): + +class write(ReadWrite): """Write field and domain constructs to a netCDF file. **File format** @@ -53,9 +29,9 @@ def write( The domain axis construct has the following methods to get, set and remove a netCDF dimension name: - `~cfdm.DomainAxis.nc_get_dimension`, - `~cfdm.DomainAxis.nc_set_dimension` and - `~cfdm.DomainAxis.nc_del_dimension`. + `~{{package}}.DomainAxis.nc_get_dimension`, + `~{{package}}.DomainAxis.nc_set_dimension` and + `~{{package}}.DomainAxis.nc_del_dimension`. **NetCDF attributes** @@ -75,9 +51,9 @@ def write( Domain axis constructs that correspond to NetCDF unlimited dimensions may be accessed with the - `~cfdm.DomainAxis.nc_is_unlimited` and - `~cfdm.DomainAxis.nc_set_unlimited` methods of a domain axis - construct. + `~{{package}}.DomainAxis.nc_is_unlimited` and + `~{{package}}.DomainAxis.nc_set_unlimited` methods of a domain + axis construct. **NetCDF4 hierarchical groups** @@ -88,18 +64,18 @@ def write( netCDF interface will, by default, be recreated in the output dataset. See the *group* parameter for details. - **NetCDF4 HDF5 chunks** + **NetCDF4 dataset chunks** - HDF5 chunking is configured by the *hdf5_chunks* parameter, which - defines the chunking strategy for all output data, including the - option of no chunking. However, this may be overridden for any - data that defines its own chunking strategy. See - `cfdm.Field.nc_set_hdf5_chunksizes`, - `cfdm.Data.nc_set_hdf5_chunksizes`, and `cfdm.read`. + Dataset chunking is configured by the *dataset_chunks* parameter, + which defines the chunking strategy for all output data, including + the option of no chunking. However, this will be overridden for + any data that defines its own chunking strategy. See + `{{package}}.Field.nc_set_hdf5_chunksizes`, + `{{package}}.Data.nc_set_hdf5_chunksizes`, and `{{package}}.read`. .. versionadded:: (cfdm) 1.7.0 - .. seealso:: `cfdm.read` + .. seealso:: `{{package}}.read` :Parameters: @@ -152,11 +128,12 @@ def write( 2GB) except ``'NETCDF3_CLASSIC'``. ``'NETCDF3_64BIT_DATA'`` is a format that requires version - 4.4.0 or newer of the C library (use `cfdm.environment` to - see which version if the netCDF-C library is in use). It - extends the ``'NETCDF3_64BIT_OFFSET'`` binary format to - allow for unsigned 64 bit integer data types and 64-bit - dimension sizes. + 4.4.0 or newer of the C library (use + `{{package}}.environment` to see which version of the + netCDF-C library is in use). It extends the + ``'NETCDF3_64BIT_OFFSET'`` binary format to allow for + unsigned 64 bit integer data types and 64-bit dimension + sizes. ``'NETCDF4_CLASSIC'`` files use the version 4 disk format (HDF5), but omit features not found in the version 3 @@ -261,7 +238,7 @@ def write( attribute, even if it has been specified by the *global_attributes* parameter, or has been flagged as global on any of the field constructs (see - `cfdm.Field.nc_global_attributes` for details). + `{{package}}.Field.nc_global_attributes` for details). Identification of the conventions being adhered to by the file are not specified as a file descriptor, but by the @@ -288,7 +265,7 @@ def write( * properties flagged as global on any of the field constructs being written (see - `cfdm.Field.nc_global_attributes` for details). + `{{package}}.Field.nc_global_attributes` for details). Note that it is not possible to create a netCDF global attribute from a property that has different values for @@ -371,18 +348,14 @@ def write( ``endian='big'`` compress: `int`, optional - Regulate the speed and efficiency of compression. Must be - an integer between ``0`` and ``9``. ``0`` means no - compression; ``1`` is the fastest, but has the lowest - compression ratio; ``9`` is the slowest but best - compression ratio. The default value is ``0``. An error is - raised if compression is requested for a netCDF3 output - file format. See the `netCDF4 package - `_ for more - details. + Regulate the speed and efficiency of zlib + compression. Must be an integer between ``0`` and + ``9``. ``0`` means no compression; ``1`` is the fastest, + but has the lowest compression ratio; ``9`` is the slowest + but best compression ratio. The default value is ``4``. *Parameter example:* - ``compress=4`` + ``compress=0`` least_significant_digit: `int`, optional Truncate the input field construct data arrays, but not @@ -442,15 +415,17 @@ def write( * ``'DETAIL'`` (``3``) * ``'DEBUG'`` (``-1``) - set for the duration of the method call only as the minimum - cut-off for the verboseness level of displayed output (log) - messages, regardless of the globally-configured `cfdm.log_level`. - Note that increasing numerical value corresponds to increasing - verbosity, with the exception of ``-1`` as a special case of - maximal and extreme verbosity. + set for the duration of the method call only as the + minimum cut-off for the verboseness level of displayed + output (log) messages, regardless of the + globally-configured `{{package}}.log_level`. Note that + increasing numerical value corresponds to increasing + verbosity, with the exception of ``-1`` as a special case + of maximal and extreme verbosity. - Otherwise, if `None` (the default value), output messages will - be shown according to the value of the `cfdm.log_level` setting. + Otherwise, if `None` (the default value), output messages + will be shown according to the value of the + `{{package}}.log_level` setting. Overall, the higher a non-negative integer or equivalent string that is set (up to a maximum of ``3``/``'DETAIL'``) for @@ -517,6 +492,8 @@ def write( ``'dimension_coordinate'`` Dimension coordinate constructs ``'auxiliary_coordinate'`` Auxiliary coordinate constructs ``'cell_measure'`` Cell measure constructs + ``'domain_topology'`` Domain topology constructs + ``'cell_connectivity'`` Cell connectivity constructs ``'all'`` All of the above constructs ========================== =============================== @@ -531,69 +508,205 @@ def write( .. versionadded:: (cfdm) 1.10.0.1 - hdf5_chunks: `str` or `int` or `float`, optional - The HDF5 chunking strategy for data arrays being written + dataset_chunks: `str` or `int` or `float`, optional + The dataset chunking strategy for data arrays being written to the file. - If any data being written already stores its own chunking - strategy (i.e. its `nc_hdf5_chunksizes` method returns - something other than `None`) then, for that data alone, it - is used in preference to the strategy defined by the - *hdf5_chunks* parameter. - - .. note:: By default, a data array returned by `cfdm.read` - stores its HDF5 chunking strategy from the file - being read. When this happens, that same HDF5 - chunking strategy will be used when the data is - written to an output netCDF4 file (unless the - strategy was modified or removed prior to - writing). To prevent the HDF5 chunking strategy - from the original file being stored, see the - *store_hdf5_chunks* parameter to `cfdm.read`. + By default, *dataset_chunks* is ``'4 MiB'``, i.e. 4194304 + bytes. + + If any `Data` being written already stores its own dataset + chunking strategy (i.e. its `Data.nc_hdf5_chunksizes` + method returns something other than `None`) then, for that + data array alone, it is used in preference to the strategy + defined by the *dataset_chunks* parameter. Ignored for netCDF3 output formats, for which all data is always written out contiguously. - The *hdf5_chunks* parameter may be one of: + .. note:: By default, a data array returned by + `{{package}}.read` stores its dataset chunking + strategy from the file being read. When this + happens that same dataset chunking strategy will + be used when the data is written to a new + netCDF4 file, unless the strategy was modified + or removed prior to writing. To prevent the + dataset chunking strategy from the original file + being stored, see the *store_dataset_chunks* + parameter to `{{package}}.read`. + + The *dataset_chunks* parameter may be one of: - * ``'contiguous'``: The data will written to the file - contiguously, i.e. no chunking. + * ``'contiguous'`` - * `int` or `float` or `str`: The size in bytes of the HDF5 - chunks. A floating point value is rounded down to the - nearest integer, and a string represents a quantity of - byte units. "Square-like" chunk shapes are preferred, + The data will written to the file contiguously, i.e. no + chunking. + + * `int` or `float` or `str` + + The size in bytes of the dataset chunks. A floating + point value is rounded down to the nearest integer, and + a string represents a quantity of byte + units. "Square-like" chunk shapes are preferred, maximising the amount of chunks that are completely filled with data values. For instance a chunksize of 1024 bytes may be specified with any of ``1024``, ``1024.9``, ``'1024'``, ``'1024.9'``, ``'1024 B'``, ``'1 - KiB'``, ``'0.0009765625 MiB'``, etc. Recognised byte - units are (case insensitive): ``B``, ``KiB``, ``MiB``, + KiB'``, ``'0.001024 MB'``, etc. Recognised byte units + are (case insensitive): ``B``, ``KiB``, ``MiB``, ``GiB``, ``TiB``, ``PiB``, ``KB``, ``MB``, ``GB``, ``TB``, and ``PB``. Spaces in strings are optional. - By default, *hdf5_chunks* is ``'4 MiB'`` (i.e. 4194304 - bytes). - - When the HDF5 chunk size is defined by a number of bytes - (taken either from the *hdf5_chunks* parameter, or as stored - by the data itself), "square-like" HDF5 chunk shapes are - preferred that maximise the amount of chunks that are - completely filled with data values. For example, with - *hdf_chunks* of ``'4 MiB'``, a data array of 64-bit floats - with shape (400, 300, 60) will be written with 20 HDF5 - chunks, each of which contains 3.9592 MiB: the first axis - is split across 5 chunks containing 93, 93, 93, 93, and 28 - elements; the second axis across 4 chunks containing 93, - 93, 93, and 21 elements; and the third axis across 1 chunk - containing 60 elements. 12 of these chunks are completely - filled with 93*93*60 data values (93*93*60*8 B = 3.9592 - MiB), whilst the remaining 8 chunks at the "edges" of the - array contain only 93*21*60, 28*93*60, or 28*21*60 data - values. The shape of the HDF5 chunks is based only on the - shape of the data aray and its data type. The use of - native compression (see the *compress* parameter) does not - affect the HDF5 chunk size. + .. note:: When the dataset chunk size is defined by a + number of bytes (taken either from the + *dataset_chunks* parameter, or as stored by the + data itself), "square-like" dataset chunk shapes + are preferred that maximise the amount of chunks + that are completely filled with data values. For + example, with *dataset_chunks* of ``'4 MiB'``, a + data array of 64-bit floats with shape (400, + 300, 60) will be written with 20 dataset chunks, + each of which contians (93, 93, 60) + elements. The first axis is split across 5 + chunks, the second axis across 4 chunks, and the + third axis across 1 chunk containing 60 + elements. 12 of these chunks are completely + filled with 93*93*60 data values (93*93*60*8 B = + 3.9592 MiB), whilst the remaining 8 chunks at + the "edges" of the array contain only 93*21*60, + 28*93*60, or 28*21*60 data values. The shape of + the dataset chunks is based on the shape of the + data aray and its data type, and is calculated + internally with the + `dask.array.core.normalize_chunks` function. The + use of native compression (see the *compress* + parameter) does not affect the dataset chunk + size. + + .. versionadded:: (cfdm) NEXTVERSION + + cfa: `str` or `dict` or `None`, optional + Specify which netCDF variables, if any, should be written + as CF-netCDF aggregation variables. + + By default, *cfa* is the string ``'auto'``, meaning that a + construct that was previously read from a CF-netCDF + aggregation variable will be written as an aggregation + variable, provided that its data have not been changed in + ways which prevent the encoding, in which case a normal + non-aggregation variable will be written. This default, as + well as other options, can be configured by setting *cfa* + to a dictionary. + + .. note:: If the intention is to create aggregation + variables from fields read from disk, then it is + strongly recommended to use the *cfa_write* + parameter to `{{package}}.read`, in order to set + up the conditions in which a CF-netCDF + aggregation encoding is possible. + + If *cfa* is `None` or a (sequence of) `str` then it + defines which types of constructs are to be written as CF + aggregation variables: + + ========================== =============================== + *cfa* Constructs + ========================== =============================== + ``'auto'`` **This is the default**. Any + construct whose data is + unchanged from having been + previously read from a + CF-netCDF aggregation + variable. + + `None` No aggregation variables will + be created. + + ``'field'`` Field constructs + ``'field_ancillary'`` Field ancillary constructs + ``'domain_ancillary'`` Domain ancillary constructs + ``'dimension_coordinate'`` Dimension coordinate constructs + ``'auxiliary_coordinate'`` Auxiliary coordinate constructs + ``'cell_measure'`` Cell measure constructs + ``'domain_topology'`` Domain topology constructs + ``'cell_connectivity'`` Cell connectivity constructs + ``'all'`` All constructs + ========================== =============================== + + If *cfa* is a dictionary then it is used to explicitly + configure the writing of aggregation variables. It may + have some or all of the following keys: + + * ``'constructs'``: `None`, `dict` or (sequence of) `str` + + The types of construct to be written as aggregation + variables. + + If the value is `None` or a (sequence of) `str` then the + types are the same as if the *cfa* parameter itself was + set to that value (see the table above). If the + ``'constructs'`` key is missing then ``'auto'`` is + assumed. + + If the value is a `dict` then each of its keys defines a + construct type (see the table above), with a value that + specifies the number of dimensions that a construct of + that type must also have if it is to be written as an + aggregation variable. A value of `None` means no + restriction on the number of dimensions. + + *Example:* + Equivalent ways to only write cell measure constructs + as aggregation variables: ``{'constructs': + 'cell_measure'}``, ``{'constructs': ['cell_measure']}``, + ``{'cell_measure': None}``. + + *Example:* + Equivalent ways to only write field and auxiliary + coordinate constructs as aggregation variables: + ``{'constructs': ('field', 'auxiliary_coordinate')}`` + and ``{'constructs': {'field': None, + 'auxiliary_coordinate': None}}``. + + *Example:* + To only write two-dimensional auxiliary coordinate + constructs as aggregation variables: ``{'constructs': + {'auxiliary_coordinate': 2}}``. + + *Example:* + Write two-dimensional auxiliary coordinate constructs + as aggregation variables, and also all field + constructs: ``{'constructs': {'auxiliary_coordinate': + 2, 'field': None}}``. + + *Example:* + Write any three-dimensionsal construct whose data is + unchanged from having been previously read from a + CF-netCDF aggregation variable: ``{'constructs': + {'auto': 3}}``. + + * ``'uri'``: `str` + + Specify the URI format of the fragment file names. + + If ``'default'`` (the default) then the fragment file + names will be written with the same URI formats that + they had when read from input files (for file names + originating from the reading of normal non-aggregation + variables, this will result in absolute URIs). If + ``'absolute'`` then all fragment file names will be + written as absolute URIs. If ``'relative'`` then all + fragment file names will be written as relative-path URI + references URIs, relative to the location of the + aggregation file. + + * ``'strict'``: `bool` + + If True (the default if this key is missing) then an + exception is raised if it is not possible to create an + aggregation variable from any construct identified by + the ``'constructs'`` option. If False then a normal, + non-aggregation variable will be written in this case. .. versionadded:: (cfdm) 1.11.2.0 @@ -607,21 +720,82 @@ def write( **Examples** - >>> cfdm.write(f, 'file.nc') + >>> {{package}}.write(f, 'file.nc') - >>> cfdm.write(f, 'file.nc', fmt='NETCDF3_CLASSIC') + >>> {{package}}.write(f, 'file.nc', fmt='NETCDF3_CLASSIC') - >>> cfdm.write(f, 'file.nc', external='cell_measures.nc') + >>> {{package}}.write(f, 'file.nc', external='cell_measures.nc') - >>> cfdm.write(f, 'file.nc', Conventions='CMIP-6.2') + >>> {{package}}.write(f, 'file.nc', Conventions='CMIP6') """ - # ---------------------------------------------------------------- - # Initialise the netCDF write object - # ---------------------------------------------------------------- - netcdf = NetCDFWrite(_implementation) - if fields: + implementation = implementation() + + def __new__( + cls, + fields, + filename, + fmt="NETCDF4", + mode="w", + overwrite=True, + global_attributes=None, + variable_attributes=None, + file_descriptors=None, + external=None, + Conventions=None, + datatype=None, + single=False, + double=False, + least_significant_digit=None, + endian="native", + compress=4, + fletcher32=False, + shuffle=True, + reference_datetime=None, + string=True, + verbose=None, + warn_valid=True, + group=True, + coordinates=False, + omit_data=None, + dataset_chunks="4 MiB", + cfa="auto", + extra_write_vars=None, + ): + """Write field and domain constructs to a netCDF file.""" + # Flatten the sequence of intput fields + fields = tuple(cls._flat(fields)) + if not fields: + raise ValueError( + "Must provide at least one Field or Domain to be written " + ) + + # Parse double and single + if datatype and (single or double): + raise ValueError( + "Can't set 'datatype' at the same time as " + "'single' or 'double'" + ) + + if single: + if double: + raise ValueError( + "Can't set both the 'single' and 'double' parameters" + ) + + datatype = { + np.dtype(float): np.dtype("float32"), + np.dtype(int): np.dtype("int32"), + } + + if double: + datatype = { + np.dtype("float32"): np.dtype(float), + np.dtype("int32"): np.dtype(int), + } + + netcdf = NetCDFWrite(cls.implementation) netcdf.write( fields, filename, @@ -638,13 +812,15 @@ def write( endian=endian, compress=compress, shuffle=shuffle, + reference_datetime=reference_datetime, fletcher32=fletcher32, string=string, verbose=verbose, warn_valid=warn_valid, group=group, coordinates=coordinates, - extra_write_vars=None, + extra_write_vars=extra_write_vars, omit_data=omit_data, - hdf5_chunks=hdf5_chunks, + dataset_chunks=dataset_chunks, + cfa=cfa, ) diff --git a/cfdm/test/create_test_files.py b/cfdm/test/create_test_files.py index 96e56bf087..d56f15d76b 100644 --- a/cfdm/test/create_test_files.py +++ b/cfdm/test/create_test_files.py @@ -2229,6 +2229,90 @@ def _make_ugrid_2(filename): return filename +def _make_aggregation_value(filename): + """Create an aggregation variable with a 'value' fragment array.""" + n = netCDF4.Dataset(filename, "w") + + n.Conventions = f"CF-{VN}" + n.comment = "A netCDF file with a 'value' aggregation variable." + + n.createDimension("time", 12) + n.createDimension("level", 1) + n.createDimension("latitude", 73) + n.createDimension("longitude", 144) + n.createDimension("a_time", 2) + n.createDimension("a_level", 1) + n.createDimension("a_latitude", 1) + n.createDimension("a_longitude", 1) + n.createDimension("a_map_i2", 2) + n.createDimension("a_map_j4", 4) + n.createDimension("a_map_j1", 1) + + temperature = n.createVariable("temperature", "f8", ()) + temperature.standard_name = "air_temperature" + temperature.units = "K" + temperature.cell_methods = "time: mean" + temperature.ancillary_variables = "uid" + temperature.aggregated_dimensions = "time level latitude longitude" + temperature.aggregated_data = "location: fragment_location variable: fragment_variable map: fragment_map" + + uid = n.createVariable("uid", str, ()) + uid.long_name = "Fragment dataset unique identifiers" + uid.aggregated_dimensions = "time" + uid.aggregated_data = ( + "unique_value: fragment_value_uid map: fragment_map_uid" + ) + + time = n.createVariable("time", "f4", ("time",)) + time.standard_name = "time" + time.units = "days since 2001-01-01" + time[...] = [0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334] + + level = n.createVariable("level", "f4", ("level",)) + level.standard_name = "height_above_mean_sea_level" + level.units = "m" + + latitude = n.createVariable("latitude", "f4", ("latitude",)) + latitude.standard_name = "latitude" + latitude.units = "degrees_north" + + longitude = n.createVariable("longitude", "f4", ("longitude",)) + longitude.standard_name = "longitude" + longitude.units = "degrees_east" + + # Fragment array variables + fragment_location = n.createVariable( + "fragment_location", + str, + ("a_time", "a_level", "a_latitude", "a_longitude"), + ) + fragment_location[0, 0, 0, 0] = "January-March.nc" + fragment_location[1, 0, 0, 0] = "April-December.nc" + + fragment_variable = n.createVariable("fragment_variable", str, ()) + fragment_variable[...] = "temperature" + + fragment_map = n.createVariable( + "fragment_map", "i4", ("a_map_j4", "a_map_i2") + ) + fragment_map[...] = [[3, 9], [1, -1], [73, -1], [144, -1]] + fragment_map[1:, 1] = np.ma.masked + + fragment_value_uid = n.createVariable( + "fragment_value_uid", str, ("a_time",) + ) + fragment_value_uid[0] = "04b9-7eb5-4046-97b-0bf8" + fragment_value_uid[1] = "05ee0-a183-43b3-a67-1eca" + + fragment_map_uid = n.createVariable( + "fragment_map_uid", "i4", ("a_map_j1", "a_map_i2") + ) + fragment_map_uid[...] = [3, 9] + + n.close() + return filename + + contiguous_file = _make_contiguous_file("DSG_timeSeries_contiguous.nc") indexed_file = _make_indexed_file("DSG_timeSeries_indexed.nc") indexed_contiguous_file = _make_indexed_contiguous_file( @@ -2259,6 +2343,8 @@ def _make_ugrid_2(filename): ugrid_1 = _make_ugrid_1("ugrid_1.nc") ugrid_2 = _make_ugrid_2("ugrid_2.nc") +aggregation_value = _make_aggregation_value("aggregation_value.nc") + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) cfdm.environment() diff --git a/cfdm/test/setup_create_field.py b/cfdm/test/setup_create_field.py index 4367110a28..3fac6faea3 100644 --- a/cfdm/test/setup_create_field.py +++ b/cfdm/test/setup_create_field.py @@ -227,7 +227,7 @@ def test_create_field(self): "Field f not equal to a copy of itself", ) - cfdm.write(f, self.filename, fmt="NETCDF3_CLASSIC", verbose=verbose) + cfdm.write(f, self.filename, verbose=verbose) g = cfdm.read(self.filename, verbose=1) diff --git a/cfdm/test/test_CFA.py b/cfdm/test/test_CFA.py new file mode 100644 index 0000000000..a6255011a2 --- /dev/null +++ b/cfdm/test/test_CFA.py @@ -0,0 +1,383 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest +from pathlib import PurePath + +import netCDF4 + +faulthandler.enable() # to debug seg faults and timeouts + +import cfdm +from cfdm.read_write.netcdf.netcdfwrite import AggregationError + +n_tmpfiles = 5 +tmpfiles = [ + tempfile.mkstemp("_test_CFA.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +( + tmpfile1, + tmpfile2, + nc_file, + cfa_file, + cfa_file2, +) = tmpfiles + + +def _remove_tmpfiles(): + """Try to remove defined temporary files by deleting their paths.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class CFATest(unittest.TestCase): + """Unit test for aggregation variables.""" + + netcdf3_fmts = [ + "NETCDF3_CLASSIC", + "NETCDF3_64BIT", + "NETCDF3_64BIT_OFFSET", + "NETCDF3_64BIT_DATA", + ] + netcdf4_fmts = ["NETCDF4", "NETCDF4_CLASSIC"] + netcdf_fmts = netcdf3_fmts + netcdf4_fmts + + aggregation_value = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "aggregation_value.nc" + ) + f0 = cfdm.example_field(0) + + def test_CFA_fmt(self): + """Test the cfdm.read 'fmt' keyword with cfa.""" + f = self.f0 + cfdm.write(f, tmpfile1) + f = cfdm.read(tmpfile1, cfa_write="field")[0] + + for fmt in self.netcdf_fmts: + cfdm.write(f, cfa_file, fmt=fmt, cfa="field") + g = cfdm.read(cfa_file) + self.assertEqual(len(g), 1) + self.assertTrue(f.equals(g[0])) + + def test_CFA_multiple_fragments(self): + """Test aggregation variables with more than one fragment.""" + f = self.f0 + + cfdm.write(f[:2], tmpfile1) + cfdm.write(f[2:], tmpfile2) + + a = cfdm.read(tmpfile1, cfa_write="field")[0] + b = cfdm.read(tmpfile2, cfa_write="field")[0] + a = cfdm.Field.concatenate([a, b], axis=0) + + cfdm.write(a, nc_file) + cfdm.write(a, cfa_file, cfa="field") + + n = cfdm.read(nc_file) + c = cfdm.read(cfa_file) + self.assertEqual(len(n), 1) + self.assertEqual(len(c), 1) + self.assertTrue(c[0].equals(f)) + self.assertTrue(n[0].equals(c[0])) + + def test_CFA_strict(self): + """Test 'strict' option to the cfdm.write 'cfa' keyword.""" + f = self.f0 + + # By default, can't write in-memory arrays as aggregation + # variables + with self.assertRaises(AggregationError): + cfdm.write(f, cfa_file, cfa="field") + + # The previous line should have deleted the output file + self.assertFalse(os.path.exists(cfa_file)) + + cfdm.write(f, nc_file, cfa={"constructs": "field", "strict": False}) + g = cfdm.read(nc_file, cfa_write="field") + self.assertEqual(len(g), 1) + self.assertTrue(g[0].equals(f)) + + cfdm.write(g, cfa_file, cfa={"constructs": "field", "strict": True}) + g = cfdm.read(cfa_file) + self.assertEqual(len(g), 1) + self.assertTrue(g[0].equals(f)) + + def test_CFA_uri_0(self): + """Test aggregation 'uri' option to cfdm.write.""" + f = self.f0 + cfdm.write(f, tmpfile1) + f = cfdm.read(tmpfile1, cfa_write="field")[0] + + absuri_filename = PurePath(os.path.abspath(tmpfile1)).as_uri() + reluri_filename = os.path.basename(tmpfile1) + + for uri, filename in zip( + ("absolute", "relative"), (absuri_filename, reluri_filename) + ): + cfdm.write( + f, + cfa_file, + cfa={"constructs": "field", "uri": uri}, + ) + + nc = netCDF4.Dataset(cfa_file, "r") + cfa_location = nc.variables["cfa_location"] + self.assertEqual(cfa_location[...], filename) + nc.close() + + g = cfdm.read(cfa_file) + self.assertEqual(len(g), 1) + g = g[0] + self.assertTrue(f.equals(g)) + self.assertEqual( + g.data.get_filenames(normalise=False), set([filename]) + ) + + def test_CFA_uri_1(self): + """Test aggregation 'uri=default' option to cfdm.write.""" + f = self.f0 + cfdm.write(f, tmpfile1) + f = cfdm.read(tmpfile1, cfa_write="field")[0] + + absuri_filename = PurePath(os.path.abspath(tmpfile1)).as_uri() + reluri_filename = os.path.basename(tmpfile1) + + for uri, filename in zip( + ("absolute", "relative"), (absuri_filename, reluri_filename) + ): + cfdm.write( + f, + cfa_file, + cfa={"constructs": "field", "uri": uri}, + ) + + g = cfdm.read(cfa_file)[0] + cfdm.write( + g, + cfa_file2, + cfa="field", + ) + + nc = netCDF4.Dataset(cfa_file2, "r") + cfa_location = nc.variables["cfa_location"] + self.assertEqual(cfa_location[...], filename) + nc.close() + + def test_CFA_constructs(self): + """Test aggregation 'constructs' option to cfdm.write.""" + f = cfdm.example_field(1) + f.del_construct("time") + f.del_construct("long_name=Grid latitude name") + cfdm.write(f, tmpfile1) + f = cfdm.read(tmpfile1, cfa_write="all")[0] + + # No constructs + cfdm.write(f, tmpfile2, cfa={"constructs": []}) + nc = netCDF4.Dataset(tmpfile2, "r") + for var in nc.variables.values(): + attrs = var.ncattrs() + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + # Field construct + cfdm.write(f, tmpfile2, cfa={"constructs": "field"}) + nc = netCDF4.Dataset(tmpfile2, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ("ta",): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + # Dimension construct + for constructs in ( + "dimension_coordinate", + ["dimension_coordinate"], + {"dimension_coordinate": None}, + {"dimension_coordinate": 1}, + ): + cfdm.write(f, tmpfile2, cfa={"constructs": constructs}) + nc = netCDF4.Dataset(tmpfile2, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ( + "x", + "x_bnds", + "y", + "y_bnds", + "atmosphere_hybrid_height_coordinate", + "atmosphere_hybrid_height_coordinate_bounds", + ): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + # Dimension and auxiliary constructs + for constructs in ( + ["dimension_coordinate", "auxiliary_coordinate"], + {"dimension_coordinate": None, "auxiliary_coordinate": 2}, + ): + cfdm.write(f, tmpfile2, cfa={"constructs": constructs}) + nc = netCDF4.Dataset(tmpfile2, "r") + for ncvar, var in nc.variables.items(): + attrs = var.ncattrs() + if ncvar in ( + "x", + "x_bnds", + "y", + "y_bnds", + "atmosphere_hybrid_height_coordinate", + "atmosphere_hybrid_height_coordinate_bounds", + "latitude_1", + "longitude_1", + ): + self.assertFalse(var.ndim) + self.assertIn("aggregated_dimensions", attrs) + self.assertIn("aggregated_data", attrs) + else: + self.assertNotIn("aggregated_dimensions", attrs) + self.assertNotIn("aggregated_data", attrs) + + nc.close() + + def test_CFA_scalar(self): + """Test scalar aggregation variable.""" + f = self.f0 + f = f[0, 0].squeeze() + cfdm.write(f, tmpfile1) + g = cfdm.read(tmpfile1, cfa_write="field")[0] + cfdm.write(g, cfa_file, cfa="field") + h = cfdm.read(cfa_file)[0] + self.assertTrue(h.equals(f)) + + def test_CFA_value(self): + """Test the value fragment array variable.""" + write = True + for aggregation_value_file in (self.aggregation_value, cfa_file): + f = cfdm.read(aggregation_value_file, cfa_write="all") + self.assertEqual(len(f), 1) + f = f[0] + fa = f.field_ancillary() + self.assertEqual(fa.shape, (12,)) + self.assertEqual(fa.data.chunks, ((3, 9),)) + self.assertEqual( + fa.data.nc_get_aggregation_fragment_type(), "unique_value" + ) + self.assertEqual( + fa.data.nc_get_aggregated_data(), + { + "map": "fragment_map_uid", + "unique_value": "fragment_value_uid", + }, + ) + + nc = netCDF4.Dataset(aggregation_value_file, "r") + fragment_value_uid = nc.variables["fragment_value_uid"][...] + nc.close() + + self.assertTrue((fa[:3].array == fragment_value_uid[0]).all()) + self.assertTrue((fa[3:].array == fragment_value_uid[1]).all()) + + if write: + cfdm.write(f, cfa_file) + write = False + + def test_CFA_cfa(self): + """Test the cfdm.write 'cfa' keyword.""" + f = self.f0 + cfdm.write(f, tmpfile1) + f = cfdm.read(tmpfile1, cfa_write="field")[0] + cfdm.write(f, tmpfile2, cfa="field") + g = cfdm.read(tmpfile2, cfa_write="field")[0] + + # Default of cfa="auto" - check that aggregation variable + # gets written + cfdm.write(g, cfa_file) + nc = netCDF4.Dataset(cfa_file, "r") + self.assertIsNotNone( + getattr(nc.variables["q"], "aggregated_data", None) + ) + nc.close() + + cfdm.write(g, cfa_file, cfa={"constructs": {"auto": 2}}) + nc = netCDF4.Dataset(cfa_file, "r") + + self.assertIsNotNone( + getattr(nc.variables["q"], "aggregated_data", None) + ) + nc.close() + + cfdm.write( + g, + cfa_file, + cfa={ + "constructs": ["auto", "dimension_coordinate"], + "strict": False, + }, + ) + nc = netCDF4.Dataset(cfa_file, "r") + for ncvar in ("q", "lat", "lon"): + self.assertIsNotNone( + getattr(nc.variables[ncvar], "aggregated_data", None) + ) + + nc.close() + + # Check bad values of cfa + for cfa in (False, True, (), []): + with self.assertRaises(ValueError): + cfdm.write(g, cfa_file, cfa=cfa) + + def test_CFA_subspace(self): + """Test the writing subspaces of aggregations.""" + f = self.f0 + + cfdm.write(f[:2], tmpfile1) + cfdm.write(f[2:], tmpfile2) + + a = cfdm.read(tmpfile1, cfa_write="field")[0] + b = cfdm.read(tmpfile2, cfa_write="field")[0] + c = cfdm.Field.concatenate([a, b], axis=0) + + cfdm.write(c, cfa_file, cfa="field") + + f = cfdm.read(cfa_file, cfa_write="field")[0] + cfdm.write(f[:2], cfa_file2, cfa="field") + g = cfdm.read(cfa_file2)[0] + self.assertTrue(g.equals(a)) + + cfdm.write(f[2:], cfa_file2, cfa="field") + g = cfdm.read(cfa_file2)[0] + self.assertTrue(g.equals(b)) + + # Can't straddle Dask chunks + with self.assertRaises(AggregationError): + cfdm.write(f[1:3], cfa_file2, cfa="field") + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cfdm.environment() + print() + unittest.main(verbosity=2) diff --git a/cfdm/test/test_Data.py b/cfdm/test/test_Data.py index 68efcf70d4..67e67f886d 100644 --- a/cfdm/test/test_Data.py +++ b/cfdm/test/test_Data.py @@ -5,6 +5,7 @@ import os import tempfile import unittest +import warnings import cftime import dask.array as da @@ -51,6 +52,8 @@ def axis_combinations(ndim): class DataTest(unittest.TestCase): """Unit test for the Data class.""" + f0 = cfdm.example_field(0) + def setUp(self): """Preparations called immediately before each test method.""" # Disable log messages to silence expected warnings @@ -66,6 +69,31 @@ def setUp(self): os.path.dirname(os.path.abspath(__file__)), "test_file.nc" ) + expected_warning_msgs = [ + "divide by zero encountered in " + np_method + for np_method in ( + "arctanh", + "log", + "double_scalars", + ) + ] + [ + "invalid value encountered in " + np_method + for np_method in ( + "arcsin", + "arccos", + "arctanh", + "arccosh", + "log", + "sqrt", + "double_scalars", + "true_divide", + ) + ] + for expected_warning in expected_warning_msgs: + warnings.filterwarnings( + "ignore", category=RuntimeWarning, message=expected_warning + ) + def test_Data__init__basic(self): """Test basic Data.__init__""" # Most __init__ parameters are covered by the various other @@ -1229,6 +1257,13 @@ def test_Data_insert_dimension(self): d.insert_dimension(-1, inplace=True) self.assertEqual(d.nc_hdf5_chunksizes(), (1, 1, 4, 3, 1)) + array = np.arange(12).reshape(3, 4) + d = cfdm.Data(array) + for i in (0, 1, 2, -3, -2, -1): + self.assertEqual( + d.insert_dimension(i).shape, np.expand_dims(array, i).shape + ) + def test_Data_get_compressed_dimension(self): """Test Data.get_compressed_dimension.""" d = cfdm.Data([[281, 279, 278, 279]]) @@ -2051,8 +2086,9 @@ def test_Data__getitem__(self): self.assertTrue(e.equals(f)) # Chained subspaces reading from disk - f = cfdm.read(self.filename)[0] + f = cfdm.read(self.filename, netcdf_backend="h5netcdf")[0] d = f.data + a = d[:1, [1, 3, 4], :][:, [True, False, True], ::-2].array b = d.array[:1, [1, 3, 4], :][:, [True, False, True], ::-2] self.assertTrue((a == b).all()) @@ -2375,12 +2411,20 @@ def test_Data_get_filenames(self): d = cfdm.Data.empty((5, 8), float, chunks=4) self.assertEqual(d.get_filenames(), set()) - f = cfdm.example_field(0) + f = self.f0 cfdm.write(f, file_A) - a = cfdm.read(file_A, dask_chunks=4)[0].data - self.assertEqual(a.get_filenames(), set([file_A])) - a.persist(inplace=True) - self.assertEqual(a.data.get_filenames(), set()) + + d = cfdm.read(file_A, dask_chunks=4)[0].data + self.assertEqual(d.get_filenames(), set([file_A])) + d.persist(inplace=True) + self.assertEqual(d.data.get_filenames(), set()) + + # Per chunk + d = cfdm.read(file_A, dask_chunks="128 B")[0].data + self.assertEqual(d.numblocks, (2, 2)) + f = d.get_filenames(per_chunk=True) + self.assertEqual(f.shape, d.numblocks) + self.assertTrue((f == [[file_A, file_A], [file_A, file_A]]).all()) def test_Data_chunk_indices(self): """Test Data.chunk_indices.""" @@ -2400,6 +2444,17 @@ def test_Data_chunk_indices(self): ], ) + def test_Data_chunk_positions(self): + """Test Data.chunk_positions.""" + d = cfdm.Data( + np.arange(60).reshape(3, 4, 5), chunks=((1, 2), (4,), (1, 2, 2)) + ) + self.assertEqual(d.npartitions, 6) + self.assertEqual( + list(d.chunk_positions()), + [(0, 0, 0), (0, 0, 1), (0, 0, 2), (1, 0, 0), (1, 0, 1), (1, 0, 2)], + ) + def test_Data_hdf5_chunksizes(self): """Test Data.nc_hdf5_chunksizes.""" d = cfdm.Data(np.arange(24).reshape(2, 3, 4)) @@ -2488,6 +2543,202 @@ def test_Data_all(self): self.assertTrue(d.all()) self.assertFalse(d.all(keepdims=False)) + def test_Data_concatenate(self): + """Test Data.concatenate.""" + # Unitless operation with default axis (axis=0): + d_np = np.arange(120).reshape(30, 4) + e_np = np.arange(120, 280).reshape(40, 4) + d = cfdm.Data(d_np) + e = cfdm.Data(e_np) + f_np = np.concatenate((d_np, e_np), axis=0) + f = cfdm.Data.concatenate((d, e)) + self.assertEqual(f.shape, f_np.shape) + self.assertTrue((f.array == f_np).all()) + + d_np = np.array([[1, 2], [3, 4]]) + e_np = np.array([[5.0, 6.0]]) + d = cfdm.Data(d_np, "km") + e = cfdm.Data(e_np, "km") + f_np = np.concatenate((d_np, e_np), axis=0) + f = cfdm.Data.concatenate([d, e]) + self.assertEqual(f.shape, f_np.shape) + self.assertTrue((f.array == f_np).all()) + + # Check axes equivalency: + self.assertTrue(f.equals(cfdm.Data.concatenate((d, e), axis=-2))) + + # Non-default axis specification: + e_np = np.array([[5.0], [6.0]]) # for compatible shapes with axis=1 + e = cfdm.Data(e_np, "km") + f_np = np.concatenate((d_np, e_np), axis=1) + f = cfdm.Data.concatenate((d, e), axis=1) + self.assertEqual(f.shape, f_np.shape) + self.assertTrue((f.array == f_np).all()) + + # Operation with every data item in sequence being a scalar: + d_np = np.array(1) + e_np = np.array(50.0) + d = cfdm.Data(d_np, "km") + e = cfdm.Data(e_np, "km") + + # Note can't use the following (to compute answer): + # f_np = np.concatenate([d_np, e_np]) + # here since we have different behaviour to NumPy w.r.t + # scalars, where NumPy would error for the above with: + # ValueError: zero-dimensional arrays cannot be concatenated + f_answer = np.array([d_np, e_np]) + f = cfdm.Data.concatenate((d, e)) + self.assertEqual(f.shape, f_answer.shape) + self.assertTrue((f.array == f_answer).all()) + + # Operation with some scalar and some non-scalar data in the + # sequence: + e_np = np.array([50.0, 75.0]) + e = cfdm.Data(e_np, "km") + + # As per above comment, can't use np.concatenate to compute + f_answer = np.array([1.0, 50, 75]) + f = cfdm.Data.concatenate((d, e)) + self.assertEqual(f.shape, f_answer.shape) + self.assertTrue((f.array == f_answer).all()) + + # Check cached elements + cached = f._get_cached_elements() + self.assertEqual(cached[0], d.first_element()) + self.assertEqual(cached[-1], e.last_element()) + + # Check concatenation with one invalid units + d.Units = cfdm.Units("foo") + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e], relaxed_units=True) + + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e], axis=1) + + # Check concatenation with both invalid units + d.Units = cfdm.Units("foo") + e.Units = cfdm.Units("foo") + f = cfdm.Data.concatenate([d, e], relaxed_units=True) + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e]) + + e.Units = cfdm.Units("foobar") + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e], relaxed_units=True) + + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e]) + + e.Units = cfdm.Units("metre") + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e], relaxed_units=True) + + with self.assertRaises(ValueError): + f = cfdm.Data.concatenate([d, e], axis=1) + + # Test cached elements + d = cfdm.Data([1, 2, 3]) + e = cfdm.Data([4, 5]) + repr(d) + repr(e) + f = cfdm.Data.concatenate([d, e], axis=0) + self.assertEqual( + f._get_cached_elements(), + {0: d.first_element(), -1: e.last_element()}, + ) + + def test_Data_aggregated_data(self): + """Test Data aggregated_data methods.""" + d = cfdm.Data(9) + aggregated_data = { + "location": "location", + "shape": "shape", + "address": "cfa_address", + } + + self.assertFalse(d.nc_has_aggregated_data()) + self.assertIsNone(d.nc_set_aggregated_data(aggregated_data)) + self.assertTrue(d.nc_has_aggregated_data()) + self.assertEqual(d.nc_get_aggregated_data(), aggregated_data) + self.assertEqual(d.nc_del_aggregated_data(), aggregated_data) + self.assertFalse(d.nc_has_aggregated_data()) + self.assertEqual(d.nc_get_aggregated_data(), {}) + self.assertEqual(d.nc_del_aggregated_data(), {}) + + def test_Data_replace_directory(self): + """Test Data.replace_directory.""" + f = self.f0 + + # No files means no stored directories + self.assertEqual(f.data.file_directories(), set()) + + cfdm.write(f, file_A) + d = cfdm.read(file_A, dask_chunks=4)[0].data + self.assertGreater(d.npartitions, 1) + + e = d.copy() + directory = cfdm.dirname(file_A) + + self.assertEqual(d.file_directories(), set([directory])) + self.assertIsNone(d.replace_directory()) + d.replace_directory(directory, "/new/path") + self.assertEqual( + d.file_directories(), + set(["/new/path"]), + ) + self.assertEqual( + d.get_filenames(), set((f"/new/path/{os.path.basename(file_A)}",)) + ) + + # Check that we haven't changed 'e' + self.assertEqual(e.file_directories(), set([directory])) + + d.replace_directory(new="/newer/path", common=True) + self.assertEqual( + d.file_directories(), + set(["/newer/path"]), + ) + self.assertEqual( + d.get_filenames(), + set((f"/newer/path/{os.path.basename(file_A)}",)), + ) + + with self.assertRaises(ValueError): + d.replace_directory(old="something", common=True) + + d.replace_directory("/newer/path") + self.assertEqual( + d.file_directories(), + set([""]), + ) + self.assertEqual( + d.get_filenames(), set((f"{os.path.basename(file_A)}",)) + ) + + def test_Data_replace_filenames(self): + """Test Data.replace_filenames.""" + f = self.f0 + cfdm.write(f[:2], file_A) + cfdm.write(f[2:], file_B) + a = cfdm.read(file_A)[0] + b = cfdm.read(file_B)[0] + d = cfdm.Data.concatenate([a.data, b.data], axis=0) + + self.assertEqual(d.get_filenames(), set([file_A, file_B])) + self.assertEqual(d.numblocks, (2, 1)) + + new_filenames = [["a"], ["b"]] + self.assertIsNone(d.replace_filenames(new_filenames)) + self.assertEqual(d.numblocks, np.shape(new_filenames)) + + self.assertEqual(d.get_filenames(normalise=False), set(["a", "b"])) + self.assertTrue( + ( + d.get_filenames(normalise=False, per_chunk=True) + == new_filenames + ).all() + ) + def test_Data_has_deterministic_name(self): """Test Data.has_deterministic_name.""" d = cfdm.Data([1, 2], "m") diff --git a/cfdm/test/test_DimensionCoordinate.py b/cfdm/test/test_DimensionCoordinate.py index fb3019484f..5b1db8e177 100644 --- a/cfdm/test/test_DimensionCoordinate.py +++ b/cfdm/test/test_DimensionCoordinate.py @@ -45,6 +45,8 @@ class DimensionCoordinateTest(unittest.TestCase): bounds.set_data(cfdm.Data(b)) dim.set_bounds(bounds) + f0 = cfdm.example_field(0) + def setUp(self): """Preparations called immediately before each test method.""" # Disable log messages to silence expected warnings @@ -110,14 +112,14 @@ def test_DimensionCoordinate_climatology(self): def test_DimensiconCoordinate_array(self): """Test the `DimensionCoordinate.array` method.""" - f = cfdm.example_field(0) + f = self.f0 t = f.construct("time") self.assertEqual(t.array, 31) self.assertEqual(t.array, t.data.array) def test_DimensiconCoordinate_datetime_array(self): """Test the `DimensionCoordinate.datetime_array` method.""" - f = cfdm.example_field(0) + f = self.f0 t = f.construct("time") self.assertEqual( t.datetime_array, diff --git a/cfdm/test/test_Domain.py b/cfdm/test/test_Domain.py index 2fc46148e2..b7f57c4eff 100644 --- a/cfdm/test/test_Domain.py +++ b/cfdm/test/test_Domain.py @@ -1,11 +1,35 @@ +import atexit import datetime import faulthandler +import os +import tempfile import unittest +import numpy as np + faulthandler.enable() # to debug seg faults and timeouts import cfdm +n_tmpfiles = 1 +tmpfiles = [ + tempfile.mkstemp("_test_Domain.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tmpfile,) = tmpfiles + + +def _remove_tmpfiles(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + class DomainTest(unittest.TestCase): """Unit test for the Domain class.""" @@ -199,6 +223,37 @@ def test_Domain_get_original_filenames(self): ), ) + def test_Domain_persist(self): + """Test Domain.persist.""" + f = cfdm.example_field(0) + cfdm.write(f, tmpfile) + f = cfdm.read(tmpfile)[0].domain + + on_disk = False + for v in f.coordinate("longitude").data.todict().values(): + if isinstance(v, cfdm.data.abstract.FileArray): + on_disk = True + + self.assertTrue(on_disk) + + g = f.persist() + in_memory = False + for v in g.coordinate("longitude").data.todict().values(): + if isinstance(v, np.ndarray): + in_memory = True + + self.assertTrue(in_memory) + + # In-place and metdata + f = cfdm.read(tmpfile)[0].domain + self.assertIsNone(f.persist(inplace=True)) + in_memory = False + for v in f.coordinate("longitude").data.todict().values(): + if isinstance(v, np.ndarray): + in_memory = True + + self.assertTrue(in_memory) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_Field.py b/cfdm/test/test_Field.py index 1250b91739..2df2f87a6e 100644 --- a/cfdm/test/test_Field.py +++ b/cfdm/test/test_Field.py @@ -79,7 +79,7 @@ def test_Field__init__(self): """Test the Field constructor and source keyword.""" cfdm.Field(source="qwerty") - def test_Field___getitem__(self): + def test_Field__getitem__(self): """Test the access of field subspsaces from Field.""" f = self.f1 f = f.squeeze() @@ -156,57 +156,16 @@ def test_Field___getitem__(self): with self.assertRaises(IndexError): f[..., [False] * f.shape[-1]] - # def test_Field___setitem__(self): - # f = self.f.squeeze() - # - # f[...] = 0 - # self.assertTrue((f.data.array == 0).all()) - # - # f[:, :] = 0 - # self.assertTrue((f.data.array == 0).all()) - # - # - # for indices in [ - # (slice(None) , slice(None)), - # (slice(3, 7) , slice(None)), - # (slice(None) , slice(2, 5)), - # (slice(3, 7) , slice(2, 5)), - # (slice(6, 2, -1), slice(4, 1, -1)), - # (slice(2, 6) , slice(4, 1, -1)), - # ([0, 3, 8] , [1, 7, 8]), - # ([7, 4, 1] , slice(6, 8)), - # ]: - # f[...] = 0 - # f[indices] = -1 - # array = f[indices].data.array - # self.assertTrue((array == -1).all()) - # - # values, counts = np.unique(f.data.array, return_counts=True) - # self.assertEqual(counts[0], array.size) - - # def test_Field_get_filenames(self): - # """Test the `get_filenames` Field method.""" - # cfdm.write(self.f0, tmpfile) - # g = cfdm.read(tmpfile)[0] - # - # abspath_tmpfile = os.path.abspath(tmpfile) - # self.assertEqual(g.get_filenames(), set([abspath_tmpfile])) - # - # g.data[...] = -99 - # self.assertEqual(g.get_filenames(), set([abspath_tmpfile])) - # - # for c in g.constructs.filter_by_data().values(): - # c.data[...] = -99 - # - # self.assertEqual(g.get_filenames(), set([abspath_tmpfile])) - # - # for c in g.constructs.filter_by_data().values(): - # if c.has_bounds(): - # c.bounds.data[...] = -99 - # - # self.assertEqual(g.get_filenames(), set()) - # - # os.remove(tmpfile) + def test_Field_get_filenames(self): + """Test Field.get_filenames.""" + cfdm.write(self.f0, tmpfile) + f = cfdm.read(tmpfile)[0] + + abspath_tmpfile = os.path.abspath(tmpfile) + self.assertEqual(f.get_filenames(), set([abspath_tmpfile])) + + f.persist(inplace=True, metadata=True) + self.assertEqual(f.get_filenames(), set()) def test_Field_apply_masking(self): """Test the `apply_masking` Field method.""" @@ -543,23 +502,36 @@ def test_Field_has_construct(self): f.set_construct(cfdm.DomainAxis(0), key="") self.assertTrue(f.has_construct("")) - def test_Field_squeeze_transpose(self): - """Test squeeze and transpose methods.""" + def test_Field_squeeze(self): + """Test Field.squeeze.""" + f = self.f1 + + for axes in (None, "atmosphere_hybrid_height_coordinate"): + g = f.squeeze(axes) + self.assertEqual(g.data.shape, f.data.shape[1:]) + self.assertEqual(g.get_data_axes(), f.get_data_axes()[1:]) + + self.assertIsNone(g.squeeze(inplace=True)) + + def test_Field_transpose(self): + """Test Field.transpose.""" f = self.f1 g = f.transpose() self.assertEqual(g.data.shape, f.data.shape[::-1]) self.assertEqual(g.get_data_axes(), f.get_data_axes()[::-1]) - g = f.squeeze() - self.assertEqual(g.data.shape, f.data.shape[1:]) - self.assertEqual( - g.get_data_axes(), - f.get_data_axes()[1:], - (g.get_data_axes(), f.get_data_axes()), + g = g.transpose( + [ + "atmosphere_hybrid_height_coordinate", + "grid_latitude", + "grid_longitude", + ] ) + self.assertEqual(g.shape, f.shape) + self.assertEqual(g.get_data_axes(), f.get_data_axes()) - def test_Field_insert_dimension(self): + def test_Field_AAAinsert_dimension(self): """Test cfdm.Field.insert_dimension method.""" f = self.f1 g = f.copy() @@ -578,6 +550,14 @@ def test_Field_insert_dimension(self): h = g.insert_dimension(None, constructs=True) self.assertEqual(h.cell_measure().ndim, 3) + f = f.squeeze() + array = f.array + for i in tuple(range(f.ndim + 1)) + tuple(range(-1, -f.ndim - 2, -1)): + self.assertEqual( + f.insert_dimension(None, i).shape, + np.expand_dims(array, i).shape, + ) + def test_Field_compress_uncompress(self): """Test the compress and uncompress Field methods.""" contiguous = os.path.join( @@ -639,8 +619,8 @@ def test_Field_compress_uncompress(self): self.assertTrue(f.equals(c, verbose=3), message) def test_Field_creation_commands(self): - """Test the `creation_commands` Field method.""" - for i in range(7): + """Test the Field.creation_commands.""" + for i in range(12): f = cfdm.example_field(i) f = self.f1 @@ -658,7 +638,7 @@ def test_Field_creation_commands(self): f.creation_commands(namespace=ns) def test_Field_has_geometry(self): - """Test the `creation_commands` Field method.""" + """Test Field.has_geometry.""" f = self.f1 self.assertFalse(f.has_geometry()) @@ -667,7 +647,7 @@ def test_Field_has_geometry(self): def test_Field_climatological_time_axes(self): """Test the `climatological_time_axes` method of Field.""" - f = cfdm.example_field(0) + f = self.f0.copy() self.assertEqual(f.climatological_time_axes(), set()) f.set_construct( @@ -685,7 +665,7 @@ def test_Field_climatological_time_axes(self): def test_Field_bounds(self): """Test that Field instances do not have cell bounds.""" - f = cfdm.example_field(0) + f = self.f0 self.assertFalse(f.has_bounds()) def test_Field_auxiliary_coordinate(self): @@ -830,7 +810,7 @@ def test_Field_domain_axis(self): def test_Field_indices(self): """Test Field.indices.""" - f = cfdm.example_field(0) + f = self.f0 g = f[f.indices(longitude=112.5)] self.assertEqual(g.shape, (5, 1)) @@ -866,7 +846,7 @@ def test_Field_indices(self): def test_Field_get_original_filenames(self): """Test Field.orignal_filenames.""" - f = cfdm.example_field(0) + f = self.f0 f._original_filenames(define=["file1.nc", "file2.nc"]) x = f.coordinate("longitude") x._original_filenames(define=["file1.nc", "file3.nc"]) @@ -909,7 +889,7 @@ def test_Field_del_properties(self): def test_Field_hdf5_chunksizes(self): """Test the HDF5 chunk size methods of a Field.""" - f = cfdm.example_field(0) + f = self.f0.copy() f.nc_set_hdf5_chunksizes({"latitude": 1}) self.assertEqual(f.nc_hdf5_chunksizes(), (1, 8)) @@ -1012,6 +992,80 @@ def test_Field_hdf5_chunksizes(self): f.dimension_coordinate("latitude").nc_hdf5_chunksizes() ) + def test_Field_concatenate(self): + """Test Field.concatenate.""" + f = self.f1.copy() + + g = cfdm.Field.concatenate([f.copy()], axis=0) + self.assertEqual(g.shape, (1, 10, 9)) + + x = [f.copy() for i in range(8)] + + g = cfdm.Field.concatenate(x, axis=0) + self.assertEqual(g.shape, (8, 10, 9)) + + key = x[3].construct_key("latitude") + x[3].del_construct(key) + g = cfdm.Field.concatenate(x, axis=0) + self.assertEqual(g.shape, (8, 10, 9)) + + with self.assertRaises(Exception): + g = cfdm.Field.concatenate([], axis=0) + + def test_Field_persist(self): + """Test Field.persist.""" + f = self.f0.copy() + cfdm.write(f, tmpfile) + f = cfdm.read(tmpfile)[0] + + for d in (f.data.todict(), f.coordinate("longitude").data.todict()): + on_disk = False + for v in d.values(): + if isinstance(v, cfdm.data.abstract.FileArray): + on_disk = True + + self.assertTrue(on_disk) + + g = f.persist() + d = g.data.todict() + in_memory = False + for v in d.values(): + if isinstance(v, np.ndarray): + in_memory = True + + self.assertTrue(in_memory) + + d = g.coordinate("longitude").data.todict() + on_disk = False + for v in d.values(): + if isinstance(v, cfdm.data.abstract.FileArray): + on_disk = True + + self.assertTrue(on_disk) + + # In-place and metdata + f = cfdm.read(tmpfile)[0] + self.assertIsNone(f.persist(metadata=True, inplace=True)) + for d in ( + f.data.todict(), + f.coordinate("longitude").data.todict(), + ): + in_memory = False + for v in d.values(): + if isinstance(v, np.ndarray): + in_memory = True + + self.assertTrue(in_memory) + + def test_Field_unsqueeze(self): + """Test Field.unsqueeze.""" + f = self.f0.copy() + self.assertEqual(f.shape, (5, 8)) + g = f.unsqueeze() + self.assertEqual(g.shape, (1, 5, 8)) + self.assertIsNone(g.unsqueeze(inplace=True)) + self.assertEqual(g.shape, (1, 5, 8)) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_LinearSubarray.py b/cfdm/test/test_LinearSubarray.py index c10501d498..9c37ef1ada 100644 --- a/cfdm/test/test_LinearSubarray.py +++ b/cfdm/test/test_LinearSubarray.py @@ -40,13 +40,6 @@ def test_LinearSubarray_get_filename(self): with self.assertRaises(AttributeError): x.get_filename() - def test_LinearSubarray_get_filenames(self): - """Test LinearSubarray.get_filenames.""" - x = cfdm.LinearSubarray( - data=123, parameters={}, dependent_tie_points={} - ) - self.assertEqual(x.get_filenames(), ()) - if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_NetCDF4Array.py b/cfdm/test/test_NetCDF4Array.py index c70c327235..02200ec193 100644 --- a/cfdm/test/test_NetCDF4Array.py +++ b/cfdm/test/test_NetCDF4Array.py @@ -9,6 +9,7 @@ faulthandler.enable() # to debug seg faults and timeouts import numpy as np +from dask.base import tokenize import cfdm @@ -35,6 +36,8 @@ def _remove_tmpfiles(): class NetCDF4ArrayTest(unittest.TestCase): """Unit test for the NetCDF4Array class.""" + f0 = cfdm.example_field(0) + def setUp(self): """Preparations called immediately before each test method.""" # Disable log messages to silence expected warnings @@ -48,34 +51,25 @@ def setUp(self): # < ... test code ... > # cfdm.log_level('DISABLE') - def test_NetCDF4Array_get_addresses(self): - """Test `NetCDF4Array.get_addresses`""" + def test_NetCDF4Array_get_address(self): + """Test NetCDF4Array.get_address.""" a = cfdm.NetCDF4Array(address="tas") - self.assertEqual(a.get_addresses(), ("tas",)) - - a = cfdm.NetCDF4Array(address=("tas1", "tas1")) - self.assertEqual(a.get_addresses(), ("tas1", "tas1")) + self.assertEqual(a.get_address(), "tas") a = cfdm.NetCDF4Array() - self.assertEqual(a.get_addresses(), ()) + self.assertIsNone(a.get_address(default=None)) - def test_NetCDF4Array_get_filenames(self): - """Test `NetCDF4Array.get_filenames`""" + def test_NetCDF4Array_get_filename(self): + """Test NetCDF4Array.get_filename.""" a = cfdm.NetCDF4Array("/data1/file1") - self.assertEqual(a.get_filenames(), ("/data1/file1",)) - - a = cfdm.NetCDF4Array(("/data1/file1",)) - self.assertEqual(a.get_filenames(), ("/data1/file1",)) - - a = cfdm.NetCDF4Array(("/data1/file1", "/data2/file2")) - self.assertEqual(a.get_filenames(), ("/data1/file1", "/data2/file2")) + self.assertEqual(a.get_filename(), "/data1/file1") a = cfdm.NetCDF4Array() - self.assertEqual(a.get_filenames(), ()) + self.assertIsNone(a.get_filename(default=None)) def test_NetCDF4Array_mask(self): """Test NetCDF4Array masking.""" - f = cfdm.example_field(0) + f = self.f0 f.data[0] = np.ma.masked cfdm.write(f, tmpfile) array = f.array @@ -97,7 +91,7 @@ def test_NetCDF4Array_unpack(self): add_offset = 10.0 scale_factor = 3.14 - f = cfdm.example_field(0) + f = self.f0.copy() f.data[0] = np.ma.masked array0 = f.array array1 = (array0 - add_offset) / scale_factor @@ -167,13 +161,10 @@ def test_NetCDF4Array_get_storage_options(self): def test_NetCDF4Array_get_attributes(self): """Test NetCDF4Array get_attributes.""" - f = cfdm.example_field(0) + f = self.f0 cfdm.write(f, tmpfile) n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) - self.assertIsNone(n.get_attributes(None)) - - with self.assertRaises(ValueError): - n.get_attributes() + self.assertEqual(n.get_attributes(), {}) # Set attributes via indexing n = n[...] @@ -189,6 +180,159 @@ def test_NetCDF4Array_get_attributes(self): }, ) + def test_NetCDF4Array_file_directory(self): + """Test NetCDF4Array.file_directory.""" + a = cfdm.NetCDF4Array("/data1/file1") + self.assertEqual(a.file_directory(), "/data1") + + a = cfdm.NetCDF4Array() + self.assertIsNone(a.file_directory(default=None)) + + def test_NetCDF4Array__dask_tokenize__(self): + """Test NetCDF4Array.__dask_tokenize__""" + a = cfdm.NetCDF4Array("/data1/file1", "tas", shape=(12, 2), mask=False) + self.assertEqual(tokenize(a), tokenize(a.copy())) + + b = cfdm.NetCDF4Array("/home/file2", "tas", shape=(12, 2)) + self.assertNotEqual(tokenize(a), tokenize(b)) + + def test_NetCDF4Array_shape(self): + """Test NetCDF4Array.shape.""" + shape = (12, 73, 96) + a = cfdm.NetCDF4Array("/home/file2", "tas", shape=shape) + self.assertEqual(a.shape, shape) + self.assertEqual(a.original_shape, shape) + a = a[::2] + self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:]) + self.assertEqual(a.original_shape, shape) + + def test_NetCDF4Array_index(self): + """Test NetCDF4Array.index.""" + shape = (12, 73, 96) + a = cfdm.NetCDF4Array("/home/file2", "tas", shape=shape) + self.assertEqual(list(a.index()), [slice(0, n, 1) for n in shape]) + a = a[8:7:-1, 10:19:3, [15, 1, 4, 12]] + a = a[[0], [True, False, True], ::-2] + self.assertEqual(a.shape, (1, 2, 2)) + self.assertEqual( + a.index(), + (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)), + ) + + index = a.index(conform=False) + self.assertTrue((index[0] == [8]).all()) + self.assertTrue((index[1] == [10, 16]).all()) + self.assertTrue((index[2] == [12, 1]).all()) + + # New dimensions + a = cfdm.NetCDF4Array("/home/file2", "tas", shape=shape) + + b = a[:2, None, ...] + self.assertEqual( + b.index(), (slice(0, 2, 1), None, slice(0, 73, 1), slice(0, 96, 1)) + ) + self.assertEqual(b.shape, (2, 1, 73, 96)) + self.assertEqual(b.reference_shape, (12, 1, 73, 96)) + + b = b[...] + self.assertEqual( + b.index(), (slice(0, 2, 1), None, slice(0, 73, 1), slice(0, 96, 1)) + ) + self.assertEqual(b.shape, (2, 1, 73, 96)) + self.assertEqual(b.reference_shape, (12, 1, 73, 96)) + + b = b[..., :4] + self.assertEqual( + b.index(), (slice(0, 2, 1), None, slice(0, 73, 1), slice(0, 4, 1)) + ) + self.assertEqual(b.shape, (2, 1, 73, 4)) + self.assertEqual(b.reference_shape, (12, 1, 73, 96)) + + b = b[..., None, :] + self.assertEqual( + b.index(), + (slice(0, 2, 1), None, slice(0, 73, 1), None, slice(0, 4, 1)), + ) + self.assertEqual(b.shape, (2, 1, 73, 1, 4)) + self.assertEqual(b.reference_shape, (12, 1, 73, 1, 96)) + + b = b[..., None] + self.assertEqual( + b.index(), + ( + slice(0, 2, 1), + None, + slice(0, 73, 1), + None, + slice(0, 4, 1), + None, + ), + ) + self.assertEqual(b.shape, (2, 1, 73, 1, 4, 1)) + self.assertEqual(b.reference_shape, (12, 1, 73, 1, 96, 1)) + + with self.assertRaises(IndexError): + # index 4 is out of bounds for axis 5 with size 1 + b[..., 4] + + b = b[:, 0, :, -1, :, 0] + self.assertEqual( + b.index(), (slice(0, 2, 1), slice(0, 73, 1), slice(0, 4, 1)) + ) + self.assertEqual(b.shape, (2, 73, 4)) + self.assertEqual(b.original_shape, shape) + self.assertEqual(b.reference_shape, shape) + + def test_NetCDF4Array_replace_directory(self): + """Test NetCDF4Array.replace_directory.""" + cwd = os.getcwd() + + n = cfdm.NetCDF4Array("basename.nc") + + m = n.replace_directory() + self.assertEqual(m.get_filename(), "basename.nc") + m = n.replace_directory(new="data") + self.assertEqual(m.get_filename(), "data/basename.nc") + m = n.replace_directory(normalise=True) + self.assertEqual(m.get_filename(), os.path.join(cwd, "basename.nc")) + + n = cfdm.NetCDF4Array("data/basename.nc") + + m = n.replace_directory() + self.assertEqual(m.get_filename(), "data/basename.nc") + m = n.replace_directory(new="/home") + self.assertEqual(m.get_filename(), "/home/data/basename.nc") + m = n.replace_directory(old="data") + self.assertEqual(m.get_filename(), "basename.nc") + m = n.replace_directory(old="data", new="home") + self.assertEqual(m.get_filename(), "home/basename.nc") + m = n.replace_directory(normalise=True) + self.assertEqual( + m.get_filename(), os.path.join(cwd, "data/basename.nc") + ) + m = n.replace_directory(old=cwd, new="new", normalise=True) + self.assertEqual( + m.get_filename(), os.path.join(cwd, "new/data/basename.nc") + ) + + n = cfdm.NetCDF4Array("path/basename.nc") + m = n.replace_directory("path", "../new_path") + self.assertEqual(m.get_filename(), "../new_path/basename.nc") + + n = cfdm.NetCDF4Array("/data/../new_path/basename.nc") + m = n.replace_directory("/new_path/", normalise=True) + self.assertEqual(m.get_filename(), "basename.nc") + + n = cfdm.NetCDF4Array("/data/basename.nc") + m = n.replace_directory(new="/home") + self.assertEqual(m.get_filename(), "/home/data/basename.nc") + + n = cfdm.NetCDF4Array("/data/basename.nc") + m = n.replace_directory(new="/home/") + self.assertEqual(m.get_filename(), "/home/data/basename.nc") + m = n.replace_directory(old="/data") + self.assertEqual(m.get_filename(), "basename.nc") + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_NumpyArray.py b/cfdm/test/test_SparseArray.py similarity index 52% rename from cfdm/test/test_NumpyArray.py rename to cfdm/test/test_SparseArray.py index 3ae3c11ab3..be9274ad20 100644 --- a/cfdm/test/test_NumpyArray.py +++ b/cfdm/test/test_SparseArray.py @@ -1,17 +1,22 @@ -import copy import datetime import faulthandler import unittest import numpy as np +from scipy.sparse import coo_array faulthandler.enable() # to debug seg faults and timeouts import cfdm +row = np.array([0, 3, 1, 0]) +col = np.array([0, 3, 1, 2]) +data = np.array([4, 5, 7, 9]) +s = coo_array((data, (row, col)), shape=(4, 4)) -class NumpyArrayTest(unittest.TestCase): - """Unit test for the NumpyArray class.""" + +class SparseArrayTest(unittest.TestCase): + """Unit test for the SparseArray class.""" def setUp(self): """Preparations called immediately before each test method.""" @@ -25,33 +30,25 @@ def setUp(self): # < ... test code ... > # cfdm.log_level('DISABLE') - def test_NumpyArray_copy(self): - """Test the copy module copying behaviour of NumpyArray.""" - a = np.array([1, 2, 3, 4]) - - x = cfdm.NumpyArray(a) - y = copy.deepcopy(x) - self.assertTrue((x.array == a).all()) - self.assertTrue((x.array == y.array).all()) - - def test_NumpyArray__array__(self): - """Test the NumPy array conversion of NumpyArray.""" - a = np.array([1, 2, 3, 4]) - - x = cfdm.NumpyArray(a) + def test_SparseArray_copy(self): + """Test the copy module copying behaviour of SparseArray.""" + x = cfdm.SparseArray(s) + self.assertTrue((s.toarray() == x.array).all()) - b = np.array(x) - self.assertTrue((b == a).all()) + def test_SparseArray__array__(self): + """Test the numpy array conversion of SparseArray.""" + x = cfdm.SparseArray(s) + self.assertTrue((np.array(x) == x.array).all()) - def test_NumpyArray_get_filename(self): - """Test NumpyArray.get_filename.""" - x = cfdm.NumpyArray() + def test_SparseArray_get_filename(self): + """Test SparseArray.get_filename.""" + x = cfdm.SparseArray() with self.assertRaises(AttributeError): x.get_filename() - def test_NumpyArray_get_filenames(self): - """Test NumpyArray.get_filenames.""" - x = cfdm.NumpyArray() + def test_SparseArray_get_filenames(self): + """Test SparseArray.get_filenames.""" + x = cfdm.SparseArray() with self.assertRaises(AttributeError): x.get_filenames() diff --git a/cfdm/test/test_SubsampledArray.py b/cfdm/test/test_SubsampledArray.py index 05da03777b..9d0495f4c9 100644 --- a/cfdm/test/test_SubsampledArray.py +++ b/cfdm/test/test_SubsampledArray.py @@ -100,18 +100,13 @@ def test_SubsampledArray_compressed_dimensions(self): with self.assertRaises(ValueError): c.compressed_dimensions() + def test_SubsampledArray_get_filename(self): + """Test SubsampledArray.get_filename.""" + x = self.coords + self.assertIsNone(x.get_filename(default=None)) -# def test_SubsampledArray_get_filename(self): -# """Test SubsampledArray.get_filename.""" -# x = self.coords -# self.assertIsNone(x.get_filename(None)) -# -# with self.assertRaises(AttributeError): -# x.get_filename() -# -# def test_SubsampledArray_get_filenames(self): -# """Test `SubsampledArray.get_filenames.""" -# self.assertEqual(self.coords.get_filenames(), set()) + with self.assertRaises(AttributeError): + x.get_filename() if __name__ == "__main__": diff --git a/cfdm/test/test_docstring.py b/cfdm/test/test_docstring.py index 1c81ce477a..aa80e00f65 100644 --- a/cfdm/test/test_docstring.py +++ b/cfdm/test/test_docstring.py @@ -57,9 +57,6 @@ def setUp(self): set( _get_all_abbrev_subclasses(cfdm.data.abstract.array.Array) ), - [ # other key classes not in subclass heirarchy above - cfdm.data.NumpyArray - ], ) ) @@ -187,23 +184,17 @@ def test_docstring(self): if name.startswith("__") and not inspect.isfunction(f): continue - self.assertIsNotNone( - f.__doc__, - f"\n\nCLASS: {klass}\n" - f"METHOD NAME: {name}\n" - f"METHOD: {f}\n__doc__: {f.__doc__}", - ) - - self.assertNotIn( - "{{", - f.__doc__, - f"\n\nCLASS: {klass}\n" - f"METHOD NAME: {name}\n" - f"METHOD: {f}", - ) + if f.__doc__ is not None: + self.assertNotIn( + "{{", + f.__doc__, + f"\n\nCLASS: {klass}\n" + f"METHOD NAME: {name}\n" + f"METHOD: {f}", + ) def test_docstring_package(self): - """Test the docstring substitution of the pacakage name.""" + """Test the docstring substitution of the package name.""" string = f">>> f = {self.package}." for klass in self.subclasses_of_Container: for x in (klass, klass()): diff --git a/cfdm/test/test_dsg.py b/cfdm/test/test_dsg.py index 8be4980bb4..b0695820f2 100644 --- a/cfdm/test/test_dsg.py +++ b/cfdm/test/test_dsg.py @@ -212,7 +212,7 @@ def test_DSG_contiguous(self): # Create the contiguous ragged array object array = cfdm.RaggedContiguousArray( - compressed_array=cfdm.NumpyArray(ragged_array), + compressed_array=ragged_array, shape=(2, 4), size=8, ndim=2, diff --git a/cfdm/test/test_functions.py b/cfdm/test/test_functions.py index 0d48aeea50..e5601257c6 100644 --- a/cfdm/test/test_functions.py +++ b/cfdm/test/test_functions.py @@ -256,13 +256,54 @@ def test_example_field(self): self.assertEqual(len(cfdm.example_fields(0, 2, 0)), 3) def test_abspath(self): - """Test the abspath function.""" - filename = "test_file.nc" - self.assertEqual(cfdm.abspath(filename), os.path.abspath(filename)) - filename = "http://test_file.nc" - self.assertEqual(cfdm.abspath(filename), filename) - filename = "https://test_file.nc" - self.assertEqual(cfdm.abspath(filename), filename) + """Test cfdm.abspath.""" + cwd = os.getcwd() + cwd_m1 = os.path.dirname(cwd) + + self.assertEqual(cfdm.abspath(""), cwd) + self.assertEqual(cfdm.abspath("file.nc"), f"{cwd}/file.nc") + self.assertEqual(cfdm.abspath("../file.nc"), f"{cwd_m1}/file.nc") + self.assertEqual(cfdm.abspath("file:///file.nc"), "file:///file.nc") + self.assertEqual(cfdm.abspath("file://file.nc"), f"file://{cwd}") + self.assertEqual(cfdm.abspath("file:/file.nc"), "file:///file.nc") + self.assertEqual(cfdm.abspath("http:///file.nc"), "http:///file.nc") + self.assertEqual(cfdm.abspath("http://file.nc"), "http://") + self.assertEqual(cfdm.abspath("http:/file.nc"), "http:///file.nc") + + self.assertEqual( + cfdm.abspath("file.nc", uri=True), f"file://{cwd}/file.nc" + ) + self.assertEqual( + cfdm.abspath("../file.nc", uri=True), f"file://{cwd_m1}/file.nc" + ) + self.assertEqual( + cfdm.abspath("file:///file.nc", uri=True), "file:///file.nc" + ) + self.assertEqual( + cfdm.abspath("file://file.nc", uri=True), f"file://{cwd}" + ) + self.assertEqual( + cfdm.abspath("file:/file.nc", uri=True), "file:///file.nc" + ) + self.assertEqual( + cfdm.abspath("http:///file.nc", uri=True), "http:///file.nc" + ) + self.assertEqual(cfdm.abspath("http://file.nc", uri=True), "http://") + self.assertEqual( + cfdm.abspath("http:/file.nc", uri=True), "http:///file.nc" + ) + + self.assertEqual(cfdm.abspath("file.nc", uri=False), f"{cwd}/file.nc") + self.assertEqual( + cfdm.abspath("../file.nc", uri=False), f"{cwd_m1}/file.nc" + ) + self.assertEqual( + cfdm.abspath("file:///file.nc", uri=False), "/file.nc" + ) + self.assertEqual(cfdm.abspath("file://file.nc", uri=False), cwd) + self.assertEqual(cfdm.abspath("file:/file.nc", uri=False), "/file.nc") + with self.assertRaises(ValueError): + cfdm.abspath("http:///file.nc", uri=False) def test_configuration(self): """Test the configuration function.""" @@ -593,6 +634,126 @@ def test_Configuration(self): self.assertIsInstance(repr(c), str) self.assertEqual(str(c), str(dict(**c))) + def test_dirname(self): + """Test cfdm.dirname.""" + cwd = os.getcwd() + cwd_m1 = os.path.dirname(cwd) + + self.assertEqual(cfdm.dirname("file.nc"), "") + self.assertEqual(cfdm.dirname("file.nc", normalise=True), cwd) + self.assertEqual( + cfdm.dirname("file.nc", normalise=True, uri=True), f"file://{cwd}" + ) + self.assertEqual( + cfdm.dirname("file.nc", normalise=True, uri=False), cwd + ) + self.assertEqual( + cfdm.dirname("file.nc", normalise=True, sep=True), f"{cwd}/" + ) + + self.assertEqual(cfdm.dirname("model/file.nc"), "model") + self.assertEqual( + cfdm.dirname("model/file.nc", normalise=True), f"{cwd}/model" + ) + self.assertEqual( + cfdm.dirname("model/file.nc", normalise=True, uri=True), + f"file://{cwd}/model", + ) + self.assertEqual( + cfdm.dirname("model/file.nc", normalise=True, uri=False), + f"{cwd}/model", + ) + + self.assertEqual(cfdm.dirname("../file.nc"), "..") + self.assertEqual(cfdm.dirname("../file.nc", normalise=True), cwd_m1) + self.assertEqual( + cfdm.dirname("../file.nc", normalise=True, uri=True), + f"file://{cwd_m1}", + ) + self.assertEqual( + cfdm.dirname("../file.nc", normalise=True, uri=False), cwd_m1 + ) + + self.assertEqual(cfdm.dirname("/model/file.nc"), "/model") + self.assertEqual( + cfdm.dirname("/model/file.nc", normalise=True), "/model" + ) + self.assertEqual( + cfdm.dirname("/model/file.nc", normalise=True, uri=True), + "file:///model", + ) + self.assertEqual( + cfdm.dirname("/model/file.nc", normalise=True, uri=False), "/model" + ) + + self.assertEqual(cfdm.dirname(""), "") + self.assertEqual(cfdm.dirname("", normalise=True), cwd) + self.assertEqual( + cfdm.dirname("", normalise=True, uri=True), f"file://{cwd}" + ) + self.assertEqual(cfdm.dirname("", normalise=True, uri=False), cwd) + + self.assertEqual( + cfdm.dirname("https:///data/archive/file.nc"), + "https:///data/archive", + ) + self.assertEqual( + cfdm.dirname("https:///data/archive/file.nc", normalise=True), + "https:///data/archive", + ) + self.assertEqual( + cfdm.dirname( + "https:///data/archive/file.nc", normalise=True, uri=True + ), + "https:///data/archive", + ) + with self.assertRaises(ValueError): + cfdm.dirname( + "https:///data/archive/file.nc", normalise=True, uri=False + ) + + self.assertEqual( + cfdm.dirname("file:///data/archive/file.nc"), + "file:///data/archive", + ) + self.assertEqual( + cfdm.dirname("file:///data/archive/file.nc", normalise=True), + "file:///data/archive", + ) + self.assertEqual( + cfdm.dirname( + "file:///data/archive/file.nc", normalise=True, uri=True + ), + "file:///data/archive", + ) + self.assertEqual( + cfdm.dirname( + "file:///data/archive/file.nc", normalise=True, uri=False + ), + "/data/archive", + ) + + self.assertEqual( + cfdm.dirname("file:///data/archive/../file.nc"), + "file:///data/archive/..", + ) + self.assertEqual( + cfdm.dirname("file:///data/archive/../file.nc", normalise=True), + "file:///data", + ) + self.assertEqual( + cfdm.dirname( + "file:///data/archive/../file.nc", normalise=True, uri=True + ), + "file:///data", + ) + self.assertEqual( + cfdm.dirname( + "file:///data/archive/../file.nc", normalise=True, uri=False + ), + "/data", + ) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index 17772c4268..b56d02bc47 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -46,6 +46,8 @@ def _remove_tmpfiles(): class GroupsTest(unittest.TestCase): """Test treatment of netCDF4 files with hierarchical groups.""" + f0 = cfdm.example_field(0) + def _check_h5netcdf_groups(self, h5, nc): """Check that an h5netcdf read gives same results as netCDF4. @@ -400,7 +402,7 @@ def test_groups_compression(self): def test_groups_dimension(self): """Test the dimensions of hierarchical groups.""" - f = cfdm.example_field(0) + f = self.f0.copy() ungrouped_file = ungrouped_file4 grouped_file = grouped_file4 diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py index 571f8cdc57..a230641c7e 100644 --- a/cfdm/test/test_netcdf_indexer.py +++ b/cfdm/test/test_netcdf_indexer.py @@ -37,6 +37,8 @@ def _remove_tmpfiles(): class netcdf_indexerTest(unittest.TestCase): """Test the masking and scaling of netCDF data.""" + f0 = cfdm.example_field(0) + def test_netcdf_indexer_shape(self): """Test netcdf_indexer shape.""" n = np.ma.arange(9) @@ -48,7 +50,7 @@ def test_netcdf_indexer_shape(self): def test_netcdf_indexer_mask(self): """Test netcdf_indexer for masking.""" - f0 = cfdm.example_field(0) + f0 = self.f0.copy() f0.del_property("missing_value", None) f0.del_property("_FillValue", None) fields = [f0.copy()] @@ -97,7 +99,7 @@ def test_netcdf_indexer_mask(self): def test_netcdf_indexer_unpack(self): """Test netcdf_indexer for unpacking.""" - f = cfdm.example_field(0) + f = self.f0.copy() array = np.ma.arange(40, dtype="int32").reshape(f.shape) array[1, :] = np.ma.masked diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 40875fa0ad..fbe39db3ac 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -13,6 +13,7 @@ faulthandler.enable() # to debug seg faults and timeouts import cfdm +from cfdm.read_write.exceptions import DatasetTypeError warnings = False @@ -50,6 +51,9 @@ def _remove_tmpfiles(): class read_writeTest(unittest.TestCase): """Test the reading and writing of field constructs from/to disk.""" + f0 = cfdm.example_field(0) + f1 = cfdm.example_field(1) + def setUp(self): """Preparations called immediately before each test method.""" # Disable log messages to silence expected warnings @@ -80,7 +84,7 @@ def setUp(self): def test_write_filename(self): """Test the writing of a named netCDF file.""" - f = cfdm.example_field(0) + f = self.f0 a = f.data.array cfdm.write(f, tmpfile) @@ -91,39 +95,39 @@ def test_write_filename(self): self.assertTrue((a == g[0].data.array).all()) - def test_read_field(self): - """Test the `extra` keyword argument of the `read` function.""" + def test_read_extra(self): + """Test the cfdm.read 'extra' keyword.""" # Test field keyword of cfdm.read filename = self.filename f = cfdm.read(filename) - self.assertEqual(len(f), 1, "\n" + str(f)) + self.assertEqual(len(f), 1) f = cfdm.read( filename, extra=["dimension_coordinate"], warnings=warnings ) - self.assertEqual(len(f), 4, "\n" + str(f)) + self.assertEqual(len(f), 4) f = cfdm.read( filename, extra=["auxiliary_coordinate"], warnings=warnings ) - self.assertEqual(len(f), 4, "\n" + str(f)) + self.assertEqual(len(f), 4) f = cfdm.read(filename, extra="cell_measure") - self.assertEqual(len(f), 2, "\n" + str(f)) + self.assertEqual(len(f), 2) f = cfdm.read(filename, extra=["field_ancillary"]) - self.assertEqual(len(f), 4, "\n" + str(f)) + self.assertEqual(len(f), 4) f = cfdm.read(filename, extra="domain_ancillary", warnings=warnings) - self.assertEqual(len(f), 4, "\n" + str(f)) + self.assertEqual(len(f), 4) f = cfdm.read( filename, extra=["field_ancillary", "auxiliary_coordinate"], warnings=warnings, ) - self.assertEqual(len(f), 7, "\n" + str(f)) + self.assertEqual(len(f), 7) self.assertEqual( len( @@ -161,10 +165,10 @@ def test_read_field(self): ), warnings=warnings, ) - self.assertEqual(len(f), 14, "\n" + str(f)) + self.assertEqual(len(f), 14) def test_read_write_format(self): - """Test the `fmt` keyword argument of the `read` function.""" + """Test the cfdm.write 'fmt' keyword.""" f = cfdm.read(self.filename)[0] for fmt in self.netcdf_fmts: cfdm.write(f, tmpfile, fmt=fmt) @@ -443,21 +447,26 @@ def test_write_netcdf_mode(self): f[0].nc_global_attributes(), g_new.nc_global_attributes() ) - def test_read_write_netCDF4_compress_shuffle(self): + def test_read_write_compress_shuffle(self): """Test the `compress` and `shuffle` parameters to `write`.""" - f = cfdm.read(self.filename)[0] - for fmt in self.netcdf4_fmts: - for shuffle in (True,): - for compress in (4,): # range(10): + f = self.f0.copy() + f.data.nc_set_hdf5_chunksizes("contiguous") + y = f.domain_axis("latitude") + y.nc_set_unlimited(True) + + for fmt in ("NETCDF3_CLASSIC", "NETCDF4"): + for shuffle in ( + False, + True, + ): + for compress in (0, 1): cfdm.write( f, tmpfile, fmt=fmt, compress=compress, shuffle=shuffle ) g = cfdm.read(tmpfile)[0] - self.assertTrue( - f.equals(g, verbose=3), - "Bad read/write with lossless compression: " - f"{fmt}, {compress}, {shuffle}", - ) + self.assertTrue(f.equals(g)) + + y.nc_set_unlimited(False) def test_read_write_missing_data(self): """Test reading and writing of netCDF with missing data.""" @@ -471,7 +480,7 @@ def test_read_write_missing_data(self): def test_read_mask(self): """Test reading and writing of netCDF with masked data.""" - f = cfdm.example_field(0) + f = self.f0.copy() N = f.size @@ -520,11 +529,31 @@ def test_write_datatype(self): datatype={np.dtype(float): np.dtype("float32")}, ) g = cfdm.read(tmpfile)[0] - self.assertEqual( - g.data.dtype, - np.dtype("float32"), - "datatype read in is " + str(g.data.dtype), - ) + self.assertEqual(g.data.dtype, np.dtype("float32")) + + # Keyword single + f = cfdm.read(self.filename)[0] + self.assertEqual(f.dtype, np.dtype(float)) + cfdm.write(f, tmpfile, fmt="NETCDF4", single=True) + g = cfdm.read(tmpfile)[0] + self.assertEqual(g.dtype, np.dtype("float32")) + + # Keyword double + f = g + self.assertEqual(f.dtype, np.dtype("float32")) + cfdm.write(f, tmpfile1, fmt="NETCDF4", double=True) + g = cfdm.read(tmpfile1)[0] + self.assertEqual(g.dtype, np.dtype(float)) + + with self.assertRaises(Exception): + cfdm.write(g, double=True, single=True) + + datatype = {np.dtype(float): np.dtype("float32")} + with self.assertRaises(Exception): + cfdm.write(g, datatype=datatype, single=True) + + with self.assertRaises(Exception): + cfdm.write(g, datatype=datatype, double=True) def test_read_write_unlimited(self): """Test reading and writing with an unlimited dimension.""" @@ -533,7 +562,7 @@ def test_read_write_unlimited(self): domain_axes = f.domain_axes() domain_axes["domainaxis0"].nc_set_unlimited(True) - cfdm.write(f, tmpfile, fmt=fmt) + cfdm.write(f, tmpfile, fmt=fmt, cfa=None) f = cfdm.read(tmpfile)[0] domain_axes = f.domain_axes() @@ -545,7 +574,7 @@ def test_read_write_unlimited(self): domain_axes["domainaxis0"].nc_set_unlimited(True) domain_axes["domainaxis2"].nc_set_unlimited(True) - cfdm.write(f, tmpfile, fmt="NETCDF4") + cfdm.write(f, tmpfile, fmt="NETCDF4", cfa=None) f = cfdm.read(tmpfile)[0] domain_axes = f.domain_axes() @@ -554,7 +583,6 @@ def test_read_write_unlimited(self): def test_read_CDL(self): """Test the reading of files in CDL format.""" - tmpfileh2 = "delme.nc" subprocess.run( " ".join(["ncdump", self.filename, ">", tmpfile]), shell=True, @@ -618,7 +646,7 @@ def test_read_CDL(self): ) ) - with self.assertRaises(OSError): + with self.assertRaises(DatasetTypeError): cfdm.read("test_read_write.py") # TODO: make portable instead of skipping on Mac OS (see Issue #25): @@ -658,15 +686,15 @@ def test_read_CDL(self): cfdm.read(tmpfileh)[0] # Finally test an invalid CDL input - with open(tmpfilec3, "w") as file: - file.write("netcdf test_file {\n add badness\n}") + with open(tmpfilec3, "w") as fh: + fh.write("netcdf test_file {\n add badness\n}") # TODO: work out (if it is even possible in a farily simple way) how # to suppress the expected error in stderr of the ncdump command # called by cfdm.read under the hood. Note that it can be easily # suppressed at subprocess call-time (but we don't want to do that in # case of genuine errors) and the following doesn't work as it doesn't # influence the subprocess: with contextlib.redirect_stdout(os.devnull) - with self.assertRaises(ValueError): + with self.assertRaises(RuntimeError): cfdm.read(tmpfilec3) def test_read_write_string(self): @@ -779,7 +807,7 @@ def test_read_write_multiple_geometries(self): def test_read_write_domain(self): """Test the reading and writing of domain constucts.""" - f = cfdm.example_field(1) + f = self.f1 d = f.domain.copy() # 1 domain @@ -824,7 +852,7 @@ def test_read_write_domain(self): def test_write_coordinates(self): """Test the `coordinates` keyword argument of `write`.""" - f = cfdm.example_field(0) + f = self.f0 cfdm.write(f, tmpfile, coordinates=True) g = cfdm.read(tmpfile) @@ -834,7 +862,7 @@ def test_write_coordinates(self): def test_write_scalar_domain_ancillary(self): """Test the writing of a file with a scalar domain ancillary.""" - f = cfdm.example_field(1) + f = self.f1.copy() # Create scalar domain ancillary d = f.construct("ncvar%a") @@ -849,7 +877,7 @@ def test_write_scalar_domain_ancillary(self): def test_write_filename_expansion(self): """Test the writing to a file name that requires expansions.""" - f = cfdm.example_field(0) + f = self.f0 filename = os.path.join("$PWD", os.path.basename(tmpfile)) cfdm.write(f, filename) @@ -860,7 +888,7 @@ def test_read_zero_length_file(self): tmpfiles.append(tmpfile) subprocess.run(f"touch {tmpfile}", shell=True, check=True) - with self.assertRaises(OSError): + with self.assertRaises(DatasetTypeError): cfdm.read(tmpfile) def test_read_subsampled_coordinates(self): @@ -918,7 +946,7 @@ def test_read_original_filenames(self): def test_write_omit_data(self): """Test the `omit_data` parameter to `write`.""" - f = cfdm.example_field(1) + f = self.f1 cfdm.write(f, tmpfile) cfdm.write(f, tmpfile, omit_data="all") @@ -951,7 +979,7 @@ def test_write_omit_data(self): def test_read_write_domain_ancillary(self): """Test when domain ancillary equals dimension coordinate.""" - f = cfdm.example_field(1) + f = self.f1 # Check the domain ancillary does indeed equal the dimension # coordinate @@ -1007,19 +1035,19 @@ def test_write_parametric_Z_coordinate(self): """Test write of parametric Z coordinate.""" # Thes write when a parametric Z dimension coordinate does not # have a compute_standard_name attribute - f = cfdm.example_field(1) + f = self.f1.copy() f.coordinate("atmosphere_hybrid_height_coordinate").del_property( "computed_standard_name", None ) cfdm.write(f, tmpfile) - def test_write_hdf5_chunks(self): - """Test the 'hdf5_chunks' parameter to `cfdm.write`.""" + def test_write_dataset_chunks(self): + """Test the 'dataset_chunks' parameter to `cfdm.write`.""" f = cfdm.example_field(5) f.nc_set_variable("data") - # Good hdf5_chunks values - for hdf5_chunks, chunking in zip( + # Good dataset_chunks values + for dataset_chunks, chunking in zip( ("4MiB", "8KiB", "5000", 314.159, 1, "contiguous"), ( [118, 5, 8], @@ -1030,41 +1058,41 @@ def test_write_hdf5_chunks(self): "contiguous", ), ): - cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks) + cfdm.write(f, tmpfile, dataset_chunks=dataset_chunks) nc = netCDF4.Dataset(tmpfile, "r") self.assertEqual(nc.variables["data"].chunking(), chunking) nc.close() - # Bad hdf5_chunks values - for hdf5_chunks in ("bad_value", None): + # Bad dataset_chunks values + for dataset_chunks in ("bad_value", None): with self.assertRaises(ValueError): - cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks) + cfdm.write(f, tmpfile, dataset_chunks=dataset_chunks) # Check that user-set chunks are not overridden for chunking in ([5, 4, 3], "contiguous"): f.nc_set_hdf5_chunksizes(chunking) - for hdf5_chunks in ("4MiB", "contiguous"): - cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks) + for dataset_chunks in ("4MiB", "contiguous"): + cfdm.write(f, tmpfile, dataset_chunks=dataset_chunks) nc = netCDF4.Dataset(tmpfile, "r") self.assertEqual(nc.variables["data"].chunking(), chunking) nc.close() f.nc_set_hdf5_chunksizes("120 B") - for hdf5_chunks in ("contiguous", "4MiB"): - cfdm.write(f, tmpfile, hdf5_chunks=hdf5_chunks) + for dataset_chunks in ("contiguous", "4MiB"): + cfdm.write(f, tmpfile, dataset_chunks=dataset_chunks) nc = netCDF4.Dataset(tmpfile, "r") self.assertEqual(nc.variables["data"].chunking(), [2, 2, 2]) nc.close() - # store_hdf5_chunks + # store_dataset_chunks f = cfdm.read(tmpfile)[0] self.assertEqual(f.nc_hdf5_chunksizes(), (2, 2, 2)) - f = cfdm.read(tmpfile, store_hdf5_chunks=False)[0] + f = cfdm.read(tmpfile, store_dataset_chunks=False)[0] self.assertIsNone(f.nc_hdf5_chunksizes()) # Scalar data is written contiguously - f = cfdm.example_field(0) + f = self.f0 f = f[0, 0].squeeze() cfdm.write(f, tmpfile) nc = netCDF4.Dataset(tmpfile, "r") @@ -1073,7 +1101,7 @@ def test_write_hdf5_chunks(self): def test_read_dask_chunks(self): """Test the 'dask_chunks' keyword of cfdm.read.""" - f = cfdm.example_field(0) + f = self.f0.copy() f.coordinate("latitude").axis = "Y" cfdm.write(f, tmpfile) @@ -1136,6 +1164,98 @@ def test_read_dask_chunks(self): g = cfdm.read(tmpfile)[0] self.assertEqual(g.data.chunks, ((7, 7, 7, 7, 7, 1), (5,), (4, 4))) + def test_read_to_memory(self): + """Test the 'to_memory' parameter to cfdm.read.""" + f = self.f0 + cfdm.write(f, tmpfile) + + f = cfdm.read(tmpfile)[0] + for d in (f.data.todict(), f.coordinate("longitude").data.todict()): + on_disk = False + for v in d.values(): + if isinstance(v, cfdm.H5netcdfArray): + on_disk = True + + self.assertTrue(on_disk) + + for to_memory in ("all", ("field", "dimension_coordinate")): + f = cfdm.read(tmpfile, to_memory=to_memory)[0] + for d in ( + f.data.todict(), + f.coordinate("longitude").data.todict(), + ): + in_memory = False + for v in d.values(): + if isinstance(v, np.ndarray): + in_memory = True + + self.assertTrue(in_memory) + + for to_memory in ("metadata", "dimension_coordinate"): + f = cfdm.read(tmpfile, to_memory=to_memory)[0] + for i, d in enumerate( + ( + f.coordinate("longitude").data.todict(), + f.data.todict(), + ) + ): + in_memory = False + for v in d.values(): + if isinstance(v, np.ndarray): + in_memory = True + + if not i: + # Metadata + self.assertTrue(in_memory) + else: + # Field + self.assertFalse(in_memory) + + def test_read_file_type(self): + """Test the cfdm.read 'file_type' keyword.""" + # netCDF file + for file_type in ( + None, + "netCDF", + ("netCDF",), + ("netCDF", "CDL"), + ("netCDF", "CDL", "bad value"), + ): + f = cfdm.read(self.filename, file_type=file_type) + self.assertEqual(len(f), 1) + + for file_type in ("CDL", "bad value"): + f = cfdm.read(self.filename, file_type=file_type) + self.assertEqual(len(f), 0) + + # CDL file + subprocess.run( + " ".join(["ncdump", self.filename, ">", tmpfile]), + shell=True, + check=True, + ) + for file_type in ( + None, + "CDL", + ("CDL",), + ("netCDF", "CDL"), + ("netCDF", "CDL", "bad value"), + ): + f = cfdm.read(tmpfile, file_type=file_type) + self.assertEqual(len(f), 1) + + for file_type in ("netCDF", "bad value"): + f = cfdm.read(tmpfile, file_type=file_type) + self.assertEqual(len(f), 0) + + # Not a netCDF or CDL file + with self.assertRaises(DatasetTypeError): + f = cfdm.read("test_read_write.py") + + for file_type in ("netCDF", "CDL", "bad value"): + f = cfdm.read("test_read_write.py", file_type=file_type) + self.assertEqual(len(f), 0) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_subsampling.py b/cfdm/test/test_subsampling.py index 40ea286bab..b0b9b6c7fe 100644 --- a/cfdm/test/test_subsampling.py +++ b/cfdm/test/test_subsampling.py @@ -241,13 +241,6 @@ def test_non_standard(self): with self.assertRaises(ValueError): a_2d.array - def test_SubsampledArray_get_filenames(self): - """Test SubsampledArray.get_filenames.""" - f = cfdm.read(self.linear) - q = f[0] - lat = q.construct("latitude").data - self.assertEqual(lat.get_filenames(), set()) - if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/docs/source/functions.rst b/docs/source/functions.rst index dc09d7d10b..96a4af0ff1 100644 --- a/docs/source/functions.rst +++ b/docs/source/functions.rst @@ -55,6 +55,7 @@ Miscellaneous cfdm.CF cfdm.abspath + cfdm.dirname cfdm.environment cfdm.example_field cfdm.example_fields diff --git a/docs/source/installation.rst b/docs/source/installation.rst index fe9e07201a..a824fc203e 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -166,7 +166,7 @@ The cfdm package requires: * `h5netcdf `_, version 1.3.0 newer. -* `h5py `_, version 3.10.0 or newer. +* `h5py `_, version 3.12.0 or newer. * `s3fs `_, version 2024.6.0 or newer. @@ -178,6 +178,9 @@ The cfdm package requires: * `scipy `_, version 1.10.0 or newer. +* `uritools `_, version 4.0.3 or + newer. + ---- .. _Code-repository: diff --git a/requirements.txt b/requirements.txt index 79c6417e2c..53e5cdebed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ numpy>=1.15,<2.0 packaging>=20.0 scipy>=1.10.0 h5netcdf>=1.3.0 -h5py>=3.10.0 +h5py>=3.12.0 s3fs>=2024.6.0 dask>=2024.6.0,<=2024.7.1 +uritools>=4.0.3 diff --git a/setup.py b/setup.py index 9fa3e1c4a1..8d63e44a7c 100755 --- a/setup.py +++ b/setup.py @@ -1,19 +1,7 @@ -import fnmatch import os import re -from setuptools import setup - -# from setuptools import setup - - -def find_package_data_files(directory): - """TODO.""" - for root, dirs, files in os.walk(directory): - for basename in files: - if fnmatch.fnmatch(basename, "*"): - filename = os.path.join(root, basename) - yield filename.replace("cfdm/", "", 1) +from setuptools import find_packages, setup def _read(fname): @@ -178,30 +166,7 @@ def _get_version(): "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", ], - packages=[ - "cfdm", - "cfdm.abstract", - "cfdm.core", - "cfdm.core.abstract", - "cfdm.core.data", - "cfdm.core.data.abstract", - "cfdm.core.docstring", - "cfdm.core.meta", - "cfdm.core.mixin", - "cfdm.docstring", - "cfdm.data", - "cfdm.data.abstract", - "cfdm.data.mixin", - "cfdm.data.subarray", - "cfdm.data.subarray.abstract", - "cfdm.data.subarray.mixin", - "cfdm.mixin", - "cfdm.read_write", - "cfdm.read_write.abstract", - "cfdm.read_write.netcdf", - "cfdm.read_write.netcdf.flatten", - "cfdm.test", - ], + packages=find_packages(), scripts=["scripts/cfdump"], python_requires=">=3.8", install_requires=install_requires,