-
Notifications
You must be signed in to change notification settings - Fork 286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle fill value on netCDF save #2747
Changes from 2 commits
d6f543b
c5d9dc0
1cfe7f8
1f234b3
4f5dadd
92d5f30
2aa21c4
5d8375a
ca4067d
4e93e34
3ee3e24
274462e
beb8efe
d10294b
745ddd2
f119bf5
703494c
c98585a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -741,6 +741,26 @@ def _setncattr(variable, name, attribute): | |
return variable.setncattr(name, attribute) | ||
|
||
|
||
class _FillValueMaskCheckAndStoreTarget(object): | ||
""" | ||
To be used with da.store. Remembers whether any element was equal to a | ||
given value and whether it was masked, before passing the chunk to the | ||
given target. | ||
|
||
""" | ||
def __init__(self, target, fill_value=None): | ||
self.target = target | ||
self.fill_value = fill_value | ||
self.contains_value = False | ||
self.is_masked = False | ||
|
||
def __setitem__(self, keys, arr): | ||
if self.fill_value is not None: | ||
self.contains_value |= self.fill_value in arr | ||
self.is_masked |= ma.isMaskedArray(arr) | ||
self.target[keys] = arr | ||
|
||
|
||
class Saver(object): | ||
"""A manager for saving netcdf files.""" | ||
|
||
|
@@ -812,7 +832,7 @@ def __exit__(self, type, value, traceback): | |
def write(self, cube, local_keys=None, unlimited_dimensions=None, | ||
zlib=False, complevel=4, shuffle=True, fletcher32=False, | ||
contiguous=False, chunksizes=None, endian='native', | ||
least_significant_digit=None, packing=None): | ||
least_significant_digit=None, packing=None, fill_value=None): | ||
""" | ||
Wrapper for saving cubes to a NetCDF file. | ||
|
||
|
@@ -899,14 +919,20 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None, | |
http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html#bp_Packed-Data-Values | ||
If this argument is a type (or type string), appropriate values of | ||
scale_factor and add_offset will be automatically calculated based | ||
on `cube.data` and possible masking. For masked data, `fill_value` | ||
is taken from netCDF4.default_fillvals. For more control, pass a | ||
dict with one or more of the following keys: `dtype` (required), | ||
`scale_factor`, `add_offset`, and `fill_value`. Note that automatic | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to ensure this public behaviour change is well documented in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree with @pelson : this needs a whatsnew entry, |
||
calculation of packing parameters will trigger loading of lazy | ||
data; set them manually using a dict to avoid this. The default is | ||
`None`, in which case the datatype is determined from the cube and | ||
no packing will occur. | ||
on `cube.data` and possible masking. For more control, pass a dict | ||
with one or more of the following keys: `dtype` (required), | ||
`scale_factor` and `add_offset`. Note that automatic calculation of | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this the reference you were looking form, saying that automatic packing realises the data ? |
||
packing parameters will trigger loading of lazy data; set them | ||
manually using a dict to avoid this. The default is `None`, in | ||
which case the datatype is determined from the cube and no packing | ||
will occur. | ||
|
||
* fill_value: | ||
The value to use for the `_FillValue` attribute on the netCDF | ||
variable. If `packing` is specified the value of `fill_value` | ||
should be in the domain of the packed data. If this argument is not | ||
supplied and the data is masked, `fill_value` is taken from | ||
netCDF4.default_fillvals | ||
|
||
Returns: | ||
None. | ||
|
@@ -955,7 +981,8 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None, | |
cube, dimension_names, local_keys, zlib=zlib, complevel=complevel, | ||
shuffle=shuffle, fletcher32=fletcher32, contiguous=contiguous, | ||
chunksizes=chunksizes, endian=endian, | ||
least_significant_digit=least_significant_digit, packing=packing) | ||
least_significant_digit=least_significant_digit, packing=packing, | ||
fill_value=fill_value) | ||
|
||
# Add coordinate variables. | ||
self._add_dim_coords(cube, dimension_names) | ||
|
@@ -1840,7 +1867,7 @@ def add_ellipsoid(ellipsoid): | |
_setncattr(cf_var_cube, 'grid_mapping', cs.grid_mapping_name) | ||
|
||
def _create_cf_data_variable(self, cube, dimension_names, local_keys=None, | ||
**kwargs): | ||
packing=None, fill_value=None, **kwargs): | ||
""" | ||
Create CF-netCDF data variable for the cube and any associated grid | ||
mapping. | ||
|
@@ -1855,8 +1882,11 @@ def _create_cf_data_variable(self, cube, dimension_names, local_keys=None, | |
Kwargs: | ||
|
||
* local_keys (iterable of strings): | ||
An interable of cube attribute keys. Any cube attributes | ||
with matching keys will become attributes on the data variable. | ||
* see :func:`iris.fileformats.netcdf.Saver.write` | ||
* packing (type or string or dict or list): | ||
* see :func:`iris.fileformats.netcdf.Saver.write` | ||
* fill_value: | ||
* see :func:`iris.fileformats.netcdf.Saver.write` | ||
|
||
All other keywords are passed through to the dataset's `createVariable` | ||
method. | ||
|
@@ -1865,47 +1895,32 @@ def _create_cf_data_variable(self, cube, dimension_names, local_keys=None, | |
The newly created CF-netCDF data variable. | ||
|
||
""" | ||
if 'packing' in kwargs: | ||
packing = kwargs.pop('packing') | ||
if packing: | ||
scale_factor = None | ||
add_offset = None | ||
fill_value = None | ||
if isinstance(packing, dict): | ||
if 'dtype' not in packing: | ||
msg = "The dtype attribute is required for packing." | ||
raise ValueError(msg) | ||
dtype = np.dtype(packing['dtype']) | ||
if 'scale_factor' in packing: | ||
scale_factor = packing['scale_factor'] | ||
if 'add_offset' in packing: | ||
add_offset = packing['add_offset'] | ||
if 'fill_value' in packing: | ||
fill_value = packing['fill_value'] | ||
|
||
if packing: | ||
if isinstance(packing, dict): | ||
if 'dtype' not in packing: | ||
msg = "The dtype attribute is required for packing." | ||
raise ValueError(msg) | ||
dtype = np.dtype(packing['dtype']) | ||
scale_factor = packing.get('scale_factor', None) | ||
add_offset = packing.get('add_offset', None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not your code, but would you mind checking that there aren't other keys in there other than the ones handled? |
||
else: | ||
masked = ma.isMaskedArray(cube.data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would trigger the data to be loaded. Is that what you intended? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry not your code... Perhaps you wouldn't mind adding a comment though:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I hadn't noticed this, it looks like it comes from the old code. It can be fixed, but it would require getting the min and max of the data while storing it. I can do that if you like, but it might be better to do it as a separate PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, just adding the comment is good. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't the docstring at this comment cover it ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really. It is about making the code readable. Doesn't need to be an essay. |
||
dtype = np.dtype(packing) | ||
cmax = cube.data.max() | ||
cmin = cube.data.min() | ||
n = dtype.itemsize * 8 | ||
if masked: | ||
scale_factor = (cmax - cmin)/(2**n-2) | ||
else: | ||
masked = ma.isMaskedArray(cube.data) | ||
dtype = np.dtype(packing) | ||
cmax = cube.data.max() | ||
cmin = cube.data.min() | ||
n = dtype.itemsize * 8 | ||
scale_factor = (cmax - cmin)/(2**n-1) | ||
if dtype.kind == 'u': | ||
add_offset = cmin | ||
elif dtype.kind == 'i': | ||
if masked: | ||
scale_factor = (cmax - cmin)/(2**n-2) | ||
add_offset = (cmax + cmin)/2 | ||
else: | ||
scale_factor = (cmax - cmin)/(2**n-1) | ||
if dtype.kind == 'u': | ||
add_offset = cmin | ||
elif dtype.kind == 'i': | ||
if masked: | ||
add_offset = (cmax + cmin)/2 | ||
else: | ||
add_offset = cmin + 2**(n-1)*scale_factor | ||
needs_fill_value = (cube.has_lazy_data() or | ||
ma.isMaskedArray(cube.data)) | ||
if needs_fill_value and fill_value is None: | ||
dtstr = dtype.str[1:] | ||
fill_value = netCDF4.default_fillvals[dtstr] | ||
else: | ||
packing = None | ||
add_offset = cmin + 2**(n-1)*scale_factor | ||
|
||
def set_packing_ncattrs(cfvar): | ||
"""Set netCDF packing attributes.""" | ||
|
@@ -1924,36 +1939,53 @@ def set_packing_ncattrs(cfvar): | |
self._dataset.file_format in ('NETCDF3_CLASSIC', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It determines which |
||
'NETCDF3_64BIT')): | ||
|
||
if packing is None: | ||
# Determine whether there is a cube MDI value. | ||
fill_value = get_fill_value(cube.core_data()) | ||
|
||
# Get the values in a form which is valid for the file format. | ||
data = self._ensure_valid_dtype(cube.data, 'cube', cube) | ||
|
||
if packing is None: | ||
dtype = data.dtype.newbyteorder('=') | ||
|
||
# Create the cube CF-netCDF data variable with data payload. | ||
cf_var = self._dataset.createVariable( | ||
cf_name, dtype, dimension_names, | ||
fill_value=fill_value, **kwargs) | ||
set_packing_ncattrs(cf_var) | ||
cf_var[:] = data | ||
|
||
def store(data, cf_var, fill_value): | ||
cf_var[:] = data | ||
is_masked = ma.isMaskedArray(data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here too should be using "ma.is_masked" instead of "ma.isMaskedArray". |
||
contains_value = fill_value is not None and fill_value in data | ||
return is_masked, contains_value | ||
else: | ||
# Create the cube CF-netCDF data variable. | ||
if packing is None: | ||
fill_value = get_fill_value(cube.core_data()) | ||
dtype = cube.dtype.newbyteorder('=') | ||
|
||
cf_var = self._dataset.createVariable( | ||
cf_name, dtype, | ||
dimension_names, fill_value=fill_value, | ||
**kwargs) | ||
set_packing_ncattrs(cf_var) | ||
|
||
da.store([cube.lazy_data()], [cf_var]) | ||
data = cube.lazy_data() | ||
|
||
def store(data, cf_var, fill_value): | ||
# Store lazy data and check whether it is masked and contains | ||
# the fill value | ||
target = _FillValueMaskCheckAndStoreTarget(cf_var, fill_value) | ||
da.store([data], [target]) | ||
return target.is_masked, target.contains_value | ||
|
||
if not packing: | ||
dtype = data.dtype.newbyteorder('=') | ||
|
||
if fill_value is None: | ||
fill_value = netCDF4.default_fillvals[dtype.str[1:]] | ||
# Ensure it is the correct dtype | ||
fill_value = dtype.type(fill_value) | ||
|
||
# Create the cube CF-netCDF data variable with data payload. | ||
cf_var = self._dataset.createVariable(cf_name, dtype, dimension_names, | ||
fill_value=fill_value, | ||
**kwargs) | ||
set_packing_ncattrs(cf_var) | ||
|
||
# If packing attributes are specified, don't bother checking whether | ||
# the fill value is in the data. | ||
fill_value_to_check = None if packing else fill_value | ||
|
||
# Store the data and check if it is masked and contains the fill value | ||
is_masked, contains_fill_value = store(data, cf_var, | ||
fill_value_to_check) | ||
|
||
if is_masked: | ||
if contains_fill_value: | ||
warnings.warn("Cube '{}' contains fill value {}. Some data " | ||
"points will be masked.".format(cube.name(), | ||
fill_value)) | ||
else: | ||
cf_var.delncattr('_FillValue') | ||
|
||
if cube.standard_name: | ||
_setncattr(cf_var, 'standard_name', cube.standard_name) | ||
|
@@ -2040,7 +2072,7 @@ def _increment_name(self, varname): | |
def save(cube, filename, netcdf_format='NETCDF4', local_keys=None, | ||
unlimited_dimensions=None, zlib=False, complevel=4, shuffle=True, | ||
fletcher32=False, contiguous=False, chunksizes=None, endian='native', | ||
least_significant_digit=None, packing=None): | ||
least_significant_digit=None, packing=None, fill_value=None): | ||
""" | ||
Save cube(s) to a netCDF file, given the cube and the filename. | ||
|
||
|
@@ -2144,16 +2176,19 @@ def save(cube, filename, netcdf_format='NETCDF4', local_keys=None, | |
This provides support for netCDF data packing as described in | ||
http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html#bp_Packed-Data-Values | ||
If this argument is a type (or type string), appropriate values of | ||
scale_factor and add_offset will be automatically calculated based on | ||
`cube.data` and possible masking. For masked data, `fill_value` is | ||
taken from netCDF4.default_fillvals. For more control, pass a dict with | ||
one or more of the following keys: `dtype` (required), `scale_factor`, | ||
`add_offset`, and `fill_value`. To save multiple cubes with different | ||
packing parameters, pass an iterable of types, strings, dicts, or | ||
`None`, one for each cube. Note that automatic calculation of packing | ||
parameters will trigger loading of lazy data; set them manually using a | ||
dict to avoid this. The default is `None`, in which case the datatype | ||
is determined from the cube and no packing will occur. | ||
scale_factor and add_offset will be automatically calculated based | ||
on `cube.data` and possible masking. For more control, pass a dict with | ||
one or more of the following keys: `dtype` (required), `scale_factor` | ||
and `add_offset`. Note that automatic calculation of packing parameters | ||
will trigger loading of lazy data; set them manually using a dict to | ||
avoid this. The default is `None`, in which case the datatype is | ||
determined from the cube and no packing will occur. | ||
|
||
* fill_value: | ||
The value to use for the `_FillValue` attribute on the netCDF variable. | ||
If `packing` is specified the value of `fill_value` should be in the | ||
domain of the packed data. If this argument is not supplied and the | ||
data is masked, `fill_value` is taken from netCDF4.default_fillvals | ||
|
||
Returns: | ||
None. | ||
|
@@ -2250,7 +2285,8 @@ def is_valid_packspec(p): | |
for cube, packspec in zip(cubes, packspecs): | ||
sman.write(cube, local_keys, unlimited_dimensions, zlib, complevel, | ||
shuffle, fletcher32, contiguous, chunksizes, endian, | ||
least_significant_digit, packing=packspec) | ||
least_significant_digit, packing=packspec, | ||
fill_value=fill_value) | ||
|
||
if iris.config.netcdf.conventions_override: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My Eclipse says this needs an extra import to be truly righteous. |
||
# Set to the default if custom conventions are not available. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -158,12 +158,13 @@ def test_big_endian(self): | |
def test_zlib(self): | ||
cube = self._simple_cube('>f4') | ||
with mock.patch('iris.fileformats.netcdf.netCDF4') as api: | ||
api.default_fillvals = {'f4': 12345.} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've had to modify this test because the |
||
with Saver('/dummy/path', 'NETCDF4') as saver: | ||
saver.write(cube, zlib=True) | ||
dataset = api.Dataset.return_value | ||
create_var_calls = mock.call.createVariable( | ||
'air_pressure_anomaly', np.dtype('float32'), ['dim0', 'dim1'], | ||
fill_value=None, shuffle=True, least_significant_digit=None, | ||
fill_value=mock.ANY, shuffle=True, least_significant_digit=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately I've had to make this test less strict. I could check that there is a |
||
contiguous=False, zlib=True, fletcher32=False, | ||
endian='native', complevel=4, chunksizes=None).call_list() | ||
dataset.assert_has_calls(create_var_calls) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would make sense to short-circuit this check (and the
fill_value in
check). Once we have found them to be true once, we can save ourselves the effort of inspecting the arrays each time.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had assumed
|=
would short circuit but I guess not.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you're going to need to double check that for me now. 😄
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I had already checked. It's because it's bitwise, and in fact it changes the data type of the left hand side.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. Well, at least you can get away with the boolean operators...