forked from pydata/xarray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconventions.py
1070 lines (874 loc) · 37.3 KB
/
conventions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from datetime import datetime
import re
import traceback
import warnings
import numpy as np
import pandas as pd
from collections import defaultdict
from pandas.tslib import OutOfBoundsDatetime
from .core import indexing, ops, utils
from .core.formatting import format_timestamp, first_n_items, last_item
from .core.variable import as_variable, Variable
from .core.pycompat import iteritems, OrderedDict, PY3, basestring
# standard calendars recognized by netcdftime
_STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian'])
def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None,
dtype=float):
"""Scale and mask array values according to CF conventions for packed and
missing values
First, values equal to the fill_value are replaced by NaN. Then, new values
are given by the formula:
original_values * scale_factor + add_offset
Parameters
----------
array : array-like
Original array of values to wrap
fill_value : number, optional
All values equal to fill_value in the original array are replaced
by NaN. If an array of multiple values is provided a warning will be
issued and all array elements matching an value in the fill_value array
will be replaced by NaN.
scale_factor : number, optional
Multiply entries in the original array by this number.
add_offset : number, optional
After applying scale_factor, add this number to entries in the
original array.
Returns
-------
scaled : np.ndarray
Array of masked and scaled values.
References
----------
http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html
"""
# by default, cast to float to ensure NaN is meaningful
values = np.array(array, dtype=dtype, copy=True)
if fill_value is not None and not np.all(pd.isnull(fill_value)):
if getattr(fill_value, 'size', 1) > 1:
fill_values = fill_value # multiple fill values
else:
fill_values = [fill_value]
for f_value in fill_values:
if values.ndim > 0:
values[values == f_value] = np.nan
elif values == f_value:
values = np.array(np.nan)
if scale_factor is not None:
values *= scale_factor
if add_offset is not None:
values += add_offset
return values
def _netcdf_to_numpy_timeunit(units):
units = units.lower()
if not units.endswith('s'):
units = '%ss' % units
return {'microseconds': 'us', 'milliseconds': 'ms', 'seconds': 's',
'minutes': 'm', 'hours': 'h', 'days': 'D'}[units]
def _unpack_netcdf_time_units(units):
# CF datetime units follow the format: "UNIT since DATE"
# this parses out the unit and date allowing for extraneous
# whitespace.
matches = re.match('(.+) since (.+)', units)
if not matches:
raise ValueError('invalid time units: %s' % units)
delta_units, ref_date = [s.strip() for s in matches.groups()]
return delta_units, ref_date
def _decode_datetime_with_netcdf4(num_dates, units, calendar):
import netCDF4 as nc4
dates = np.asarray(nc4.num2date(num_dates, units, calendar))
if (dates[np.nanargmin(num_dates)].year < 1678 or
dates[np.nanargmax(num_dates)].year >= 2262):
warnings.warn('Unable to decode time axis into full '
'numpy.datetime64 objects, continuing using dummy '
'netCDF4.datetime objects instead, reason: dates out'
' of range', RuntimeWarning, stacklevel=3)
else:
try:
dates = nctime_to_nptime(dates)
except ValueError as e:
warnings.warn('Unable to decode time axis into full '
'numpy.datetime64 objects, continuing using '
'dummy netCDF4.datetime objects instead, reason:'
'{0}'.format(e), RuntimeWarning, stacklevel=3)
return dates
def decode_cf_datetime(num_dates, units, calendar=None):
"""Given an array of numeric dates in netCDF format, convert it into a
numpy array of date time objects.
For standard (Gregorian) calendars, this function uses vectorized
operations, which makes it much faster than netCDF4.num2date. In such a
case, the returned array will be of type np.datetime64.
See also
--------
netCDF4.num2date
"""
num_dates = np.asarray(num_dates, dtype=float)
flat_num_dates = num_dates.ravel()
if calendar is None:
calendar = 'standard'
delta, ref_date = _unpack_netcdf_time_units(units)
try:
if calendar not in _STANDARD_CALENDARS:
raise OutOfBoundsDatetime
delta = _netcdf_to_numpy_timeunit(delta)
try:
ref_date = pd.Timestamp(ref_date)
except ValueError:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using netCDF4
raise OutOfBoundsDatetime
# fixes: https://github.com/pydata/pandas/issues/14068
# these lines check if the the lowest or the highest value in dates
# cause an OutOfBoundsDatetime (Overflow) error
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
dates = (pd.to_timedelta(flat_num_dates, delta) + ref_date).values
except (OutOfBoundsDatetime, OverflowError):
dates = _decode_datetime_with_netcdf4(flat_num_dates, units, calendar)
return dates.reshape(num_dates.shape)
def decode_cf_timedelta(num_timedeltas, units):
"""Given an array of numeric timedeltas in netCDF format, convert it into a
numpy timedelta64[ns] array.
"""
num_timedeltas = np.asarray(num_timedeltas)
units = _netcdf_to_numpy_timeunit(units)
shape = num_timedeltas.shape
num_timedeltas = num_timedeltas.ravel()
result = pd.to_timedelta(num_timedeltas, unit=units, box=False)
# NaT is returned unboxed with wrong units; this should be fixed in pandas
if result.dtype != 'timedelta64[ns]':
result = result.astype('timedelta64[ns]')
return result.reshape(shape)
TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds',
'milliseconds', 'microseconds'])
def _infer_time_units_from_diff(unique_timedeltas):
for time_unit, delta in [('days', 86400), ('hours', 3600),
('minutes', 60), ('seconds', 1)]:
unit_delta = np.timedelta64(10 ** 9 * delta, 'ns')
diffs = unique_timedeltas / unit_delta
if np.all(diffs == diffs.astype(int)):
return time_unit
return 'seconds'
def infer_datetime_units(dates):
"""Given an array of datetimes, returns a CF compatible time-unit string of
the form "{time_unit} since {date[0]}", where `time_unit` is 'days',
'hours', 'minutes' or 'seconds' (the first one that can evenly divide all
unique time deltas in `dates`)
"""
dates = pd.to_datetime(np.asarray(dates).ravel(), box=False)
dates = dates[pd.notnull(dates)]
unique_timedeltas = np.unique(np.diff(dates))
units = _infer_time_units_from_diff(unique_timedeltas)
reference_date = dates[0] if len(dates) > 0 else '1970-01-01'
return '%s since %s' % (units, pd.Timestamp(reference_date))
def infer_timedelta_units(deltas):
"""Given an array of timedeltas, returns a CF compatible time-unit from
{'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly
divide all unique time deltas in `deltas`)
"""
deltas = pd.to_timedelta(np.asarray(deltas).ravel(), box=False)
unique_timedeltas = np.unique(deltas[pd.notnull(deltas)])
units = _infer_time_units_from_diff(unique_timedeltas)
return units
def nctime_to_nptime(times):
"""Given an array of netCDF4.datetime objects, return an array of
numpy.datetime64 objects of the same size"""
times = np.asarray(times)
new = np.empty(times.shape, dtype='M8[ns]')
for i, t in np.ndenumerate(times):
dt = datetime(t.year, t.month, t.day, t.hour, t.minute, t.second)
new[i] = np.datetime64(dt)
return new
def _cleanup_netcdf_time_units(units):
delta, ref_date = _unpack_netcdf_time_units(units)
try:
units = '%s since %s' % (delta, format_timestamp(ref_date))
except OutOfBoundsDatetime:
# don't worry about reifying the units if they're out of bounds
pass
return units
def _encode_datetime_with_netcdf4(dates, units, calendar):
"""Fallback method for encoding dates using netCDF4-python.
This method is more flexible than xarray's parsing using datetime64[ns]
arrays but also slower because it loops over each element.
"""
import netCDF4 as nc4
if np.issubdtype(dates.dtype, np.datetime64):
# numpy's broken datetime conversion only works for us precision
dates = dates.astype('M8[us]').astype(datetime)
def encode_datetime(d):
return np.nan if d is None else nc4.date2num(d, units, calendar)
return np.vectorize(encode_datetime)(dates)
def cast_to_int_if_safe(num):
int_num = np.array(num, dtype=np.int64)
if (num == int_num).all():
num = int_num
return num
def encode_cf_datetime(dates, units=None, calendar=None):
"""Given an array of datetime objects, returns the tuple `(num, units,
calendar)` suitable for a CF compliant time variable.
Unlike `date2num`, this function can handle datetime64 arrays.
See also
--------
netCDF4.date2num
"""
dates = np.asarray(dates)
if units is None:
units = infer_datetime_units(dates)
else:
units = _cleanup_netcdf_time_units(units)
if calendar is None:
calendar = 'proleptic_gregorian'
delta, ref_date = _unpack_netcdf_time_units(units)
try:
if calendar not in _STANDARD_CALENDARS or dates.dtype.kind == 'O':
# parse with netCDF4 instead
raise OutOfBoundsDatetime
assert dates.dtype == 'datetime64[ns]'
delta_units = _netcdf_to_numpy_timeunit(delta)
time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]')
ref_date = np.datetime64(pd.Timestamp(ref_date))
num = (dates - ref_date) / time_delta
except (OutOfBoundsDatetime, OverflowError):
num = _encode_datetime_with_netcdf4(dates, units, calendar)
num = cast_to_int_if_safe(num)
return (num, units, calendar)
def encode_cf_timedelta(timedeltas, units=None):
if units is None:
units = infer_timedelta_units(timedeltas)
np_unit = _netcdf_to_numpy_timeunit(units)
num = 1.0 * timedeltas / np.timedelta64(1, np_unit)
num = np.where(pd.isnull(timedeltas), np.nan, num)
num = cast_to_int_if_safe(num)
return (num, units)
class MaskedAndScaledArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessed, are automatically scaled and masked according to
CF conventions for packed and missing data values.
New values are given by the formula:
original_values * scale_factor + add_offset
Values can only be accessed via `__getitem__`:
>>> x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1)
>>> x
MaskedAndScaledArray(array([-99, -1, 0, 1, 2]), fill_value=-99,
scale_factor=0.01, add_offset=1)
>>> x[:]
array([ nan, 0.99, 1. , 1.01, 1.02]
References
----------
http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html
"""
def __init__(self, array, fill_value=None, scale_factor=None,
add_offset=None, dtype=float):
"""
Parameters
----------
array : array-like
Original array of values to wrap
fill_value : number, optional
All values equal to fill_value in the original array are replaced
by NaN.
scale_factor : number, optional
Multiply entries in the original array by this number.
add_offset : number, optional
After applying scale_factor, add this number to entries in the
original array.
"""
self.array = array
self.fill_value = fill_value
self.scale_factor = scale_factor
self.add_offset = add_offset
self._dtype = dtype
@property
def dtype(self):
return np.dtype(self._dtype)
def __getitem__(self, key):
return mask_and_scale(self.array[key], self.fill_value,
self.scale_factor, self.add_offset, self._dtype)
def __repr__(self):
return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r, "
"dtype=%r)" %
(type(self).__name__, self.array, self.fill_value,
self.scale_factor, self.add_offset, self._dtype))
class DecodedCFDatetimeArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessed, are automatically converted into datetime objects
using decode_cf_datetime.
"""
def __init__(self, array, units, calendar=None):
self.array = array
self.units = units
self.calendar = calendar
# Verify that at least the first and last date can be decoded
# successfully. Otherwise, tracebacks end up swallowed by
# Dataset.__repr__ when users try to view their lazily decoded array.
example_value = np.concatenate([first_n_items(array, 1),
last_item(array), [0]])
try:
result = decode_cf_datetime(example_value, units, calendar)
except Exception:
calendar_msg = ('the default calendar' if calendar is None
else 'calendar %r' % calendar)
msg = ('unable to decode time units %r with %s. Try '
'opening your dataset with decode_times=False.'
% (units, calendar_msg))
if not PY3:
msg += ' Full traceback:\n' + traceback.format_exc()
raise ValueError(msg)
else:
self._dtype = getattr(result, 'dtype', np.dtype('object'))
@property
def dtype(self):
return self._dtype
def __getitem__(self, key):
return decode_cf_datetime(self.array[key], units=self.units,
calendar=self.calendar)
class DecodedCFTimedeltaArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessed, are automatically converted into timedelta objects
using decode_cf_timedelta.
"""
def __init__(self, array, units):
self.array = array
self.units = units
@property
def dtype(self):
return np.dtype('timedelta64[ns]')
def __getitem__(self, key):
return decode_cf_timedelta(self.array[key], units=self.units)
class CharToStringArray(utils.NDArrayMixin):
"""Wrapper around array-like objects to create a new indexable object where
values, when accessed, are automatically concatenated along the last
dimension.
>>> CharToStringArray(np.array(['a', 'b', 'c']))[:]
array('abc',
dtype='|S3')
"""
def __init__(self, array):
"""
Parameters
----------
array : array-like
Original array of values to wrap.
"""
self.array = array
@property
def dtype(self):
return np.dtype('S' + str(self.array.shape[-1]))
@property
def shape(self):
return self.array.shape[:-1]
def __str__(self):
if self.ndim == 0:
# always return a unicode str if it's a single item for py3 compat
return self[...].item().decode('utf-8')
else:
return repr(self)
def __repr__(self):
return '%s(%r)' % (type(self).__name__, self.array)
def __getitem__(self, key):
if self.array.ndim == 0:
values = self.array[key]
else:
# require slicing the last dimension completely
key = indexing.expanded_indexer(key, self.array.ndim)
if key[-1] != slice(None):
raise IndexError('too many indices')
values = char_to_string(self.array[key])
return values
class NativeEndiannessArray(utils.NDArrayMixin):
"""Decode arrays on the fly from non-native to native endianness
This is useful for decoding arrays from netCDF3 files (which are all
big endian) into native endianness, so they can be used with Cython
functions, such as those found in bottleneck and pandas.
>>> x = np.arange(5, dtype='>i2')
>>> x.dtype
dtype('>i2')
>>> NativeEndianArray(x).dtype
dtype('int16')
>>> NativeEndianArray(x)[:].dtype
dtype('int16')
"""
def __init__(self, array):
self.array = array
@property
def dtype(self):
return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)
class BoolTypeArray(utils.NDArrayMixin):
"""Decode arrays on the fly from integer to boolean datatype
This is useful for decoding boolean arrays from integer typed netCDF
variables.
>>> x = np.array([1, 0, 1, 1, 0], dtype='i1')
>>> x.dtype
dtype('>i2')
>>> BoolTypeArray(x).dtype
dtype('bool')
>>> BoolTypeArray(x)[:].dtype
dtype('bool')
"""
def __init__(self, array):
self.array = array
@property
def dtype(self):
return np.dtype('bool')
def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)
def string_to_char(arr):
"""Like netCDF4.stringtochar, but faster and more flexible.
"""
# ensure the array is contiguous
arr = np.array(arr, copy=False, order='C')
kind = arr.dtype.kind
if kind not in ['U', 'S']:
raise ValueError('argument must be a string')
return arr.reshape(arr.shape + (1,)).view(kind + '1')
def char_to_string(arr):
"""Like netCDF4.chartostring, but faster and more flexible.
"""
# based on: http://stackoverflow.com/a/10984878/809705
arr = np.array(arr, copy=False, order='C')
kind = arr.dtype.kind
if kind not in ['U', 'S']:
raise ValueError('argument must be a string')
return arr.view(kind + str(arr.shape[-1]))[..., 0]
def safe_setitem(dest, key, value):
if key in dest:
raise ValueError('Failed hard to prevent overwriting key %r' % key)
dest[key] = value
def pop_to(source, dest, key, default=None):
"""
A convenience function which pops a key k from source to dest.
None values are not passed on. If k already exists in dest an
error is raised.
"""
value = source.pop(key, None)
if value is not None:
safe_setitem(dest, key, value)
return value
def _var_as_tuple(var):
return var.dims, var.data, var.attrs.copy(), var.encoding.copy()
def maybe_encode_datetime(var):
if np.issubdtype(var.dtype, np.datetime64):
dims, data, attrs, encoding = _var_as_tuple(var)
(data, units, calendar) = encode_cf_datetime(
data, encoding.pop('units', None), encoding.pop('calendar', None))
safe_setitem(attrs, 'units', units)
safe_setitem(attrs, 'calendar', calendar)
var = Variable(dims, data, attrs, encoding)
return var
def maybe_encode_timedelta(var):
if np.issubdtype(var.dtype, np.timedelta64):
dims, data, attrs, encoding = _var_as_tuple(var)
data, units = encode_cf_timedelta(
data, encoding.pop('units', None))
safe_setitem(attrs, 'units', units)
var = Variable(dims, data, attrs, encoding)
return var
def maybe_encode_offset_and_scale(var, needs_copy=True):
if any(k in var.encoding for k in ['add_offset', 'scale_factor']):
dims, data, attrs, encoding = _var_as_tuple(var)
data = data.astype(dtype=float, copy=needs_copy)
needs_copy = False
if 'add_offset' in encoding:
data -= pop_to(encoding, attrs, 'add_offset')
if 'scale_factor' in encoding:
data /= pop_to(encoding, attrs, 'scale_factor')
var = Variable(dims, data, attrs, encoding)
return var, needs_copy
def maybe_encode_fill_value(var, needs_copy=True):
# replace NaN with the fill value
if '_FillValue' in var.encoding:
dims, data, attrs, encoding = _var_as_tuple(var)
fill_value = pop_to(encoding, attrs, '_FillValue')
if not pd.isnull(fill_value):
data = ops.fillna(data, fill_value)
needs_copy = False
var = Variable(dims, data, attrs, encoding)
return var, needs_copy
def maybe_encode_dtype(var, name=None):
if 'dtype' in var.encoding:
dims, data, attrs, encoding = _var_as_tuple(var)
dtype = np.dtype(encoding.pop('dtype'))
if dtype != var.dtype:
if np.issubdtype(dtype, np.integer):
if (np.issubdtype(var.dtype, np.floating) and
'_FillValue' not in var.attrs):
warnings.warn('saving variable %s with floating '
'point data as an integer dtype without '
'any _FillValue to use for NaNs' % name,
RuntimeWarning, stacklevel=3)
data = ops.around(data)[...]
if dtype == 'S1' and data.dtype != 'S1':
data = string_to_char(np.asarray(data, 'S'))
dims = dims + ('string%s' % data.shape[-1],)
data = data.astype(dtype=dtype)
var = Variable(dims, data, attrs, encoding)
return var
def maybe_encode_bools(var):
if ((var.dtype == np.bool) and
('dtype' not in var.encoding) and ('dtype' not in var.attrs)):
dims, data, attrs, encoding = _var_as_tuple(var)
attrs['dtype'] = 'bool'
data = data.astype(dtype='i1', copy=True)
var = Variable(dims, data, attrs, encoding)
return var
def _infer_dtype(array):
"""Given an object array with no missing values, infer its dtype from its
first element
"""
if array.size == 0:
dtype = np.dtype(float)
else:
dtype = np.array(array[(0,) * array.ndim]).dtype
if dtype.kind in ['S', 'U']:
# don't just use inferred dtype to avoid truncating arrays to
# the length of their first element
dtype = np.dtype(dtype.kind)
elif dtype.kind == 'O':
raise ValueError('unable to infer dtype; xarray cannot '
'serialize arbitrary Python objects')
return dtype
def ensure_dtype_not_object(var):
# TODO: move this from conventions to backends? (it's not CF related)
if var.dtype.kind == 'O':
dims, data, attrs, encoding = _var_as_tuple(var)
missing = pd.isnull(data)
if missing.any():
# nb. this will fail for dask.array data
non_missing_values = data[~missing]
inferred_dtype = _infer_dtype(non_missing_values)
if inferred_dtype.kind in ['S', 'U']:
# There is no safe bit-pattern for NA in typical binary string
# formats, we so can't set a fill_value. Unfortunately, this
# means we won't be able to restore string arrays with missing
# values.
fill_value = ''
else:
# insist on using float for numeric values
if not np.issubdtype(inferred_dtype, float):
inferred_dtype = np.dtype(float)
fill_value = np.nan
data = np.array(data, dtype=inferred_dtype, copy=True)
data[missing] = fill_value
else:
data = data.astype(dtype=_infer_dtype(data))
var = Variable(dims, data, attrs, encoding)
return var
def encode_cf_variable(var, needs_copy=True, name=None):
"""
Converts an Variable into an Variable which follows some
of the CF conventions:
- Nans are masked using _FillValue (or the deprecated missing_value)
- Rescaling via: scale_factor and add_offset
- datetimes are converted to the CF 'units since time' format
- dtype encodings are enforced.
Parameters
----------
var : xarray.Variable
A variable holding un-encoded data.
Returns
-------
out : xarray.Variable
A variable which has been encoded as described above.
"""
var = maybe_encode_datetime(var)
var = maybe_encode_timedelta(var)
var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy)
var, needs_copy = maybe_encode_fill_value(var, needs_copy)
var = maybe_encode_dtype(var, name)
var = maybe_encode_bools(var)
var = ensure_dtype_not_object(var)
return var
def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
decode_times=True, decode_endianness=True):
"""
Decodes a variable which may hold CF encoded information.
This includes variables that have been masked and scaled, which
hold CF style time variables (this is almost always the case if
the dataset has been serialized) and which have strings encoded
as character arrays.
Parameters
----------
var : Variable
A variable holding potentially CF encoded information.
concat_characters : bool
Should character arrays be concatenated to strings, for
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
mask_and_scale: bool
Lazily scale (using scale_factor and add_offset) and mask
(using _FillValue).
decode_times : bool
Decode cf times ('hours since 2000-01-01') to np.datetime64.
decode_endianness : bool
Decode arrays from non-native to native endianness.
Returns
-------
out : Variable
A variable holding the decoded equivalent of var
"""
# use _data instead of data so as not to trigger loading data
var = as_variable(var)
data = var._data
dimensions = var.dims
attributes = var.attrs.copy()
encoding = var.encoding.copy()
original_dtype = data.dtype
if concat_characters:
if data.dtype.kind == 'S' and data.dtype.itemsize == 1:
dimensions = dimensions[:-1]
data = CharToStringArray(data)
if mask_and_scale:
if 'missing_value' in attributes:
# missing_value is deprecated, but we still want to support it as
# an alias for _FillValue.
if ('_FillValue' in attributes and
not utils.equivalent(attributes['_FillValue'],
attributes['missing_value'])):
raise ValueError("Discovered conflicting _FillValue "
"and missing_value. Considering "
"opening the offending dataset using "
"decode_cf=False, corrected the attributes",
"and decoding explicitly using "
"xarray.conventions.decode_cf(ds)")
attributes['_FillValue'] = attributes.pop('missing_value')
fill_value = np.array(pop_to(attributes, encoding, '_FillValue'))
if fill_value.size > 1:
warnings.warn("variable has multiple fill values {0}, decoding "
"all values to NaN.".format(str(fill_value)),
RuntimeWarning, stacklevel=3)
scale_factor = pop_to(attributes, encoding, 'scale_factor')
add_offset = pop_to(attributes, encoding, 'add_offset')
if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or
scale_factor is not None or add_offset is not None):
if fill_value.dtype.kind in ['U', 'S']:
dtype = object
else:
dtype = float
data = MaskedAndScaledArray(data, fill_value, scale_factor,
add_offset, dtype)
if decode_times and 'units' in attributes:
if 'since' in attributes['units']:
# datetime
units = pop_to(attributes, encoding, 'units')
calendar = pop_to(attributes, encoding, 'calendar')
data = DecodedCFDatetimeArray(data, units, calendar)
elif attributes['units'] in TIME_UNITS:
# timedelta
units = pop_to(attributes, encoding, 'units')
data = DecodedCFTimedeltaArray(data, units)
if decode_endianness and not data.dtype.isnative:
# do this last, so it's only done if we didn't already unmask/scale
data = NativeEndiannessArray(data)
original_dtype = data.dtype
if 'dtype' in encoding:
if original_dtype != encoding['dtype']:
warnings.warn("CF decoding is overwriting dtype")
else:
encoding['dtype'] = original_dtype
if 'dtype' in attributes and attributes['dtype'] == 'bool':
del attributes['dtype']
data = BoolTypeArray(data)
return Variable(dimensions, indexing.LazilyIndexedArray(data),
attributes, encoding=encoding)
def decode_cf_variables(variables, attributes, concat_characters=True,
mask_and_scale=True, decode_times=True,
decode_coords=True, drop_variables=None):
"""
Decode a several CF encoded variables.
See: decode_cf_variable
"""
dimensions_used_by = defaultdict(list)
for v in variables.values():
for d in v.dims:
dimensions_used_by[d].append(v)
def stackable(dim):
# figure out if a dimension can be concatenated over
if dim in variables:
return False
for v in dimensions_used_by[dim]:
if v.dtype.kind != 'S' or dim != v.dims[-1]:
return False
return True
coord_names = set()
if isinstance(drop_variables, basestring):
drop_variables = [drop_variables]
elif drop_variables is None:
drop_variables = []
drop_variables = set(drop_variables)
new_vars = OrderedDict()
for k, v in iteritems(variables):
if k in drop_variables:
continue
concat = (concat_characters and v.dtype.kind == 'S' and v.ndim > 0 and
stackable(v.dims[-1]))
new_vars[k] = decode_cf_variable(
v, concat_characters=concat, mask_and_scale=mask_and_scale,
decode_times=decode_times)
if decode_coords:
var_attrs = new_vars[k].attrs
if 'coordinates' in var_attrs:
coord_str = var_attrs['coordinates']
var_coord_names = coord_str.split()
if all(k in variables for k in var_coord_names):
new_vars[k].encoding['coordinates'] = coord_str
del var_attrs['coordinates']
coord_names.update(var_coord_names)
if decode_coords and 'coordinates' in attributes:
attributes = OrderedDict(attributes)
coord_names.update(attributes.pop('coordinates').split())
return new_vars, attributes, coord_names
def decode_cf(obj, concat_characters=True, mask_and_scale=True,
decode_times=True, decode_coords=True, drop_variables=None):
"""Decode the given Dataset or Datastore according to CF conventions into
a new Dataset.
Parameters
----------
obj : Dataset or DataStore
Object to decode.
concat_characters : bool, optional
Should character arrays be concatenated to strings, for
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
mask_and_scale: bool, optional
Lazily scale (using scale_factor and add_offset) and mask
(using _FillValue).
decode_times : bool, optional
Decode cf times (e.g., integers since 'hours since 2000-01-01') to
np.datetime64.
decode_coords : bool, optional
Use the 'coordinates' attribute on variable (or the dataset itself) to
identify coordinates.
drop_variables: string or iterable, optional
A variable or list of variables to exclude from being parsed from the
dataset.This may be useful to drop variables with problems or
inconsistent values.
Returns
-------
decoded : Dataset
"""
from .core.dataset import Dataset
from .backends.common import AbstractDataStore
if isinstance(obj, Dataset):
vars = obj._variables
attrs = obj.attrs
extra_coords = set(obj.coords)
file_obj = obj._file_obj
elif isinstance(obj, AbstractDataStore):
vars, attrs = obj.load()
extra_coords = set()
file_obj = obj
else:
raise TypeError('can only decode Dataset or DataStore objects')
vars, attrs, coord_names = decode_cf_variables(
vars, attrs, concat_characters, mask_and_scale, decode_times,
decode_coords, drop_variables=drop_variables)
ds = Dataset(vars, attrs=attrs)
ds = ds.set_coords(coord_names.union(extra_coords))
ds._file_obj = file_obj
return ds
def cf_decoder(variables, attributes,
concat_characters=True, mask_and_scale=True,
decode_times=True):
"""
Decode a set of CF encoded variables and attributes.
See Also, decode_cf_variable
Parameters
----------
variables : dict
A dictionary mapping from variable name to xarray.Variable
attributes : dict
A dictionary mapping from attribute name to value
concat_characters : bool
Should character arrays be concatenated to strings, for
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
mask_and_scale: bool
Lazily scale (using scale_factor and add_offset) and mask
(using _FillValue).
decode_times : bool
Decode cf times ('hours since 2000-01-01') to np.datetime64.
Returns
-------
decoded_variables : dict
A dictionary mapping from variable name to xarray.Variable objects.
decoded_attributes : dict
A dictionary mapping from attribute name to values.
"""
variables, attributes, _ = decode_cf_variables(
variables, attributes, concat_characters, mask_and_scale, decode_times)
return variables, attributes
def _encode_coordinates(variables, attributes, non_dim_coord_names):
# calculate global and variable specific coordinates
non_dim_coord_names = set(non_dim_coord_names)
global_coordinates = non_dim_coord_names.copy()
variable_coordinates = defaultdict(set)
for coord_name in non_dim_coord_names:
target_dims = variables[coord_name].dims
for k, v in variables.items():
if (k not in non_dim_coord_names and k not in v.dims and
any(d in target_dims for d in v.dims)):
variable_coordinates[k].add(coord_name)
global_coordinates.discard(coord_name)
variables = OrderedDict((k, v.copy(deep=False))
for k, v in variables.items())
# These coordinates are saved according to CF conventions
for var_name, coord_names in variable_coordinates.items():
attrs = variables[var_name].attrs
if 'coordinates' in attrs:
raise ValueError('cannot serialize coordinates because variable '
"%s already has an attribute 'coordinates'"