Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use getitem_with_mask in reindex_variables #1847

Merged
merged 9 commits into from
Feb 14, 2018
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
"netcdf4": [""],
"scipy": [""],
"bottleneck": ["", null],
"dask": ["", null],
"dask": [""],
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to run benchmarks separately with/without dask. This just makes the whole suite take twice as long as necessary. We already need to write separate benchmark cases, given that the syntax for using dask is different.

},


Expand Down
45 changes: 45 additions & 0 deletions asv_bench/benchmarks/reindexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import xarray as xr

from . import requires_dask


class Reindex(object):
def setup(self):
data = np.random.RandomState(0).randn(1000, 100, 100)
self.ds = xr.Dataset({'temperature': (('time', 'x', 'y'), data)},
coords={'time': np.arange(1000),
'x': np.arange(100),
'y': np.arange(100)})

def time_1d_coarse(self):
self.ds.reindex(time=np.arange(0, 1000, 5)).load()

def time_1d_fine_all_found(self):
self.ds.reindex(time=np.arange(0, 1000, 0.5), method='nearest').load()

def time_1d_fine_some_missing(self):
self.ds.reindex(time=np.arange(0, 1000, 0.5), method='nearest',
tolerance=0.1).load()

def time_2d_coarse(self):
self.ds.reindex(x=np.arange(0, 100, 2), y=np.arange(0, 100, 2)).load()

def time_2d_fine_all_found(self):
self.ds.reindex(x=np.arange(0, 100, 0.5), y=np.arange(0, 100, 0.5),
method='nearest').load()

def time_2d_fine_some_missing(self):
self.ds.reindex(x=np.arange(0, 100, 0.5), y=np.arange(0, 100, 0.5),
method='nearest', tolerance=0.1).load()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have unit test coverage for this test w/ dask? I see it was failing in the ASV benchmark before the changes in alignment.py so my guess is we don't have test coverage here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll double check but I think the test failure was just ASV timing out. When I run a similar test case before (see my comment on the other PR) it was extremely slow.



class ReindexDask(Reindex):
def setup(self):
requires_dask()
super(ReindexDask, self).setup()
self.ds = self.ds.chunk({'time': 100})
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ Enhancements
(:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
Correspondingly, encoded variables may also be saved with a smaller dtype.
By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
- Speed of reindexing/alignment with dask array is orders of magnitude faster
when inserting missing values (:issue:`1847`).
By `Stephan Hoyer <https://github.com/shoyer>`_.

.. _Zarr: http://zarr.readthedocs.io/

Expand Down Expand Up @@ -135,6 +138,9 @@ Bug fixes
``parse_coordinates`` kwarg has beed added to :py:func:`~open_rasterio`
(set to ``True`` per default).
By `Fabien Maussion <https://github.com/fmaussion>`_.
- :py:meth:`DataArray.where` with string data now fills with ``np.NaN`` like
pandas, not the string ``'nan'`` (:issue:`1847`).
By `Stephan Hoyer <https://github.com/shoyer>`_.

.. _whats-new.0.10.0:

Expand Down
101 changes: 34 additions & 67 deletions xarray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@

import numpy as np

from . import duck_array_ops
from . import dtypes
from . import utils
from .indexing import get_indexer_nd
from .pycompat import iteritems, OrderedDict, suppress
from .utils import is_full_slice, is_dict_like
from .variable import Variable, IndexVariable
from .variable import IndexVariable


def _get_joiner(join):
Expand Down Expand Up @@ -306,59 +304,51 @@ def reindex_variables(variables, sizes, indexes, indexers, method=None,
from .dataarray import DataArray

# build up indexers for assignment along each dimension
to_indexers = {}
from_indexers = {}
int_indexers = {}
targets = {}
masked_dims = set()
unchanged_dims = set()

# size of reindexed dimensions
new_sizes = {}

for name, index in iteritems(indexes):
if name in indexers:
target = utils.safe_cast_to_index(indexers[name])
if not index.is_unique:
raise ValueError(
'cannot reindex or align along dimension %r because the '
'index has duplicate values' % name)
indexer = get_indexer_nd(index, target, method, tolerance)

target = utils.safe_cast_to_index(indexers[name])
new_sizes[name] = len(target)
# Note pandas uses negative values from get_indexer_nd to signify
# values that are missing in the index
# The non-negative values thus indicate the non-missing values
to_indexers[name] = indexer >= 0
if to_indexers[name].all():
# If an indexer includes no negative values, then the
# assignment can be to a full-slice (which is much faster,
# and means we won't need to fill in any missing values)
to_indexers[name] = slice(None)

from_indexers[name] = indexer[to_indexers[name]]
if np.array_equal(from_indexers[name], np.arange(len(index))):
# If the indexer is equal to the original index, use a full
# slice object to speed up selection and so we can avoid
# unnecessary copies
from_indexers[name] = slice(None)

int_indexer = get_indexer_nd(index, target, method, tolerance)

# We uses negative values from get_indexer_nd to signify
# values that are missing in the index.
if (int_indexer < 0).any():
masked_dims.add(name)
elif np.array_equal(int_indexer, np.arange(len(index))):
unchanged_dims.add(name)

int_indexers[name] = int_indexer
targets[name] = target

for dim in sizes:
if dim not in indexes and dim in indexers:
existing_size = sizes[dim]
new_size = utils.safe_cast_to_index(indexers[dim]).size
new_size = indexers[dim].size
if existing_size != new_size:
raise ValueError(
'cannot reindex or align along dimension %r without an '
'index because its size %r is different from the size of '
'the new index %r' % (dim, existing_size, new_size))

def any_not_full_slices(indexers):
return any(not is_full_slice(idx) for idx in indexers)

def var_indexers(var, indexers):
return tuple(indexers.get(d, slice(None)) for d in var.dims)

# create variables for the new dataset
reindexed = OrderedDict()

for dim, indexer in indexers.items():
if isinstance(indexer, DataArray) and indexer.dims != (dim, ):
if isinstance(indexer, DataArray) and indexer.dims != (dim,):
warnings.warn(
"Indexer has dimensions {0:s} that are different "
"from that to be indexed along {1:s}. "
Expand All @@ -375,47 +365,24 @@ def var_indexers(var, indexers):

for name, var in iteritems(variables):
if name not in indexers:
assign_to = var_indexers(var, to_indexers)
assign_from = var_indexers(var, from_indexers)

if any_not_full_slices(assign_to):
# there are missing values to in-fill
data = var[assign_from].data
dtype, fill_value = dtypes.maybe_promote(var.dtype)

if isinstance(data, np.ndarray):
shape = tuple(new_sizes.get(dim, size)
for dim, size in zip(var.dims, var.shape))
new_data = np.empty(shape, dtype=dtype)
new_data[...] = fill_value
# create a new Variable so we can use orthogonal indexing
# use fastpath=True to avoid dtype inference
new_var = Variable(var.dims, new_data, var.attrs,
fastpath=True)
new_var[assign_to] = data

else: # dask array
data = data.astype(dtype, copy=False)
for axis, indexer in enumerate(assign_to):
if not is_full_slice(indexer):
indices = np.cumsum(indexer)[~indexer]
data = duck_array_ops.insert(
data, indices, fill_value, axis=axis)
new_var = Variable(var.dims, data, var.attrs,
fastpath=True)

elif any_not_full_slices(assign_from):
# type coercion is not necessary as there are no missing
# values
new_var = var[assign_from]

else:
# no reindexing is necessary
key = tuple(slice(None)
if d in unchanged_dims
else int_indexers.get(d, slice(None))
for d in var.dims)
needs_masking = any(d in masked_dims for d in var.dims)

if needs_masking:
new_var = var._getitem_with_mask(key)
elif all(is_full_slice(k) for k in key):
# no reindexing necessary
# here we need to manually deal with copying data, since
# we neither created a new ndarray nor used fancy indexing
new_var = var.copy(deep=copy)
else:
new_var = var[key]

reindexed[name] = new_var

return reindexed


Expand Down
23 changes: 23 additions & 0 deletions xarray/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,26 @@ def is_datetime_like(dtype):
"""
return (np.issubdtype(dtype, np.datetime64) or
np.issubdtype(dtype, np.timedelta64))


def result_type(*arrays_and_dtypes):
"""Like np.result_type, but number + string -> object (not string).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should also take care of bool-float pair

In [1]: import numpy as np
   ...: import xarray as xr
   ...: 
   ...: x = xr.DataArray([True, False])
   ...: x.where(x)
   ...: 
Out[1]: 
<xarray.DataArray (dim_0: 2)>
array([ 1., nan])
Dimensions without coordinates: dim_0


Parameters
----------
*arrays_and_dtypes : list of arrays and dtypes
The dtype is extracted from both numpy and dask arrays.

Returns
-------
numpy.dtype for the result.
"""
types = [np.result_type(t).type for t in arrays_and_dtypes]

# For reference, see the NumPy type hierarchy:
# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html
if (any(issubclass(t, (np.number, np.bool_)) for t in types) and
any(issubclass(t, np.character) for t in types)):
return np.dtype(object)
else:
return np.result_type(*arrays_and_dtypes)
16 changes: 15 additions & 1 deletion xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def isnull(data):


transpose = _dask_or_eager_func('transpose')
where = _dask_or_eager_func('where', n_array_args=3)
raw_where = _dask_or_eager_func('where', n_array_args=3)
insert = _dask_or_eager_func('insert')
take = _dask_or_eager_func('take')
broadcast_to = _dask_or_eager_func('broadcast_to')
Expand Down Expand Up @@ -151,6 +151,20 @@ def count(data, axis=None):
return sum(~isnull(data), axis=axis)


def where(condition, x, y):
"""Three argument where() with better dtype promotion rules."""
x = asarray(x)
y = asarray(y)
# Pass arrays directly instead of dtypes to result_type so Python scalars
# get handled properly.
# Note that result_type() safely gets the dtype from dask arrays without
# evaluating them.
out_type = dtypes.result_type(x, y)
x = x.astype(out_type, copy=False)
y = y.astype(out_type, copy=False)
return raw_where(condition, x, y)


def where_method(data, cond, other=dtypes.NA):
if other is dtypes.NA:
other = dtypes.get_fill_value(data.dtype)
Expand Down
6 changes: 6 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,12 @@ def test_where(self):
actual = arr.where(arr.x < 2, drop=True)
assert_identical(actual, expected)

def test_where_string(self):
array = DataArray(['a', 'b'])
expected = DataArray(np.array(['a', np.nan], dtype=object))
actual = array.where([True, False])
assert_identical(actual, expected)

def test_cumops(self):
coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),
Expand Down
47 changes: 47 additions & 0 deletions xarray/tests/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import pytest

from xarray.core import dtypes


@pytest.mark.parametrize("args, expected", [
([np.bool], np.bool),
([np.bool, np.string_], np.object),
([np.float32, np.float64], np.float64),
([np.float32, np.string_], np.object),
([np.unicode_, np.int64], np.object),
([np.unicode_, np.unicode_], np.unicode_),
])
def test_result_type(args, expected):
actual = dtypes.result_type(*args)
assert actual == expected


def test_result_type_scalar():
actual = dtypes.result_type(np.arange(3, dtype=np.float32), np.nan)
assert actual == np.float32


def test_result_type_dask_array():
# verify it works without evaluating dask arrays
da = pytest.importorskip('dask.array')
dask = pytest.importorskip('dask')

def error():
raise RuntimeError

array = da.from_delayed(dask.delayed(error)(), (), np.float64)
with pytest.raises(RuntimeError):
array.compute()

actual = dtypes.result_type(array)
assert actual == np.float64

# note that this differs from the behavior for scalar numpy arrays, which
# would get promoted to float32
actual = dtypes.result_type(array, np.array([0.5, 1.0], dtype=np.float32))
assert actual == np.float64
10 changes: 9 additions & 1 deletion xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from numpy import array, nan
from . import assert_array_equal
from xarray.core.duck_array_ops import (
first, last, count, mean, array_notnull_equiv,
first, last, count, mean, array_notnull_equiv, where
)

from . import TestCase, raises_regex
Expand Down Expand Up @@ -76,6 +76,14 @@ def test_count(self):
expected = array([[1, 2, 3], [3, 2, 1]])
assert_array_equal(expected, count(self.x, axis=-1))

def test_where_type_promotion(self):
result = where([True, False], [1, 2], ['a', 'b'])
assert_array_equal(result, np.array([1, 'b'], dtype=object))

result = where([True, False], np.array([1, 2], np.float32), np.nan)
assert result.dtype == np.float32
assert_array_equal(result, np.array([1, np.nan], dtype=np.float32))

def test_all_nan_arrays(self):
assert np.isnan(mean([np.nan, np.nan]))

Expand Down