pydata · fujiisoup · Feb 14, 2018 · Dec 1, 2017 · Feb 9, 2018 · Feb 9, 2018
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -63,7 +63,7 @@
         "netcdf4": [""],
         "scipy": [""],
         "bottleneck": ["", null],
-        "dask": ["", null],
+        "dask": [""],
     },
 
 

diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py
@@ -0,0 +1,45 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import xarray as xr
+
+from . import requires_dask
+
+
+class Reindex(object):
+    def setup(self):
+        data = np.random.RandomState(0).randn(1000, 100, 100)
+        self.ds = xr.Dataset({'temperature': (('time', 'x', 'y'), data)},
+                             coords={'time': np.arange(1000),
+                                     'x': np.arange(100),
+                                     'y': np.arange(100)})
+
+    def time_1d_coarse(self):
+        self.ds.reindex(time=np.arange(0, 1000, 5)).load()
+
+    def time_1d_fine_all_found(self):
+        self.ds.reindex(time=np.arange(0, 1000, 0.5), method='nearest').load()
+
+    def time_1d_fine_some_missing(self):
+        self.ds.reindex(time=np.arange(0, 1000, 0.5), method='nearest',
+                        tolerance=0.1).load()
+
+    def time_2d_coarse(self):
+        self.ds.reindex(x=np.arange(0, 100, 2), y=np.arange(0, 100, 2)).load()
+
+    def time_2d_fine_all_found(self):
+        self.ds.reindex(x=np.arange(0, 100, 0.5), y=np.arange(0, 100, 0.5),
+                        method='nearest').load()
+
+    def time_2d_fine_some_missing(self):
+        self.ds.reindex(x=np.arange(0, 100, 0.5), y=np.arange(0, 100, 0.5),
+                        method='nearest', tolerance=0.1).load()
+
+
+class ReindexDask(Reindex):
+    def setup(self):
+        requires_dask()
+        super(ReindexDask, self).setup()
+        self.ds = self.ds.chunk({'time': 100})
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -79,6 +79,9 @@ Enhancements
   (:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
   Correspondingly, encoded variables may also be saved with a smaller dtype.
   By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
+- Speed of reindexing/alignment with dask array is orders of magnitude faster
+  when inserting missing values  (:issue:`1847`).
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
 
 .. _Zarr: http://zarr.readthedocs.io/
 
@@ -135,6 +138,9 @@ Bug fixes
   ``parse_coordinates`` kwarg has beed added to :py:func:`~open_rasterio`
   (set to ``True`` per default).
   By `Fabien Maussion <https://github.com/fmaussion>`_.
+- :py:meth:`DataArray.where` with string data now fills with ``np.NaN`` like
+  pandas, not the string ``'nan'`` (:issue:`1847`).
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
 
 .. _whats-new.0.10.0:
 

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -8,13 +8,11 @@
 
 import numpy as np
 
-from . import duck_array_ops
-from . import dtypes
 from . import utils
 from .indexing import get_indexer_nd
 from .pycompat import iteritems, OrderedDict, suppress
 from .utils import is_full_slice, is_dict_like
-from .variable import Variable, IndexVariable
+from .variable import IndexVariable
 
 
 def _get_joiner(join):
@@ -306,59 +304,51 @@ def reindex_variables(variables, sizes, indexes, indexers, method=None,
     from .dataarray import DataArray
 
     # build up indexers for assignment along each dimension
-    to_indexers = {}
-    from_indexers = {}
+    int_indexers = {}
+    targets = {}
+    masked_dims = set()
+    unchanged_dims = set()
+
     # size of reindexed dimensions
     new_sizes = {}
 
     for name, index in iteritems(indexes):
         if name in indexers:
-            target = utils.safe_cast_to_index(indexers[name])
             if not index.is_unique:
                 raise ValueError(
                     'cannot reindex or align along dimension %r because the '
                     'index has duplicate values' % name)
-            indexer = get_indexer_nd(index, target, method, tolerance)
 
+            target = utils.safe_cast_to_index(indexers[name])
             new_sizes[name] = len(target)
-            # Note pandas uses negative values from get_indexer_nd to signify
-            # values that are missing in the index
-            # The non-negative values thus indicate the non-missing values
-            to_indexers[name] = indexer >= 0
-            if to_indexers[name].all():
-                # If an indexer includes no negative values, then the
-                # assignment can be to a full-slice (which is much faster,
-                # and means we won't need to fill in any missing values)
-                to_indexers[name] = slice(None)
-
-            from_indexers[name] = indexer[to_indexers[name]]
-            if np.array_equal(from_indexers[name], np.arange(len(index))):
-                # If the indexer is equal to the original index, use a full
-                # slice object to speed up selection and so we can avoid
-                # unnecessary copies
-                from_indexers[name] = slice(None)
+
+            int_indexer = get_indexer_nd(index, target, method, tolerance)
+
+            # We uses negative values from get_indexer_nd to signify
+            # values that are missing in the index.
+            if (int_indexer < 0).any():
+                masked_dims.add(name)
+            elif np.array_equal(int_indexer, np.arange(len(index))):
+                unchanged_dims.add(name)
+
+            int_indexers[name] = int_indexer
+            targets[name] = target
 
     for dim in sizes:
         if dim not in indexes and dim in indexers:
             existing_size = sizes[dim]
-            new_size = utils.safe_cast_to_index(indexers[dim]).size
+            new_size = indexers[dim].size
             if existing_size != new_size:
                 raise ValueError(
                     'cannot reindex or align along dimension %r without an '
                     'index because its size %r is different from the size of '
                     'the new index %r' % (dim, existing_size, new_size))
 
-    def any_not_full_slices(indexers):
-        return any(not is_full_slice(idx) for idx in indexers)
-
-    def var_indexers(var, indexers):
-        return tuple(indexers.get(d, slice(None)) for d in var.dims)
-
     # create variables for the new dataset
     reindexed = OrderedDict()
 
     for dim, indexer in indexers.items():
-        if isinstance(indexer, DataArray) and indexer.dims != (dim, ):
+        if isinstance(indexer, DataArray) and indexer.dims != (dim,):
             warnings.warn(
                 "Indexer has dimensions {0:s} that are different "
                 "from that to be indexed along {1:s}. "
@@ -375,47 +365,24 @@ def var_indexers(var, indexers):
 
     for name, var in iteritems(variables):
         if name not in indexers:
-            assign_to = var_indexers(var, to_indexers)
-            assign_from = var_indexers(var, from_indexers)
-
-            if any_not_full_slices(assign_to):
-                # there are missing values to in-fill
-                data = var[assign_from].data
-                dtype, fill_value = dtypes.maybe_promote(var.dtype)
-
-                if isinstance(data, np.ndarray):
-                    shape = tuple(new_sizes.get(dim, size)
-                                  for dim, size in zip(var.dims, var.shape))
-                    new_data = np.empty(shape, dtype=dtype)
-                    new_data[...] = fill_value
-                    # create a new Variable so we can use orthogonal indexing
-                    # use fastpath=True to avoid dtype inference
-                    new_var = Variable(var.dims, new_data, var.attrs,
-                                       fastpath=True)
-                    new_var[assign_to] = data
-
-                else:  # dask array
-                    data = data.astype(dtype, copy=False)
-                    for axis, indexer in enumerate(assign_to):
-                        if not is_full_slice(indexer):
-                            indices = np.cumsum(indexer)[~indexer]
-                            data = duck_array_ops.insert(
-                                data, indices, fill_value, axis=axis)
-                    new_var = Variable(var.dims, data, var.attrs,
-                                       fastpath=True)
-
-            elif any_not_full_slices(assign_from):
-                # type coercion is not necessary as there are no missing
-                # values
-                new_var = var[assign_from]
-
-            else:
-                # no reindexing is necessary
+            key = tuple(slice(None)
+                        if d in unchanged_dims
+                        else int_indexers.get(d, slice(None))
+                        for d in var.dims)
+            needs_masking = any(d in masked_dims for d in var.dims)
+
+            if needs_masking:
+                new_var = var._getitem_with_mask(key)
+            elif all(is_full_slice(k) for k in key):
+                # no reindexing necessary
                 # here we need to manually deal with copying data, since
                 # we neither created a new ndarray nor used fancy indexing
                 new_var = var.copy(deep=copy)
+            else:
+                new_var = var[key]
 
             reindexed[name] = new_var
+
     return reindexed
 
 

diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py
@@ -60,3 +60,26 @@ def is_datetime_like(dtype):
     """
     return (np.issubdtype(dtype, np.datetime64) or
             np.issubdtype(dtype, np.timedelta64))
+
+
+def result_type(*arrays_and_dtypes):
+    """Like np.result_type, but number + string -> object (not string).
+
+    Parameters
+    ----------
+    *arrays_and_dtypes : list of arrays and dtypes
+        The dtype is extracted from both numpy and dask arrays.
+
+    Returns
+    -------
+    numpy.dtype for the result.
+    """
+    types = [np.result_type(t).type for t in arrays_and_dtypes]
+
+    # For reference, see the NumPy type hierarchy:
+    # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html
+    if (any(issubclass(t, (np.number, np.bool_)) for t in types) and
+            any(issubclass(t, np.character) for t in types)):
+        return np.dtype(object)
+    else:
+        return np.result_type(*arrays_and_dtypes)
diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -82,7 +82,7 @@ def isnull(data):
 
 
 transpose = _dask_or_eager_func('transpose')
-where = _dask_or_eager_func('where', n_array_args=3)
+raw_where = _dask_or_eager_func('where', n_array_args=3)
 insert = _dask_or_eager_func('insert')
 take = _dask_or_eager_func('take')
 broadcast_to = _dask_or_eager_func('broadcast_to')
@@ -151,6 +151,20 @@ def count(data, axis=None):
     return sum(~isnull(data), axis=axis)
 
 
+def where(condition, x, y):
+    """Three argument where() with better dtype promotion rules."""
+    x = asarray(x)
+    y = asarray(y)
+    # Pass arrays directly instead of dtypes to result_type so Python scalars
+    # get handled properly.
+    # Note that result_type() safely gets the dtype from dask arrays without
+    # evaluating them.
+    out_type = dtypes.result_type(x, y)
+    x = x.astype(out_type, copy=False)
+    y = y.astype(out_type, copy=False)
+    return raw_where(condition, x, y)
+
+
 def where_method(data, cond, other=dtypes.NA):
     if other is dtypes.NA:
         other = dtypes.get_fill_value(data.dtype)

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -1717,6 +1717,12 @@ def test_where(self):
         actual = arr.where(arr.x < 2, drop=True)
         assert_identical(actual, expected)
 
+    def test_where_string(self):
+        array = DataArray(['a', 'b'])
+        expected = DataArray(np.array(['a', np.nan], dtype=object))
+        actual = array.where([True, False])
+        assert_identical(actual, expected)
+
     def test_cumops(self):
         coords = {'x': [-1, -2], 'y': ['ab', 'cd', 'ef'],
                   'lat': (['x', 'y'], [[1, 2, 3], [-1, -2, -3]]),

diff --git a/xarray/tests/test_dtypes.py b/xarray/tests/test_dtypes.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import pytest
+
+from xarray.core import dtypes
+
+
+@pytest.mark.parametrize("args, expected", [
+    ([np.bool], np.bool),
+    ([np.bool, np.string_], np.object),
+    ([np.float32, np.float64], np.float64),
+    ([np.float32, np.string_], np.object),
+    ([np.unicode_, np.int64], np.object),
+    ([np.unicode_, np.unicode_], np.unicode_),
+])
+def test_result_type(args, expected):
+    actual = dtypes.result_type(*args)
+    assert actual == expected
+
+
+def test_result_type_scalar():
+    actual = dtypes.result_type(np.arange(3, dtype=np.float32), np.nan)
+    assert actual == np.float32
+
+
+def test_result_type_dask_array():
+    # verify it works without evaluating dask arrays
+    da = pytest.importorskip('dask.array')
+    dask = pytest.importorskip('dask')
+
+    def error():
+        raise RuntimeError
+
+    array = da.from_delayed(dask.delayed(error)(), (), np.float64)
+    with pytest.raises(RuntimeError):
+        array.compute()
+
+    actual = dtypes.result_type(array)
+    assert actual == np.float64
+
+    # note that this differs from the behavior for scalar numpy arrays, which
+    # would get promoted to float32
+    actual = dtypes.result_type(array, np.array([0.5, 1.0], dtype=np.float32))
+    assert actual == np.float64
diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py
@@ -6,7 +6,7 @@
 from numpy import array, nan
 from . import assert_array_equal
 from xarray.core.duck_array_ops import (
-    first, last, count, mean, array_notnull_equiv,
+    first, last, count, mean, array_notnull_equiv, where
 )
 
 from . import TestCase, raises_regex
@@ -76,6 +76,14 @@ def test_count(self):
         expected = array([[1, 2, 3], [3, 2, 1]])
         assert_array_equal(expected, count(self.x, axis=-1))
 
+    def test_where_type_promotion(self):
+        result = where([True, False], [1, 2], ['a', 'b'])
+        assert_array_equal(result, np.array([1, 'b'], dtype=object))
+
+        result = where([True, False], np.array([1, 2], np.float32), np.nan)
+        assert result.dtype == np.float32
+        assert_array_equal(result, np.array([1, np.nan], dtype=np.float32))
+
     def test_all_nan_arrays(self):
         assert np.isnan(mean([np.nan, np.nan]))