Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use getitem_with_mask in reindex_variables #1847

Merged
merged 9 commits into from
Feb 14, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
"netcdf4": [""],
"scipy": [""],
"bottleneck": ["", null],
"dask": ["", null],
"dask": [""],
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to run benchmarks separately with/without dask. This just makes the whole suite take twice as long as necessary. We already need to write separate benchmark cases, given that the syntax for using dask is different.

},


Expand Down
45 changes: 45 additions & 0 deletions asv_bench/benchmarks/reindexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import xarray as xr

from . import requires_dask


class Reindex(object):
def setup(self):
data = np.random.RandomState(0).randn(1000, 100, 100)
self.ds = xr.Dataset({'temperature': (('time', 'x', 'y'), data)},
coords={'time': np.arange(1000),
'x': np.arange(100),
'y': np.arange(100)})

def time_1d_coarse(self):
self.ds.reindex(time=np.arange(0, 1000, 5)).load()

def time_1d_fine_all_found(self):
self.ds.reindex(time=np.arange(0, 1000, 0.5), method='nearest').load()

def time_1d_fine_some_missing(self):
self.ds.reindex(time=np.arange(0, 1000, 0.5), method='nearest',
tolerance=0.1).load()

def time_2d_coarse(self):
self.ds.reindex(x=np.arange(0, 100, 2), y=np.arange(0, 100, 2)).load()

def time_2d_fine_all_found(self):
self.ds.reindex(x=np.arange(0, 100, 0.5), y=np.arange(0, 100, 0.5),
method='nearest').load()

def time_2d_fine_some_missing(self):
self.ds.reindex(x=np.arange(0, 100, 0.5), y=np.arange(0, 100, 0.5),
method='nearest', tolerance=0.1).load()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have unit test coverage for this test w/ dask? I see it was failing in the ASV benchmark before the changes in alignment.py so my guess is we don't have test coverage here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll double check but I think the test failure was just ASV timing out. When I run a similar test case before (see my comment on the other PR) it was extremely slow.



class ReindexDask(Reindex):
def setup(self):
requires_dask()
super(ReindexDask, self).setup()
self.ds = self.ds.chunk({'time': 100})
97 changes: 33 additions & 64 deletions xarray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,59 +306,51 @@ def reindex_variables(variables, sizes, indexes, indexers, method=None,
from .dataarray import DataArray

# build up indexers for assignment along each dimension
to_indexers = {}
from_indexers = {}
int_indexers = {}
targets = {}
masked_dims = set()
unchanged_dims = set()

# size of reindexed dimensions
new_sizes = {}

for name, index in iteritems(indexes):
if name in indexers:
target = utils.safe_cast_to_index(indexers[name])
if not index.is_unique:
raise ValueError(
'cannot reindex or align along dimension %r because the '
'index has duplicate values' % name)
indexer = get_indexer_nd(index, target, method, tolerance)

target = utils.safe_cast_to_index(indexers[name])
new_sizes[name] = len(target)
# Note pandas uses negative values from get_indexer_nd to signify
# values that are missing in the index
# The non-negative values thus indicate the non-missing values
to_indexers[name] = indexer >= 0
if to_indexers[name].all():
# If an indexer includes no negative values, then the
# assignment can be to a full-slice (which is much faster,
# and means we won't need to fill in any missing values)
to_indexers[name] = slice(None)

from_indexers[name] = indexer[to_indexers[name]]
if np.array_equal(from_indexers[name], np.arange(len(index))):
# If the indexer is equal to the original index, use a full
# slice object to speed up selection and so we can avoid
# unnecessary copies
from_indexers[name] = slice(None)

int_indexer = get_indexer_nd(index, target, method, tolerance)

# We uses negative values from get_indexer_nd to signify
# values that are missing in the index.
if (int_indexer < 0).any():
masked_dims.add(name)
elif np.array_equal(int_indexer, np.arange(len(index))):
unchanged_dims.add(name)

int_indexers[name] = int_indexer
targets[name] = target

for dim in sizes:
if dim not in indexes and dim in indexers:
existing_size = sizes[dim]
new_size = utils.safe_cast_to_index(indexers[dim]).size
new_size = indexers[dim].size
if existing_size != new_size:
raise ValueError(
'cannot reindex or align along dimension %r without an '
'index because its size %r is different from the size of '
'the new index %r' % (dim, existing_size, new_size))

def any_not_full_slices(indexers):
return any(not is_full_slice(idx) for idx in indexers)

def var_indexers(var, indexers):
return tuple(indexers.get(d, slice(None)) for d in var.dims)

# create variables for the new dataset
reindexed = OrderedDict()

for dim, indexer in indexers.items():
if isinstance(indexer, DataArray) and indexer.dims != (dim, ):
if isinstance(indexer, DataArray) and indexer.dims != (dim,):
warnings.warn(
"Indexer has dimensions {0:s} that are different "
"from that to be indexed along {1:s}. "
Expand All @@ -375,47 +367,24 @@ def var_indexers(var, indexers):

for name, var in iteritems(variables):
if name not in indexers:
assign_to = var_indexers(var, to_indexers)
assign_from = var_indexers(var, from_indexers)

if any_not_full_slices(assign_to):
# there are missing values to in-fill
data = var[assign_from].data
dtype, fill_value = dtypes.maybe_promote(var.dtype)

if isinstance(data, np.ndarray):
shape = tuple(new_sizes.get(dim, size)
for dim, size in zip(var.dims, var.shape))
new_data = np.empty(shape, dtype=dtype)
new_data[...] = fill_value
# create a new Variable so we can use orthogonal indexing
# use fastpath=True to avoid dtype inference
new_var = Variable(var.dims, new_data, var.attrs,
fastpath=True)
new_var[assign_to] = data

else: # dask array
data = data.astype(dtype, copy=False)
for axis, indexer in enumerate(assign_to):
if not is_full_slice(indexer):
indices = np.cumsum(indexer)[~indexer]
data = duck_array_ops.insert(
data, indices, fill_value, axis=axis)
new_var = Variable(var.dims, data, var.attrs,
fastpath=True)

elif any_not_full_slices(assign_from):
# type coercion is not necessary as there are no missing
# values
new_var = var[assign_from]

else:
# no reindexing is necessary
key = tuple(slice(None)
if d in unchanged_dims
else int_indexers.get(d, slice(None))
for d in var.dims)
needs_masking = any(d in masked_dims for d in var.dims)

if needs_masking:
new_var = var._getitem_with_mask(key)
elif all(is_full_slice(k) for k in key):
# no reindexing necessary
# here we need to manually deal with copying data, since
# we neither created a new ndarray nor used fancy indexing
new_var = var.copy(deep=copy)
else:
new_var = var[key]

reindexed[name] = new_var

return reindexed


Expand Down