pydata · rabernat · Apr 18, 2019 · Oct 31, 2018 · Oct 31, 2018 · Oct 31, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -65,7 +65,7 @@ script:
     elif [[ "$CONDA_ENV" == "py36-hypothesis" ]]; then
       pytest properties ;
     else
-      py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing --verbose $EXTRA_FLAGS;
+      py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing $EXTRA_FLAGS;
     fi
 
 after_success:

diff --git a/README.rst b/README.rst
@@ -8,9 +8,9 @@ xarray: N-D labeled arrays and datasets
 .. image:: https://coveralls.io/repos/pydata/xarray/badge.svg
    :target: https://coveralls.io/r/pydata/xarray
 .. image:: https://readthedocs.org/projects/xray/badge/?version=latest
-   :target: http://xarray.pydata.org/
-.. image:: http://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat
-  :target: http://pandas.pydata.org/speed/xarray/
+   :target: https://xarray.pydata.org/
+.. image:: https://img.shields.io/badge/benchmarked%20by-asv-green.svg?style=flat
+  :target: https://pandas.pydata.org/speed/xarray/
 .. image:: https://img.shields.io/pypi/v/xarray.svg
    :target: https://pypi.python.org/pypi/xarray/
 
@@ -30,10 +30,10 @@ It is particularly tailored to working with netCDF_ files, which were the
 source of xarray's data model, and integrates tightly with dask_ for parallel
 computing.
 
-.. _NumPy: http://www.numpy.org
-.. _pandas: http://pandas.pydata.org
-.. _dask: http://dask.org
-.. _netCDF: http://www.unidata.ucar.edu/software/netcdf
+.. _NumPy: https://www.numpy.org
+.. _pandas: https://pandas.pydata.org
+.. _dask: https://dask.org
+.. _netCDF: https://www.unidata.ucar.edu/software/netcdf
 
 Why xarray?
 -----------
@@ -66,12 +66,12 @@ powerful and concise interface. For example:
 Documentation
 -------------
 
-Learn more about xarray in its official documentation at http://xarray.pydata.org/
+Learn more about xarray in its official documentation at https://xarray.pydata.org/
 
 Contributing
 ------------
 
-You can find information about contributing to xarray at our `Contributing page <http://xarray.pydata.org/en/latest/contributing.html#>`_.
+You can find information about contributing to xarray at our `Contributing page <https://xarray.pydata.org/en/latest/contributing.html#>`_.
 
 Get in touch
 ------------
@@ -81,9 +81,9 @@ Get in touch
 - For less well defined questions or ideas, or to announce other projects of
   interest to xarray users, use the `mailing list`_.
 
-.. _StackOverFlow: http://stackoverflow.com/questions/tagged/python-xarray
+.. _StackOverFlow: https://stackoverflow.com/questions/tagged/python-xarray
 .. _mailing list: https://groups.google.com/forum/#!forum/xarray
-.. _on GitHub: http://github.com/pydata/xarray
+.. _on GitHub: https://github.com/pydata/xarray
 
 NumFOCUS
 --------
@@ -120,7 +120,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-  http://www.apache.org/licenses/LICENSE-2.0
+  https://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -13,25 +13,37 @@ What's New
     import xarray as xr
     np.random.seed(123456)
 
-.. _whats-new.0.12.1:
+.. _whats-new.0.12.2:
 
-v0.12.1 (unreleased)
+v0.12.2 (unreleased)
 --------------------
 
 Enhancements
 ~~~~~~~~~~~~
 
+Bug fixes
+~~~~~~~~~
+
+.. _whats-new.0.12.1:
+
+v0.12.1 (4 April 2019)
+----------------------
+
+Enhancements
+~~~~~~~~~~~~
+
 - Allow ``expand_dims`` method to support inserting/broadcasting dimensions
   with size > 1. (:issue:`2710`)
   By `Martin Pletcher <https://github.com/pletchm>`_.
 
-
 Bug fixes
 ~~~~~~~~~
 
 - Dataset.copy(deep=True) now creates a deep copy of the attrs (:issue:`2835`).
   By `Andras Gefferth <https://github.com/kefirbandi>`_.
-- ``swap_dims`` would create incorrect ``indexes`` (:issue:`2842`).
+- Fix incorrect ``indexes`` resulting from various ``Dataset`` operations
+  (e.g., ``swap_dims``, ``isel``, ``reindex``, ``[]``) (:issue:`2842`,
+  :issue:`2856`).
   By `Stephan Hoyer <https://github.com/shoyer>`_.
 
 .. _whats-new.0.12.0:
@@ -121,6 +133,11 @@ Other enhancements
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
 - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`).
   By `Kevin Squire <https://github.com/kmsquire>`_.
+- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` 
+  parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for 
+  backwards compatibility. The ``overwrite_encoded_chunks`` parameter is 
+  added to remove the original zarr chunk encoding.
+  By `Lily Wang <https://github.com/lilyminium>`_.
 
 Bug fixes
 ~~~~~~~~~

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import OrderedDict
 from distutils.version import LooseVersion
 
@@ -352,10 +353,11 @@ def close(self):
             zarr.consolidate_metadata(self.ds.store)
 
 
-def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
+def open_zarr(store, group=None, synchronizer=None, chunks='auto',
               decode_cf=True, mask_and_scale=True, decode_times=True,
               concat_characters=True, decode_coords=True,
-              drop_variables=None, consolidated=False):
+              drop_variables=None, consolidated=False,
+              overwrite_encoded_chunks=False, **kwargs):
     """Load and decode a dataset from a Zarr store.
 
     .. note:: Experimental
@@ -375,10 +377,15 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
         Array synchronizer provided to zarr
     group : str, obtional
         Group path. (a.k.a. `path` in zarr terminology.)
-    auto_chunk : bool, optional
-        Whether to automatically create dask chunks corresponding to each
-        variable's zarr chunks. If False, zarr array data will lazily convert
-        to numpy arrays upon access.
+    chunks : int or dict or tuple or {None, 'auto'}, optional
+        Chunk sizes along each dimension, e.g., ``5`` or
+        ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created
+        based on the variable's zarr chunks. If `chunks=None`, zarr array
+        data will lazily convert to numpy arrays upon access. This accepts
+        all the chunk specifications as Dask does.
+    overwrite_encoded_chunks: bool, optional
+        Whether to drop the zarr chunks encoded for each variable when a
+        dataset is loaded with specified chunk sizes (default: False)
     decode_cf : bool, optional
         Whether to decode these variables, assuming they were saved according
         to CF conventions.
@@ -422,6 +429,24 @@ def open_zarr(store, group=None, synchronizer=None, auto_chunk=True,
     ----------
     http://zarr.readthedocs.io/
     """
+    if 'auto_chunk' in kwargs:
+        auto_chunk = kwargs.pop('auto_chunk')
+        if auto_chunk:
+            chunks = 'auto'  # maintain backwards compatibility
+        else:
+            chunks = None
+
+        warnings.warn("auto_chunk is deprecated. Use chunks='auto' instead.",
+                      FutureWarning, stacklevel=2)
+
+    if kwargs:
+        raise TypeError("open_zarr() got unexpected keyword arguments " +
+                        ",".join(kwargs.keys()))
+
+    if not isinstance(chunks, (int, dict)):
+        if chunks != 'auto' and chunks is not None:
+            raise ValueError("chunks must be an int, dict, 'auto', or None. "
+                             "Instead found %s. " % chunks)
 
     if not decode_cf:
         mask_and_scale = False
@@ -449,21 +474,60 @@ def maybe_decode_store(store, lock=False):
 
     # auto chunking needs to be here and not in ZarrStore because variable
     # chunks do not survive decode_cf
-    if auto_chunk:
-        # adapted from Dataset.Chunk()
-        def maybe_chunk(name, var):
-            from dask.base import tokenize
-            chunks = var.encoding.get('chunks')
-            if (var.ndim > 0) and (chunks is not None):
-                # does this cause any data to be read?
-                token2 = tokenize(name, var._data)
-                name2 = 'zarr-%s' % token2
-                return var.chunk(chunks, name=name2, lock=None)
-            else:
-                return var
-
-        variables = OrderedDict([(k, maybe_chunk(k, v))
-                                 for k, v in ds.variables.items()])
-        return ds._replace_vars_and_dims(variables)
-    else:
+    # return trivial case
+    if not chunks:
         return ds
+
+    # adapted from Dataset.Chunk()
+    if isinstance(chunks, int):
+        chunks = dict.fromkeys(ds.dims, chunks)
+
+    if isinstance(chunks, tuple) and len(chunks) == len(ds.dims):
+        chunks = dict(zip(ds.dims, chunks))
+
+    def get_chunk(name, var, chunks):
+        chunk_spec = dict(zip(var.dims, var.encoding.get('chunks')))
+
+        # Coordinate labels aren't chunked
+        if var.ndim == 1 and var.dims[0] == name:
+            return chunk_spec
+
+        if chunks == 'auto':
+            return chunk_spec
+
+        for dim in var.dims:
+            if dim in chunks:
+                spec = chunks[dim]
+                if isinstance(spec, int):
+                    spec = (spec,)
+                if isinstance(spec, (tuple, list)) and chunk_spec[dim]:
+                    if any(s % chunk_spec[dim] for s in spec):
+                        warnings.warn("Specified Dask chunks %r would "
+                                      "separate Zarr chunk shape %r for "
+                                      "dimension %r. This significantly "
+                                      "degrades performance. Consider "
+                                      "rechunking after loading instead."
+                                      % (chunks[dim], chunk_spec[dim], dim),
+                                      stacklevel=2)
+                chunk_spec[dim] = chunks[dim]
+        return chunk_spec
+
+    def maybe_chunk(name, var, chunks):
+        from dask.base import tokenize
+
+        chunk_spec = get_chunk(name, var, chunks)
+
+        if (var.ndim > 0) and (chunk_spec is not None):
+            # does this cause any data to be read?
+            token2 = tokenize(name, var._data)
+            name2 = 'zarr-%s' % token2
+            var = var.chunk(chunk_spec, name=name2, lock=None)
+            if overwrite_encoded_chunks and var.chunks is not None:
+                var.encoding['chunks'] = tuple(x[0] for x in var.chunks)
+            return var
+        else:
+            return var
+
+    variables = OrderedDict([(k, maybe_chunk(k, v, chunks))
+                            for k, v in ds.variables.items()])
+    return ds._replace_vars_and_dims(variables)
diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py
@@ -79,7 +79,7 @@ def get_date_type(calendar):
 
 class BaseCFTimeOffset(object):
     _freq = None  # type: ClassVar[str]
-    _day_option = None
+    _day_option = None  # type: ClassVar[str]
 
     def __init__(self, n=1):
         if not isinstance(n, int):

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -315,36 +315,51 @@ def reindex_variables(
     """
     from .dataarray import DataArray
 
+    # create variables for the new dataset
+    reindexed = OrderedDict()  # type: OrderedDict[Any, Variable]
+
     # build up indexers for assignment along each dimension
     int_indexers = {}
-    targets = OrderedDict()  # type: OrderedDict[Any, pd.Index]
+    new_indexes = OrderedDict(indexes)
     masked_dims = set()
     unchanged_dims = set()
 
-    # size of reindexed dimensions
-    new_sizes = {}
+    for dim, indexer in indexers.items():
+        if isinstance(indexer, DataArray) and indexer.dims != (dim,):
+            warnings.warn(
+                "Indexer has dimensions {0:s} that are different "
+                "from that to be indexed along {1:s}. "
+                "This will behave differently in the future.".format(
+                    str(indexer.dims), dim),
+                FutureWarning, stacklevel=3)
+
+        target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim])
+
+        if dim in indexes:
+            index = indexes[dim]
 
-    for name, index in indexes.items():
-        if name in indexers:
             if not index.is_unique:
                 raise ValueError(
                     'cannot reindex or align along dimension %r because the '
-                    'index has duplicate values' % name)
-
-            target = utils.safe_cast_to_index(indexers[name])
-            new_sizes[name] = len(target)
+                    'index has duplicate values' % dim)
 
             int_indexer = get_indexer_nd(index, target, method, tolerance)
 
             # We uses negative values from get_indexer_nd to signify
             # values that are missing in the index.
             if (int_indexer < 0).any():
-                masked_dims.add(name)
+                masked_dims.add(dim)
             elif np.array_equal(int_indexer, np.arange(len(index))):
-                unchanged_dims.add(name)
+                unchanged_dims.add(dim)
 
-            int_indexers[name] = int_indexer
-            targets[name] = target
+            int_indexers[dim] = int_indexer
+
+        if dim in variables:
+            var = variables[dim]
+            args = (var.attrs, var.encoding)  # type: tuple
+        else:
+            args = ()
+        reindexed[dim] = IndexVariable((dim,), target, *args)
 
     for dim in sizes:
         if dim not in indexes and dim in indexers:
@@ -356,25 +371,6 @@ def reindex_variables(
                     'index because its size %r is different from the size of '
                     'the new index %r' % (dim, existing_size, new_size))
 
-    # create variables for the new dataset
-    reindexed = OrderedDict()  # type: OrderedDict[Any, Variable]
-
-    for dim, indexer in indexers.items():
-        if isinstance(indexer, DataArray) and indexer.dims != (dim,):
-            warnings.warn(
-                "Indexer has dimensions {0:s} that are different "
-                "from that to be indexed along {1:s}. "
-                "This will behave differently in the future.".format(
-                    str(indexer.dims), dim),
-                FutureWarning, stacklevel=3)
-
-        if dim in variables:
-            var = variables[dim]
-            args = (var.attrs, var.encoding)  # type: tuple
-        else:
-            args = ()
-        reindexed[dim] = IndexVariable((dim,), indexers[dim], *args)
-
     for name, var in variables.items():
         if name not in indexers:
             key = tuple(slice(None)
@@ -395,9 +391,6 @@ def reindex_variables(
 
             reindexed[name] = new_var
 
-    new_indexes = OrderedDict(indexes)
-    new_indexes.update(targets)
-
     return reindexed, new_indexes
 
 

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -231,9 +231,6 @@ def __init__(self, data, coords=None, dims=None, name=None,
             coords, dims = _infer_coords_and_dims(data.shape, coords, dims)
             variable = Variable(dims, data, attrs, encoding, fastpath=True)
 
-        # uncomment for a useful consistency check:
-        # assert all(isinstance(v, Variable) for v in coords.values())
-
         # These fully describe a DataArray
         self._variable = variable
         self._coords = coords