diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4300f1b188a..b59ed6363dc 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -60,6 +60,10 @@ New Features By `Maximilian Roos `_. - Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables. + By `Deepak Cherian `_ +- add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg + (:issue:`4749`, :pull:`4827`). + By `Justus Magin `_. By `Deepak Cherian `_. - :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims in the form of kwargs as well as a dict, like most similar methods. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 86ed1870302..573247937b7 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -412,14 +412,16 @@ def combine_nested( - "override": if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \ - default: "drop" + combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \ + "override"}, default: "drop" String indicating how to combine attrs of the objects being merged: - "drop": empty attrs on returned Dataset. - "identical": all attrs must be the same on every object. - "no_conflicts": attrs from all objects are combined, any that have the same name must also have the same value. + - "drop_conflicts": attrs from all objects are combined, any that have + the same name but different values are dropped. - "override": skip comparing and copy attrs from the first dataset to the result. @@ -625,14 +627,16 @@ def combine_by_coords( - "override": if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \ - default: "drop" + combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \ + "override"}, default: "drop" String indicating how to combine attrs of the objects being merged: - "drop": empty attrs on returned Dataset. - "identical": all attrs must be the same on every object. - "no_conflicts": attrs from all objects are combined, any that have the same name must also have the same value. + - "drop_conflicts": attrs from all objects are combined, any that have + the same name but different values are dropped. - "override": skip comparing and copy attrs from the first dataset to the result. diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5cda5aa903c..7a958eb1404 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -142,14 +142,16 @@ def concat( - "override": if indexes are of same size, rewrite indexes to be those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \ - default: "override" + combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \ + "override"}, default: "override" String indicating how to combine attrs of the objects being merged: - "drop": empty attrs on returned Dataset. - "identical": all attrs must be the same on every object. - "no_conflicts": attrs from all objects are combined, any that have the same name must also have the same value. + - "drop_conflicts": attrs from all objects are combined, any that have + the same name but different values are dropped. - "override": skip comparing and copy attrs from the first dataset to the result. diff --git a/xarray/core/merge.py b/xarray/core/merge.py index d29a9e1ff02..14beeff3db5 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -20,7 +20,7 @@ from . import dtypes, pdcompat from .alignment import deep_align from .duck_array_ops import lazy_array_equiv -from .utils import Frozen, compat_dict_union, dict_equiv +from .utils import Frozen, compat_dict_union, dict_equiv, equivalent from .variable import Variable, as_variable, assert_unique_multiindex_level_names if TYPE_CHECKING: @@ -513,6 +513,24 @@ def merge_attrs(variable_attrs, combine_attrs): "the same. Merging %s with %s" % (str(result), str(attrs)) ) return result + elif combine_attrs == "drop_conflicts": + result = {} + dropped_keys = set() + for attrs in variable_attrs: + result.update( + { + key: value + for key, value in attrs.items() + if key not in result and key not in dropped_keys + } + ) + result = { + key: value + for key, value in result.items() + if key not in attrs or equivalent(attrs[key], value) + } + dropped_keys |= {key for key in attrs if key not in result} + return result elif combine_attrs == "identical": result = dict(variable_attrs[0]) for attrs in variable_attrs[1:]: @@ -556,7 +574,8 @@ def merge_core( Compatibility checks to use when merging variables. join : {"outer", "inner", "left", "right"}, optional How to combine objects with different indexes. - combine_attrs : {"drop", "identical", "no_conflicts", "override"}, optional + combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \ + "override"}, optional How to combine attributes of objects priority_arg : int, optional Optional argument in `objects` that takes precedence over the others. @@ -668,14 +687,16 @@ def merge( Value to use for newly missing values. If a dict-like, maps variable names to fill values. Use a data array's name to refer to its values. - combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \ - default: "drop" + combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \ + "override"}, default: "drop" String indicating how to combine attrs of the objects being merged: - "drop": empty attrs on returned Dataset. - "identical": all attrs must be the same on every object. - "no_conflicts": attrs from all objects are combined, any that have the same name must also have the same value. + - "drop_conflicts": attrs from all objects are combined, any that have + the same name but different values are dropped. - "override": skip comparing and copy attrs from the first dataset to the result. diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 109b78f05a9..522b98cf864 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -732,6 +732,17 @@ def test_combine_coords_combine_attrs_identical(self): objs, concat_dim="x", join="outer", combine_attrs="identical" ) + def test_combine_nested_combine_attrs_drop_conflicts(self): + objs = [ + Dataset({"x": [0], "y": [0]}, attrs={"a": 1, "b": 2, "c": 3}), + Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 0, "d": 3}), + ] + expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "c": 3, "d": 3}) + actual = combine_nested( + objs, concat_dim="x", join="outer", combine_attrs="drop_conflicts" + ) + assert_identical(expected, actual) + def test_infer_order_from_coords(self): data = create_test_data() objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))] diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 7416cab13ed..beed48a35fc 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -258,27 +258,118 @@ def test_concat_join_kwarg(self): ) assert_identical(actual, expected) - def test_concat_combine_attrs_kwarg(self): - ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs={"b": 42}) - ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs={"b": 42, "c": 43}) - - expected = {} - expected["drop"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}) - expected["no_conflicts"] = Dataset( - {"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42, "c": 43} - ) - expected["override"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42}) - - with raises_regex(ValueError, "combine_attrs='identical'"): - actual = concat([ds1, ds2], dim="x", combine_attrs="identical") - with raises_regex(ValueError, "combine_attrs='no_conflicts'"): - ds3 = ds2.copy(deep=True) - ds3.attrs["b"] = 44 - actual = concat([ds1, ds3], dim="x", combine_attrs="no_conflicts") + @pytest.mark.parametrize( + "combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception", + [ + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 1, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + False, + ), + ("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False), + ("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False), + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 4, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + True, + ), + ("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True), + ( + "override", + {"a": 1, "b": 2}, + {"a": 4, "b": 5, "c": 3}, + {"a": 1, "b": 2}, + False, + ), + ( + "drop_conflicts", + {"a": 41, "b": 42, "c": 43}, + {"b": 2, "c": 43, "d": 44}, + {"a": 41, "c": 43, "d": 44}, + False, + ), + ], + ) + def test_concat_combine_attrs_kwarg( + self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception + ): + ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs=var1_attrs) + ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs=var2_attrs) + + if expect_exception: + with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"): + concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) + else: + actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) + expected = Dataset( + {"a": ("x", [0, 0])}, {"x": [0, 1]}, attrs=expected_attrs + ) - for combine_attrs in expected: + assert_identical(actual, expected) + + @pytest.mark.skip(reason="not implemented, yet (see #4827)") + @pytest.mark.parametrize( + "combine_attrs, attrs1, attrs2, expected_attrs, expect_exception", + [ + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 1, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + False, + ), + ("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False), + ("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False), + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 4, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + True, + ), + ("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True), + ( + "override", + {"a": 1, "b": 2}, + {"a": 4, "b": 5, "c": 3}, + {"a": 1, "b": 2}, + False, + ), + ( + "drop_conflicts", + {"a": 41, "b": 42, "c": 43}, + {"b": 2, "c": 43, "d": 44}, + {"a": 41, "c": 43, "d": 44}, + False, + ), + ], + ) + def test_concat_combine_attrs_kwarg_variables( + self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception + ): + """check that combine_attrs is used on data variables and coords""" + ds1 = Dataset({"a": ("x", [0], attrs1)}, coords={"x": ("x", [0], attrs1)}) + ds2 = Dataset({"a": ("x", [0], attrs2)}, coords={"x": ("x", [1], attrs2)}) + + if expect_exception: + with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"): + concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) + else: actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs) - assert_identical(actual, expected[combine_attrs]) + expected = Dataset( + {"a": ("x", [0, 0], expected_attrs)}, + {"x": ("x", [0, 1], expected_attrs)}, + ) + + assert_identical(actual, expected) def test_concat_promote_shape(self): # mixed dims within variables diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 34b138e1f6a..27e2b10dcbc 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -92,6 +92,20 @@ def test_merge_arrays_attrs_default(self): {"a": 1, "b": 2}, False, ), + ( + "drop_conflicts", + {"a": 1, "b": 2, "c": 3}, + {"b": 1, "c": 3, "d": 4}, + {"a": 1, "c": 3, "d": 4}, + False, + ), + ( + "drop_conflicts", + {"a": 1, "b": np.array([2]), "c": np.array([3])}, + {"b": 1, "c": np.array([3]), "d": 4}, + {"a": 1, "c": np.array([3]), "d": 4}, + False, + ), ], ) def test_merge_arrays_attrs( @@ -109,6 +123,68 @@ def test_merge_arrays_attrs( expected.attrs = expected_attrs assert_identical(actual, expected) + @pytest.mark.skip(reason="not implemented, yet (see #4827)") + @pytest.mark.parametrize( + "combine_attrs, attrs1, attrs2, expected_attrs, expect_exception", + [ + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 1, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + False, + ), + ("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False), + ("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False), + ( + "no_conflicts", + {"a": 1, "b": 2}, + {"a": 4, "c": 3}, + {"a": 1, "b": 2, "c": 3}, + True, + ), + ("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False), + ("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True), + ( + "override", + {"a": 1, "b": 2}, + {"a": 4, "b": 5, "c": 3}, + {"a": 1, "b": 2}, + False, + ), + ( + "drop_conflicts", + {"a": 1, "b": 2, "c": 3}, + {"b": 1, "c": 3, "d": 4}, + {"a": 1, "c": 3, "d": 4}, + False, + ), + ], + ) + def test_merge_arrays_attrs_variables( + self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception + ): + """check that combine_attrs is used on data variables and coords""" + data = create_test_data() + data1 = data.copy() + data1.var1.attrs = attrs1 + data1.dim1.attrs = attrs1 + data2 = data.copy() + data2.var1.attrs = attrs2 + data2.dim1.attrs = attrs2 + + if expect_exception: + with raises_regex(MergeError, "combine_attrs"): + actual = xr.merge([data1, data2], combine_attrs=combine_attrs) + else: + actual = xr.merge([data1, data2], combine_attrs=combine_attrs) + expected = data.copy() + expected.var1.attrs = expected_attrs + expected.dim1.attrs = expected_attrs + + assert_identical(actual, expected) + def test_merge_attrs_override_copy(self): ds1 = xr.Dataset(attrs={"x": 0}) ds2 = xr.Dataset(attrs={"x": 1}) @@ -116,6 +192,15 @@ def test_merge_attrs_override_copy(self): ds3.attrs["x"] = 2 assert ds1.x == 0 + def test_merge_attrs_drop_conflicts(self): + ds1 = xr.Dataset(attrs={"a": 0, "b": 0, "c": 0}) + ds2 = xr.Dataset(attrs={"b": 0, "c": 1, "d": 0}) + ds3 = xr.Dataset(attrs={"a": 0, "b": 1, "c": 0, "e": 0}) + + actual = xr.merge([ds1, ds2, ds3], combine_attrs="drop_conflicts") + expected = xr.Dataset(attrs={"a": 0, "d": 0, "e": 0}) + assert_identical(actual, expected) + def test_merge_dicts_simple(self): actual = xr.merge([{"foo": 0}, {"bar": "one"}, {"baz": 3.5}]) expected = xr.Dataset({"foo": 0, "bar": "one", "baz": 3.5})