Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add drop duplicates #5089

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
d84dae7
Add drop duplicates; wip need to fix tests
Mar 29, 2021
1494966
Comments
Mar 30, 2021
adfafc0
Replace apostrophes with quotations
Mar 30, 2021
322ad9a
Add whats new, fix linting, and bug
Mar 30, 2021
81d4002
Add to api, pre commit
Mar 30, 2021
28aa96a
Revise based on comments
Mar 31, 2021
4b1dab7
Merge branch 'master' of https://github.com/pydata/xarray into drop_d…
Mar 31, 2021
f9ee3fe
iPrecommit
Mar 31, 2021
daa6e42
Rewrite function to support coords
Apr 4, 2021
cc94bbe
Merge branch 'master' into drop_duplicates
ahuang11 Apr 4, 2021
f7dcdd4
Lint
Apr 4, 2021
915dcf5
Add back >>> for what's new
Apr 4, 2021
344a7d8
Revert new line
Apr 4, 2021
596ec7a
Update test_dataarray tests
Apr 4, 2021
1698990
Fix formatting
Apr 4, 2021
e307041
Fix test
Apr 4, 2021
d33586e
Fix based on sugggestion
Apr 6, 2021
8c27afb
Black
Apr 6, 2021
8a168ce
Replace drop with drop_vars
Apr 6, 2021
a1ce19d
Fix tests
Apr 6, 2021
e1e24bc
Revert "iPrecommit"
May 1, 2021
d7cf3c4
Revert "Merge branch 'master' into drop_duplicates"
May 1, 2021
1c8a4ae
Revert "Fix tests"
May 1, 2021
c2cc15f
Revert "Replace drop with drop_vars"
May 1, 2021
966a420
Revert "Black"
May 1, 2021
d9fde90
Revert "Fix based on sugggestion"
May 1, 2021
b9ee4ca
Revert "Fix test"
May 1, 2021
3b9b7e3
Revert "Fix formatting"
May 1, 2021
61352f9
Revert "Update test_dataarray tests"
May 1, 2021
25949b0
Revert "Revert new line"
May 1, 2021
5c4fc82
Revert "Add back >>> for what's new"
May 1, 2021
a77f78d
Revert "Lint"
May 1, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4418,10 +4418,34 @@ def query(
)
return ds[self.name]

def drop_duplicates(
self,
subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
keep: Union[str, bool,] = "first"
):
"""Returns a new data array with duplicate dimension values removed.

Parameters
----------
subset : dimension label or sequence of labels, optional
Only consider certain dimensions for identifying duplicates, by
default use all of the columns.
keep : {"first", "last", False}, default: "first"
Determines which duplicates (if any) to keep.
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.

Returns
-------
DataArray
"""
ds = self._to_temp_dataset().drop_duplicates(subset=subset, keep=keep)
return self._from_temp_dataset(ds)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor)


# priority most be higher than Variable to properly work with binary ufuncs
ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)
46 changes: 46 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7074,5 +7074,51 @@ def query(
# apply the selection
return self.isel(indexers, missing_dims=missing_dims)

def drop_duplicates(
self,
subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
keep: Union[str, bool] = "first"
):
"""Returns a new dataset with duplicate dimension values removed.

Parameters
----------
subset : dimension label or sequence of labels, optional
Only consider certain dimensions for identifying duplicates, by
default use all of the columns.
keep : {"first", "last", False}, default: "first"
Determines which duplicates (if any) to keep.
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.

Returns
-------
Dataset
"""
if subset is None:
subset = list(self.coords)
elif isinstance(subset, str):
subset = [subset]

for dim in subset:
if dim not in self.dims:
raise ValueError("%s must be a single dataset dimension" % dim)

new = self.copy(deep=False)
if len(subset) > 1:
new = self.stack({'tmp_dim': subset})
subset = 'tmp_dim'
else:
subset = subset[0]

index = new.get_index(subset).duplicated(keep=keep)
new = new.isel(**{subset: ~index})

if 'tmp_dim' in new.dims:
new = new.unstack('tmp_dim')

return new


ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
48 changes: 48 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6845,3 +6845,51 @@ def test_deepcopy_obj_array():
x0 = Dataset(dict(foo=DataArray(np.array([object()]))))
x1 = deepcopy(x0)
assert x0["foo"].values[0] is not x1["foo"].values[0]


@pytest.mark.parametrize("keep", ['first', 'last', False])
def test_drop_duplicates():
ds = xr.DataArray(
[0, 5, 6, 7], dims='time', coords={'time': [0, 0, 1, 2]}
).to_dataset()

if keep == 'first':
data = [0, 6, 7]
time = [0, 1, 2]
elif keep == 'last':
data = [5, 6, 7]
time = [0, 1, 2]
else:
data = [6, 7]
time = [1, 2]
expected = xr.DataArray(data, dims='time', coords={'time': time}).to_dataset()
result = ds.drop_duplicates('time', keep=keep)
assert_equal(expected, result)


@pytest.mark.parametrize("keep", ['first', 'last', False])
def test_drop_duplicates_multi_dim():
base_data = np.stack([np.arange(0, 5) * i for i in np.arange(0, 5)])
ds = xr.DataArray(
base_data,
coords={'lat': [0, 1, 2, 2, 3], 'lon': [0, 1, 3, 3, 4]},
dims=['lat', 'lon'], name='test'
).to_dataset()
ds = ds.drop_duplicates(['lat', 'lon'], keep='first')

if keep == 'first':
data = base_data[[0, 1, 2, 4]][:, [0, 1, 2, 4]]
lat = [0, 1, 2, 3]
lon = [0, 1, 3, 4]
elif keep == 'last':
data = base_data[[0, 1, 3, 4]][:, [0, 1, 3, 4]]
lat = [0, 1, 2, 3]
lon = [0, 1, 3, 4]
else:
data = base_data[[0, 1, 4]][:, [0, 1, 4]]
lat = [0, 1, 3]
lon = [0, 1, 4]
expected = xr.DataArray(
data, dims=['lat', 'lon'], coords={'lat': lat, 'lon': lon}, name='test'
).to_dataset()
result = ds.drop_duplicates(['lat', 'lon'], keep=keep)