-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
EHN: multi-column explode #40770
EHN: multi-column explode #40770
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -8151,16 +8151,27 @@ def stack(self, level: Level = -1, dropna: bool = True): | |||||
|
||||||
return result.__finalize__(self, method="stack") | ||||||
|
||||||
def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: | ||||||
def explode( | ||||||
self, | ||||||
column: str | tuple | list[str | tuple], | ||||||
ignore_index: bool = False, | ||||||
) -> DataFrame: | ||||||
""" | ||||||
Transform each element of a list-like to a row, replicating index values. | ||||||
|
||||||
.. versionadded:: 0.25.0 | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
column : str or tuple | ||||||
Column to explode. | ||||||
column : str or tuple or list thereof | ||||||
Column(s) to explode. | ||||||
For multiple columns, specify a non-empty list with each element | ||||||
be str or tuple, and all specified columns their list-like data | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
on same row of the frame must have matching length. | ||||||
|
||||||
.. versionadded:: 1.3.0 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
Multi-column explode | ||||||
|
||||||
ignore_index : bool, default False | ||||||
If True, the resulting index will be labeled 0, 1, …, n - 1. | ||||||
|
||||||
|
@@ -8175,7 +8186,10 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: | |||||
Raises | ||||||
------ | ||||||
ValueError : | ||||||
if columns of the frame are not unique. | ||||||
* If columns of the frame are not unique. | ||||||
* If specified columns to explode is empty list. | ||||||
* If specified columns to explode have not matching count of | ||||||
elements rowwise in the frame. | ||||||
|
||||||
See Also | ||||||
-------- | ||||||
|
@@ -8194,32 +8208,69 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: | |||||
|
||||||
Examples | ||||||
-------- | ||||||
>>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) | ||||||
>>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], | ||||||
... 'B': 1, | ||||||
... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) | ||||||
>>> df | ||||||
A B | ||||||
0 [1, 2, 3] 1 | ||||||
1 foo 1 | ||||||
2 [] 1 | ||||||
3 [3, 4] 1 | ||||||
A B C | ||||||
0 [0, 1, 2] 1 [a, b, c] | ||||||
1 foo 1 NaN | ||||||
2 [] 1 [] | ||||||
3 [3, 4] 1 [d, e] | ||||||
|
||||||
Single-column explode. | ||||||
|
||||||
>>> df.explode('A') | ||||||
A B | ||||||
0 1 1 | ||||||
0 2 1 | ||||||
0 3 1 | ||||||
1 foo 1 | ||||||
2 NaN 1 | ||||||
3 3 1 | ||||||
3 4 1 | ||||||
""" | ||||||
if not (is_scalar(column) or isinstance(column, tuple)): | ||||||
raise ValueError("column must be a scalar") | ||||||
A B C | ||||||
0 0 1 [a, b, c] | ||||||
0 1 1 [a, b, c] | ||||||
0 2 1 [a, b, c] | ||||||
1 foo 1 NaN | ||||||
2 NaN 1 [] | ||||||
3 3 1 [d, e] | ||||||
3 4 1 [d, e] | ||||||
|
||||||
Multi-column explode. | ||||||
|
||||||
>>> df.explode(list('AC')) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add Multi-column explode (and can comment that the above is a single column) |
||||||
A B C | ||||||
0 0 1 a | ||||||
0 1 1 b | ||||||
0 2 1 c | ||||||
1 foo 1 NaN | ||||||
2 NaN 1 NaN | ||||||
3 3 1 d | ||||||
3 4 1 e | ||||||
""" | ||||||
if not self.columns.is_unique: | ||||||
raise ValueError("columns must be unique") | ||||||
|
||||||
columns: list[str | tuple] | ||||||
if is_scalar(column) or isinstance(column, tuple): | ||||||
assert isinstance(column, (str, tuple)) | ||||||
columns = [column] | ||||||
elif isinstance(column, list) and all( | ||||||
map(lambda c: is_scalar(c) or isinstance(c, tuple), column) | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
): | ||||||
if not column: | ||||||
raise ValueError("column must be nonempty") | ||||||
if len(column) > len(set(column)): | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
raise ValueError("column must be unique") | ||||||
columns = column | ||||||
else: | ||||||
raise ValueError("column must be a scalar, tuple, or list thereof") | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
df = self.reset_index(drop=True) | ||||||
result = df[column].explode() | ||||||
result = df.drop([column], axis=1).join(result) | ||||||
if len(columns) == 1: | ||||||
result = df[columns[0]].explode() | ||||||
else: | ||||||
mylen = lambda x: len(x) if is_list_like(x) else -1 | ||||||
counts0 = self[columns[0]].apply(mylen) | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
for c in columns[1:]: | ||||||
if not all(counts0 == self[c].apply(mylen)): | ||||||
raise ValueError("columns must have matching element counts") | ||||||
result = DataFrame({c: df[c].explode() for c in columns}) | ||||||
result = df.drop(columns, axis=1).join(result) | ||||||
if ignore_index: | ||||||
result.index = ibase.default_index(len(result)) | ||||||
else: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,14 +9,50 @@ def test_error(): | |
df = pd.DataFrame( | ||
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} | ||
) | ||
with pytest.raises(ValueError, match="column must be a scalar"): | ||
with pytest.raises( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test is getting big, can you split into multiple ones (ok to rename the original), parameterize is good as well if posilbe |
||
ValueError, match="column must be a scalar, tuple, or list thereof" | ||
): | ||
df.explode([list("AA")]) | ||
|
||
with pytest.raises(ValueError, match="column must be unique"): | ||
df.explode(list("AA")) | ||
|
||
df.columns = list("AA") | ||
with pytest.raises(ValueError, match="columns must be unique"): | ||
df.explode("A") | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"input_subset, error_message", | ||
[ | ||
( | ||
list("AC"), | ||
"columns must have matching element counts", | ||
), | ||
( | ||
[], | ||
"column must be nonempty", | ||
), | ||
( | ||
list("AC"), | ||
"columns must have matching element counts", | ||
), | ||
], | ||
) | ||
def test_error_multi_columns(input_subset, error_message): | ||
# GH 39240 | ||
df = pd.DataFrame( | ||
{ | ||
"A": [[0, 1, 2], np.nan, [], (3, 4)], | ||
"B": 1, | ||
"C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]], | ||
}, | ||
index=list("abcd"), | ||
) | ||
with pytest.raises(ValueError, match=error_message): | ||
df.explode(input_subset) | ||
|
||
|
||
def test_basic(): | ||
df = pd.DataFrame( | ||
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} | ||
|
@@ -180,3 +216,58 @@ def test_explode_sets(): | |
result = df.explode(column="a").sort_values(by="a") | ||
expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"input_subset, expected_dict, expected_index", | ||
[ | ||
( | ||
list("AC"), | ||
{ | ||
"A": pd.Series( | ||
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan], | ||
index=list("aaabcdde"), | ||
dtype=object, | ||
), | ||
"B": 1, | ||
"C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan], | ||
}, | ||
list("aaabcdde"), | ||
), | ||
( | ||
list("A"), | ||
{ | ||
"A": pd.Series( | ||
[0, 1, 2, np.nan, np.nan, 3, 4, np.nan], | ||
index=list("aaabcdde"), | ||
dtype=object, | ||
), | ||
"B": 1, | ||
"C": [ | ||
["a", "b", "c"], | ||
["a", "b", "c"], | ||
["a", "b", "c"], | ||
"foo", | ||
[], | ||
["d", "e"], | ||
["d", "e"], | ||
np.nan, | ||
], | ||
}, | ||
list("aaabcdde"), | ||
), | ||
], | ||
) | ||
def test_multi_columns(input_subset, expected_dict, expected_index): | ||
# GH 39240 | ||
df = pd.DataFrame( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add gh reference |
||
{ | ||
"A": [[0, 1, 2], np.nan, [], (3, 4), np.nan], | ||
"B": 1, | ||
"C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan], | ||
}, | ||
index=list("abcde"), | ||
) | ||
result = df.explode(input_subset) | ||
expected = pd.DataFrame(expected_dict, expected_index) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should consider deprecating & renaming to subset to match others, can you open an issue