Skip to content

Commit

Permalink
multi-column explode
Browse files Browse the repository at this point in the history
  • Loading branch information
iynehz committed Apr 7, 2021
1 parent 5d20815 commit 527a587
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 24 deletions.
91 changes: 68 additions & 23 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7910,16 +7910,23 @@ def stack(self, level: Level = -1, dropna: bool = True):

return result.__finalize__(self, method="stack")

def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
def explode(
self,
column: str | tuple | list[str | tuple],
ignore_index: bool = False,
) -> DataFrame:
"""
Transform each element of a list-like to a row, replicating index values.
.. versionadded:: 0.25.0
Parameters
----------
column : str or tuple
Column to explode.
column : str or tuple or list thereof
Column(s) to explode.
For multiple columns, specify a non-empty list with each element
be str or tuple, and all specified columns their list-like data
on same row of the frame must have matching length.
ignore_index : bool, default False
If True, the resulting index will be labeled 0, 1, …, n - 1.
Expand All @@ -7934,7 +7941,10 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
Raises
------
ValueError :
if columns of the frame are not unique.
* If columns of the frame are not unique.
* If specified columns to explode is empty list.
* If specified columns to explode have not matching count of
elements rowwise in the frame.
See Also
--------
Expand All @@ -7953,32 +7963,67 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame:
Examples
--------
>>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
>>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
... 'B': 1,
... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
>>> df
A B
0 [1, 2, 3] 1
1 foo 1
2 [] 1
3 [3, 4] 1
A B C
0 [0, 1, 2] 1 [a, b, c]
1 foo 1 NaN
2 [] 1 []
3 [3, 4] 1 [d, e]
>>> df.explode('A')
A B
0 1 1
0 2 1
0 3 1
1 foo 1
2 NaN 1
3 3 1
3 4 1
"""
if not (is_scalar(column) or isinstance(column, tuple)):
raise ValueError("column must be a scalar")
A B C
0 0 1 [a, b, c]
0 1 1 [a, b, c]
0 2 1 [a, b, c]
1 foo 1 NaN
2 NaN 1 []
3 3 1 [d, e]
3 4 1 [d, e]
>>> df.explode(list('AC'))
A B C
0 0 1 a
0 1 1 b
0 2 1 c
1 foo 1 NaN
2 NaN 1 NaN
3 3 1 d
3 4 1 e
"""
if not self.columns.is_unique:
raise ValueError("columns must be unique")

columns: list[str | tuple]
if is_scalar(column) or isinstance(column, tuple):
# mypy: List item 0 has incompatible type "Union[str, Tuple[Any, ...],
# List[Union[str, Tuple[Any, ...]]]]"; expected
# "Union[str, Tuple[Any, ...]]"
columns = [column] # type: ignore[list-item]
elif isinstance(column, list) and all(
map(lambda c: is_scalar(c) or isinstance(c, tuple), column)
):
if len(column) == 0:
raise ValueError("column must be nonempty")
if len(column) > len(set(column)):
raise ValueError("column must be unique")
columns = column
else:
raise ValueError("column must be a scalar, tuple, or list thereof")

df = self.reset_index(drop=True)
result = df[column].explode()
result = df.drop([column], axis=1).join(result)
if len(columns) == 1:
result = df[column].explode()
else:
mylen = lambda x: len(x) if is_list_like(x) else -1
counts0 = self[columns[0]].apply(mylen)
for c in columns[1:]:
if not all(counts0 == self[c].apply(mylen)):
raise ValueError("columns must have matching element counts")
result = DataFrame({c: df[c].explode() for c in columns})
result = df.drop(columns, axis=1).join(result)
if ignore_index:
result.index = ibase.default_index(len(result))
else:
Expand Down
45 changes: 44 additions & 1 deletion pandas/tests/frame/methods/test_explode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,34 @@ def test_error():
df = pd.DataFrame(
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
)
with pytest.raises(ValueError, match="column must be a scalar"):
with pytest.raises(
ValueError, match="column must be a scalar, tuple, or list thereof"
):
df.explode([list("AA")])

with pytest.raises(ValueError, match="column must be unique"):
df.explode(list("AA"))

df.columns = list("AA")
with pytest.raises(ValueError, match="columns must be unique"):
df.explode("A")

# GH 39240
df1 = df.assign(C=[["a", "b", "c"], "foo", [], ["d", "e", "f"]])
df1.columns = list("ABC")
with pytest.raises(ValueError, match="columns must have matching element counts"):
df1.explode(list("AC"))

# GH 39240
with pytest.raises(ValueError, match="column must be nonempty"):
df1.explode([])

# GH 39240
df2 = df.assign(C=[["a", "b", "c"], "foo", [], "d"])
df2.columns = list("ABC")
with pytest.raises(ValueError, match="columns must have matching element counts"):
df2.explode(list("AC"))


def test_basic():
df = pd.DataFrame(
Expand Down Expand Up @@ -180,3 +201,25 @@ def test_explode_sets():
result = df.explode(column="a").sort_values(by="a")
expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
tm.assert_frame_equal(result, expected)


def test_multi_columns():
# GH 39240
df = pd.DataFrame(
{
"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")),
"B": 1,
"C": [["a", "b", "c"], "foo", [], ["d", "e"]],
}
)
result = df.explode(list("AC"))
expected = pd.DataFrame(
{
"A": pd.Series(
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
),
"B": 1,
"C": ["a", "b", "c", "foo", np.nan, "d", "e"],
}
)
tm.assert_frame_equal(result, expected)

0 comments on commit 527a587

Please sign in to comment.