Skip to content

Commit

Permalink
[ENH] Add DataFrame method to explode a list-like column (GH #16538)
Browse files Browse the repository at this point in the history
Sometimes a values column is presented with list-like values on one row.
Instead we may want to split each individual value onto its own row,
keeping the same mapping to the other key columns. While it's possible
to chain together existing pandas operations (in fact that's exactly
what this implementation is) to do this, the sequence of operations
is not obvious. By contrast this is available as a built-in operation
in say Spark and is a fairly common use case.
  • Loading branch information
changhiskhan committed Dec 20, 2018
1 parent 14c33b0 commit 9e76b75
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 0 deletions.
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,22 @@ def time_qcut_datetime(self, bins):
pd.qcut(self.datetime_series, bins)


class Explode(object):
param_names = ['n_rows', 'max_list_length']
params = [[100, 1000, 10000], [3, 5, 10]]

def setup(self, n_rows, max_list_length):
import string
num_letters = np.random.randint(0, max_list_length, n_rows)
key_column = [','.join([np.random.choice(list(string.ascii_letters))
for _ in range(k)])
for k in num_letters]
value_column = np.random.randn(n_rows)
self.frame = pd.DataFrame({'key': key_column,
'value': value_column})

def time_explode(self, n_rows, max_list_length):
self.frame.explode('key', sep=',')


from .pandas_vb_common import setup # noqa: F401
37 changes: 37 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -801,3 +801,40 @@ Note to subdivide over multiple columns we can pass in a list to the
df.pivot_table(
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
.. _reshaping.explode:

Exploding a List-like Column
----------------------------

.. ipython:: python
:suppress:
keys = ['panda1', 'panda2', 'panda3']
values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
df = pd.DataFrame({'keys': keys, 'values': values})
exploded = df.explode('values')
values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves']
df2 = pd.DataFrame({'keys': keys, 'values': values})
Sometimes the value column is list-like:

.. ipython:: python
df
But we actually want to put each value onto its own row.
For this purpose we can use ``DataFrame.explode``:

.. ipython:: python
df.explode('values')
For convenience, we can use the optional keyword ``sep`` to automatically
split a string column before exploding:

.. ipython:: python
df2
df2.explode('values', sep=',')
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ New features
- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column <reshaping.html>` in docs for more information (:issue:`16538`)

.. _whatsnew_0240.values_api:

Expand Down
43 changes: 43 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5980,6 +5980,49 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
var_name=var_name, value_name=value_name,
col_level=col_level)

def explode(self, col_name, sep=None, dtype=None):
"""
Create a new DataFrame where each element in each row
of a list-like column `col_name` is expanded to its own row
.. versionadded:: 0.24.0
Parameters
----------
col_name : str
Name of the column to be exploded
sep : str, default None
Convenience to split a string `col_name` before exploding
dtype : str or dtype, default None
Optionally coerce the dtype of exploded column
-
Examples
--------
>>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
>>> df.explode('k', sep=',')
k v
0 a 0
0 b 0
1 c 1
1 d 1
"""
col = self[col_name]
if len(self) == 0:
return self.copy()
if sep:
col_expanded = col.str.split(sep, expand=True)
else:
col_expanded = col.apply(Series)
col_stacked = (col_expanded
.stack()
.reset_index(level=-1, drop=True)
.rename(col_name))
if dtype:
col_stacked = col_stacked.astype(dtype)
return (col_stacked.to_frame()
.join(self.drop(col_name, axis=1))
.reindex(self.columns, axis=1))

# ----------------------------------------------------------------------
# Time series-related

Expand Down
84 changes: 84 additions & 0 deletions pandas/tests/frame/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,90 @@ def test_unstack_swaplevel_sortlevel(self, level):
tm.assert_frame_equal(result, expected)


def test_explode():
# GH 16538

# Automatically do str.split
columns = ['a', 'b', 'c']
df = pd.DataFrame([['foo,bar', 'x', 42],
['fizz,buzz', 'y', 43]],
columns=columns)
rs = df.explode('a', sep=',')
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
'b': ['x', 'x', 'y', 'y'],
'c': [42, 42, 43, 43]},
index=[0, 0, 1, 1])
tm.assert_frame_equal(rs, xp)

# Coerce dtype
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
[[2, 3], 'y', 43]],
columns=columns)
rs = df.explode('a', dtype='int')
xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'),
'b': ['x', 'x', 'x', 'y', 'y'],
'c': [42, 42, 42, 43, 43]},
index=[0, 0, 0, 1, 1])
tm.assert_frame_equal(rs, xp)

# NaN's and empty lists are omitted
# TODO: option to preserve explicit NAs instead
df = pd.DataFrame([[[], 'x', 42],
[[2.0, np.nan], 'y', 43]],
columns=columns)
rs = df.explode('a')
xp = pd.DataFrame({'a': [2.0],
'b': ['y'],
'c': [43]},
index=[1])
tm.assert_frame_equal(rs, xp)

# Not everything is a list
df = pd.DataFrame([[[0, 1, 4], 'x', 42],
[3, 'y', 43]],
columns=columns)
rs = df.explode('a', dtype='int')
xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'),
'b': ['x', 'x', 'x', 'y'],
'c': [42, 42, 42, 43]},
index=[0, 0, 0, 1])
tm.assert_frame_equal(rs, xp)

# Nothing is a list
df = pd.DataFrame([[0, 'x', 42],
[3, 'y', 43]],
columns=columns)
rs = df.explode('a')
xp = pd.DataFrame({'a': [0, 3],
'b': ['x', 'y'],
'c': [42, 43]},
index=[0, 1])
tm.assert_frame_equal(rs, xp)

# Empty frame
rs = pd.DataFrame(columns=['a', 'b']).explode('a')
xp = pd.DataFrame(columns=['a', 'b'])
tm.assert_frame_equal(rs, xp)

# Bad column name
pytest.raises(KeyError, df.explode, 'badcolumnname')

# Multi-index
columns = ['a', 'b', 'c']
idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
df = pd.DataFrame([['foo,bar', 'x', 42],
['fizz,buzz', 'y', 43]],
columns=columns,
index=idx)
rs = df.explode('a', sep=',')
idx = pd.MultiIndex.from_tuples([(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')])
xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
'b': ['x', 'x', 'y', 'y'],
'c': [42, 42, 43, 43]},
index=idx)
tm.assert_frame_equal(rs, xp)


def test_unstack_fill_frame_object():
# GH12815 Test unstacking with object.
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
Expand Down

0 comments on commit 9e76b75

Please sign in to comment.