Skip to content

Commit

Permalink
Fixed issue with read_json and partially missing MI names (#19177)
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd authored and jreback committed Jan 16, 2018
1 parent eee83e2 commit c271d4d
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 21 deletions.
7 changes: 4 additions & 3 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2228,9 +2228,10 @@ round-trippable manner.
new_df
new_df.dtypes
Please note that the string `index` is not supported with the round trip
format, as it is used by default in ``write_json`` to indicate a missing index
name.
Please note that the literal string 'index' as the name of an :class:`Index`
is not round-trippable, nor are any names beginning with 'level_' within a
:class:`MultiIndex`. These are used by default in :func:`DataFrame.to_json` to
indicate missing values and the subsequent read cannot distinguish the intent.

.. ipython:: python
Expand Down
14 changes: 8 additions & 6 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,12 +341,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
Notes
-----
Specific to ``orient='table'``, if a ``DataFrame`` with a literal ``Index``
name of `index` gets written with ``write_json``, the subsequent read
operation will incorrectly set the ``Index`` name to ``None``. This is
because `index` is also used by ``write_json`` to denote a missing
``Index`` name, and the subsequent ``read_json`` operation cannot
distinguish between the two.
Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
:class:`Index` name of `index` gets written with :func:`to_json`, the
subsequent read operation will incorrectly set the :class:`Index` name to
``None``. This is because `index` is also used by :func:`DataFrame.to_json`
to denote a missing :class:`Index` name, and the subsequent
:func:`read_json` operation cannot distinguish between the two. The same
limitation is encountered with a :class:`MultiIndex` and any names
beginning with 'level_'.
See Also
--------
Expand Down
28 changes: 20 additions & 8 deletions pandas/io/json/table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
http://specs.frictionlessdata.io/json-table-schema/
"""
import warnings

import pandas._libs.json as json
from pandas import DataFrame
from pandas.api.types import CategoricalDtype
Expand Down Expand Up @@ -68,6 +70,12 @@ def as_json_table_type(x):
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if _all_not_none(*data.index.names):
nms = data.index.names
if len(nms) == 1 and data.index.name == 'index':
warnings.warn("Index name of 'index' is not round-trippable")
elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
warnings.warn("Index names beginning with 'level_' are not "
"round-trippable")
return data

data = data.copy()
Expand Down Expand Up @@ -273,10 +281,13 @@ def parse_table_schema(json, precise_float):
Notes
-----
Because ``write_json`` uses the string `index` to denote a name-less
``Index``, this function sets the name of the returned ``DataFrame`` to
``None`` when said string is encountered. Therefore, intentional usage
of `index` as the ``Index`` name is not supported.
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
name-less :class:`Index`, this function sets the name of the returned
:class:`DataFrame` to ``None`` when said string is encountered with a
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
applies to any strings beginning with 'level_'. Therefore, an
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
with 'level_' are not supported.
See also
--------
Expand All @@ -303,10 +314,11 @@ def parse_table_schema(json, precise_float):
df = df.astype(dtypes)

df = df.set_index(table['schema']['primaryKey'])
if len(df.index.names) == 1 and df.index.name == 'index':
df.index.name = None
if len(df.index.names) == 1:
if df.index.name == 'index':
df.index.name = None
else:
if all(x.startswith('level_') for x in df.index.names):
df.index.names = [None] * len(df.index.names)
df.index.names = [None if x.startswith('level_') else x for x in
df.index.names]

return df
25 changes: 21 additions & 4 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,20 @@ def test_set_names_unset(self, idx, nm, prop):
result = set_default_names(data)
assert getattr(result.index, prop) == nm

@pytest.mark.parametrize("idx", [
pd.Index([], name='index'),
pd.MultiIndex.from_arrays([['foo'], ['bar']],
names=('level_0', 'level_1')),
pd.MultiIndex.from_arrays([['foo'], ['bar']],
names=('foo', 'level_1'))
])
def test_warns_non_roundtrippable_names(self, idx):
# GH 19130
df = pd.DataFrame([[]], index=idx)
df.index.name = 'index'
with tm.assert_produces_warning():
set_default_names(df)

def test_timestamp_in_columns(self):
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
pd.Timedelta(10, unit='s')])
Expand Down Expand Up @@ -481,7 +495,8 @@ def test_mi_falsey_name(self):
class TestTableOrientReader(object):

@pytest.mark.parametrize("index_nm", [
None, "idx", pytest.param("index", marks=pytest.mark.xfail)])
None, "idx", pytest.param("index", marks=pytest.mark.xfail),
'level_0'])
@pytest.mark.parametrize("vals", [
{'ints': [1, 2, 3, 4]},
{'objects': ['a', 'b', 'c', 'd']},
Expand All @@ -492,7 +507,7 @@ class TestTableOrientReader(object):
pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
{'floats': [1.1, 2.2, 3.3, 4.4]},
{'bools': [True, False, False, True]}])
def test_read_json_table_orient(self, index_nm, vals):
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
Expand All @@ -504,7 +519,7 @@ def test_read_json_table_orient(self, index_nm, vals):
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
tz='US/Central')}])
def test_read_json_table_orient_raises(self, index_nm, vals):
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
Expand All @@ -530,7 +545,9 @@ def test_comprehensive(self):
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)

@pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']])
@pytest.mark.parametrize("index_names", [
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
['index', 'foo']])
def test_multiindex(self, index_names):
# GH 18912
df = pd.DataFrame(
Expand Down

0 comments on commit c271d4d

Please sign in to comment.