From c271d4dd0b014c41bd91048fa6e8d33c5bf27b44 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 15 Jan 2018 16:47:23 -0800 Subject: [PATCH] Fixed issue with read_json and partially missing MI names (#19177) --- doc/source/io.rst | 7 +++-- pandas/io/json/json.py | 14 ++++++---- pandas/io/json/table_schema.py | 28 +++++++++++++------ .../tests/io/json/test_json_table_schema.py | 25 ++++++++++++++--- 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index b15d3918eb569..2f29e390c0ba1 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2228,9 +2228,10 @@ round-trippable manner. new_df new_df.dtypes -Please note that the string `index` is not supported with the round trip -format, as it is used by default in ``write_json`` to indicate a missing index -name. +Please note that the literal string 'index' as the name of an :class:`Index` +is not round-trippable, nor are any names beginning with 'level_' within a +:class:`MultiIndex`. These are used by default in :func:`DataFrame.to_json` to +indicate missing values and the subsequent read cannot distinguish the intent. .. ipython:: python diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index d1c83ad57f59d..6d35fc5769331 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -341,12 +341,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Notes ----- - Specific to ``orient='table'``, if a ``DataFrame`` with a literal ``Index`` - name of `index` gets written with ``write_json``, the subsequent read - operation will incorrectly set the ``Index`` name to ``None``. This is - because `index` is also used by ``write_json`` to denote a missing - ``Index`` name, and the subsequent ``read_json`` operation cannot - distinguish between the two. + Specific to ``orient='table'``, if a :class:`DataFrame` with a literal + :class:`Index` name of `index` gets written with :func:`to_json`, the + subsequent read operation will incorrectly set the :class:`Index` name to + ``None``. This is because `index` is also used by :func:`DataFrame.to_json` + to denote a missing :class:`Index` name, and the subsequent + :func:`read_json` operation cannot distinguish between the two. The same + limitation is encountered with a :class:`MultiIndex` and any names + beginning with 'level_'. See Also -------- diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 8da36b64b0914..89b7a1de8acfc 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -3,6 +3,8 @@ http://specs.frictionlessdata.io/json-table-schema/ """ +import warnings + import pandas._libs.json as json from pandas import DataFrame from pandas.api.types import CategoricalDtype @@ -68,6 +70,12 @@ def as_json_table_type(x): def set_default_names(data): """Sets index names to 'index' for regular, or 'level_x' for Multi""" if _all_not_none(*data.index.names): + nms = data.index.names + if len(nms) == 1 and data.index.name == 'index': + warnings.warn("Index name of 'index' is not round-trippable") + elif len(nms) > 1 and any(x.startswith('level_') for x in nms): + warnings.warn("Index names beginning with 'level_' are not " + "round-trippable") return data data = data.copy() @@ -273,10 +281,13 @@ def parse_table_schema(json, precise_float): Notes ----- - Because ``write_json`` uses the string `index` to denote a name-less - ``Index``, this function sets the name of the returned ``DataFrame`` to - ``None`` when said string is encountered. Therefore, intentional usage - of `index` as the ``Index`` name is not supported. + Because :func:`DataFrame.to_json` uses the string 'index' to denote a + name-less :class:`Index`, this function sets the name of the returned + :class:`DataFrame` to ``None`` when said string is encountered with a + normal :class:`Index`. For a :class:`MultiIndex`, the same limitation + applies to any strings beginning with 'level_'. Therefore, an + :class:`Index` name of 'index' and :class:`MultiIndex` names starting + with 'level_' are not supported. See also -------- @@ -303,10 +314,11 @@ def parse_table_schema(json, precise_float): df = df.astype(dtypes) df = df.set_index(table['schema']['primaryKey']) - if len(df.index.names) == 1 and df.index.name == 'index': - df.index.name = None + if len(df.index.names) == 1: + if df.index.name == 'index': + df.index.name = None else: - if all(x.startswith('level_') for x in df.index.names): - df.index.names = [None] * len(df.index.names) + df.index.names = [None if x.startswith('level_') else x for x in + df.index.names] return df diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index ccccdc9b0863e..49b39c17238ae 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -451,6 +451,20 @@ def test_set_names_unset(self, idx, nm, prop): result = set_default_names(data) assert getattr(result.index, prop) == nm + @pytest.mark.parametrize("idx", [ + pd.Index([], name='index'), + pd.MultiIndex.from_arrays([['foo'], ['bar']], + names=('level_0', 'level_1')), + pd.MultiIndex.from_arrays([['foo'], ['bar']], + names=('foo', 'level_1')) + ]) + def test_warns_non_roundtrippable_names(self, idx): + # GH 19130 + df = pd.DataFrame([[]], index=idx) + df.index.name = 'index' + with tm.assert_produces_warning(): + set_default_names(df) + def test_timestamp_in_columns(self): df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'), pd.Timedelta(10, unit='s')]) @@ -481,7 +495,8 @@ def test_mi_falsey_name(self): class TestTableOrientReader(object): @pytest.mark.parametrize("index_nm", [ - None, "idx", pytest.param("index", marks=pytest.mark.xfail)]) + None, "idx", pytest.param("index", marks=pytest.mark.xfail), + 'level_0']) @pytest.mark.parametrize("vals", [ {'ints': [1, 2, 3, 4]}, {'objects': ['a', 'b', 'c', 'd']}, @@ -492,7 +507,7 @@ class TestTableOrientReader(object): pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail), {'floats': [1.1, 2.2, 3.3, 4.4]}, {'bools': [True, False, False, True]}]) - def test_read_json_table_orient(self, index_nm, vals): + def test_read_json_table_orient(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) out = df.to_json(orient="table") result = pd.read_json(out, orient="table") @@ -504,7 +519,7 @@ def test_read_json_table_orient(self, index_nm, vals): {'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')}, {'timezones': pd.date_range('2016-01-01', freq='d', periods=4, tz='US/Central')}]) - def test_read_json_table_orient_raises(self, index_nm, vals): + def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) out = df.to_json(orient="table") with tm.assert_raises_regex(NotImplementedError, 'can not yet read '): @@ -530,7 +545,9 @@ def test_comprehensive(self): result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) - @pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']]) + @pytest.mark.parametrize("index_names", [ + [None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'], + ['index', 'foo']]) def test_multiindex(self, index_names): # GH 18912 df = pd.DataFrame(