Fixed issue with read_json and partially missing MI names (#19177)

pandas-dev · Jan 16, 2018 · c271d4d · c271d4d
1 parent eee83e2
commit c271d4d
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 21 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2228,9 +2228,10 @@ round-trippable manner.
    new_df
    new_df.dtypes
 
-Please note that the string `index` is not supported with the round trip
-format, as it is used by default in ``write_json`` to indicate a missing index
-name.
+Please note that the literal string 'index' as the name of an :class:`Index`
+is not round-trippable, nor are any names beginning with 'level_' within a
+:class:`MultiIndex`. These are used by default in :func:`DataFrame.to_json` to
+indicate missing values and the subsequent read cannot distinguish the intent.
 
 .. ipython:: python
 

diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -341,12 +341,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
     Notes
     -----
-    Specific to ``orient='table'``, if a ``DataFrame`` with a literal ``Index``
-    name of `index` gets written with ``write_json``, the subsequent read
-    operation will incorrectly set the ``Index`` name to ``None``. This is
-    because `index` is also used by ``write_json`` to denote a missing
-    ``Index`` name, and the subsequent ``read_json`` operation cannot
-    distinguish between the two.
+    Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
+    :class:`Index` name of `index` gets written with :func:`to_json`, the
+    subsequent read operation will incorrectly set the :class:`Index` name to
+    ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
+    to denote a missing :class:`Index` name, and the subsequent
+    :func:`read_json` operation cannot distinguish between the two. The same
+    limitation is encountered with a :class:`MultiIndex` and any names
+    beginning with 'level_'.
 
     See Also
     --------

diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
@@ -3,6 +3,8 @@
 
 http://specs.frictionlessdata.io/json-table-schema/
 """
+import warnings
+
 import pandas._libs.json as json
 from pandas import DataFrame
 from pandas.api.types import CategoricalDtype
@@ -68,6 +70,12 @@ def as_json_table_type(x):
 def set_default_names(data):
     """Sets index names to 'index' for regular, or 'level_x' for Multi"""
     if _all_not_none(*data.index.names):
+        nms = data.index.names
+        if len(nms) == 1 and data.index.name == 'index':
+            warnings.warn("Index name of 'index' is not round-trippable")
+        elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
+            warnings.warn("Index names beginning with 'level_' are not "
+                          "round-trippable")
         return data
 
     data = data.copy()
@@ -273,10 +281,13 @@ def parse_table_schema(json, precise_float):
 
     Notes
     -----
-        Because ``write_json`` uses the string `index` to denote a name-less
-        ``Index``, this function sets the name of the returned ``DataFrame`` to
-        ``None`` when said string is encountered. Therefore, intentional usage
-        of `index` as the ``Index`` name is not supported.
+        Because :func:`DataFrame.to_json` uses the string 'index' to denote a
+        name-less :class:`Index`, this function sets the name of the returned
+        :class:`DataFrame` to ``None`` when said string is encountered with a
+        normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
+        applies to any strings beginning with 'level_'. Therefore, an
+        :class:`Index` name of 'index'  and :class:`MultiIndex` names starting
+        with 'level_' are not supported.
 
     See also
     --------
@@ -303,10 +314,11 @@ def parse_table_schema(json, precise_float):
     df = df.astype(dtypes)
 
     df = df.set_index(table['schema']['primaryKey'])
-    if len(df.index.names) == 1 and df.index.name == 'index':
-        df.index.name = None
+    if len(df.index.names) == 1:
+        if df.index.name == 'index':
+            df.index.name = None
     else:
-        if all(x.startswith('level_') for x in df.index.names):
-            df.index.names = [None] * len(df.index.names)
+        df.index.names = [None if x.startswith('level_') else x for x in
+                          df.index.names]
 
     return df
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
@@ -451,6 +451,20 @@ def test_set_names_unset(self, idx, nm, prop):
         result = set_default_names(data)
         assert getattr(result.index, prop) == nm
 
+    @pytest.mark.parametrize("idx", [
+        pd.Index([], name='index'),
+        pd.MultiIndex.from_arrays([['foo'], ['bar']],
+                                  names=('level_0', 'level_1')),
+        pd.MultiIndex.from_arrays([['foo'], ['bar']],
+                                  names=('foo', 'level_1'))
+    ])
+    def test_warns_non_roundtrippable_names(self, idx):
+        # GH 19130
+        df = pd.DataFrame([[]], index=idx)
+        df.index.name = 'index'
+        with tm.assert_produces_warning():
+            set_default_names(df)
+
     def test_timestamp_in_columns(self):
         df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
                                              pd.Timedelta(10, unit='s')])
@@ -481,7 +495,8 @@ def test_mi_falsey_name(self):
 class TestTableOrientReader(object):
 
     @pytest.mark.parametrize("index_nm", [
-        None, "idx", pytest.param("index", marks=pytest.mark.xfail)])
+        None, "idx", pytest.param("index", marks=pytest.mark.xfail),
+        'level_0'])
     @pytest.mark.parametrize("vals", [
         {'ints': [1, 2, 3, 4]},
         {'objects': ['a', 'b', 'c', 'd']},
@@ -492,7 +507,7 @@ class TestTableOrientReader(object):
         pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
         {'floats': [1.1, 2.2, 3.3, 4.4]},
         {'bools': [True, False, False, True]}])
-    def test_read_json_table_orient(self, index_nm, vals):
+    def test_read_json_table_orient(self, index_nm, vals, recwarn):
         df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
         out = df.to_json(orient="table")
         result = pd.read_json(out, orient="table")
@@ -504,7 +519,7 @@ def test_read_json_table_orient(self, index_nm, vals):
         {'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
         {'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
                                     tz='US/Central')}])
-    def test_read_json_table_orient_raises(self, index_nm, vals):
+    def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
         df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
         out = df.to_json(orient="table")
         with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
@@ -530,7 +545,9 @@ def test_comprehensive(self):
         result = pd.read_json(out, orient="table")
         tm.assert_frame_equal(df, result)
 
-    @pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']])
+    @pytest.mark.parametrize("index_names", [
+        [None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
+        ['index', 'foo']])
     def test_multiindex(self, index_names):
         # GH 18912
         df = pd.DataFrame(