API: to_msgpack and read_msgpack encoding defaults to utf-8

closes pandas-dev#12170 Author: Ka Wo Chen <[email protected]> Closes pandas-dev#12277 from kawochen/API-12170 and squashes the following commits: 5adcf3b [Ka Wo Chen] API: to_msgpack and read_msgpack encoding defaults to utf-8
cldy · Feb 11, 2016 · cb7ba66 · cb7ba66
1 parent a4843fb
commit cb7ba66
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 14 deletions.
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -346,6 +346,7 @@ Backwards incompatible API changes
 - ``DataFrame.round()`` leaves non-numeric columns unchanged in its return, rather than raises. (:issue:`11885`)
 - ``DataFrame.head(0)`` and ``DataFrame.tail(0)`` return empty frames, rather than ``self``.  (:issue:`11937`)
 - ``Series.head(0)`` and ``Series.tail(0)`` return empty series, rather than ``self``.  (:issue:`11937`)
+- ``to_msgpack`` and ``read_msgpack`` encoding now defaults to ``'utf-8'``. (:issue:`12170`)
 
 NaT and Timedelta operations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -939,7 +939,7 @@ def to_hdf(self, path_or_buf, key, **kwargs):
         from pandas.io import pytables
         return pytables.to_hdf(path_or_buf, key, self, **kwargs)
 
-    def to_msgpack(self, path_or_buf=None, **kwargs):
+    def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
         """
         msgpack (serialize) object to input file path
 
@@ -957,7 +957,8 @@ def to_msgpack(self, path_or_buf=None, **kwargs):
         """
 
         from pandas.io import packers
-        return packers.to_msgpack(path_or_buf, self, **kwargs)
+        return packers.to_msgpack(path_or_buf, self, encoding=encoding,
+                                  **kwargs)
 
     def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail',
                index=True, index_label=None, chunksize=None, dtype=None):

diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -75,6 +75,7 @@ def to_msgpack(path_or_buf, *args, **kwargs):
     path_or_buf : string File path, buffer-like, or None
                   if None, return generated string
     args : an object or objects to serialize
+    encoding: encoding for unicode objects
     append : boolean whether to append to an existing msgpack
              (default is False)
     compress : type of compressor (zlib or blosc), default to None (no
@@ -103,7 +104,7 @@ def writer(fh):
         writer(path_or_buf)
 
 
-def read_msgpack(path_or_buf, iterator=False, **kwargs):
+def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
     """
     Load msgpack pandas object from the specified
     file path
@@ -114,6 +115,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
     Parameters
     ----------
     path_or_buf : string File path, BytesIO like or string
+    encoding: Encoding for decoding msgpack str type
     iterator : boolean, if True, return an iterator to the unpacker
                (default is False)
 
@@ -127,7 +129,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
         return Iterator(path_or_buf)
 
     def read(fh):
-        l = list(unpack(fh, **kwargs))
+        l = list(unpack(fh, encoding=encoding, **kwargs))
         if len(l) == 1:
             return l[0]
         return l
@@ -573,7 +575,7 @@ def create_block(b):
 
 
 def pack(o, default=encode,
-         encoding='latin1', unicode_errors='strict', use_single_float=False,
+         encoding='utf-8', unicode_errors='strict', use_single_float=False,
          autoreset=1, use_bin_type=1):
     """
     Pack an object and return the packed bytes.
@@ -587,7 +589,7 @@ def pack(o, default=encode,
 
 
 def unpack(packed, object_hook=decode,
-           list_hook=None, use_list=False, encoding='latin1',
+           list_hook=None, use_list=False, encoding='utf-8',
            unicode_errors='strict', object_pairs_hook=None,
            max_buffer_size=0, ext_hook=ExtType):
     """
@@ -607,7 +609,7 @@ def unpack(packed, object_hook=decode,
 class Packer(_Packer):
 
     def __init__(self, default=encode,
-                 encoding='latin1',
+                 encoding='utf-8',
                  unicode_errors='strict',
                  use_single_float=False,
                  autoreset=1,
@@ -624,7 +626,7 @@ class Unpacker(_Unpacker):
 
     def __init__(self, file_like=None, read_size=0, use_list=False,
                  object_hook=decode,
-                 object_pairs_hook=None, list_hook=None, encoding='latin1',
+                 object_pairs_hook=None, list_hook=None, encoding='utf-8',
                  unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
         super(Unpacker, self).__init__(file_like=file_like,
                                        read_size=read_size,

diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py
@@ -299,11 +299,8 @@ def test_multi_index(self):
     def test_unicode(self):
         i = tm.makeUnicodeIndex(100)
 
-        # this currently fails
-        self.assertRaises(UnicodeEncodeError, self.encode_decode, i)
-
-        # i_rec = self.encode_decode(i)
-        # self.assertTrue(i.equals(i_rec))
+        i_rec = self.encode_decode(i)
+        self.assertTrue(i.equals(i_rec))
 
 
 class TestSeries(TestPackers):
@@ -615,6 +612,14 @@ def test_utf(self):
                 result = self.encode_decode(frame, encoding=encoding)
                 assert_frame_equal(result, frame)
 
+    def test_default_encoding(self):
+        for frame in compat.itervalues(self.frame):
+            result = frame.to_msgpack()
+            expected = frame.to_msgpack(encoding='utf8')
+            self.assertEqual(result, expected)
+            result = self.encode_decode(frame)
+            assert_frame_equal(result, frame)
+
 
 class TestMsgpack():
     """
@@ -652,7 +657,11 @@ def check_min_structure(self, data):
                     typ], '"{0}" not found in data["{1}"]'.format(kind, typ)
 
     def compare(self, vf, version):
-        data = read_msgpack(vf)
+        # GH12277 encoding default used to be latin-1, now utf-8
+        if LooseVersion(version) < '0.18.0':
+            data = read_msgpack(vf, encoding='latin-1')
+        else:
+            data = read_msgpack(vf)
         self.check_min_structure(data)
         for typ, dv in data.items():
             assert typ in self.all_data, ('unpacked data contains '