Skip to content

Commit

Permalink
API: to_msgpack and read_msgpack encoding defaults to utf-8
Browse files Browse the repository at this point in the history
closes pandas-dev#12170

Author: Ka Wo Chen <[email protected]>

Closes pandas-dev#12277 from kawochen/API-12170 and squashes the following commits:

5adcf3b [Ka Wo Chen] API: to_msgpack and read_msgpack encoding defaults to utf-8
  • Loading branch information
kawochen authored and cldy committed Feb 11, 2016
1 parent a4843fb commit cb7ba66
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 14 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ Backwards incompatible API changes
- ``DataFrame.round()`` leaves non-numeric columns unchanged in its return, rather than raises. (:issue:`11885`)
- ``DataFrame.head(0)`` and ``DataFrame.tail(0)`` return empty frames, rather than ``self``. (:issue:`11937`)
- ``Series.head(0)`` and ``Series.tail(0)`` return empty series, rather than ``self``. (:issue:`11937`)
- ``to_msgpack`` and ``read_msgpack`` encoding now defaults to ``'utf-8'``. (:issue:`12170`)

NaT and Timedelta operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,7 @@ def to_hdf(self, path_or_buf, key, **kwargs):
from pandas.io import pytables
return pytables.to_hdf(path_or_buf, key, self, **kwargs)

def to_msgpack(self, path_or_buf=None, **kwargs):
def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
"""
msgpack (serialize) object to input file path
Expand All @@ -957,7 +957,8 @@ def to_msgpack(self, path_or_buf=None, **kwargs):
"""

from pandas.io import packers
return packers.to_msgpack(path_or_buf, self, **kwargs)
return packers.to_msgpack(path_or_buf, self, encoding=encoding,
**kwargs)

def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail',
index=True, index_label=None, chunksize=None, dtype=None):
Expand Down
14 changes: 8 additions & 6 deletions pandas/io/packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def to_msgpack(path_or_buf, *args, **kwargs):
path_or_buf : string File path, buffer-like, or None
if None, return generated string
args : an object or objects to serialize
encoding: encoding for unicode objects
append : boolean whether to append to an existing msgpack
(default is False)
compress : type of compressor (zlib or blosc), default to None (no
Expand Down Expand Up @@ -103,7 +104,7 @@ def writer(fh):
writer(path_or_buf)


def read_msgpack(path_or_buf, iterator=False, **kwargs):
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
"""
Load msgpack pandas object from the specified
file path
Expand All @@ -114,6 +115,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
Parameters
----------
path_or_buf : string File path, BytesIO like or string
encoding: Encoding for decoding msgpack str type
iterator : boolean, if True, return an iterator to the unpacker
(default is False)
Expand All @@ -127,7 +129,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
return Iterator(path_or_buf)

def read(fh):
l = list(unpack(fh, **kwargs))
l = list(unpack(fh, encoding=encoding, **kwargs))
if len(l) == 1:
return l[0]
return l
Expand Down Expand Up @@ -573,7 +575,7 @@ def create_block(b):


def pack(o, default=encode,
encoding='latin1', unicode_errors='strict', use_single_float=False,
encoding='utf-8', unicode_errors='strict', use_single_float=False,
autoreset=1, use_bin_type=1):
"""
Pack an object and return the packed bytes.
Expand All @@ -587,7 +589,7 @@ def pack(o, default=encode,


def unpack(packed, object_hook=decode,
list_hook=None, use_list=False, encoding='latin1',
list_hook=None, use_list=False, encoding='utf-8',
unicode_errors='strict', object_pairs_hook=None,
max_buffer_size=0, ext_hook=ExtType):
"""
Expand All @@ -607,7 +609,7 @@ def unpack(packed, object_hook=decode,
class Packer(_Packer):

def __init__(self, default=encode,
encoding='latin1',
encoding='utf-8',
unicode_errors='strict',
use_single_float=False,
autoreset=1,
Expand All @@ -624,7 +626,7 @@ class Unpacker(_Unpacker):

def __init__(self, file_like=None, read_size=0, use_list=False,
object_hook=decode,
object_pairs_hook=None, list_hook=None, encoding='latin1',
object_pairs_hook=None, list_hook=None, encoding='utf-8',
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
super(Unpacker, self).__init__(file_like=file_like,
read_size=read_size,
Expand Down
21 changes: 15 additions & 6 deletions pandas/io/tests/test_packers.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,11 +299,8 @@ def test_multi_index(self):
def test_unicode(self):
i = tm.makeUnicodeIndex(100)

# this currently fails
self.assertRaises(UnicodeEncodeError, self.encode_decode, i)

# i_rec = self.encode_decode(i)
# self.assertTrue(i.equals(i_rec))
i_rec = self.encode_decode(i)
self.assertTrue(i.equals(i_rec))


class TestSeries(TestPackers):
Expand Down Expand Up @@ -615,6 +612,14 @@ def test_utf(self):
result = self.encode_decode(frame, encoding=encoding)
assert_frame_equal(result, frame)

def test_default_encoding(self):
for frame in compat.itervalues(self.frame):
result = frame.to_msgpack()
expected = frame.to_msgpack(encoding='utf8')
self.assertEqual(result, expected)
result = self.encode_decode(frame)
assert_frame_equal(result, frame)


class TestMsgpack():
"""
Expand Down Expand Up @@ -652,7 +657,11 @@ def check_min_structure(self, data):
typ], '"{0}" not found in data["{1}"]'.format(kind, typ)

def compare(self, vf, version):
data = read_msgpack(vf)
# GH12277 encoding default used to be latin-1, now utf-8
if LooseVersion(version) < '0.18.0':
data = read_msgpack(vf, encoding='latin-1')
else:
data = read_msgpack(vf)
self.check_min_structure(data)
for typ, dv in data.items():
assert typ in self.all_data, ('unpacked data contains '
Expand Down

0 comments on commit cb7ba66

Please sign in to comment.