From f75ca6e5cbe17f99fbad98f1d52ab5d87200fa7b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 29 Apr 2018 21:31:18 -0700 Subject: [PATCH 1/7] Added test case --- pandas/tests/io/test_pytables.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index a6a38e005b9b6..3c2d09663df15 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1462,6 +1462,17 @@ def test_to_hdf_with_min_itemsize(self): tm.assert_series_equal(pd.read_hdf(path, 'ss4'), pd.concat([df['B'], df2['B']])) + @pytest.mark.parametrize("format", ['fixed', 'table']) + def test_to_hdf_errors(self, format): + + ser = pd.Series(['\ud800foo']) + with ensure_clean_path(self.path) as path: + # GH 20835 + ser.to_hdf(path, 'table', format=format, errors='surrogatepass') + + result = pd.read_hdf(path, 'table') + tm.assert_series_equal(result, ser) + def test_append_with_data_columns(self): with ensure_clean_store(self.path) as store: From 97f6a54a18d9c82f9a9031a632b8cb6dc7e6c18b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 29 Apr 2018 22:27:40 -0700 Subject: [PATCH 2/7] Round trippable read/write with errors --- pandas/io/pytables.py | 38 ++++++++++++++++++-------------- pandas/tests/io/test_pytables.py | 2 +- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4004a6ea8f6ff..d0dd7a58e4475 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1566,14 +1566,14 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding): + def convert(self, values, nan_rep, encoding, errors='strict'): """ set the values from this selection: take = take ownership """ # values is a recarray if values.dtype.fields is not None: values = values[self.cname] - values = _maybe_convert(values, self.kind, encoding) + values = _maybe_convert(values, self.kind, encoding, errors=errors) kwargs = dict() if self.freq is not None: @@ -1922,7 +1922,8 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, existing_col, min_itemsize, nan_rep, - encoding) + encoding, + **kwargs) # set as a data block else: @@ -1932,7 +1933,7 @@ def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) def set_atom_string(self, block, block_items, existing_col, min_itemsize, - nan_rep, encoding): + nan_rep, encoding, errors='strict'): # fill nan items with myself, don't disturb the blocks by # trying to downcast block = block.fillna(nan_rep, downcast=False) @@ -1958,7 +1959,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, ) # itemsize is the maximum length of a string (along any dimension) - data_converted = _convert_string_array(data, encoding) + data_converted = _convert_string_array(data, encoding, errors=errors) itemsize = data_converted.itemsize # specified min_itemsize? @@ -2089,7 +2090,7 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing " "items dtype in table!") - def convert(self, values, nan_rep, encoding): + def convert(self, values, nan_rep, encoding, errors='strict'): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -2163,7 +2164,7 @@ def convert(self, values, nan_rep, encoding): # convert nans / decode if _ensure_decoded(self.kind) == u('string'): self.data = _unconvert_string_array( - self.data, nan_rep=nan_rep, encoding=encoding) + self.data, nan_rep=nan_rep, encoding=encoding, errors=errors) return self @@ -3340,9 +3341,11 @@ def read_axes(self, where, **kwargs): values = self.selection.select() # convert the data + errors = kwargs.get('errors', 'strict') for a in self.axes: a.set_info(self.info) - a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding) + a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, + errors=errors) return True @@ -4540,7 +4543,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): return index -def _convert_string_array(data, encoding, itemsize=None): +def _convert_string_array(data, encoding, itemsize=None, errors='strict'): """ we take a string-like that is object dtype and coerce to a fixed size string type @@ -4550,6 +4553,7 @@ def _convert_string_array(data, encoding, itemsize=None): data : a numpy array of object dtype encoding : None or string-encoding itemsize : integer, optional, defaults to the max length of the strings + errors : handler for encoding errors, default 'strict' Returns ------- @@ -4559,7 +4563,7 @@ def _convert_string_array(data, encoding, itemsize=None): # encode if needed if encoding is not None and len(data): data = Series(data.ravel()).str.encode( - encoding).values.reshape(data.shape) + encoding, errors).values.reshape(data.shape) # create the sized dtype if itemsize is None: @@ -4570,7 +4574,7 @@ def _convert_string_array(data, encoding, itemsize=None): return data -def _unconvert_string_array(data, nan_rep=None, encoding=None): +def _unconvert_string_array(data, nan_rep=None, encoding=None, errors='strict'): """ inverse of _convert_string_array @@ -4579,6 +4583,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): data : fixed length string dtyped array nan_rep : the storage repr of NaN, optional encoding : the encoding of the data, optional + errors : handler for encoding errors, default 'strict' Returns ------- @@ -4600,7 +4605,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): dtype = "S{0}".format(itemsize) if isinstance(data[0], compat.binary_type): - data = Series(data).str.decode(encoding).values + data = Series(data).str.decode(encoding, errors=errors).values else: data = data.astype(dtype, copy=False).astype(object, copy=False) @@ -4611,22 +4616,23 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): return data.reshape(shape) -def _maybe_convert(values, val_kind, encoding): +def _maybe_convert(values, val_kind, encoding, errors='strict'): if _need_convert(val_kind): - conv = _get_converter(val_kind, encoding) + conv = _get_converter(val_kind, encoding, errors=errors) # conv = np.frompyfunc(conv, 1, 1) values = conv(values) return values -def _get_converter(kind, encoding): +def _get_converter(kind, encoding, errors='strict'): kind = _ensure_decoded(kind) if kind == 'datetime64': return lambda x: np.asarray(x, dtype='M8[ns]') elif kind == 'datetime': return lambda x: to_datetime(x, cache=True).to_pydatetime() elif kind == 'string': - return lambda x: _unconvert_string_array(x, encoding=encoding) + return lambda x: _unconvert_string_array(x, encoding=encoding, + errors=errors) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 3c2d09663df15..b0efb5b4b6c14 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1470,7 +1470,7 @@ def test_to_hdf_errors(self, format): # GH 20835 ser.to_hdf(path, 'table', format=format, errors='surrogatepass') - result = pd.read_hdf(path, 'table') + result = pd.read_hdf(path, 'table', errors='surrogatepass') tm.assert_series_equal(result, ser) def test_append_with_data_columns(self): From 9ae2ea08f948538c8f94e8cac33f36edb9a5e9b1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 29 Apr 2018 22:39:06 -0700 Subject: [PATCH 3/7] Added index to test case --- pandas/tests/io/test_pytables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index b0efb5b4b6c14..366d1645089f3 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1465,7 +1465,8 @@ def test_to_hdf_with_min_itemsize(self): @pytest.mark.parametrize("format", ['fixed', 'table']) def test_to_hdf_errors(self, format): - ser = pd.Series(['\ud800foo']) + data = ['\ud800foo'] + ser = pd.Series(data, index=pd.Index(data)) with ensure_clean_path(self.path) as path: # GH 20835 ser.to_hdf(path, 'table', format=format, errors='surrogatepass') From cfe09d14cb0602572a5cb530df9171ad899f73ad Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 29 Apr 2018 23:09:10 -0700 Subject: [PATCH 4/7] Mirrored encoding impl --- pandas/io/pytables.py | 79 +++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d0dd7a58e4475..9ac954e6ead06 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -705,7 +705,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, def func(_start, _stop, _where): return s.read(start=_start, stop=_stop, where=_where, - columns=columns, **kwargs) + columns=columns) # create the iterator it = TableIterator(self, s, func, where=where, nrows=s.nrows, @@ -1566,14 +1566,14 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors='strict'): + def convert(self, values, nan_rep, encoding, errors): """ set the values from this selection: take = take ownership """ # values is a recarray if values.dtype.fields is not None: values = values[self.cname] - values = _maybe_convert(values, self.kind, encoding, errors=errors) + values = _maybe_convert(values, self.kind, encoding, errors) kwargs = dict() if self.freq is not None: @@ -1748,7 +1748,7 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep, encoding): + def convert(self, values, nan_rep, encoding, errors): """ set the values from this selection: take = take ownership """ self.values = Int64Index(np.arange(self.table.nrows)) @@ -1877,7 +1877,7 @@ def set_kind(self): self.typ = getattr(self.description, self.cname, None) def set_atom(self, block, block_items, existing_col, min_itemsize, - nan_rep, info, encoding=None, **kwargs): + nan_rep, info, encoding=None, errors='strict'): """ create and setup my atom from the block b """ self.values = list(block_items) @@ -1923,7 +1923,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, min_itemsize, nan_rep, encoding, - **kwargs) + errors) # set as a data block else: @@ -1933,7 +1933,7 @@ def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) def set_atom_string(self, block, block_items, existing_col, min_itemsize, - nan_rep, encoding, errors='strict'): + nan_rep, encoding, errors): # fill nan items with myself, don't disturb the blocks by # trying to downcast block = block.fillna(nan_rep, downcast=False) @@ -1959,7 +1959,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, ) # itemsize is the maximum length of a string (along any dimension) - data_converted = _convert_string_array(data, encoding, errors=errors) + data_converted = _convert_string_array(data, encoding, errors) itemsize = data_converted.itemsize # specified min_itemsize? @@ -2090,7 +2090,7 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing " "items dtype in table!") - def convert(self, values, nan_rep, encoding, errors='strict'): + def convert(self, values, nan_rep, encoding, errors): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -2230,10 +2230,11 @@ class Fixed(StringMixin): ndim = None is_table = False - def __init__(self, parent, group, encoding=None, **kwargs): + def __init__(self, parent, group, encoding=None, errors='strict', **kwargs): self.parent = parent self.group = group self.encoding = _ensure_encoding(encoding) + self.errors = errors self.set_version() @property @@ -2437,10 +2438,12 @@ def is_exists(self): def set_attrs(self): """ set our object attributes """ self.attrs.encoding = self.encoding + self.attrs.errors = self.errors def get_attrs(self): """ retrieve our attributes """ self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) + self.errors = getattr(self.attrs, 'errors', 'strict') for n in self.attributes: setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) @@ -2507,7 +2510,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index, self.encoding, + converted = _convert_index(index, self.encoding, self.errors, self.format_type).set_name('index') self.write_array(key, converted.values) @@ -2553,7 +2556,7 @@ def write_multi_index(self, key, index): index.names)): # write the level level_key = '%s_level%d' % (key, i) - conv_level = _convert_index(lev, self.encoding, + conv_level = _convert_index(lev, self.encoding, self.errors, self.format_type).set_name(level_key) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) @@ -2614,11 +2617,13 @@ def read_index_node(self, node, start=None, stop=None): if kind in (u('date'), u('datetime')): index = factory(_unconvert_index(data, kind, - encoding=self.encoding), + encoding=self.encoding, + errors=self.errors), dtype=object, **kwargs) else: index = factory(_unconvert_index(data, kind, - encoding=self.encoding), **kwargs) + encoding=self.encoding, + errors=self.errors), **kwargs) index.name = name @@ -2731,7 +2736,8 @@ def read_index_legacy(self, key, start=None, stop=None): node = getattr(self.group, key) data = node[start:stop] kind = node._v_attrs.kind - return _unconvert_index_legacy(data, kind, encoding=self.encoding) + return _unconvert_index_legacy(data, kind, encoding=self.encoding, + errors=self.errors) class LegacySeriesFixed(LegacyFixed): @@ -3150,7 +3156,8 @@ def write_metadata(self, key, values): """ values = Series(values) self.parent.put(self._get_metadata_path(key), values, format='table', - encoding=self.encoding, nan_rep=self.nan_rep) + encoding=self.encoding, errors=self.errors, + nan_rep=self.nan_rep) def read_metadata(self, key): """ return the meta data array for this key """ @@ -3171,6 +3178,7 @@ def set_attrs(self): self.attrs.data_columns = self.data_columns self.attrs.nan_rep = self.nan_rep self.attrs.encoding = self.encoding + self.attrs.errors = self.errors self.attrs.levels = self.levels self.attrs.metadata = self.metadata self.set_info() @@ -3186,6 +3194,7 @@ def get_attrs(self): self.nan_rep = getattr(self.attrs, 'nan_rep', None) self.encoding = _ensure_encoding( getattr(self.attrs, 'encoding', None)) + self.errors = getattr(self.attrs, 'errors', 'strict') self.levels = getattr( self.attrs, 'levels', None) or [] self.index_axes = [ @@ -3341,11 +3350,10 @@ def read_axes(self, where, **kwargs): values = self.selection.select() # convert the data - errors = kwargs.get('errors', 'strict') for a in self.axes: a.set_info(self.info) a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, - errors=errors) + errors=self.errors) return True @@ -3427,6 +3435,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns = existing_table.data_columns nan_rep = existing_table.nan_rep self.encoding = existing_table.encoding + self.errors = existing_table.errors self.info = copy.copy(existing_table.info) else: existing_table = None @@ -3453,7 +3462,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if i in axes: name = obj._AXIS_NAMES[i] index_axes_map[i] = _convert_index( - a, self.encoding, self.format_type + a, self.encoding, self.errors, self.format_type ).set_name(name).set_axis(i) else: @@ -3572,8 +3581,8 @@ def get_blk_items(mgr, blocks): min_itemsize=min_itemsize, nan_rep=nan_rep, encoding=self.encoding, - info=self.info, - **kwargs) + errors=self.errors, + info=self.info) col.set_pos(j) self.values_axes.append(col) @@ -3737,7 +3746,8 @@ def read_column(self, column, where=None, start=None, stop=None, **kwargs): a.set_info(self.info) return Series(_set_tz(a.convert(c[start:stop], nan_rep=self.nan_rep, - encoding=self.encoding + encoding=self.encoding, + errors=self.errors ).take_data(), a.tz, True), name=column) @@ -4418,7 +4428,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): return values -def _convert_index(index, encoding=None, format_type=None): +def _convert_index(index, encoding=None, errors='strict', format_type=None): index_name = getattr(index, 'name', None) if isinstance(index, DatetimeIndex): @@ -4472,7 +4482,7 @@ def _convert_index(index, encoding=None, format_type=None): # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom - converted = _convert_string_array(values, encoding) + converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize return IndexCol( converted, 'string', _tables().StringCol(itemsize), @@ -4503,7 +4513,7 @@ def _convert_index(index, encoding=None, format_type=None): index_name=index_name) -def _unconvert_index(data, kind, encoding=None): +def _unconvert_index(data, kind, encoding=None, errors='strict'): kind = _ensure_decoded(kind) if kind == u('datetime64'): index = DatetimeIndex(data) @@ -4522,7 +4532,8 @@ def _unconvert_index(data, kind, encoding=None): elif kind in (u('integer'), u('float')): index = np.asarray(data) elif kind in (u('string')): - index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, + errors=errors) elif kind == u('object'): index = np.asarray(data[0]) else: # pragma: no cover @@ -4530,20 +4541,22 @@ def _unconvert_index(data, kind, encoding=None): return index -def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): +def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, + errors='strict'): kind = _ensure_decoded(kind) if kind == u('datetime'): index = to_datetime(data) elif kind in (u('integer')): index = np.asarray(data, dtype=object) elif kind in (u('string')): - index = _unconvert_string_array(data, nan_rep=None, encoding=encoding) + index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, + errors=errors) else: # pragma: no cover raise ValueError('unrecognized index type %s' % kind) return index -def _convert_string_array(data, encoding, itemsize=None, errors='strict'): +def _convert_string_array(data, encoding, errors, itemsize=None): """ we take a string-like that is object dtype and coerce to a fixed size string type @@ -4552,8 +4565,8 @@ def _convert_string_array(data, encoding, itemsize=None, errors='strict'): ---------- data : a numpy array of object dtype encoding : None or string-encoding + errors : handler for encoding errors itemsize : integer, optional, defaults to the max length of the strings - errors : handler for encoding errors, default 'strict' Returns ------- @@ -4616,15 +4629,15 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors='strict'): return data.reshape(shape) -def _maybe_convert(values, val_kind, encoding, errors='strict'): +def _maybe_convert(values, val_kind, encoding, errors): if _need_convert(val_kind): - conv = _get_converter(val_kind, encoding, errors=errors) + conv = _get_converter(val_kind, encoding, errors) # conv = np.frompyfunc(conv, 1, 1) values = conv(values) return values -def _get_converter(kind, encoding, errors='strict'): +def _get_converter(kind, encoding, errors): kind = _ensure_decoded(kind) if kind == 'datetime64': return lambda x: np.asarray(x, dtype='M8[ns]') From 3973ef795dd405a4f82ec22ac879d37d6f73f8c8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 29 Apr 2018 23:17:41 -0700 Subject: [PATCH 5/7] Updated whatsnew --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c128058858c17..d9d89993568b7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -450,6 +450,7 @@ Other Enhancements - Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ library. (:issue:`20564`) +- :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) .. _whatsnew_0230.api_breaking: From 61a0c6b54b0fc762bdc5fb9deb842c309570cc05 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 29 Apr 2018 23:19:20 -0700 Subject: [PATCH 6/7] LINT fixup --- pandas/io/pytables.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9ac954e6ead06..f27292eec36f2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2230,7 +2230,8 @@ class Fixed(StringMixin): ndim = None is_table = False - def __init__(self, parent, group, encoding=None, errors='strict', **kwargs): + def __init__(self, parent, group, encoding=None, errors='strict', + **kwargs): self.parent = parent self.group = group self.encoding = _ensure_encoding(encoding) @@ -4587,7 +4588,8 @@ def _convert_string_array(data, encoding, errors, itemsize=None): return data -def _unconvert_string_array(data, nan_rep=None, encoding=None, errors='strict'): +def _unconvert_string_array(data, nan_rep=None, encoding=None, + errors='strict'): """ inverse of _convert_string_array From 0fe838afb519dfcaab39f525446ba05ea4789f7f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 May 2018 06:28:06 -0500 Subject: [PATCH 7/7] Document errors --- pandas/core/generic.py | 4 ++++ pandas/io/pytables.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6d55f92167d3b..31a33c6f464c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1946,6 +1946,10 @@ def to_hdf(self, path_or_buf, key, **kwargs): If applying compression use the fletcher32 checksum. dropna : bool, default False If true, ALL nan rows will not be written to store. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. See Also -------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f27292eec36f2..5b06bca19cf13 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -308,6 +308,10 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): return columns iterator : optional, boolean, return an iterator, default False chunksize : optional, nrows to include in iteration, return an iterator + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. Returns -------