Skip to content

Commit fb6131d

Browse files
lithomas1TLouf
authored andcommitted
DEPR: error_bad_lines and warn_bad_lines for read_csv (pandas-dev#40413)
1 parent 0b9c558 commit fb6131d

14 files changed

+241
-70
lines changed

doc/source/user_guide/io.rst

+20-3
Original file line numberDiff line numberDiff line change
@@ -344,16 +344,33 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None``
344344
Error handling
345345
++++++++++++++
346346

347-
error_bad_lines : boolean, default ``True``
347+
error_bad_lines : boolean, default ``None``
348348
Lines with too many fields (e.g. a csv line with too many commas) will by
349349
default cause an exception to be raised, and no ``DataFrame`` will be
350350
returned. If ``False``, then these "bad lines" will dropped from the
351351
``DataFrame`` that is returned. See :ref:`bad lines <io.bad_lines>`
352352
below.
353-
warn_bad_lines : boolean, default ``True``
353+
354+
.. deprecated:: 1.3
355+
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
356+
encountering a bad line instead.
357+
warn_bad_lines : boolean, default ``None``
354358
If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
355359
each "bad line" will be output.
356360

361+
.. deprecated:: 1.3
362+
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
363+
encountering a bad line instead.
364+
on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error'
365+
Specifies what to do upon encountering a bad line (a line with too many fields).
366+
Allowed values are :
367+
368+
- 'error', raise an ParserError when a bad line is encountered.
369+
- 'warn', print a warning when a bad line is encountered and skip that line.
370+
- 'skip', skip bad lines without raising or warning when they are encountered.
371+
372+
.. versionadded:: 1.3
373+
357374
.. _io.dtypes:
358375

359376
Specifying column data types
@@ -1245,7 +1262,7 @@ You can elect to skip bad lines:
12451262

12461263
.. code-block:: ipython
12471264
1248-
In [29]: pd.read_csv(StringIO(data), error_bad_lines=False)
1265+
In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn")
12491266
Skipping line 3: expected 3 fields, saw 4
12501267
12511268
Out[29]:

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ Deprecations
669669
- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`)
670670
- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`)
671671
- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
672+
- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:``read_csv`` and :meth:``read_table`` in favor of argument ``on_bad_lines`` (:issue:`15122`)
672673
- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
673674
- Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`)
674675
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)

pandas/_libs/parsers.pyx

+9-12
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ cdef extern from "parser/tokenizer.h":
146146

147147
enum: ERROR_OVERFLOW
148148

149+
ctypedef enum BadLineHandleMethod:
150+
ERROR,
151+
WARN,
152+
SKIP
153+
149154
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
150155
int *status, const char *encoding_errors)
151156
ctypedef int (*io_cleanup)(void *src)
@@ -198,8 +203,7 @@ cdef extern from "parser/tokenizer.h":
198203
int usecols
199204

200205
int expected_fields
201-
int error_bad_lines
202-
int warn_bad_lines
206+
BadLineHandleMethod on_bad_lines
203207

204208
# floating point options
205209
char decimal
@@ -351,8 +355,7 @@ cdef class TextReader:
351355
thousands=None, # bytes | str
352356
dtype=None,
353357
usecols=None,
354-
bint error_bad_lines=True,
355-
bint warn_bad_lines=True,
358+
on_bad_lines = ERROR,
356359
bint na_filter=True,
357360
na_values=None,
358361
na_fvalues=None,
@@ -435,9 +438,7 @@ cdef class TextReader:
435438
raise ValueError('Only length-1 comment characters supported')
436439
self.parser.commentchar = ord(comment)
437440

438-
# error handling of bad lines
439-
self.parser.error_bad_lines = int(error_bad_lines)
440-
self.parser.warn_bad_lines = int(warn_bad_lines)
441+
self.parser.on_bad_lines = on_bad_lines
441442

442443
self.skiprows = skiprows
443444
if skiprows is not None:
@@ -454,8 +455,7 @@ cdef class TextReader:
454455

455456
# XXX
456457
if skipfooter > 0:
457-
self.parser.error_bad_lines = 0
458-
self.parser.warn_bad_lines = 0
458+
self.parser.on_bad_lines = SKIP
459459

460460
self.delimiter = delimiter
461461

@@ -570,9 +570,6 @@ cdef class TextReader:
570570
kh_destroy_str_starts(self.false_set)
571571
self.false_set = NULL
572572

573-
def set_error_bad_lines(self, int status) -> None:
574-
self.parser.error_bad_lines = status
575-
576573
def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
577574
if not isinstance(quoting, int):
578575
raise TypeError('"quoting" must be an integer')

pandas/_libs/src/parser/tokenizer.c

+3-4
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) {
9393
self->allow_embedded_newline = 1;
9494

9595
self->expected_fields = -1;
96-
self->error_bad_lines = 0;
97-
self->warn_bad_lines = 0;
96+
self->on_bad_lines = ERROR;
9897

9998
self->commentchar = '#';
10099
self->thousands = '\0';
@@ -457,7 +456,7 @@ static int end_line(parser_t *self) {
457456
self->line_fields[self->lines] = 0;
458457

459458
// file_lines is now the actual file line number (starting at 1)
460-
if (self->error_bad_lines) {
459+
if (self->on_bad_lines == ERROR) {
461460
self->error_msg = malloc(bufsize);
462461
snprintf(self->error_msg, bufsize,
463462
"Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
@@ -468,7 +467,7 @@ static int end_line(parser_t *self) {
468467
return -1;
469468
} else {
470469
// simply skip bad lines
471-
if (self->warn_bad_lines) {
470+
if (self->on_bad_lines == WARN) {
472471
// pass up error message
473472
msg = malloc(bufsize);
474473
snprintf(msg, bufsize,

pandas/_libs/src/parser/tokenizer.h

+7-2
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ typedef enum {
8484
QUOTE_NONE
8585
} QuoteStyle;
8686

87+
typedef enum {
88+
ERROR,
89+
WARN,
90+
SKIP
91+
} BadLineHandleMethod;
92+
8793
typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
8894
int *status, const char *encoding_errors);
8995
typedef int (*io_cleanup)(void *src);
@@ -136,8 +142,7 @@ typedef struct parser_t {
136142
int usecols; // Boolean: 1: usecols provided, 0: none provided
137143

138144
int expected_fields;
139-
int error_bad_lines;
140-
int warn_bad_lines;
145+
BadLineHandleMethod on_bad_lines;
141146

142147
// floating point options
143148
char decimal;

pandas/io/parsers/base_parser.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from collections import defaultdict
44
import csv
55
import datetime
6+
from enum import Enum
67
import itertools
78
from typing import (
89
Any,
@@ -108,10 +109,16 @@
108109
"infer_datetime_format": False,
109110
"skip_blank_lines": True,
110111
"encoding_errors": "strict",
112+
"on_bad_lines": "error",
111113
}
112114

113115

114116
class ParserBase:
117+
class BadLineHandleMethod(Enum):
118+
ERROR = 0
119+
WARN = 1
120+
SKIP = 2
121+
115122
_implicit_index: bool = False
116123
_first_chunk: bool
117124

@@ -203,9 +210,13 @@ def __init__(self, kwds):
203210

204211
self.handles: IOHandles | None = None
205212

213+
# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
214+
# Normally, this arg would get pre-processed earlier on
215+
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
216+
206217
def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None:
207218
"""
208-
Let the readers open IOHanldes after they are done with their potential raises.
219+
Let the readers open IOHandles after they are done with their potential raises.
209220
"""
210221
self.handles = get_handle(
211222
src,

pandas/io/parsers/c_parser_wrapper.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,18 @@ def __init__(self, src: FilePathOrBuffer, **kwds):
5050
# open handles
5151
self._open_handles(src, kwds)
5252
assert self.handles is not None
53-
for key in ("storage_options", "encoding", "memory_map", "compression"):
53+
54+
# Have to pass int, would break tests using TextReader directly otherwise :(
55+
kwds["on_bad_lines"] = self.on_bad_lines.value
56+
57+
for key in (
58+
"storage_options",
59+
"encoding",
60+
"memory_map",
61+
"compression",
62+
"error_bad_lines",
63+
"warn_bad_lines",
64+
):
5465
kwds.pop(key, None)
5566

5667
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
@@ -206,9 +217,6 @@ def _set_noconvert_columns(self):
206217
for col in noconvert_columns:
207218
self._reader.set_noconvert(col)
208219

209-
def set_error_bad_lines(self, status):
210-
self._reader.set_error_bad_lines(int(status))
211-
212220
def read(self, nrows=None):
213221
try:
214222
if self.low_memory:

pandas/io/parsers/python_parser.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,6 @@ def __init__(self, f: Union[FilePathOrBuffer, list], **kwds):
7474
self.quoting = kwds["quoting"]
7575
self.skip_blank_lines = kwds["skip_blank_lines"]
7676

77-
self.warn_bad_lines = kwds["warn_bad_lines"]
78-
self.error_bad_lines = kwds["error_bad_lines"]
79-
8077
self.names_passed = kwds["names"] or None
8178

8279
self.has_index_names = False
@@ -707,10 +704,11 @@ def _next_line(self):
707704

708705
def _alert_malformed(self, msg, row_num):
709706
"""
710-
Alert a user about a malformed row.
707+
Alert a user about a malformed row, depending on value of
708+
`self.on_bad_lines` enum.
711709
712-
If `self.error_bad_lines` is True, the alert will be `ParserError`.
713-
If `self.warn_bad_lines` is True, the alert will be printed out.
710+
If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
711+
If `self.on_bad_lines` is WARN, the alert will be printed out.
714712
715713
Parameters
716714
----------
@@ -719,9 +717,9 @@ def _alert_malformed(self, msg, row_num):
719717
Because this row number is displayed, we 1-index,
720718
even though we 0-index internally.
721719
"""
722-
if self.error_bad_lines:
720+
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
723721
raise ParserError(msg)
724-
elif self.warn_bad_lines:
722+
elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
725723
base = f"Skipping line {row_num}: "
726724
sys.stderr.write(base + msg + "\n")
727725

@@ -742,7 +740,10 @@ def _next_iter_line(self, row_num):
742740
assert self.data is not None
743741
return next(self.data)
744742
except csv.Error as e:
745-
if self.warn_bad_lines or self.error_bad_lines:
743+
if (
744+
self.on_bad_lines == self.BadLineHandleMethod.ERROR
745+
or self.on_bad_lines == self.BadLineHandleMethod.WARN
746+
):
746747
msg = str(e)
747748

748749
if "NULL byte" in msg or "line contains NUL" in msg:
@@ -947,11 +948,14 @@ def _rows_to_cols(self, content):
947948
actual_len = len(l)
948949

949950
if actual_len > col_len:
950-
if self.error_bad_lines or self.warn_bad_lines:
951+
if (
952+
self.on_bad_lines == self.BadLineHandleMethod.ERROR
953+
or self.on_bad_lines == self.BadLineHandleMethod.WARN
954+
):
951955
row_num = self.pos - (content_len - i + footers)
952956
bad_lines.append((row_num, actual_len))
953957

954-
if self.error_bad_lines:
958+
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
955959
break
956960
else:
957961
content.append(l)

0 commit comments

Comments
 (0)