diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 7594478ada41a3..2e56fe6cb00852 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -36,6 +36,7 @@ Bug Fixes - Compat with Cython 0.25 for building (:issue:`14496`) +- Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index c9f50dec6c01e1..9f01adb6fabcbe 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -190,3 +190,11 @@ def test_skiprows_lineterminator(self): skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) + + def test_skiprows_infield_quote(self): + # see gh-14459 + data = 'a"\nb"\na\n1' + expected = DataFrame({'a': [1]}) + + df = self.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(df, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index af85b7b894d260..748edc7fcacc5f 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -478,9 +478,10 @@ static int end_line(parser_t *self) { } } - if (self->state == SKIP_LINE || \ - self->state == QUOTE_IN_SKIP_LINE || \ - self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE + if (self->state == START_FIELD_IN_SKIP_LINE || \ + self->state == IN_FIELD_IN_SKIP_LINE || \ + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \ + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE ) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count @@ -761,38 +762,54 @@ int tokenize_bytes(parser_t *self, size_t line_limit) switch(self->state) { - case SKIP_LINE: - TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state)); + case START_FIELD_IN_SKIP_LINE: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } break; - case QUOTE_IN_SKIP_LINE: + case IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { if (self->doublequote) { - self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE; + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } } break; - case QUOTE_IN_QUOTE_IN_SKIP_LINE: + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } break; @@ -846,9 +863,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // start of record if (skip_this_line(self, self->file_lines)) { if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; if (IS_TERMINATOR(c)) { END_LINE(); diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 8f7ae436bb7b7c..487c1265d93583 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -123,9 +123,10 @@ typedef enum { EAT_COMMENT, EAT_LINE_COMMENT, WHITESPACE_LINE, - SKIP_LINE, - QUOTE_IN_SKIP_LINE, - QUOTE_IN_QUOTE_IN_SKIP_LINE, + START_FIELD_IN_SKIP_LINE, + IN_FIELD_IN_SKIP_LINE, + IN_QUOTED_FIELD_IN_SKIP_LINE, + QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, FINISHED } ParserState;