Skip to content

Commit

Permalink
BUG: Don't parse inline quotes in skipped lines
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Oct 30, 2016
1 parent 7f5a45c commit 2e41dab
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 17 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Bug Fixes
- Compat with Cython 0.25 for building (:issue:`14496`)


- Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`)
- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`)
- Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`)
- Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`)
Expand Down
8 changes: 8 additions & 0 deletions pandas/io/tests/parser/skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,11 @@ def test_skiprows_lineterminator(self):
skiprows=1, delim_whitespace=True,
names=['date', 'time', 'var', 'flag', 'oflag'])
tm.assert_frame_equal(df, expected)

def test_skiprows_infield_quote(self):
# see gh-14459
data = 'a"\nb"\na\n1'
expected = DataFrame({'a': [1]})

df = self.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(df, expected)
45 changes: 31 additions & 14 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -478,9 +478,10 @@ static int end_line(parser_t *self) {
}
}

if (self->state == SKIP_LINE || \
self->state == QUOTE_IN_SKIP_LINE || \
self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE
if (self->state == START_FIELD_IN_SKIP_LINE || \
self->state == IN_FIELD_IN_SKIP_LINE || \
self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \
self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE
) {
TRACE(("end_line: Skipping row %d\n", self->file_lines));
// increment file line count
Expand Down Expand Up @@ -761,38 +762,54 @@ int tokenize_bytes(parser_t *self, size_t line_limit)

switch(self->state) {

case SKIP_LINE:
TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state));
case START_FIELD_IN_SKIP_LINE:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_QUOTE(c)) {
self->state = QUOTE_IN_SKIP_LINE;
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else if (IS_DELIMITER(c)) {
// Do nothing, we're starting a new field again.
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
break;

case IN_FIELD_IN_SKIP_LINE:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_DELIMITER(c)) {
self->state = START_FIELD_IN_SKIP_LINE;
}
break;

case QUOTE_IN_SKIP_LINE:
case IN_QUOTED_FIELD_IN_SKIP_LINE:
if (IS_QUOTE(c)) {
if (self->doublequote) {
self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE;
self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
self->state = SKIP_LINE;
self->state = IN_FIELD_IN_SKIP_LINE;
}
}
break;

case QUOTE_IN_QUOTE_IN_SKIP_LINE:
case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
if (IS_QUOTE(c)) {
self->state = QUOTE_IN_SKIP_LINE;
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_DELIMITER(c)) {
self->state = START_FIELD_IN_SKIP_LINE;
} else {
self->state = SKIP_LINE;
self->state = IN_FIELD_IN_SKIP_LINE;
}
break;

Expand Down Expand Up @@ -846,9 +863,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
// start of record
if (skip_this_line(self, self->file_lines)) {
if (IS_QUOTE(c)) {
self->state = QUOTE_IN_SKIP_LINE;
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
self->state = SKIP_LINE;
self->state = IN_FIELD_IN_SKIP_LINE;

if (IS_TERMINATOR(c)) {
END_LINE();
Expand Down
7 changes: 4 additions & 3 deletions pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,10 @@ typedef enum {
EAT_COMMENT,
EAT_LINE_COMMENT,
WHITESPACE_LINE,
SKIP_LINE,
QUOTE_IN_SKIP_LINE,
QUOTE_IN_QUOTE_IN_SKIP_LINE,
START_FIELD_IN_SKIP_LINE,
IN_FIELD_IN_SKIP_LINE,
IN_QUOTED_FIELD_IN_SKIP_LINE,
QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
FINISHED
} ParserState;

Expand Down

0 comments on commit 2e41dab

Please sign in to comment.