Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: more flexible iso8601 parsing #12060

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 12 additions & 18 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,33 +1059,27 @@ class timeseries_to_datetime_iso8601(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
for x in self.rng]

def time_timeseries_to_datetime_iso8601(self):
to_datetime(self.strings)


class timeseries_to_datetime_iso8601_format(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
if hasattr(Series, 'convert'):
Series.resample = Series.convert
self.ts = Series(np.random.randn(self.N), index=self.rng)
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
def time_timeseries_to_datetime_iso8601_nosep(self):
to_datetime(self.strings_nosep)

def time_timeseries_to_datetime_iso8601_format(self):
to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')

def time_timeseries_to_datetime_iso8601_format_no_sep(self):
to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S')

def time_timeseries_to_datetime_iso8601_tz_spaceformat(self):
to_datetime(self.strings_tz_space)


class timeseries_with_format_no_exact(object):
goal_time = 0.2
Expand Down Expand Up @@ -1160,4 +1154,4 @@ def setup(self):
self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)

def time_timeseries_year_incr(self):
(self.date + self.year)
(self.date + self.year)
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ Performance Improvements




- Improved performance of ISO 8601 date parsing for dates without separators (:issue:`11899`), leading zeros (:issue:`11871`) and with whitespace preceding the time zone (:issue:`9714`)



Expand Down
125 changes: 102 additions & 23 deletions pandas/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc,
/*
* Parses (almost) standard ISO 8601 date strings. The differences are:
*
* + The date "20100312" is parsed as the year 20100312, not as
* equivalent to "2010-03-12". The '-' in the dates are not optional.
* + Only seconds may have a decimal point, with up to 18 digits after it
* (maximum attoseconds precision).
* + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
Expand Down Expand Up @@ -396,6 +394,16 @@ parse_iso_8601_datetime(char *str, int len,
char *substr, sublen;
PANDAS_DATETIMEUNIT bestunit;

/* if date components in are separated by one of valid separators
* months/days without leadings 0s will be parsed
* (though not iso8601). If the components aren't separated,
* an error code will be retuned because the date is ambigous
*/
int has_sep = 0;
char sep;
char valid_sep[] = {'-', '.', '/', '\\', ' '};
int valid_sep_len = 5;

/* Initialize the output to all zeros */
memset(out, 0, sizeof(pandas_datetimestruct));
out->month = 1;
Expand Down Expand Up @@ -523,12 +531,16 @@ parse_iso_8601_datetime(char *str, int len,
goto parse_error;
}

/* PARSE THE YEAR (digits until the '-' character) */
/* PARSE THE YEAR (4 digits) */
out->year = 0;
while (sublen > 0 && isdigit(*substr)) {
out->year = 10 * out->year + (*substr - '0');
++substr;
--sublen;
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
isdigit(substr[2]) && isdigit(substr[3])) {

out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
10 * (substr[2] - '0') + (substr[3] - '0');

substr += 4;
sublen -= 4;;
}

/* Negate the year if necessary */
Expand All @@ -538,29 +550,49 @@ parse_iso_8601_datetime(char *str, int len,
/* Check whether it's a leap-year */
year_leap = is_leapyear(out->year);

/* Next character must be a '-' or the end of the string */
/* Next character must be a separator, start of month or end */
if (sublen == 0) {
if (out_local != NULL) {
*out_local = 0;
}
bestunit = PANDAS_FR_Y;
goto finish;
}
else if (*substr == '-') {
++substr;
--sublen;
}
else {
goto parse_error;
else if (!isdigit(*substr)) {
for (i = 0; i < valid_sep_len; ++i) {
if (*substr == valid_sep[i]) {
has_sep = 1;
sep = valid_sep[i];
++substr;
--sublen;
break;
}
}
if (i == valid_sep_len) {
goto parse_error;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this validate all seps are the same?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes - This first check (sep between year and month) validates its in the list - the second check (between month/day) is required to match sep

}
}

/* Can't have a trailing '-' */
/* Can't have a trailing sep */
if (sublen == 0) {
goto parse_error;
}


/* PARSE THE MONTH (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
|| (sublen == 1 && isdigit(substr[0])))) {
out->month = (substr[0] - '0');

if (out->month < 1) {
PyErr_Format(PyExc_ValueError,
"Month out of range in datetime string \"%s\"", str);
goto error;
}
++substr;
--sublen;
}
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->month = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->month < 1 || out->month > 12) {
Expand All @@ -577,18 +609,22 @@ parse_iso_8601_datetime(char *str, int len,

/* Next character must be a '-' or the end of the string */
if (sublen == 0) {
/* dates of form YYYYMM are not valid */
if (!has_sep) {
goto parse_error;
}
if (out_local != NULL) {
*out_local = 0;
}
bestunit = PANDAS_FR_M;
goto finish;
}
else if (*substr == '-') {
else if (has_sep && *substr == sep) {
++substr;
--sublen;
}
else {
goto parse_error;
else if (!isdigit(*substr)) {
goto parse_error;
}

/* Can't have a trailing '-' */
Expand All @@ -597,7 +633,19 @@ parse_iso_8601_datetime(char *str, int len,
}

/* PARSE THE DAY (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
|| (sublen == 1 && isdigit(substr[0])))) {
out->day = (substr[0] - '0');

if (out->day < 1) {
PyErr_Format(PyExc_ValueError,
"Day out of range in datetime string \"%s\"", str);
goto error;
}
++substr;
--sublen;
}
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->day = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->day < 1 ||
Expand Down Expand Up @@ -633,14 +681,19 @@ parse_iso_8601_datetime(char *str, int len,
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->hour = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->hour < 0 || out->hour >= 24) {
if (out->hour >= 24) {
PyErr_Format(PyExc_ValueError,
"Hours out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->hour = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these negative checks still needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kawochen pointed this out - unless I'm misunderstanding something they will always be false.

}
Expand All @@ -664,14 +717,19 @@ parse_iso_8601_datetime(char *str, int len,
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->min = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->hour < 0 || out->min >= 60) {
if (out->min >= 60) {
PyErr_Format(PyExc_ValueError,
"Minutes out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->min = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand All @@ -695,14 +753,19 @@ parse_iso_8601_datetime(char *str, int len,
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->sec = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->sec < 0 || out->sec >= 60) {
if (out->sec >= 60) {
PyErr_Format(PyExc_ValueError,
"Seconds out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->sec = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand Down Expand Up @@ -781,6 +844,12 @@ parse_iso_8601_datetime(char *str, int len,
}

parse_timezone:
/* trim any whitepsace between time/timeezone */
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
}

if (sublen == 0) {
// Unlike NumPy, treating no time zone as naive
goto finish;
Expand Down Expand Up @@ -832,6 +901,11 @@ parse_iso_8601_datetime(char *str, int len,
goto error;
}
}
else if (sublen >= 1 && isdigit(substr[0])) {
offset_hour = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand All @@ -856,6 +930,11 @@ parse_iso_8601_datetime(char *str, int len,
goto error;
}
}
else if (sublen >= 1 && isdigit(substr[0])) {
offset_minute = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}
Expand Down
13 changes: 11 additions & 2 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2454,7 +2454,7 @@ def test_constructor_datetime64_tzformat(self):
idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00',
freq=freq)
expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
freq=freq, tz=tzoffset(None, -18000))
freq=freq, tz=pytz.FixedOffset(-300))
tm.assert_index_equal(idx, expected)
# Unable to use `US/Eastern` because of DST
expected_i8 = date_range('2013-01-01T00:00:00',
Expand All @@ -2465,7 +2465,7 @@ def test_constructor_datetime64_tzformat(self):
idx = date_range('2013/1/1 0:00:00+9:00',
'2016/1/1 23:59:59+09:00', freq=freq)
expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
freq=freq, tz=tzoffset(None, 32400))
freq=freq, tz=pytz.FixedOffset(540))
tm.assert_index_equal(idx, expected)
expected_i8 = date_range('2013-01-01T00:00:00',
'2016-01-01T23:59:59', freq=freq,
Expand Down Expand Up @@ -4833,6 +4833,15 @@ def test_to_datetime_infer_datetime_format_series_starting_with_nans(self):
pd.to_datetime(test_series, infer_datetime_format=True)
)

def test_to_datetime_iso8601_noleading_0s(self):
# GH 11871
test_series = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3'])
expected = pd.Series([pd.Timestamp('2014-01-01'),
pd.Timestamp('2014-02-02'),
pd.Timestamp('2015-03-03')])
tm.assert_series_equal(pd.to_datetime(test_series), expected)
tm.assert_series_equal(pd.to_datetime(test_series, format='%Y-%m-%d'),
expected)

class TestGuessDatetimeFormat(tm.TestCase):
def test_guess_datetime_format_with_parseable_formats(self):
Expand Down
26 changes: 26 additions & 0 deletions pandas/tseries/tests/test_tslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,32 @@ def test_parsers_timezone_minute_offsets_roundtrip(self):
converted_time = dt_time.tz_localize('UTC').tz_convert(tz)
self.assertEqual(dt_string_repr, repr(converted_time))

def test_parsers_iso8601(self):
# GH 12060
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

give a comment about what we are testing & doing here (and that all seps must be the same)

# test only the iso parser - flexibility to different
# separators and leadings 0s
# Timestamp construction falls back to dateutil
cases = {'2011-01-02': datetime.datetime(2011, 1, 2),
'2011-1-2': datetime.datetime(2011, 1, 2),
'2011-01': datetime.datetime(2011, 1, 1),
'2011-1': datetime.datetime(2011, 1, 1),
'2011 01 02': datetime.datetime(2011, 1, 2),
'2011.01.02': datetime.datetime(2011, 1, 2),
'2011/01/02': datetime.datetime(2011, 1, 2),
'2011\\01\\02': datetime.datetime(2011, 1, 2),
'2013-01-01 05:30:00': datetime.datetime(2013, 1, 1, 5, 30),
'2013-1-1 5:30:00': datetime.datetime(2013, 1, 1, 5, 30)}
for date_str, exp in compat.iteritems(cases):
actual = tslib._test_parse_iso8601(date_str)
self.assertEqual(actual, exp)

# seperators must all match - YYYYMM not valid
invalid_cases = ['2011-01/02', '2011^11^11', '201401',
'201111', '200101']
for date_str in invalid_cases:
with tm.assertRaises(ValueError):
tslib._test_parse_iso8601(date_str)


class TestArrayToDatetime(tm.TestCase):
def test_parsing_valid_dates(self):
Expand Down
Loading