Skip to content

Commit 6b32cb3

Browse files
committed
PERF: more flexible iso8601 parsing
1 parent 5b5b2fe commit 6b32cb3

File tree

7 files changed

+188
-48
lines changed

7 files changed

+188
-48
lines changed

asv_bench/benchmarks/timeseries.py

+12-18
Original file line numberDiff line numberDiff line change
@@ -1059,33 +1059,27 @@ class timeseries_to_datetime_iso8601(object):
10591059
goal_time = 0.2
10601060

10611061
def setup(self):
1062-
self.N = 100000
1063-
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
1064-
if hasattr(Series, 'convert'):
1065-
Series.resample = Series.convert
1066-
self.ts = Series(np.random.randn(self.N), index=self.rng)
10671062
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
10681063
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
1064+
self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng]
1065+
self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
1066+
for x in self.rng]
10691067

10701068
def time_timeseries_to_datetime_iso8601(self):
10711069
to_datetime(self.strings)
10721070

1073-
1074-
class timeseries_to_datetime_iso8601_format(object):
1075-
goal_time = 0.2
1076-
1077-
def setup(self):
1078-
self.N = 100000
1079-
self.rng = date_range(start='1/1/2000', periods=self.N, freq='T')
1080-
if hasattr(Series, 'convert'):
1081-
Series.resample = Series.convert
1082-
self.ts = Series(np.random.randn(self.N), index=self.rng)
1083-
self.rng = date_range(start='1/1/2000', periods=20000, freq='H')
1084-
self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng]
1071+
def time_timeseries_to_datetime_iso8601_nosep(self):
1072+
to_datetime(self.strings_nosep)
10851073

10861074
def time_timeseries_to_datetime_iso8601_format(self):
10871075
to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')
10881076

1077+
def time_timeseries_to_datetime_iso8601_format_no_sep(self):
1078+
to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S')
1079+
1080+
def time_timeseries_to_datetime_iso8601_tz_spaceformat(self):
1081+
to_datetime(self.strings_tz_space)
1082+
10891083

10901084
class timeseries_with_format_no_exact(object):
10911085
goal_time = 0.2
@@ -1160,4 +1154,4 @@ def setup(self):
11601154
self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal)
11611155

11621156
def time_timeseries_year_incr(self):
1163-
(self.date + self.year)
1157+
(self.date + self.year)

doc/source/whatsnew/v0.18.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ Performance Improvements
461461

462462

463463

464-
464+
- Improved performance of ISO 8601 date parsing for dates without separators (:issue:`11899`), leading zeros (:issue:`11871`) and with whitespace preceding the time zone (:issue:`9714`)
465465

466466

467467

pandas/src/datetime/np_datetime_strings.c

+102-23
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc,
346346
/*
347347
* Parses (almost) standard ISO 8601 date strings. The differences are:
348348
*
349-
* + The date "20100312" is parsed as the year 20100312, not as
350-
* equivalent to "2010-03-12". The '-' in the dates are not optional.
351349
* + Only seconds may have a decimal point, with up to 18 digits after it
352350
* (maximum attoseconds precision).
353351
* + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
@@ -396,6 +394,16 @@ parse_iso_8601_datetime(char *str, int len,
396394
char *substr, sublen;
397395
PANDAS_DATETIMEUNIT bestunit;
398396

397+
/* if date components in are separated by one of valid separators
398+
* months/days without leadings 0s will be parsed
399+
* (though not iso8601). If the components aren't separated,
400+
* an error code will be retuned because the date is ambigous
401+
*/
402+
int has_sep = 0;
403+
char sep;
404+
char valid_sep[] = {'-', '.', '/', '\\', ' '};
405+
int valid_sep_len = 5;
406+
399407
/* Initialize the output to all zeros */
400408
memset(out, 0, sizeof(pandas_datetimestruct));
401409
out->month = 1;
@@ -523,12 +531,16 @@ parse_iso_8601_datetime(char *str, int len,
523531
goto parse_error;
524532
}
525533

526-
/* PARSE THE YEAR (digits until the '-' character) */
534+
/* PARSE THE YEAR (4 digits) */
527535
out->year = 0;
528-
while (sublen > 0 && isdigit(*substr)) {
529-
out->year = 10 * out->year + (*substr - '0');
530-
++substr;
531-
--sublen;
536+
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
537+
isdigit(substr[2]) && isdigit(substr[3])) {
538+
539+
out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
540+
10 * (substr[2] - '0') + (substr[3] - '0');
541+
542+
substr += 4;
543+
sublen -= 4;;
532544
}
533545

534546
/* Negate the year if necessary */
@@ -538,29 +550,49 @@ parse_iso_8601_datetime(char *str, int len,
538550
/* Check whether it's a leap-year */
539551
year_leap = is_leapyear(out->year);
540552

541-
/* Next character must be a '-' or the end of the string */
553+
/* Next character must be a separator, start of month or end */
542554
if (sublen == 0) {
543555
if (out_local != NULL) {
544556
*out_local = 0;
545557
}
546558
bestunit = PANDAS_FR_Y;
547559
goto finish;
548560
}
549-
else if (*substr == '-') {
550-
++substr;
551-
--sublen;
552-
}
553-
else {
554-
goto parse_error;
561+
else if (!isdigit(*substr)) {
562+
for (i = 0; i < valid_sep_len; ++i) {
563+
if (*substr == valid_sep[i]) {
564+
has_sep = 1;
565+
sep = valid_sep[i];
566+
++substr;
567+
--sublen;
568+
break;
569+
}
570+
}
571+
if (i == valid_sep_len) {
572+
goto parse_error;
573+
}
555574
}
556575

557-
/* Can't have a trailing '-' */
576+
/* Can't have a trailing sep */
558577
if (sublen == 0) {
559578
goto parse_error;
560579
}
561580

581+
562582
/* PARSE THE MONTH (2 digits) */
563-
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
583+
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
584+
|| (sublen == 1 && isdigit(substr[0])))) {
585+
out->month = (substr[0] - '0');
586+
587+
if (out->month < 1) {
588+
PyErr_Format(PyExc_ValueError,
589+
"Month out of range in datetime string \"%s\"", str);
590+
goto error;
591+
}
592+
++substr;
593+
--sublen;
594+
}
595+
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
564596
out->month = 10 * (substr[0] - '0') + (substr[1] - '0');
565597

566598
if (out->month < 1 || out->month > 12) {
@@ -577,18 +609,22 @@ parse_iso_8601_datetime(char *str, int len,
577609

578610
/* Next character must be a '-' or the end of the string */
579611
if (sublen == 0) {
612+
/* dates of form YYYYMM are not valid */
613+
if (!has_sep) {
614+
goto parse_error;
615+
}
580616
if (out_local != NULL) {
581617
*out_local = 0;
582618
}
583619
bestunit = PANDAS_FR_M;
584620
goto finish;
585621
}
586-
else if (*substr == '-') {
622+
else if (has_sep && *substr == sep) {
587623
++substr;
588624
--sublen;
589625
}
590-
else {
591-
goto parse_error;
626+
else if (!isdigit(*substr)) {
627+
goto parse_error;
592628
}
593629

594630
/* Can't have a trailing '-' */
@@ -597,7 +633,19 @@ parse_iso_8601_datetime(char *str, int len,
597633
}
598634

599635
/* PARSE THE DAY (2 digits) */
600-
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
636+
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
637+
|| (sublen == 1 && isdigit(substr[0])))) {
638+
out->day = (substr[0] - '0');
639+
640+
if (out->day < 1) {
641+
PyErr_Format(PyExc_ValueError,
642+
"Day out of range in datetime string \"%s\"", str);
643+
goto error;
644+
}
645+
++substr;
646+
--sublen;
647+
}
648+
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
601649
out->day = 10 * (substr[0] - '0') + (substr[1] - '0');
602650

603651
if (out->day < 1 ||
@@ -633,14 +681,19 @@ parse_iso_8601_datetime(char *str, int len,
633681
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
634682
out->hour = 10 * (substr[0] - '0') + (substr[1] - '0');
635683

636-
if (out->hour < 0 || out->hour >= 24) {
684+
if (out->hour >= 24) {
637685
PyErr_Format(PyExc_ValueError,
638686
"Hours out of range in datetime string \"%s\"", str);
639687
goto error;
640688
}
641689
substr += 2;
642690
sublen -= 2;
643691
}
692+
else if (sublen >= 1 && isdigit(substr[0])) {
693+
out->hour = substr[0] - '0';
694+
++substr;
695+
--sublen;
696+
}
644697
else {
645698
goto parse_error;
646699
}
@@ -664,14 +717,19 @@ parse_iso_8601_datetime(char *str, int len,
664717
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
665718
out->min = 10 * (substr[0] - '0') + (substr[1] - '0');
666719

667-
if (out->hour < 0 || out->min >= 60) {
720+
if (out->min >= 60) {
668721
PyErr_Format(PyExc_ValueError,
669722
"Minutes out of range in datetime string \"%s\"", str);
670723
goto error;
671724
}
672725
substr += 2;
673726
sublen -= 2;
674727
}
728+
else if (sublen >= 1 && isdigit(substr[0])) {
729+
out->min = substr[0] - '0';
730+
++substr;
731+
--sublen;
732+
}
675733
else {
676734
goto parse_error;
677735
}
@@ -695,14 +753,19 @@ parse_iso_8601_datetime(char *str, int len,
695753
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
696754
out->sec = 10 * (substr[0] - '0') + (substr[1] - '0');
697755

698-
if (out->sec < 0 || out->sec >= 60) {
756+
if (out->sec >= 60) {
699757
PyErr_Format(PyExc_ValueError,
700758
"Seconds out of range in datetime string \"%s\"", str);
701759
goto error;
702760
}
703761
substr += 2;
704762
sublen -= 2;
705763
}
764+
else if (sublen >= 1 && isdigit(substr[0])) {
765+
out->sec = substr[0] - '0';
766+
++substr;
767+
--sublen;
768+
}
706769
else {
707770
goto parse_error;
708771
}
@@ -781,6 +844,12 @@ parse_iso_8601_datetime(char *str, int len,
781844
}
782845

783846
parse_timezone:
847+
/* trim any whitepsace between time/timeezone */
848+
while (sublen > 0 && isspace(*substr)) {
849+
++substr;
850+
--sublen;
851+
}
852+
784853
if (sublen == 0) {
785854
// Unlike NumPy, treating no time zone as naive
786855
goto finish;
@@ -832,6 +901,11 @@ parse_iso_8601_datetime(char *str, int len,
832901
goto error;
833902
}
834903
}
904+
else if (sublen >= 1 && isdigit(substr[0])) {
905+
offset_hour = substr[0] - '0';
906+
++substr;
907+
--sublen;
908+
}
835909
else {
836910
goto parse_error;
837911
}
@@ -856,6 +930,11 @@ parse_iso_8601_datetime(char *str, int len,
856930
goto error;
857931
}
858932
}
933+
else if (sublen >= 1 && isdigit(substr[0])) {
934+
offset_minute = substr[0] - '0';
935+
++substr;
936+
--sublen;
937+
}
859938
else {
860939
goto parse_error;
861940
}

pandas/tseries/tests/test_timeseries.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -2454,7 +2454,7 @@ def test_constructor_datetime64_tzformat(self):
24542454
idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00',
24552455
freq=freq)
24562456
expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
2457-
freq=freq, tz=tzoffset(None, -18000))
2457+
freq=freq, tz=pytz.FixedOffset(-300))
24582458
tm.assert_index_equal(idx, expected)
24592459
# Unable to use `US/Eastern` because of DST
24602460
expected_i8 = date_range('2013-01-01T00:00:00',
@@ -2465,7 +2465,7 @@ def test_constructor_datetime64_tzformat(self):
24652465
idx = date_range('2013/1/1 0:00:00+9:00',
24662466
'2016/1/1 23:59:59+09:00', freq=freq)
24672467
expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59',
2468-
freq=freq, tz=tzoffset(None, 32400))
2468+
freq=freq, tz=pytz.FixedOffset(540))
24692469
tm.assert_index_equal(idx, expected)
24702470
expected_i8 = date_range('2013-01-01T00:00:00',
24712471
'2016-01-01T23:59:59', freq=freq,
@@ -4833,6 +4833,15 @@ def test_to_datetime_infer_datetime_format_series_starting_with_nans(self):
48334833
pd.to_datetime(test_series, infer_datetime_format=True)
48344834
)
48354835

4836+
def test_to_datetime_iso8601_noleading_0s(self):
4837+
# GH 11871
4838+
test_series = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3'])
4839+
expected = pd.Series([pd.Timestamp('2014-01-01'),
4840+
pd.Timestamp('2014-02-02'),
4841+
pd.Timestamp('2015-03-03')])
4842+
tm.assert_series_equal(pd.to_datetime(test_series), expected)
4843+
tm.assert_series_equal(pd.to_datetime(test_series, format='%Y-%m-%d'),
4844+
expected)
48364845

48374846
class TestGuessDatetimeFormat(tm.TestCase):
48384847
def test_guess_datetime_format_with_parseable_formats(self):

pandas/tseries/tests/test_tslib.py

+26
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,32 @@ def test_parsers_timezone_minute_offsets_roundtrip(self):
688688
converted_time = dt_time.tz_localize('UTC').tz_convert(tz)
689689
self.assertEqual(dt_string_repr, repr(converted_time))
690690

691+
def test_parsers_iso8601(self):
692+
# GH 12060
693+
# test only the iso parser - flexibility to different
694+
# separators and leadings 0s
695+
# Timestamp construction falls back to dateutil
696+
cases = {'2011-01-02': datetime.datetime(2011, 1, 2),
697+
'2011-1-2': datetime.datetime(2011, 1, 2),
698+
'2011-01': datetime.datetime(2011, 1, 1),
699+
'2011-1': datetime.datetime(2011, 1, 1),
700+
'2011 01 02': datetime.datetime(2011, 1, 2),
701+
'2011.01.02': datetime.datetime(2011, 1, 2),
702+
'2011/01/02': datetime.datetime(2011, 1, 2),
703+
'2011\\01\\02': datetime.datetime(2011, 1, 2),
704+
'2013-01-01 05:30:00': datetime.datetime(2013, 1, 1, 5, 30),
705+
'2013-1-1 5:30:00': datetime.datetime(2013, 1, 1, 5, 30)}
706+
for date_str, exp in compat.iteritems(cases):
707+
actual = tslib._test_parse_iso8601(date_str)
708+
self.assertEqual(actual, exp)
709+
710+
# seperators must all match - YYYYMM not valid
711+
invalid_cases = ['2011-01/02', '2011^11^11', '201401',
712+
'201111', '200101']
713+
for date_str in invalid_cases:
714+
with tm.assertRaises(ValueError):
715+
tslib._test_parse_iso8601(date_str)
716+
691717

692718
class TestArrayToDatetime(tm.TestCase):
693719
def test_parsing_valid_dates(self):

0 commit comments

Comments
 (0)