From 1e9af2ad91f21300f319f0997511f6d32834c213 Mon Sep 17 00:00:00 2001 From: Phil Elson Date: Tue, 22 Jan 2019 14:52:01 +0000 Subject: [PATCH] Unicode support (#135) * Always treat units as unicode. Closes #133. Also, allow a file encoding in the coding standards test, so that we can have some literal unicode characters for testing with. * Fix date2num test which is incorrectly using repr when str was intended. * Tidy up the Unit constructor for the py2 case, so that it is easier to see that the code can be deleted when the codebase becomes py3 only. * Handle unicode object in py2 specially, and always ensure that py2k returns a non-unicode for __str__ (unless sys.getdefaultencoding says otherwise). * Ensure that the error raised in Unit constructor handles unicode too. --- cf_units/__init__.py | 29 +++++++++++++++++---- cf_units/tests/integration/test_date2num.py | 4 +-- cf_units/tests/test_coding_standards.py | 8 +++--- cf_units/tests/test_unit.py | 29 +++++++++++++++++++++ 4 files changed, 59 insertions(+), 11 deletions(-) diff --git a/cf_units/__init__.py b/cf_units/__init__.py index ad71097c..0f86af93 100644 --- a/cf_units/__init__.py +++ b/cf_units/__init__.py @@ -804,10 +804,21 @@ def __init__(self, unit, calendar=None): ut_unit = _ud.NULL_UNIT calendar_ = None + encoding = UT_UTF8 + if unit is None: unit = '' + + if six.PY2: + if not isinstance(unit, six.text_type): + # Cast everything that isn't a unicode object to a str. + unit = str(unit) + if isinstance(unit, str): + # All str in py2 should be treated as ASCII. + encoding = UT_ASCII else: - unit = str(unit).strip() + unit = str(unit) + unit = unit.strip() if unit.lower().endswith(' utc'): unit = unit[:unit.lower().rfind(' utc')] @@ -830,10 +841,15 @@ def __init__(self, unit, calendar=None): unit = _NO_UNIT_STRING else: category = _CATEGORY_UDUNIT + if six.PY2: + str_unit = unit.encode(sys.getdefaultencoding(), 'replace') + else: + str_unit = unit try: - ut_unit = _ud.parse(_ud_system, unit.encode('ascii'), UT_ASCII) + ut_unit = _ud.parse(_ud_system, unit.encode('utf8'), encoding) except _ud.UdunitsError as e: - self._propogate_error('Failed to parse unit "%s"' % unit, e) + self._propogate_error( + 'Failed to parse unit "%s"' % str_unit, e) if _OP_SINCE in unit.lower(): if calendar is None: calendar_ = CALENDAR_GREGORIAN @@ -1446,7 +1462,10 @@ def __str__(self): 'miles/hour' """ - return self.origin or self.symbol + r = self.origin or self.symbol + if six.PY2 and sys.getdefaultencoding() == 'ascii': + r = r.encode('ascii', 'replace') + return r def __repr__(self): """ @@ -1893,7 +1912,7 @@ def utime(self): if self.is_long_time_interval(): interval = self.origin.split(' ')[0] emsg = ('Time units with interval of "months", "years" ' - '(or singular of these) cannot be processed, got {!r}.') + '(or singular of these) cannot be processed, got "{!s}".') raise ValueError(emsg.format(interval)) # diff --git a/cf_units/tests/integration/test_date2num.py b/cf_units/tests/integration/test_date2num.py index bdbd6336..16d01a01 100644 --- a/cf_units/tests/integration/test_date2num.py +++ b/cf_units/tests/integration/test_date2num.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2016 - 2018, Met Office +# (C) British Crown Copyright 2016 - 2019, Met Office # # This file is part of cf-units. # @@ -73,7 +73,7 @@ def test_long_time_interval(self): # This test should fail with an error that we need to catch properly. unit = 'years since 1970-01-01' date = datetime.datetime(1970, 1, 1, 0, 0, 5) - exp_emsg = 'interval of "months", "years" .* got \'years\'.' + exp_emsg = 'interval of "months", "years" .* got "years".' with six.assertRaisesRegex(self, ValueError, exp_emsg): date2num(date, unit, self.calendar) diff --git a/cf_units/tests/test_coding_standards.py b/cf_units/tests/test_coding_standards.py index 74907332..9d62568b 100644 --- a/cf_units/tests/test_coding_standards.py +++ b/cf_units/tests/test_coding_standards.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2013 - 2018, Met Office +# (C) British Crown Copyright 2013 - 2019, Met Office # # This file is part of cf-units. # @@ -52,9 +52,9 @@ LICENSE_RE_PATTERN = re.escape(LICENSE_TEMPLATE).replace(r'\{YEARS\}', '(.*?)') -# Add shebang possibility to the LICENSE_RE_PATTERN -LICENSE_RE_PATTERN = r'(\#\!.*\n)?' + LICENSE_RE_PATTERN -LICENSE_RE = re.compile(LICENSE_RE_PATTERN, re.MULTILINE) +SHEBANG = r'(\#\!.*\n)?' +ENCODING = r'(\# \-\*\- coding\: .* \-\*\-\n)?' +LICENSE_RE = re.compile(SHEBANG + ENCODING + LICENSE_RE_PATTERN, re.MULTILINE) # Guess cf_units repo directory of cf_units - realpath is used to mitigate diff --git a/cf_units/tests/test_unit.py b/cf_units/tests/test_unit.py index ebf42fc5..a8070542 100644 --- a/cf_units/tests/test_unit.py +++ b/cf_units/tests/test_unit.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # (C) British Crown Copyright 2010 - 2019, Met Office # # This file is part of cf-units. @@ -74,6 +75,34 @@ def test_unsupported_calendar(self): with six.assertRaisesRegex(self, ValueError, 'unsupported calendar'): Unit('hours since 1970-01-01 00:00:00', calendar='wibble') + def test_calendar_w_unicode(self): + calendar = unit.CALENDAR_365_DAY + u = Unit(u'hours\xb2 hours-1 since epoch', calendar=calendar) + self.assertEqual(u.calendar, calendar) + if six.PY2: + # Python 2 str MUST return an ascii string, yet the input + # was a unicode. We therefore return the ASCII encoded form. + expected = 'hours? hours-1 since 1970-01-01 00:00:00' + else: + expected = 'hours\xb2 hours-1 since 1970-01-01 00:00:00' + self.assertEqual(str(u), expected) + + @unittest.skipIf(six.PY2, "Unicode literals in str aren't a thing") + def test_unicode_valid(self): + # Some unicode characters are allowed. + u = Unit('m²') + assert u.symbol == 'm2' + + def test_py2k_unicode(self): + u = Unit(u'm\xb2') + assert u.symbol == 'm2' + + def test_unicode_invalid(self): + # Not all unicode characters are allowed. + msg = '[UT_UNKNOWN] Failed to parse unit "ø"' + with self.assertRaises(ValueError, msg=msg): + Unit('ø') + class Test_modulus(unittest.TestCase):