From 1e9af2ad91f21300f319f0997511f6d32834c213 Mon Sep 17 00:00:00 2001
From: Phil Elson <pelson.pub@gmail.com>
Date: Tue, 22 Jan 2019 14:52:01 +0000
Subject: [PATCH] Unicode support (#135)

* Always treat units as unicode. Closes #133.

Also, allow a file encoding in the coding standards test, so that we can have some literal unicode characters for testing with.

* Fix date2num test which is incorrectly using repr when str was intended.

* Tidy up the Unit constructor for the py2 case, so that it is easier to see that the code can be deleted when the codebase becomes py3 only.

* Handle unicode object in py2 specially, and always ensure that py2k returns a non-unicode for __str__ (unless sys.getdefaultencoding says otherwise).

* Ensure that the error raised in Unit constructor handles unicode too.
---
 cf_units/__init__.py                        | 29 +++++++++++++++++----
 cf_units/tests/integration/test_date2num.py |  4 +--
 cf_units/tests/test_coding_standards.py     |  8 +++---
 cf_units/tests/test_unit.py                 | 29 +++++++++++++++++++++
 4 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/cf_units/__init__.py b/cf_units/__init__.py
index ad71097c..0f86af93 100644
--- a/cf_units/__init__.py
+++ b/cf_units/__init__.py
@@ -804,10 +804,21 @@ def __init__(self, unit, calendar=None):
         ut_unit = _ud.NULL_UNIT
         calendar_ = None
 
+        encoding = UT_UTF8
+
         if unit is None:
             unit = ''
+
+        if six.PY2:
+            if not isinstance(unit, six.text_type):
+                # Cast everything that isn't a unicode object to a str.
+                unit = str(unit)
+            if isinstance(unit, str):
+                # All str in py2 should be treated as ASCII.
+                encoding = UT_ASCII
         else:
-            unit = str(unit).strip()
+            unit = str(unit)
+        unit = unit.strip()
 
         if unit.lower().endswith(' utc'):
             unit = unit[:unit.lower().rfind(' utc')]
@@ -830,10 +841,15 @@ def __init__(self, unit, calendar=None):
             unit = _NO_UNIT_STRING
         else:
             category = _CATEGORY_UDUNIT
+            if six.PY2:
+                str_unit = unit.encode(sys.getdefaultencoding(), 'replace')
+            else:
+                str_unit = unit
             try:
-                ut_unit = _ud.parse(_ud_system, unit.encode('ascii'), UT_ASCII)
+                ut_unit = _ud.parse(_ud_system, unit.encode('utf8'), encoding)
             except _ud.UdunitsError as e:
-                self._propogate_error('Failed to parse unit "%s"' % unit, e)
+                self._propogate_error(
+                    'Failed to parse unit "%s"' % str_unit, e)
             if _OP_SINCE in unit.lower():
                 if calendar is None:
                     calendar_ = CALENDAR_GREGORIAN
@@ -1446,7 +1462,10 @@ def __str__(self):
             'miles/hour'
 
         """
-        return self.origin or self.symbol
+        r = self.origin or self.symbol
+        if six.PY2 and sys.getdefaultencoding() == 'ascii':
+            r = r.encode('ascii', 'replace')
+        return r
 
     def __repr__(self):
         """
@@ -1893,7 +1912,7 @@ def utime(self):
         if self.is_long_time_interval():
             interval = self.origin.split(' ')[0]
             emsg = ('Time units with interval of "months", "years" '
-                    '(or singular of these) cannot be processed, got {!r}.')
+                    '(or singular of these) cannot be processed, got "{!s}".')
             raise ValueError(emsg.format(interval))
 
         #
diff --git a/cf_units/tests/integration/test_date2num.py b/cf_units/tests/integration/test_date2num.py
index bdbd6336..16d01a01 100644
--- a/cf_units/tests/integration/test_date2num.py
+++ b/cf_units/tests/integration/test_date2num.py
@@ -1,4 +1,4 @@
-# (C) British Crown Copyright 2016 - 2018, Met Office
+# (C) British Crown Copyright 2016 - 2019, Met Office
 #
 # This file is part of cf-units.
 #
@@ -73,7 +73,7 @@ def test_long_time_interval(self):
         # This test should fail with an error that we need to catch properly.
         unit = 'years since 1970-01-01'
         date = datetime.datetime(1970, 1, 1, 0, 0, 5)
-        exp_emsg = 'interval of "months", "years" .* got \'years\'.'
+        exp_emsg = 'interval of "months", "years" .* got "years".'
         with six.assertRaisesRegex(self, ValueError, exp_emsg):
             date2num(date, unit, self.calendar)
 
diff --git a/cf_units/tests/test_coding_standards.py b/cf_units/tests/test_coding_standards.py
index 74907332..9d62568b 100644
--- a/cf_units/tests/test_coding_standards.py
+++ b/cf_units/tests/test_coding_standards.py
@@ -1,4 +1,4 @@
-# (C) British Crown Copyright 2013 - 2018, Met Office
+# (C) British Crown Copyright 2013 - 2019, Met Office
 #
 # This file is part of cf-units.
 #
@@ -52,9 +52,9 @@
 
 
 LICENSE_RE_PATTERN = re.escape(LICENSE_TEMPLATE).replace(r'\{YEARS\}', '(.*?)')
-# Add shebang possibility to the LICENSE_RE_PATTERN
-LICENSE_RE_PATTERN = r'(\#\!.*\n)?' + LICENSE_RE_PATTERN
-LICENSE_RE = re.compile(LICENSE_RE_PATTERN, re.MULTILINE)
+SHEBANG = r'(\#\!.*\n)?'
+ENCODING = r'(\# \-\*\- coding\: .* \-\*\-\n)?'
+LICENSE_RE = re.compile(SHEBANG + ENCODING + LICENSE_RE_PATTERN, re.MULTILINE)
 
 
 # Guess cf_units repo directory of cf_units - realpath is used to mitigate
diff --git a/cf_units/tests/test_unit.py b/cf_units/tests/test_unit.py
index ebf42fc5..a8070542 100644
--- a/cf_units/tests/test_unit.py
+++ b/cf_units/tests/test_unit.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # (C) British Crown Copyright 2010 - 2019, Met Office
 #
 # This file is part of cf-units.
@@ -74,6 +75,34 @@ def test_unsupported_calendar(self):
         with six.assertRaisesRegex(self, ValueError, 'unsupported calendar'):
             Unit('hours since 1970-01-01 00:00:00', calendar='wibble')
 
+    def test_calendar_w_unicode(self):
+        calendar = unit.CALENDAR_365_DAY
+        u = Unit(u'hours\xb2 hours-1 since epoch', calendar=calendar)
+        self.assertEqual(u.calendar, calendar)
+        if six.PY2:
+            # Python 2 str MUST return an ascii string, yet the input
+            # was a unicode. We therefore return the ASCII encoded form.
+            expected = 'hours? hours-1 since 1970-01-01 00:00:00'
+        else:
+            expected = 'hours\xb2 hours-1 since 1970-01-01 00:00:00'
+        self.assertEqual(str(u), expected)
+
+    @unittest.skipIf(six.PY2, "Unicode literals in str aren't a thing")
+    def test_unicode_valid(self):
+        # Some unicode characters are allowed.
+        u = Unit('m²')
+        assert u.symbol == 'm2'
+
+    def test_py2k_unicode(self):
+        u = Unit(u'm\xb2')
+        assert u.symbol == 'm2'
+
+    def test_unicode_invalid(self):
+        # Not all unicode characters are allowed.
+        msg = '[UT_UNKNOWN] Failed to parse unit "ø"'
+        with self.assertRaises(ValueError, msg=msg):
+            Unit('ø')
+
 
 class Test_modulus(unittest.TestCase):