From 78c92ca9051963bfd72f90a37665f9425c3b1381 Mon Sep 17 00:00:00 2001 From: Mike Edmunds Date: Wed, 31 Jul 2024 17:12:14 -0700 Subject: [PATCH] Stop incorrectly RFC 2047 encoding non-ASCII email addresses Email generators had been incorrectly flattening non-ASCII email addresses to RFC 2047 encoded-word format, leaving them undeliverable. (RFC 2047 prohibits use of encoded-word in an addr-spec.) This change raises a ValueError when attempting to flatten an EmailMessage with a non-ASCII addr-spec and a policy with utf8=False. (Exception: If the non-ASCII address originated from parsing a message, it will be flattened as originally parsed, without error.) Non-ASCII email addresses are supported when using a policy with utf8=True (such as email.policy.SMTPUTF8) under RFCs 6531 and 6532. Non-ASCII email address domains (but not localparts) can also be used with non-SMTPUTF8 policies by encoding the domain as an IDNA A-label. (The email package does not perform this encoding, because it cannot know whether the caller wants IDNA 2003, IDNA 2008, or some other variant such as UTS #46.) --- Doc/library/email.policy.rst | 10 ++++- Lib/email/_header_value_parser.py | 11 +++++ Lib/test/test_email/test_generator.py | 58 ++++++++++++++++++++++++++- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst index 314767d0802a088..d5be4d6625176a9 100644 --- a/Doc/library/email.policy.rst +++ b/Doc/library/email.policy.rst @@ -406,11 +406,17 @@ added matters. To illustrate:: .. attribute:: utf8 If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in - headers by encoding them as "encoded words". If ``True``, follow - :rfc:`6532` and use ``utf-8`` encoding for headers. Messages + headers by encoding them as :rfc:`2047` "encoded words". If ``True``, + follow :rfc:`6532` and use ``utf-8`` encoding for headers. Messages formatted in this way may be passed to SMTP servers that support the ``SMTPUTF8`` extension (:rfc:`6531`). + .. versionchanged:: 3.13 + If ``False``, the generator will raise a ``ValueError`` if any email + address contains non-ASCII characters. To send to a non-ASCII domain + with ``utf8=False``, encode the domain using the third-party + :pypi:`idna` module or :mod:`encodings.idna`. No RFC allows a non-ASCII + username ("localpart") in an email address with ``utf8=False``. .. attribute:: refold_source diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ec2215a5e5f33c6..ff75b9acd81fd8c 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2829,6 +2829,17 @@ def _refold_parse_tree(parse_tree, *, policy): _fold_mime_parameters(part, lines, maxlen, encoding) continue + if want_encoding and part.token_type == 'addr-spec': + # RFC2047 forbids encoded-word in any part of an addr-spec. + if charset == 'unknown-8bit': + # Non-ASCII addr-spec came from parsed message; leave unchanged. + want_encoding = False + else: + raise ValueError( + "Non-ASCII address requires policy with utf8=True:" + " '{}'".format(part) + ) + if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index c75a842c33578e9..66379129abb1a25 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -1,4 +1,5 @@ import io +import re import textwrap import unittest from email import message_from_string, message_from_bytes @@ -288,6 +289,28 @@ def test_keep_long_encoded_newlines(self): g.flatten(msg) self.assertEqual(s.getvalue(), self.typ(expected)) + def test_non_ascii_addr_spec_raises(self): + # RFC2047 encoded-word is not permitted in any part of an addr-spec. + # (See also test_non_ascii_addr_spec_preserved below.) + g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False)) + cases = [ + 'wők@example.com', + 'wok@exàmple.com', + 'wők@exàmple.com', + '"Name, for display" ', + 'Näyttönimi ', + ] + for address in cases: + with self.subTest(address=address): + msg = EmailMessage() + msg['To'] = address + expected_error = re.escape( + "Non-ASCII address requires policy with utf8=True:" + " '{}'".format(msg['To'].addresses[0].addr_spec) + ) + with self.assertRaisesRegex(ValueError, expected_error): + g.flatten(msg) + class TestGenerator(TestGeneratorBase, TestEmailBase): @@ -432,12 +455,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self): def test_smtputf8_policy(self): msg = EmailMessage() - msg['From'] = "Páolo " + msg['From'] = "Páolo " msg['To'] = 'Dinsdale' msg['Subject'] = 'Nudge nudge, wink, wink \u1F609' msg.set_content("oh là là, know what I mean, know what I mean?") expected = textwrap.dedent("""\ - From: Páolo + From: Páolo To: Dinsdale Subject: Nudge nudge, wink, wink \u1F609 Content-Type: text/plain; charset="utf-8" @@ -472,6 +495,37 @@ def test_smtp_policy(self): g.flatten(msg) self.assertEqual(s.getvalue(), expected) + def test_non_ascii_addr_spec_preserved(self): + # A defective non-ASCII addr-spec parsed from the original + # message is left unchanged when flattening. + # (See also test_non_ascii_addr_spec_raises above.) + source = ( + 'To: jörg@example.com, "But a long name still works with refold_source" ' + ).encode() + expected = ( + b'To: j\xc3\xb6rg@example.com,\n' + b' "But a long name still works with refold_source" \n' + b'\n' + ) + msg = message_from_bytes(source, policy=policy.default) + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_idna_encoding_preserved(self): + # Nothing tries to decode a pre-encoded IDNA domain. + msg = EmailMessage() + msg["To"] = Address( + username='jörg', + domain='☕.example'.encode('idna').decode() # IDNA 2003 + ) + expected = 'To: jörg@xn--53h.example,\n\n'.encode() + s = io.BytesIO() + g = BytesGenerator(s, policy=policy.default.clone(utf8=True)) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + if __name__ == '__main__': unittest.main()