From 78c92ca9051963bfd72f90a37665f9425c3b1381 Mon Sep 17 00:00:00 2001
From: Mike Edmunds <medmunds@gmail.com>
Date: Wed, 31 Jul 2024 17:12:14 -0700
Subject: [PATCH] Stop incorrectly RFC 2047 encoding non-ASCII email addresses

Email generators had been incorrectly flattening non-ASCII email
addresses to RFC 2047 encoded-word format, leaving them undeliverable.
(RFC 2047 prohibits use of encoded-word in an addr-spec.)
This change raises a ValueError when attempting to flatten an
EmailMessage with a non-ASCII addr-spec and a policy with utf8=False.
(Exception: If the non-ASCII address originated from parsing a message,
it will be flattened as originally parsed, without error.)

Non-ASCII email addresses are supported when using a policy with
utf8=True (such as email.policy.SMTPUTF8) under RFCs 6531 and 6532.

Non-ASCII email address domains (but not localparts) can also be used
with non-SMTPUTF8 policies by encoding the domain as an IDNA A-label.
(The email package does not perform this encoding, because it cannot
know whether the caller wants IDNA 2003, IDNA 2008, or some other
variant such as UTS #46.)
---
 Doc/library/email.policy.rst          | 10 ++++-
 Lib/email/_header_value_parser.py     | 11 +++++
 Lib/test/test_email/test_generator.py | 58 ++++++++++++++++++++++++++-
 3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst
index 314767d0802a088..d5be4d6625176a9 100644
--- a/Doc/library/email.policy.rst
+++ b/Doc/library/email.policy.rst
@@ -406,11 +406,17 @@ added matters.  To illustrate::
    .. attribute:: utf8
 
       If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
-      headers by encoding them as "encoded words".  If ``True``, follow
-      :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
+      headers by encoding them as :rfc:`2047` "encoded words".  If ``True``,
+      follow :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
       formatted in this way may be passed to SMTP servers that support
       the ``SMTPUTF8`` extension (:rfc:`6531`).
 
+      .. versionchanged:: 3.13
+         If ``False``, the generator will raise a ``ValueError`` if any email
+         address contains non-ASCII characters. To send to a non-ASCII domain
+         with ``utf8=False``, encode the domain using the third-party
+         :pypi:`idna` module or :mod:`encodings.idna`. No RFC allows a non-ASCII
+         username ("localpart") in an email address with ``utf8=False``.
 
    .. attribute:: refold_source
 
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index ec2215a5e5f33c6..ff75b9acd81fd8c 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -2829,6 +2829,17 @@ def _refold_parse_tree(parse_tree, *, policy):
             _fold_mime_parameters(part, lines, maxlen, encoding)
             continue
 
+        if want_encoding and part.token_type == 'addr-spec':
+            # RFC2047 forbids encoded-word in any part of an addr-spec.
+            if charset == 'unknown-8bit':
+                # Non-ASCII addr-spec came from parsed message; leave unchanged.
+                want_encoding = False
+            else:
+                raise ValueError(
+                    "Non-ASCII address requires policy with utf8=True:"
+                    " '{}'".format(part)
+                )
+
         if want_encoding and not wrap_as_ew_blocked:
             if not part.as_ew_allowed:
                 want_encoding = False
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
index c75a842c33578e9..66379129abb1a25 100644
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -1,4 +1,5 @@
 import io
+import re
 import textwrap
 import unittest
 from email import message_from_string, message_from_bytes
@@ -288,6 +289,28 @@ def test_keep_long_encoded_newlines(self):
         g.flatten(msg)
         self.assertEqual(s.getvalue(), self.typ(expected))
 
+    def test_non_ascii_addr_spec_raises(self):
+        # RFC2047 encoded-word is not permitted in any part of an addr-spec.
+        # (See also test_non_ascii_addr_spec_preserved below.)
+        g = self.genclass(self.ioclass(), policy=self.policy.clone(utf8=False))
+        cases = [
+            'wők@example.com',
+            'wok@exàmple.com',
+            'wők@exàmple.com',
+            '"Name, for display" <wők@example.com>',
+            'Näyttönimi <wők@example.com>',
+        ]
+        for address in cases:
+            with self.subTest(address=address):
+                msg = EmailMessage()
+                msg['To'] = address
+                expected_error = re.escape(
+                    "Non-ASCII address requires policy with utf8=True:"
+                    " '{}'".format(msg['To'].addresses[0].addr_spec)
+                )
+                with self.assertRaisesRegex(ValueError, expected_error):
+                    g.flatten(msg)
+
 
 class TestGenerator(TestGeneratorBase, TestEmailBase):
 
@@ -432,12 +455,12 @@ def test_cte_type_7bit_transforms_8bit_cte(self):
 
     def test_smtputf8_policy(self):
         msg = EmailMessage()
-        msg['From'] = "Páolo <főo@bar.com>"
+        msg['From'] = "Páolo <főo@bàr.com>"
         msg['To'] = 'Dinsdale'
         msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
         msg.set_content("oh là là, know what I mean, know what I mean?")
         expected = textwrap.dedent("""\
-            From: Páolo <főo@bar.com>
+            From: Páolo <főo@bàr.com>
             To: Dinsdale
             Subject: Nudge nudge, wink, wink \u1F609
             Content-Type: text/plain; charset="utf-8"
@@ -472,6 +495,37 @@ def test_smtp_policy(self):
         g.flatten(msg)
         self.assertEqual(s.getvalue(), expected)
 
+    def test_non_ascii_addr_spec_preserved(self):
+        # A defective non-ASCII addr-spec parsed from the original
+        # message is left unchanged when flattening.
+        # (See also test_non_ascii_addr_spec_raises above.)
+        source = (
+            'To: jörg@example.com, "But a long name still works with refold_source" <jörg@example.com>'
+        ).encode()
+        expected = (
+            b'To: j\xc3\xb6rg@example.com,\n'
+            b' "But a long name still works with refold_source" <j\xc3\xb6rg@example.com>\n'
+            b'\n'
+        )
+        msg = message_from_bytes(source, policy=policy.default)
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.default)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
+    def test_idna_encoding_preserved(self):
+        # Nothing tries to decode a pre-encoded IDNA domain.
+        msg = EmailMessage()
+        msg["To"] = Address(
+            username='jörg',
+            domain='☕.example'.encode('idna').decode()  # IDNA 2003
+        )
+        expected = 'To: jörg@xn--53h.example,\n\n'.encode()
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.default.clone(utf8=True))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
 
 if __name__ == '__main__':
     unittest.main()