diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 5fa37963e07eff..e654412965a727 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -992,19 +992,11 @@ These are the UTF-8 codec APIs: As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size. - Raise an exception if the *unicode* string contains embedded null - characters. To accept embedded null characters and truncate on purpose - at the first null byte, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be - used instead. - .. versionadded:: 3.3 .. versionchanged:: 3.7 The return type is now ``const char *`` rather of ``char *``. - .. versionchanged:: 3.13 - Raise an exception if the string contains embedded null characters. - UTF-32 Codecs """"""""""""" diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 84d50a68eace4b..8db8a798caf0a7 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -1222,12 +1222,6 @@ Porting to Python 3.13 Note that ``Py_TRASHCAN_BEGIN`` has a second argument which should be the deallocation function it is in. -* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string - contains embedded null characters. To accept embedded null characters and - truncate on purpose at the first null byte, - ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead. - (Contributed by Victor Stinner in :gh:`111089`.) - * On Windows, ``Python.h`` no longer includes the ```` standard header file. If needed, it should now be included explicitly. For example, it provides ``offsetof()`` function, and ``size_t`` and ``ptrdiff_t`` types. diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index d200fa0622cef5..cd56a6a74acf51 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -440,6 +440,22 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( const void *buffer, Py_ssize_t size); +/* --- Manage the default encoding ---------------------------------------- */ + +/* Returns a pointer to the default encoding (UTF-8) of the + Unicode object unicode. + + Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation + in the unicodeobject. + + _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to + support the previous internal function with the same behaviour. + + Use of this API is DEPRECATED since no size information can be + extracted from the returned data. +*/ + +PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); /* === Characters Type APIs =============================================== */ diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index ee7b769ce5a6fc..dee00715b3c51d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -443,25 +443,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( PyObject *unicode /* Unicode object */ ); -// Returns a pointer to the UTF-8 encoding of the Unicode object unicode. -// -// Raise an exception if the string contains embedded null characters. -// Use PyUnicode_AsUTF8AndSize() to accept embedded null characters. -// -// This function caches the UTF-8 encoded string in the Unicode object -// and subsequent calls will return the same string. The memory is released -// when the Unicode object is deallocated. -PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); - -// Returns a pointer to the UTF-8 encoding of the -// Unicode object unicode and the size of the encoded representation -// in bytes stored in `*size` (if size is not NULL). -// -// On error, `*size` is set to 0 (if size is not NULL). -// -// This function caches the UTF-8 encoded string in the Unicode object -// and subsequent calls will return the same string. The memory is released -// when the Unicode object is deallocated. +/* Returns a pointer to the default encoding (UTF-8) of the + Unicode object unicode and the size of the encoded representation + in bytes stored in *size. + + In case of an error, no *size is set. + + This function caches the UTF-8 encoded string in the unicodeobject + and subsequent calls will return the same string. The memory is released + when the unicodeobject is deallocated. +*/ + #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( PyObject *unicode, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index d8537244b39555..bb6161abf4da81 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -914,10 +914,7 @@ def test_asutf8(self): self.assertEqual(unicode_asutf8('abc', 4), b'abc\0') self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0') self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0') - - # disallow embedded null characters - self.assertRaises(ValueError, unicode_asutf8, 'abc\0', 0) - self.assertRaises(ValueError, unicode_asutf8, 'abc\0def', 0) + self.assertEqual(unicode_asutf8('abc\0def', 8), b'abc\0def\0') self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0) self.assertRaises(TypeError, unicode_asutf8, b'abc', 0) diff --git a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst deleted file mode 100644 index 2008dd5438d2b5..00000000000000 --- a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst +++ /dev/null @@ -1,2 +0,0 @@ -The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the -string contains embedded null characters. Patch by Victor Stinner. diff --git a/Objects/typeobject.c b/Objects/typeobject.c index f44e30cf0446a5..557464c6740c18 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -3501,14 +3501,13 @@ type_new_set_doc(PyTypeObject *type) return 0; } - Py_ssize_t doc_size; - const char *doc_str = PyUnicode_AsUTF8AndSize(doc, &doc_size); + const char *doc_str = PyUnicode_AsUTF8(doc); if (doc_str == NULL) { return -1; } // Silently truncate the docstring if it contains a null byte - Py_ssize_t size = doc_size + 1; + Py_ssize_t size = strlen(doc_str) + 1; char *tp_doc = (char *)PyObject_Malloc(size); if (tp_doc == NULL) { PyErr_NoMemory(); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 87636efcfca050..53e1e56babf952 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3844,13 +3844,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) const char * PyUnicode_AsUTF8(PyObject *unicode) { - Py_ssize_t size; - const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size); - if (utf8 != NULL && strlen(utf8) != (size_t)size) { - PyErr_SetString(PyExc_ValueError, "embedded null character"); - return NULL; - } - return utf8; + return PyUnicode_AsUTF8AndSize(unicode, NULL); } /*