From e72dbec2d372cbd9f278fa68dc0df34950f09e10 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 14 Nov 2022 15:32:02 +0200 Subject: [PATCH] [3.11] gh-78453: Move Unicode C API tests from test_unicode to test_capi.test_unicode (GH-99431). (cherry picked from commit 06d4e02c3b3526b5d90e41a0a0befa8663e08f27) Co-authored-by: Serhiy Storchaka --- Lib/test/test_capi/test_unicode.py | 474 +++++++++++++++++++++++++++++ Lib/test/test_unicode.py | 461 ---------------------------- 2 files changed, 474 insertions(+), 461 deletions(-) create mode 100644 Lib/test/test_capi/test_unicode.py diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py new file mode 100644 index 00000000000000..77a180a30693dc --- /dev/null +++ b/Lib/test/test_capi/test_unicode.py @@ -0,0 +1,474 @@ +import unittest +import sys +from test import support +from test.support import import_helper + +try: + import _testcapi +except ImportError: + _testcapi = None + + +class CAPITest(unittest.TestCase): + + # Test PyUnicode_FromFormat() + def test_from_format(self): + import_helper.import_module('ctypes') + from ctypes import ( + c_char_p, + pythonapi, py_object, sizeof, + c_int, c_long, c_longlong, c_ssize_t, + c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) + name = "PyUnicode_FromFormat" + _PyUnicode_FromFormat = getattr(pythonapi, name) + _PyUnicode_FromFormat.argtypes = (c_char_p,) + _PyUnicode_FromFormat.restype = py_object + + def PyUnicode_FromFormat(format, *args): + cargs = tuple( + py_object(arg) if isinstance(arg, str) else arg + for arg in args) + return _PyUnicode_FromFormat(format, *cargs) + + def check_format(expected, format, *args): + text = PyUnicode_FromFormat(format, *args) + self.assertEqual(expected, text) + + # ascii format, non-ascii argument + check_format('ascii\x7f=unicode\xe9', + b'ascii\x7f=%U', 'unicode\xe9') + + # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() + # raises an error + self.assertRaisesRegex(ValueError, + r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' + 'string, got a non-ASCII byte: 0xe9$', + PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + + # test "%c" + check_format('\uabcd', + b'%c', c_int(0xabcd)) + check_format('\U0010ffff', + b'%c', c_int(0x10ffff)) + with self.assertRaises(OverflowError): + PyUnicode_FromFormat(b'%c', c_int(0x110000)) + # Issue #18183 + check_format('\U00010000\U00100000', + b'%c%c', c_int(0x10000), c_int(0x100000)) + + # test "%" + check_format('%', + b'%') + check_format('%', + b'%%') + check_format('%s', + b'%%s') + check_format('[%]', + b'[%%]') + check_format('%abc', + b'%%%s', b'abc') + + # truncated string + check_format('abc', + b'%.3s', b'abcdef') + check_format('abc[\ufffd', + b'%.5s', 'abc[\u20ac]'.encode('utf8')) + check_format("'\\u20acABC'", + b'%A', '\u20acABC') + check_format("'\\u20", + b'%.5A', '\u20acABCDEF') + check_format("'\u20acABC'", + b'%R', '\u20acABC') + check_format("'\u20acA", + b'%.3R', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3S', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3U', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3V', '\u20acABCDEF', None) + check_format('abc[\ufffd', + b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + + # following tests comes from #7330 + # test width modifier and precision modifier with %S + check_format("repr= abc", + b'repr=%5S', 'abc') + check_format("repr=ab", + b'repr=%.2S', 'abc') + check_format("repr= ab", + b'repr=%5.2S', 'abc') + + # test width modifier and precision modifier with %R + check_format("repr= 'abc'", + b'repr=%8R', 'abc') + check_format("repr='ab", + b'repr=%.3R', 'abc') + check_format("repr= 'ab", + b'repr=%5.3R', 'abc') + + # test width modifier and precision modifier with %A + check_format("repr= 'abc'", + b'repr=%8A', 'abc') + check_format("repr='ab", + b'repr=%.3A', 'abc') + check_format("repr= 'ab", + b'repr=%5.3A', 'abc') + + # test width modifier and precision modifier with %s + check_format("repr= abc", + b'repr=%5s', b'abc') + check_format("repr=ab", + b'repr=%.2s', b'abc') + check_format("repr= ab", + b'repr=%5.2s', b'abc') + + # test width modifier and precision modifier with %U + check_format("repr= abc", + b'repr=%5U', 'abc') + check_format("repr=ab", + b'repr=%.2U', 'abc') + check_format("repr= ab", + b'repr=%5.2U', 'abc') + + # test width modifier and precision modifier with %V + check_format("repr= abc", + b'repr=%5V', 'abc', b'123') + check_format("repr=ab", + b'repr=%.2V', 'abc', b'123') + check_format("repr= ab", + b'repr=%5.2V', 'abc', b'123') + check_format("repr= 123", + b'repr=%5V', None, b'123') + check_format("repr=12", + b'repr=%.2V', None, b'123') + check_format("repr= 12", + b'repr=%5.2V', None, b'123') + + # test integer formats (%i, %d, %u) + check_format('010', + b'%03i', c_int(10)) + check_format('0010', + b'%0.4i', c_int(10)) + check_format('-123', + b'%i', c_int(-123)) + check_format('-123', + b'%li', c_long(-123)) + check_format('-123', + b'%lli', c_longlong(-123)) + check_format('-123', + b'%zi', c_ssize_t(-123)) + + check_format('-123', + b'%d', c_int(-123)) + check_format('-123', + b'%ld', c_long(-123)) + check_format('-123', + b'%lld', c_longlong(-123)) + check_format('-123', + b'%zd', c_ssize_t(-123)) + + check_format('123', + b'%u', c_uint(123)) + check_format('123', + b'%lu', c_ulong(123)) + check_format('123', + b'%llu', c_ulonglong(123)) + check_format('123', + b'%zu', c_size_t(123)) + + # test long output + min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) + max_longlong = -min_longlong - 1 + check_format(str(min_longlong), + b'%lld', c_longlong(min_longlong)) + check_format(str(max_longlong), + b'%lld', c_longlong(max_longlong)) + max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 + check_format(str(max_ulonglong), + b'%llu', c_ulonglong(max_ulonglong)) + PyUnicode_FromFormat(b'%p', c_void_p(-1)) + + # test padding (width and/or precision) + check_format('123'.rjust(10, '0'), + b'%010i', c_int(123)) + check_format('123'.rjust(100), + b'%100i', c_int(123)) + check_format('123'.rjust(100, '0'), + b'%.100i', c_int(123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80i', c_int(123)) + + check_format('123'.rjust(10, '0'), + b'%010u', c_uint(123)) + check_format('123'.rjust(100), + b'%100u', c_uint(123)) + check_format('123'.rjust(100, '0'), + b'%.100u', c_uint(123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80u', c_uint(123)) + + check_format('123'.rjust(10, '0'), + b'%010x', c_int(0x123)) + check_format('123'.rjust(100), + b'%100x', c_int(0x123)) + check_format('123'.rjust(100, '0'), + b'%.100x', c_int(0x123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80x', c_int(0x123)) + + # test %A + check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", + b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') + + # test %V + check_format('repr=abc', + b'repr=%V', 'abc', b'xyz') + + # test %p + # We cannot test the exact result, + # because it returns a hex representation of a C pointer, + # which is going to be different each time. But, we can test the format. + p_format_regex = r'^0x[a-zA-Z0-9]{3,}$' + p_format1 = PyUnicode_FromFormat(b'%p', 'abc') + self.assertIsInstance(p_format1, str) + self.assertRegex(p_format1, p_format_regex) + + p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz') + self.assertIsInstance(p_format2, str) + self.assertRegex(p_format2, + r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}') + + # Extra args are ignored: + p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz') + self.assertIsInstance(p_format3, str) + self.assertRegex(p_format3, p_format_regex) + + # Test string decode from parameter of %s using utf-8. + # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of + # '\u4eba\u6c11' + check_format('repr=\u4eba\u6c11', + b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') + + #Test replace error handler. + check_format('repr=abc\ufffd', + b'repr=%V', None, b'abc\xff') + + # not supported: copy the raw format string. these tests are just here + # to check for crashes and should not be considered as specifications + check_format('%s', + b'%1%s', b'abc') + check_format('%1abc', + b'%1abc') + check_format('%+i', + b'%+i', c_int(10)) + check_format('%.%s', + b'%.%s', b'abc') + + # Issue #33817: empty strings + check_format('', + b'') + check_format('', + b'%s', b'') + + # Test PyUnicode_AsWideChar() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_aswidechar(self): + from _testcapi import unicode_aswidechar + import_helper.import_module('ctypes') + from ctypes import c_wchar, sizeof + + wchar, size = unicode_aswidechar('abcdef', 2) + self.assertEqual(size, 2) + self.assertEqual(wchar, 'ab') + + wchar, size = unicode_aswidechar('abc', 3) + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc') + + wchar, size = unicode_aswidechar('abc', 4) + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc\0') + + wchar, size = unicode_aswidechar('abc', 10) + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc\0') + + wchar, size = unicode_aswidechar('abc\0def', 20) + self.assertEqual(size, 7) + self.assertEqual(wchar, 'abc\0def\0') + + nonbmp = chr(0x10ffff) + if sizeof(c_wchar) == 2: + buflen = 3 + nchar = 2 + else: # sizeof(c_wchar) == 4 + buflen = 2 + nchar = 1 + wchar, size = unicode_aswidechar(nonbmp, buflen) + self.assertEqual(size, nchar) + self.assertEqual(wchar, nonbmp + '\0') + + # Test PyUnicode_AsWideCharString() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_aswidecharstring(self): + from _testcapi import unicode_aswidecharstring + import_helper.import_module('ctypes') + from ctypes import c_wchar, sizeof + + wchar, size = unicode_aswidecharstring('abc') + self.assertEqual(size, 3) + self.assertEqual(wchar, 'abc\0') + + wchar, size = unicode_aswidecharstring('abc\0def') + self.assertEqual(size, 7) + self.assertEqual(wchar, 'abc\0def\0') + + nonbmp = chr(0x10ffff) + if sizeof(c_wchar) == 2: + nchar = 2 + else: # sizeof(c_wchar) == 4 + nchar = 1 + wchar, size = unicode_aswidecharstring(nonbmp) + self.assertEqual(size, nchar) + self.assertEqual(wchar, nonbmp + '\0') + + # Test PyUnicode_AsUCS4() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_asucs4(self): + from _testcapi import unicode_asucs4 + for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', + 'a\ud800b\udfffc', '\ud834\udd1e']: + l = len(s) + self.assertEqual(unicode_asucs4(s, l, True), s+'\0') + self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff') + self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff') + self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff') + self.assertRaises(SystemError, unicode_asucs4, s, l-1, True) + self.assertRaises(SystemError, unicode_asucs4, s, l-2, False) + s = '\0'.join([s, s]) + self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') + self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') + + # Test PyUnicode_AsUTF8() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_asutf8(self): + from _testcapi import unicode_asutf8 + + bmp = '\u0100' + bmp2 = '\uffff' + nonbmp = chr(0x10ffff) + + self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80') + self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf') + self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf') + self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc') + + # Test PyUnicode_AsUTF8AndSize() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_asutf8andsize(self): + from _testcapi import unicode_asutf8andsize + + bmp = '\u0100' + bmp2 = '\uffff' + nonbmp = chr(0x10ffff) + + self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2)) + self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3)) + self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4)) + self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc') + + # Test PyUnicode_FindChar() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_findchar(self): + from _testcapi import unicode_findchar + + for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1": + for i, ch in enumerate(str): + self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i) + self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) + + str = "!>_= end + self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1) + self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1) + # negative + self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0) + self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0) + + # Test PyUnicode_CopyCharacters() + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_copycharacters(self): + from _testcapi import unicode_copycharacters + + strings = [ + 'abcde', '\xa1\xa2\xa3\xa4\xa5', + '\u4f60\u597d\u4e16\u754c\uff01', + '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' + ] + + for idx, from_ in enumerate(strings): + # wide -> narrow: exceed maxchar limitation + for to in strings[:idx]: + self.assertRaises( + SystemError, + unicode_copycharacters, to, 0, from_, 0, 5 + ) + # same kind + for from_start in range(5): + self.assertEqual( + unicode_copycharacters(from_, 0, from_, from_start, 5), + (from_[from_start:from_start+5].ljust(5, '\0'), + 5-from_start) + ) + for to_start in range(5): + self.assertEqual( + unicode_copycharacters(from_, to_start, from_, to_start, 5), + (from_[to_start:to_start+5].rjust(5, '\0'), + 5-to_start) + ) + # narrow -> wide + # Tests omitted since this creates invalid strings. + + s = strings[0] + self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) + self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) + self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) + self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) + self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_pep393_utf8_caching_bug(self): + # Issue #25709: Problem with string concatenation and utf-8 cache + from _testcapi import getargs_s_hash + for k in 0x24, 0xa4, 0x20ac, 0x1f40d: + s = '' + for i in range(5): + # Due to CPython specific optimization the 's' string can be + # resized in-place. + s += chr(k) + # Parsing with the "s#" format code calls indirectly + # PyUnicode_AsUTF8AndSize() which creates the UTF-8 + # encoded string cached in the Unicode object. + self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) + # Check that the second call returns the same result + self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) + + +if __name__ == "__main__": + unittest.main() diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index c8c07a463d1b2b..b3c02c1b998475 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -16,7 +16,6 @@ import unicodedata import unittest import warnings -from test.support import import_helper from test.support import warnings_helper from test import support, string_tests from test.support.script_helper import assert_python_failure @@ -2624,466 +2623,6 @@ def test_check_encoding_errors(self): self.assertEqual(proc.rc, 10, proc) -class CAPITest(unittest.TestCase): - - # Test PyUnicode_FromFormat() - def test_from_format(self): - import_helper.import_module('ctypes') - from ctypes import ( - c_char_p, - pythonapi, py_object, sizeof, - c_int, c_long, c_longlong, c_ssize_t, - c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p) - name = "PyUnicode_FromFormat" - _PyUnicode_FromFormat = getattr(pythonapi, name) - _PyUnicode_FromFormat.argtypes = (c_char_p,) - _PyUnicode_FromFormat.restype = py_object - - def PyUnicode_FromFormat(format, *args): - cargs = tuple( - py_object(arg) if isinstance(arg, str) else arg - for arg in args) - return _PyUnicode_FromFormat(format, *cargs) - - def check_format(expected, format, *args): - text = PyUnicode_FromFormat(format, *args) - self.assertEqual(expected, text) - - # ascii format, non-ascii argument - check_format('ascii\x7f=unicode\xe9', - b'ascii\x7f=%U', 'unicode\xe9') - - # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() - # raises an error - self.assertRaisesRegex(ValueError, - r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' - 'string, got a non-ASCII byte: 0xe9$', - PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') - - # test "%c" - check_format('\uabcd', - b'%c', c_int(0xabcd)) - check_format('\U0010ffff', - b'%c', c_int(0x10ffff)) - with self.assertRaises(OverflowError): - PyUnicode_FromFormat(b'%c', c_int(0x110000)) - # Issue #18183 - check_format('\U00010000\U00100000', - b'%c%c', c_int(0x10000), c_int(0x100000)) - - # test "%" - check_format('%', - b'%') - check_format('%', - b'%%') - check_format('%s', - b'%%s') - check_format('[%]', - b'[%%]') - check_format('%abc', - b'%%%s', b'abc') - - # truncated string - check_format('abc', - b'%.3s', b'abcdef') - check_format('abc[\ufffd', - b'%.5s', 'abc[\u20ac]'.encode('utf8')) - check_format("'\\u20acABC'", - b'%A', '\u20acABC') - check_format("'\\u20", - b'%.5A', '\u20acABCDEF') - check_format("'\u20acABC'", - b'%R', '\u20acABC') - check_format("'\u20acA", - b'%.3R', '\u20acABCDEF') - check_format('\u20acAB', - b'%.3S', '\u20acABCDEF') - check_format('\u20acAB', - b'%.3U', '\u20acABCDEF') - check_format('\u20acAB', - b'%.3V', '\u20acABCDEF', None) - check_format('abc[\ufffd', - b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) - - # following tests comes from #7330 - # test width modifier and precision modifier with %S - check_format("repr= abc", - b'repr=%5S', 'abc') - check_format("repr=ab", - b'repr=%.2S', 'abc') - check_format("repr= ab", - b'repr=%5.2S', 'abc') - - # test width modifier and precision modifier with %R - check_format("repr= 'abc'", - b'repr=%8R', 'abc') - check_format("repr='ab", - b'repr=%.3R', 'abc') - check_format("repr= 'ab", - b'repr=%5.3R', 'abc') - - # test width modifier and precision modifier with %A - check_format("repr= 'abc'", - b'repr=%8A', 'abc') - check_format("repr='ab", - b'repr=%.3A', 'abc') - check_format("repr= 'ab", - b'repr=%5.3A', 'abc') - - # test width modifier and precision modifier with %s - check_format("repr= abc", - b'repr=%5s', b'abc') - check_format("repr=ab", - b'repr=%.2s', b'abc') - check_format("repr= ab", - b'repr=%5.2s', b'abc') - - # test width modifier and precision modifier with %U - check_format("repr= abc", - b'repr=%5U', 'abc') - check_format("repr=ab", - b'repr=%.2U', 'abc') - check_format("repr= ab", - b'repr=%5.2U', 'abc') - - # test width modifier and precision modifier with %V - check_format("repr= abc", - b'repr=%5V', 'abc', b'123') - check_format("repr=ab", - b'repr=%.2V', 'abc', b'123') - check_format("repr= ab", - b'repr=%5.2V', 'abc', b'123') - check_format("repr= 123", - b'repr=%5V', None, b'123') - check_format("repr=12", - b'repr=%.2V', None, b'123') - check_format("repr= 12", - b'repr=%5.2V', None, b'123') - - # test integer formats (%i, %d, %u) - check_format('010', - b'%03i', c_int(10)) - check_format('0010', - b'%0.4i', c_int(10)) - check_format('-123', - b'%i', c_int(-123)) - check_format('-123', - b'%li', c_long(-123)) - check_format('-123', - b'%lli', c_longlong(-123)) - check_format('-123', - b'%zi', c_ssize_t(-123)) - - check_format('-123', - b'%d', c_int(-123)) - check_format('-123', - b'%ld', c_long(-123)) - check_format('-123', - b'%lld', c_longlong(-123)) - check_format('-123', - b'%zd', c_ssize_t(-123)) - - check_format('123', - b'%u', c_uint(123)) - check_format('123', - b'%lu', c_ulong(123)) - check_format('123', - b'%llu', c_ulonglong(123)) - check_format('123', - b'%zu', c_size_t(123)) - - # test long output - min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) - max_longlong = -min_longlong - 1 - check_format(str(min_longlong), - b'%lld', c_longlong(min_longlong)) - check_format(str(max_longlong), - b'%lld', c_longlong(max_longlong)) - max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 - check_format(str(max_ulonglong), - b'%llu', c_ulonglong(max_ulonglong)) - PyUnicode_FromFormat(b'%p', c_void_p(-1)) - - # test padding (width and/or precision) - check_format('123'.rjust(10, '0'), - b'%010i', c_int(123)) - check_format('123'.rjust(100), - b'%100i', c_int(123)) - check_format('123'.rjust(100, '0'), - b'%.100i', c_int(123)) - check_format('123'.rjust(80, '0').rjust(100), - b'%100.80i', c_int(123)) - - check_format('123'.rjust(10, '0'), - b'%010u', c_uint(123)) - check_format('123'.rjust(100), - b'%100u', c_uint(123)) - check_format('123'.rjust(100, '0'), - b'%.100u', c_uint(123)) - check_format('123'.rjust(80, '0').rjust(100), - b'%100.80u', c_uint(123)) - - check_format('123'.rjust(10, '0'), - b'%010x', c_int(0x123)) - check_format('123'.rjust(100), - b'%100x', c_int(0x123)) - check_format('123'.rjust(100, '0'), - b'%.100x', c_int(0x123)) - check_format('123'.rjust(80, '0').rjust(100), - b'%100.80x', c_int(0x123)) - - # test %A - check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", - b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') - - # test %V - check_format('repr=abc', - b'repr=%V', 'abc', b'xyz') - - # test %p - # We cannot test the exact result, - # because it returns a hex representation of a C pointer, - # which is going to be different each time. But, we can test the format. - p_format_regex = r'^0x[a-zA-Z0-9]{3,}$' - p_format1 = PyUnicode_FromFormat(b'%p', 'abc') - self.assertIsInstance(p_format1, str) - self.assertRegex(p_format1, p_format_regex) - - p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz') - self.assertIsInstance(p_format2, str) - self.assertRegex(p_format2, - r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}') - - # Extra args are ignored: - p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz') - self.assertIsInstance(p_format3, str) - self.assertRegex(p_format3, p_format_regex) - - # Test string decode from parameter of %s using utf-8. - # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of - # '\u4eba\u6c11' - check_format('repr=\u4eba\u6c11', - b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') - - #Test replace error handler. - check_format('repr=abc\ufffd', - b'repr=%V', None, b'abc\xff') - - # not supported: copy the raw format string. these tests are just here - # to check for crashes and should not be considered as specifications - check_format('%s', - b'%1%s', b'abc') - check_format('%1abc', - b'%1abc') - check_format('%+i', - b'%+i', c_int(10)) - check_format('%.%s', - b'%.%s', b'abc') - - # Issue #33817: empty strings - check_format('', - b'') - check_format('', - b'%s', b'') - - # Test PyUnicode_AsWideChar() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_aswidechar(self): - from _testcapi import unicode_aswidechar - import_helper.import_module('ctypes') - from ctypes import c_wchar, sizeof - - wchar, size = unicode_aswidechar('abcdef', 2) - self.assertEqual(size, 2) - self.assertEqual(wchar, 'ab') - - wchar, size = unicode_aswidechar('abc', 3) - self.assertEqual(size, 3) - self.assertEqual(wchar, 'abc') - - wchar, size = unicode_aswidechar('abc', 4) - self.assertEqual(size, 3) - self.assertEqual(wchar, 'abc\0') - - wchar, size = unicode_aswidechar('abc', 10) - self.assertEqual(size, 3) - self.assertEqual(wchar, 'abc\0') - - wchar, size = unicode_aswidechar('abc\0def', 20) - self.assertEqual(size, 7) - self.assertEqual(wchar, 'abc\0def\0') - - nonbmp = chr(0x10ffff) - if sizeof(c_wchar) == 2: - buflen = 3 - nchar = 2 - else: # sizeof(c_wchar) == 4 - buflen = 2 - nchar = 1 - wchar, size = unicode_aswidechar(nonbmp, buflen) - self.assertEqual(size, nchar) - self.assertEqual(wchar, nonbmp + '\0') - - # Test PyUnicode_AsWideCharString() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_aswidecharstring(self): - from _testcapi import unicode_aswidecharstring - import_helper.import_module('ctypes') - from ctypes import c_wchar, sizeof - - wchar, size = unicode_aswidecharstring('abc') - self.assertEqual(size, 3) - self.assertEqual(wchar, 'abc\0') - - wchar, size = unicode_aswidecharstring('abc\0def') - self.assertEqual(size, 7) - self.assertEqual(wchar, 'abc\0def\0') - - nonbmp = chr(0x10ffff) - if sizeof(c_wchar) == 2: - nchar = 2 - else: # sizeof(c_wchar) == 4 - nchar = 1 - wchar, size = unicode_aswidecharstring(nonbmp) - self.assertEqual(size, nchar) - self.assertEqual(wchar, nonbmp + '\0') - - # Test PyUnicode_AsUCS4() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_asucs4(self): - from _testcapi import unicode_asucs4 - for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600', - 'a\ud800b\udfffc', '\ud834\udd1e']: - l = len(s) - self.assertEqual(unicode_asucs4(s, l, True), s+'\0') - self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff') - self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff') - self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff') - self.assertRaises(SystemError, unicode_asucs4, s, l-1, True) - self.assertRaises(SystemError, unicode_asucs4, s, l-2, False) - s = '\0'.join([s, s]) - self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') - self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') - - # Test PyUnicode_AsUTF8() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_asutf8(self): - from _testcapi import unicode_asutf8 - - bmp = '\u0100' - bmp2 = '\uffff' - nonbmp = chr(0x10ffff) - - self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80') - self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf') - self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf') - self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc') - - # Test PyUnicode_AsUTF8AndSize() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_asutf8andsize(self): - from _testcapi import unicode_asutf8andsize - - bmp = '\u0100' - bmp2 = '\uffff' - nonbmp = chr(0x10ffff) - - self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2)) - self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3)) - self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4)) - self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc') - - # Test PyUnicode_FindChar() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_findchar(self): - from _testcapi import unicode_findchar - - for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1": - for i, ch in enumerate(str): - self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i) - self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) - - str = "!>_= end - self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1) - self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1) - # negative - self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0) - self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0) - - # Test PyUnicode_CopyCharacters() - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_copycharacters(self): - from _testcapi import unicode_copycharacters - - strings = [ - 'abcde', '\xa1\xa2\xa3\xa4\xa5', - '\u4f60\u597d\u4e16\u754c\uff01', - '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604' - ] - - for idx, from_ in enumerate(strings): - # wide -> narrow: exceed maxchar limitation - for to in strings[:idx]: - self.assertRaises( - SystemError, - unicode_copycharacters, to, 0, from_, 0, 5 - ) - # same kind - for from_start in range(5): - self.assertEqual( - unicode_copycharacters(from_, 0, from_, from_start, 5), - (from_[from_start:from_start+5].ljust(5, '\0'), - 5-from_start) - ) - for to_start in range(5): - self.assertEqual( - unicode_copycharacters(from_, to_start, from_, to_start, 5), - (from_[to_start:to_start+5].rjust(5, '\0'), - 5-to_start) - ) - # narrow -> wide - # Tests omitted since this creates invalid strings. - - s = strings[0] - self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5) - self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5) - self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5) - self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5) - self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5) - self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1) - self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0) - - @support.cpython_only - @unittest.skipIf(_testcapi is None, 'need _testcapi module') - def test_pep393_utf8_caching_bug(self): - # Issue #25709: Problem with string concatenation and utf-8 cache - from _testcapi import getargs_s_hash - for k in 0x24, 0xa4, 0x20ac, 0x1f40d: - s = '' - for i in range(5): - # Due to CPython specific optimization the 's' string can be - # resized in-place. - s += chr(k) - # Parsing with the "s#" format code calls indirectly - # PyUnicode_AsUTF8AndSize() which creates the UTF-8 - # encoded string cached in the Unicode object. - self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) - # Check that the second call returns the same result - self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) - class StringModuleTest(unittest.TestCase): def test_formatter_parser(self): def parse(format):