gh-80480: array: Add 'w' typecode. (#105242)

python · Jun 4, 2023 · 1237fb6 · 1237fb6
1 parent 5a5ed7a
commit 1237fb6
Show file tree

Hide file tree

Showing 7 changed files with 158 additions and 58 deletions.
diff --git a/Doc/faq/programming.rst b/Doc/faq/programming.rst
@@ -924,12 +924,12 @@ module::
    'Hello, there!'
 
    >>> import array
-   >>> a = array.array('u', s)
+   >>> a = array.array('w', s)
    >>> print(a)
-   array('u', 'Hello, world')
+   array('w', 'Hello, world')
    >>> a[0] = 'y'
    >>> print(a)
-   array('u', 'yello, world')
+   array('w', 'yello, world')
    >>> a.tounicode()
    'yello, world'
 

diff --git a/Doc/library/array.rst b/Doc/library/array.rst
@@ -24,6 +24,8 @@ defined:
 +-----------+--------------------+-------------------+-----------------------+-------+
 | ``'u'``   | wchar_t            | Unicode character | 2                     | \(1)  |
 +-----------+--------------------+-------------------+-----------------------+-------+
+| ``'w'``   | Py_UCS4            | Unicode character | 4                     |       |
++-----------+--------------------+-------------------+-----------------------+-------+
 | ``'h'``   | signed short       | int               | 2                     |       |
 +-----------+--------------------+-------------------+-----------------------+-------+
 | ``'H'``   | unsigned short     | int               | 2                     |       |
@@ -56,6 +58,7 @@ Notes:
       ``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3.
 
    .. deprecated-removed:: 3.3 4.0
+      Please migrate to ``'w'`` typecode.
 
 
 The actual representation of values is determined by the machine architecture
@@ -174,9 +177,9 @@ The module defines the following type:
 
    .. method:: fromunicode(s)
 
-      Extends this array with data from the given unicode string.  The array must
-      be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised.  Use
-      ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
+      Extends this array with data from the given unicode string.
+      The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised.
+      Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
       array of some other type.
 
 
@@ -236,21 +239,22 @@ The module defines the following type:
 
    .. method:: tounicode()
 
-      Convert the array to a unicode string.  The array must be a type ``'u'`` array;
+      Convert the array to a unicode string.  The array must have a type ``'u'`` or ``'w'``;
       otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to
       obtain a unicode string from an array of some other type.
 
 
 When an array object is printed or converted to a string, it is represented as
 ``array(typecode, initializer)``.  The *initializer* is omitted if the array is
-empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a
-list of numbers.  The string is guaranteed to be able to be converted back to an
+empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``,
+otherwise it is a list of numbers.
+The string is guaranteed to be able to be converted back to an
 array with the same type and value using :func:`eval`, so long as the
 :class:`~array.array` class has been imported using ``from array import array``.
 Examples::
 
    array('l')
-   array('u', 'hello \u2641')
+   array('w', 'hello \u2641')
    array('l', [1, 2, 3, 4, 5])
    array('d', [1.0, 2.0, 3.14])
 

diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
@@ -87,6 +87,13 @@ New Modules
 Improved Modules
 ================
 
+array
+-----
+
+* Add ``'w'`` type code that can be used for Unicode strings.
+  It can be used instead of ``'u'`` type code, which is deprecated.
+  (Contributed by Inada Naoki in :gh:`80480`.)
+
 io
 --
 

diff --git a/Lib/test/test_array.py b/Lib/test/test_array.py
@@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array):
     def __init__(self, typecode, newarg=None):
         array.array.__init__(self)
 
-typecodes = 'ubBhHiIlLfdqQ'
+typecodes = 'uwbBhHiIlLfdqQ'
 
 class MiscTest(unittest.TestCase):
 
@@ -186,11 +186,12 @@ def test_unicode(self):
         )
         for testcase in testcases:
             mformat_code, encoding = testcase
-            a = array.array('u', teststr)
-            b = array_reconstructor(
-                array.array, 'u', mformat_code, teststr.encode(encoding))
-            self.assertEqual(a, b,
-                msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
+            for c in 'uw':
+                a = array.array(c, teststr)
+                b = array_reconstructor(
+                    array.array, c, mformat_code, teststr.encode(encoding))
+                self.assertEqual(a, b,
+                    msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
 
 
 class BaseTest:
@@ -234,7 +235,7 @@ def test_buffer_info(self):
         self.assertEqual(bi[1], len(a))
 
     def test_byteswap(self):
-        if self.typecode == 'u':
+        if self.typecode in ('u', 'w'):
             example = '\U00100100'
         else:
             example = self.example
@@ -1079,7 +1080,7 @@ def test_buffer(self):
         self.assertEqual(m.tobytes(), expected)
         self.assertRaises(BufferError, a.frombytes, a.tobytes())
         self.assertEqual(m.tobytes(), expected)
-        if self.typecode == 'u':
+        if self.typecode in ('u', 'w'):
             self.assertRaises(BufferError, a.fromunicode, a.tounicode())
             self.assertEqual(m.tobytes(), expected)
         self.assertRaises(BufferError, operator.imul, a, 2)
@@ -1135,16 +1136,17 @@ def test_sizeof_without_buffer(self):
         support.check_sizeof(self, a, basesize)
 
     def test_initialize_with_unicode(self):
-        if self.typecode != 'u':
+        if self.typecode not in ('u', 'w'):
             with self.assertRaises(TypeError) as cm:
                 a = array.array(self.typecode, 'foo')
             self.assertIn("cannot use a str", str(cm.exception))
             with self.assertRaises(TypeError) as cm:
-                a = array.array(self.typecode, array.array('u', 'foo'))
+                a = array.array(self.typecode, array.array('w', 'foo'))
             self.assertIn("cannot use a unicode array", str(cm.exception))
         else:
             a = array.array(self.typecode, "foo")
             a = array.array(self.typecode, array.array('u', 'foo'))
+            a = array.array(self.typecode, array.array('w', 'foo'))
 
     @support.cpython_only
     def test_obsolete_write_lock(self):
@@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase):
     smallerexample = '\x01\u263a\x00\ufefe'
     biggerexample = '\x01\u263a\x01\ufeff'
     outside = str('\x33')
-    minitemsize = 2
+    minitemsize = sizeof_wchar
 
     def test_unicode(self):
         self.assertRaises(TypeError, array.array, 'b', 'foo')
 
-        a = array.array('u', '\xa0\xc2\u1234')
+        a = array.array(self.typecode, '\xa0\xc2\u1234')
         a.fromunicode(' ')
         a.fromunicode('')
         a.fromunicode('')
         a.fromunicode('\x11abc\xff\u1234')
         s = a.tounicode()
         self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234')
-        self.assertEqual(a.itemsize, sizeof_wchar)
+        self.assertEqual(a.itemsize, self.minitemsize)
 
         s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234'
-        a = array.array('u', s)
+        a = array.array(self.typecode, s)
         self.assertEqual(
             repr(a),
-            "array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
+            f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
 
         self.assertRaises(TypeError, a.fromunicode)
 
     def test_issue17223(self):
-        # this used to crash
-        if sizeof_wchar == 4:
-            # U+FFFFFFFF is an invalid code point in Unicode 6.0
-            invalid_str = b'\xff\xff\xff\xff'
-        else:
+        if self.typecode == 'u' and sizeof_wchar == 2:
             # PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
             self.skipTest("specific to 32-bit wchar_t")
-        a = array.array('u', invalid_str)
+
+        # this used to crash
+        # U+FFFFFFFF is an invalid code point in Unicode 6.0
+        invalid_str = b'\xff\xff\xff\xff'
+
+        a = array.array(self.typecode, invalid_str)
         self.assertRaises(ValueError, a.tounicode)
         self.assertRaises(ValueError, str, a)
 
+class UCS4Test(UnicodeTest):
+    typecode = 'w'
+    minitemsize = 4
+
 class NumberTest(BaseTest):
 
     def test_extslice(self):

diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py
@@ -955,7 +955,7 @@ def test_float_write(self):
 
     def test_char_write(self):
         import array, string
-        a = array.array('u', string.ascii_letters)
+        a = array.array('w', string.ascii_letters)
 
         with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
             writer = csv.writer(fileobj, dialect="excel")

diff --git a/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst b/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst
@@ -0,0 +1 @@
+:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.