python · wismill · Jul 26, 2023 · Jul 26, 2023 · Jul 26, 2023 · SnoopJ
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
@@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
 
     # Update this if the database changes. Make sure to do a full rebuild
     # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
+    expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'
 
     @requires_resource('cpu')
     def test_function_checksum(self):
@@ -97,13 +97,58 @@ def test_function_checksum(self):
         result = h.hexdigest()
         self.assertEqual(result, self.expectedchecksum)
 
+    @requires_resource('network')
+    def test_name(self):
+        TESTBASEURL = "https://www.unicode.org/Public"
+        TESTDATAFILE = "extracted/DerivedName.txt"
+        TESTDATAURL = f"{TESTBASEURL}/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"
+
+        # Hit the exception early
+        try:
+            testdata = open_urlresource(TESTDATAURL, encoding="utf-8")
+        except PermissionError:
+            self.skipTest(f"Permission error when downloading {TESTDATAURL} "
+                          f"into the test data directory")
+        except (OSError, HTTPException) as exc:
+            self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
+
+        with testdata:
+            self.run_name_tests(testdata)
+
+    def run_name_tests(self, testdata):
+        names_ref = {}
+
+        def parse_cp(s):
+            return int(s, 16)
+
+        # Parse data
+        for line in testdata:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            raw_cp, name = line.split("; ")
+            # Check for a range
+            if ".." in raw_cp:
+                cp1, cp2 = map(parse_cp, raw_cp.split(".."))
+                # remove ‘*’ at the end
+                name = name[:-1]
+                for cp in range(cp1, cp2 + 1):
+                    names_ref[cp] = f"{name}{cp:0>4X}"
+            else:
+                cp = parse_cp(raw_cp)
+                names_ref[cp] = name
+
+        for cp in range(0, sys.maxunicode + 1):
+            self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp))
+
     @requires_resource('cpu')
     def test_name_inverse_lookup(self):
         for i in range(sys.maxunicode + 1):
             char = chr(i)
             if looked_name := self.db.name(char, None):
                 self.assertEqual(self.db.lookup(looked_name), char)
 
+
     def test_digit(self):
         self.assertEqual(self.db.digit('A', None), None)
         self.assertEqual(self.db.digit('9'), 9)

diff --git a/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst
@@ -0,0 +1 @@
+unicodedata: Fix missing Tangut Ideographs names.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = {
 
 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
 static int
-is_unified_ideograph(Py_UCS4 code)
+is_cjk_unified_ideograph(Py_UCS4 code)
 {
     return
         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
@@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code)
         (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
 }
 
+/* These ranges need to match makeunicodedata.py:tangut_ranges. */
+static int
+is_tangut_ideograph(Py_UCS4 code)
+{
+    return
+        (0x17000 <= code && code <= 0x187F7) || /* Tangut */
+        (0x18D00 <= code && code <= 0x18D08);   /* Tangut Supplement */
+}
+
 /* macros used to determine if the given code point is in the PUA range that
  * we are using to store aliases and named sequences */
 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
@@ -1098,14 +1107,22 @@ _getucname(PyObject *self,
         return 1;
     }
 
-    if (is_unified_ideograph(code)) {
+    if (is_cjk_unified_ideograph(code)) {
         if (buflen < 28)
             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
             return 0;
         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
         return 1;
     }
 
+    if (is_tangut_ideograph(code)) {
+        if (buflen < 23)
+            /* Worst case: TANGUT IDEOGRAPH-18D08 */
+            return 0;
+        sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
+        return 1;
+    }
+
     /* get offset into phrasebook */
     offset = phrasebook_offset1[(code>>phrasebook_shift)];
     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@@ -1236,7 +1253,7 @@ _getcode(PyObject* self,
         return 0;
     }
 
-    /* Check for unified ideographs. */
+    /* Check for CJK unified ideographs. */
     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
         /* Four or five hexdigits must follow. */
         v = 0;
@@ -1254,12 +1271,38 @@ _getcode(PyObject* self,
                 return 0;
             name++;
         }
-        if (!is_unified_ideograph(v))
+        if (!is_cjk_unified_ideograph(v))
+            return 0;
+        *code = v;
+        return 1;
+    }
+
+
+    /* Check for Tangut ideographs. */
+    if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
+        /* Five hexdigits must follow. */
+        v = 0;
+        name += 17;
+        namelen -= 17;
+        if (namelen != 5)
+            return 0;
+        while (namelen--) {
+            v *= 16;
+            if (*name >= '0' && *name <= '9')
+                v += *name - '0';
+            else if (*name >= 'A' && *name <= 'F')
+                v += *name - 'A' + 10;
+            else
+                return 0;
+            name++;
+        }
+        if (!is_tangut_ideograph(v))
             return 0;
         *code = v;
         return 1;
     }
 
+
     /* the following is the same as python's dictionary lookup, with
        only minor changes.  see the makeunicodedata script for more
        details */

diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
@@ -99,7 +99,7 @@
 CASED_MASK = 0x2000
 EXTENDED_CASE_MASK = 0x4000
 
-# these ranges need to match unicodedata.c:is_unified_ideograph
+# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
 cjk_ranges = [
     ('3400', '4DBF'),
     ('4E00', '9FFF'),
@@ -112,6 +112,12 @@
     ('31350', '323AF'),
 ]
 
+# these ranges need to match unicodedata.c:is_tangut_ideograph
+tangut_ranges = [
+    ('17000', '187F7'),
+    ('18D00', '18D08')
+]
+
 
 def maketables(trace=0):
 
@@ -123,7 +129,7 @@ def maketables(trace=0):
 
     for version in old_versions:
         print("--- Reading", UNICODE_DATA % ("-"+version), "...")
-        old_unicode = UnicodeData(version, cjk_check=False)
+        old_unicode = UnicodeData(version, ideograph_check=False)
         print(len(list(filter(None, old_unicode.table))), "characters")
         merge_old_version(version, unicode, old_unicode)
 
@@ -1020,14 +1026,15 @@ def from_row(row: List[str]) -> UcdRecord:
 class UnicodeData:
     # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
 
-    def __init__(self, version, cjk_check=True):
+    def __init__(self, version, ideograph_check=True):
         self.changed = []
         table = [None] * 0x110000
         for s in UcdFile(UNICODE_DATA, version):
             char = int(s[0], 16)
             table[char] = from_row(s)
 
         cjk_ranges_found = []
+        tangut_ranges_found = []
 
         # expand first-last ranges
         field = None
@@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True):
                     if s.name.startswith("<CJK Ideograph"):
                         cjk_ranges_found.append((field[0],
                                                  s.codepoint))
+                    elif s.name.startswith("<Tangut Ideograph"):
+                        tangut_ranges_found.append((field[0],
+                                                    s.codepoint))
                     s.name = ""
                     field = None
             elif field:
                 table[i] = from_row(('%X' % i,) + field[1:])
-        if cjk_check and cjk_ranges != cjk_ranges_found:
+        if ideograph_check and cjk_ranges != cjk_ranges_found:
             raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
+        if ideograph_check and tangut_ranges != tangut_ranges_found:
+            raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)
 
         # public attributes
         self.filename = UNICODE_DATA % ''
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		unicodedata: Fix missing Tangut Ideographs names.