Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9'

@requires_resource('cpu')
def test_function_checksum(self):
Expand All @@ -97,13 +97,58 @@ def test_function_checksum(self):
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)

@requires_resource('network')
def test_name(self):
TESTBASEURL = "https://www.unicode.org/Public"
TESTDATAFILE = "extracted/DerivedName.txt"
TESTDATAURL = f"{TESTBASEURL}/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"

# Hit the exception early
try:
testdata = open_urlresource(TESTDATAURL, encoding="utf-8")
except PermissionError:
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
f"into the test data directory")
except (OSError, HTTPException) as exc:
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")

with testdata:
self.run_name_tests(testdata)

def run_name_tests(self, testdata):
names_ref = {}

def parse_cp(s):
return int(s, 16)

# Parse data
for line in testdata:
line = line.strip()
if not line or line.startswith("#"):
continue
raw_cp, name = line.split("; ")
# Check for a range
if ".." in raw_cp:
cp1, cp2 = map(parse_cp, raw_cp.split(".."))
# remove ‘*’ at the end
name = name[:-1]
for cp in range(cp1, cp2 + 1):
names_ref[cp] = f"{name}{cp:0>4X}"
else:
cp = parse_cp(raw_cp)
names_ref[cp] = name

for cp in range(0, sys.maxunicode + 1):
self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp))

@requires_resource('cpu')
def test_name_inverse_lookup(self):
for i in range(sys.maxunicode + 1):
char = chr(i)
if looked_name := self.db.name(char, None):
self.assertEqual(self.db.lookup(looked_name), char)


def test_digit(self):
self.assertEqual(self.db.digit('A', None), None)
self.assertEqual(self.db.digit('9'), 9)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
unicodedata: Fix missing Tangut Ideographs names.
51 changes: 47 additions & 4 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = {

/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int
is_unified_ideograph(Py_UCS4 code)
is_cjk_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
Expand All @@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code)
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
}

/* These ranges need to match makeunicodedata.py:tangut_ranges. */
static int
is_tangut_ideograph(Py_UCS4 code)
{
return
(0x17000 <= code && code <= 0x187F7) || /* Tangut */
(0x18D00 <= code && code <= 0x18D08); /* Tangut Supplement */
}

/* macros used to determine if the given code point is in the PUA range that
* we are using to store aliases and named sequences */
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
Expand Down Expand Up @@ -1098,14 +1107,22 @@ _getucname(PyObject *self,
return 1;
}

if (is_unified_ideograph(code)) {
if (is_cjk_unified_ideograph(code)) {
if (buflen < 28)
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
return 0;
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
return 1;
}

if (is_tangut_ideograph(code)) {
if (buflen < 23)
/* Worst case: TANGUT IDEOGRAPH-18D08 */
return 0;
sprintf(buffer, "TANGUT IDEOGRAPH-%X", code);
return 1;
}

/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
Expand Down Expand Up @@ -1236,7 +1253,7 @@ _getcode(PyObject* self,
return 0;
}

/* Check for unified ideographs. */
/* Check for CJK unified ideographs. */
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
v = 0;
Expand All @@ -1254,12 +1271,38 @@ _getcode(PyObject* self,
return 0;
name++;
}
if (!is_unified_ideograph(v))
if (!is_cjk_unified_ideograph(v))
return 0;
*code = v;
return 1;
}


/* Check for Tangut ideographs. */
if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) {
/* Five hexdigits must follow. */
v = 0;
name += 17;
namelen -= 17;
if (namelen != 5)
return 0;
while (namelen--) {
v *= 16;
if (*name >= '0' && *name <= '9')
v += *name - '0';
else if (*name >= 'A' && *name <= 'F')
v += *name - 'A' + 10;
else
return 0;
name++;
}
Comment on lines +1289 to +1298
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little unsettled that this loop is duplicated from above, but I don't see a better way to do it aside from maybe some preprocessor abuse.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree. But since this MR has received very little attention, I am not going to dedicate time to this if there is no opportunity to merge it.

if (!is_tangut_ideograph(v))
return 0;
*code = v;
return 1;
}


/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
Expand Down
20 changes: 16 additions & 4 deletions Tools/unicode/makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
CASED_MASK = 0x2000
EXTENDED_CASE_MASK = 0x4000

# these ranges need to match unicodedata.c:is_unified_ideograph
# these ranges need to match unicodedata.c:is_cjk_unified_ideograph
cjk_ranges = [
('3400', '4DBF'),
('4E00', '9FFF'),
Expand All @@ -112,6 +112,12 @@
('31350', '323AF'),
]

# these ranges need to match unicodedata.c:is_tangut_ideograph
tangut_ranges = [
('17000', '187F7'),
('18D00', '18D08')
]


def maketables(trace=0):

Expand All @@ -123,7 +129,7 @@ def maketables(trace=0):

for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(version, cjk_check=False)
old_unicode = UnicodeData(version, ideograph_check=False)
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)

Expand Down Expand Up @@ -1020,14 +1026,15 @@ def from_row(row: List[str]) -> UcdRecord:
class UnicodeData:
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned

def __init__(self, version, cjk_check=True):
def __init__(self, version, ideograph_check=True):
self.changed = []
table = [None] * 0x110000
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = from_row(s)

cjk_ranges_found = []
tangut_ranges_found = []

# expand first-last ranges
field = None
Expand All @@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True):
if s.name.startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s.codepoint))
elif s.name.startswith("<Tangut Ideograph"):
tangut_ranges_found.append((field[0],
s.codepoint))
s.name = ""
field = None
elif field:
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
if ideograph_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
if ideograph_check and tangut_ranges != tangut_ranges_found:
raise ValueError("Tangut ranges deviate: have %r" % tangut_ranges_found)

# public attributes
self.filename = UNICODE_DATA % ''
Expand Down