Skip to content

Commit

Permalink
bpo-40328: Add tool for generating cjk mapping headers (GH-19602)
Browse files Browse the repository at this point in the history
  • Loading branch information
corona10 committed Apr 29, 2020
1 parent 2d87577 commit 113feb3
Show file tree
Hide file tree
Showing 15 changed files with 51,015 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add tools for generating mappings headers for CJKCodecs.
4 changes: 1 addition & 3 deletions Modules/cjkcodecs/README
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
To generate or modify mapping headers
-------------------------------------
Mapping headers are imported from CJKCodecs as pre-generated form.
If you need to tweak or add something on it, please look at tools/
subdirectory of CJKCodecs' distribution.
Mapping headers are generated from Tools/unicode/genmap_*.py



Expand Down
1 change: 1 addition & 0 deletions Modules/cjkcodecs/mappings_cn.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_schinese.py: DO NOT EDIT
static const ucs2_t __gb2312_decmap[7482] = {
12288,12289,12290,12539,713,711,168,12291,12293,8213,65374,8214,8230,8216,
8217,8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,
Expand Down
1 change: 1 addition & 0 deletions Modules/cjkcodecs/mappings_jisx0213_pair.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
#define JISX0213_ENCPAIRS 46
#ifdef EXTERN_JISX0213_PAIR
static const struct widedbcs_index *jisx0213_pair_decmap;
Expand Down
1 change: 1 addition & 0 deletions Modules/cjkcodecs/mappings_jp.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_japanese.py: DO NOT EDIT
static const ucs2_t __jisx0208_decmap[6956] = {
12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,
65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,
Expand Down
2 changes: 2 additions & 0 deletions Modules/cjkcodecs/mappings_kr.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// AUTO-GENERATED FILE FROM genmap_korean.py: DO NOT EDIT
static const ucs2_t __ksx1001_decmap[8264] = {
12288,12289,12290,183,8229,8230,168,12291,173,8213,8741,65340,8764,8216,8217,
8220,8221,12308,12309,12296,12297,12298,12299,12300,12301,12302,12303,12304,
Expand Down Expand Up @@ -3249,3 +3250,4 @@ __cp949_encmap+31959,0,255},{__cp949_encmap+32215,0,255},{__cp949_encmap+32471
__cp949_encmap+32891,0,11},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{__cp949_encmap+
32903,1,230},
};

251 changes: 251 additions & 0 deletions Tools/unicode/genmap_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
#
# genmap_ja_codecs.py: Japanese Codecs Map Generator
#
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
#
import os

from genmap_support import *

JISX0208_C1 = (0x21, 0x74)
JISX0208_C2 = (0x21, 0x7e)
JISX0212_C1 = (0x22, 0x6d)
JISX0212_C2 = (0x21, 0x7e)
JISX0213_C1 = (0x21, 0x7e)
JISX0213_C2 = (0x21, 0x7e)
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932
CP932P0_C2 = (0x5f, 0xca)
CP932P1_C1 = (0x87, 0x87) # CP932 P1
CP932P1_C2 = (0x40, 0x9c)
CP932P2_C1 = (0xed, 0xfc) # CP932 P2
CP932P2_C2 = (0x40, 0xfc)

MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'


def loadmap_jisx0213(fo):
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
decmap3_pair = {} # maps to BMP-pair for level 3
for line in fo:
line = line.split('#', 1)[0].strip()
if not line or len(line.split()) < 2:
continue

row = line.split()
loc = eval('0x' + row[0][2:])
level = eval(row[0][0])
m = None
if len(row[1].split('+')) == 2: # single unicode
uni = eval('0x' + row[1][2:])
if level == 3:
if uni < 0x10000:
m = decmap3
elif 0x20000 <= uni < 0x30000:
uni -= 0x20000
m = decmap3_2
elif level == 4:
if uni < 0x10000:
m = decmap4
elif 0x20000 <= uni < 0x30000:
uni -= 0x20000
m = decmap4_2
m.setdefault((loc >> 8), {})
m[(loc >> 8)][(loc & 0xff)] = uni
else: # pair
uniprefix = eval('0x' + row[1][2:6]) # body
uni = eval('0x' + row[1][7:11]) # modifier
if level != 3:
raise ValueError("invalid map")
decmap3_pair.setdefault(uniprefix, {})
m = decmap3_pair[uniprefix]

if m is None:
raise ValueError("invalid map")
m.setdefault((loc >> 8), {})
m[(loc >> 8)][(loc & 0xff)] = uni

return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair


def main():
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)

print("Loading Mapping File...")

sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
jisx0212decmap = loadmap(jisx0212file)
cp932decmap = loadmap(cp932file)
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)

if jis3decmap[0x21][0x24] != 0xff0c:
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')

sjisencmap, cp932encmap = {}, {}
jisx0208_0212encmap = {}
for c1, m in sjisdecmap.items():
for c2, code in m.items():
sjisencmap.setdefault(code >> 8, {})
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
for c1, m in cp932decmap.items():
for c2, code in m.items():
cp932encmap.setdefault(code >> 8, {})
if (code & 0xff) not in cp932encmap[code >> 8]:
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
for c1, m in cp932encmap.copy().items():
for c2, code in m.copy().items():
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
del cp932encmap[c1][c2]
if not cp932encmap[c1]:
del cp932encmap[c1]

jisx0213pairdecmap = {}
jisx0213pairencmap = []
for unibody, m1 in jis3_pairdecmap.items():
for c1, m2 in m1.items():
for c2, modifier in m2.items():
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
jisx0213pairdecmap.setdefault(c1, {})
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier

# Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
for c1, m in jisx0208decmap.items():
for c2, code in m.items():
jisx0208_0212encmap.setdefault(code >> 8, {})
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2

for c1, m in jisx0212decmap.items():
for c2, code in m.items():
jisx0208_0212encmap.setdefault(code >> 8, {})
if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
print("OOPS!!!", (code))
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

jisx0213bmpencmap = {}
for c1, m in jis3decmap.copy().items():
for c2, code in m.copy().items():
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
if code in jis3_pairdecmap:
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
elif jisx0208decmap[c1][c2] == code:
del jis3decmap[c1][c2]
if not jis3decmap[c1]:
del jis3decmap[c1]
else:
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
else:
jisx0213bmpencmap.setdefault(code >> 8, {})
if code not in jis3_pairdecmap:
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
else:
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))

for c1, m in jis4decmap.items():
for c2, code in m.items():
jisx0213bmpencmap.setdefault(code >> 8, {})
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

jisx0213empencmap = {}
for c1, m in jis3_2_decmap.items():
for c2, code in m.items():
jisx0213empencmap.setdefault(code >> 8, {})
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
for c1, m in jis4_2_decmap.items():
for c2, code in m.items():
jisx0213empencmap.setdefault(code >> 8, {})
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

with open("mappings_jp.h", "w") as fp:
print_autogen(fp, os.path.basename(__file__))
print("Generating JIS X 0208 decode map...")
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
writer.update_decode_map(JISX0208_C1, JISX0208_C2)
writer.generate()

print("Generating JIS X 0212 decode map...")
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
writer.update_decode_map(JISX0212_C1, JISX0212_C2)
writer.generate()

print("Generating JIS X 0208 && JIS X 0212 encode map...")
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
writer.generate()

print("Generating CP932 Extension decode map...")
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
writer.update_decode_map(CP932P0_C1, CP932P0_C2)
writer.update_decode_map(CP932P1_C1, CP932P1_C2)
writer.update_decode_map(CP932P2_C1, CP932P2_C2)
writer.generate()

print("Generating CP932 Extension encode map...")
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
writer.generate()

print("Generating JIS X 0213 Plane 1 BMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 Plane 2 BMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 BMP encode map...")
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
writer.generate()

print("Generating JIS X 0213 Plane 1 EMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 Plane 2 EMP decode map...")
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate()

print("Generating JIS X 0213 EMP encode map...")
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
writer.generate()

with open('mappings_jisx0213_pair.h', 'w') as fp:
print_autogen(fp, os.path.basename(__file__))
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
fp.write("""\
#ifdef EXTERN_JISX0213_PAIR
static const struct widedbcs_index *jisx0213_pair_decmap;
static const struct pair_encodemap *jisx0213_pair_encmap;
#else
""")

print("Generating JIS X 0213 unicode-pair decode map...")
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
writer.generate(wide=True)

print("Generating JIS X 0213 unicode-pair encode map...")
jisx0213pairencmap.sort()
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
filler = BufferedFiller()
for body, modifier, jis in jisx0213pairencmap:
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
filler.printout(fp)
fp.write("};\n")
fp.write("#endif\n")

print("Done!")

if __name__ == '__main__':
main()
62 changes: 62 additions & 0 deletions Tools/unicode/genmap_korean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#
# genmap_korean.py: Korean Codecs Map Generator
#
# Original Author: Hye-Shik Chang <perky@FreeBSD.org>
# Modified Author: Dong-hee Na <donghee.na92@gmail.com>
#
import os

from genmap_support import *


KSX1001_C1 = (0x21, 0x7e)
KSX1001_C2 = (0x21, 0x7e)
UHCL1_C1 = (0x81, 0xa0)
UHCL1_C2 = (0x41, 0xfe)
UHCL2_C1 = (0xa1, 0xfe)
UHCL2_C2 = (0x41, 0xa0)
MAPPINGS_CP949 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT'


def main():
mapfile = open_mapping_file('python-mappings/CP949.TXT', MAPPINGS_CP949)
print("Loading Mapping File...")
decmap = loadmap(mapfile)
uhcdecmap, ksx1001decmap, cp949encmap = {}, {}, {}
for c1, c2map in decmap.items():
for c2, code in c2map.items():
if c1 >= 0xa1 and c2 >= 0xa1:
ksx1001decmap.setdefault(c1 & 0x7f, {})
ksx1001decmap[c1 & 0x7f][c2 & 0x7f] = c2map[c2]
cp949encmap.setdefault(code >> 8, {})
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2) & 0x7f7f
else:
# uhc
uhcdecmap.setdefault(c1, {})
uhcdecmap[c1][c2] = c2map[c2]
cp949encmap.setdefault(code >> 8, {}) # MSB set
cp949encmap[code >> 8][code & 0xFF] = (c1 << 8 | c2)

with open('mappings_kr.h', 'w') as fp:
print_autogen(fp, os.path.basename(__file__))

print("Generating KS X 1001 decode map...")
writer = DecodeMapWriter(fp, "ksx1001", ksx1001decmap)
writer.update_decode_map(KSX1001_C1, KSX1001_C2)
writer.generate()

print("Generating UHC decode map...")
writer = DecodeMapWriter(fp, "cp949ext", uhcdecmap)
writer.update_decode_map(UHCL1_C1, UHCL1_C2)
writer.update_decode_map(UHCL2_C1, UHCL2_C2)
writer.generate()

print("Generating CP949 (includes KS X 1001) encode map...")
writer = EncodeMapWriter(fp, "cp949", cp949encmap)
writer.generate()

print("Done!")


if __name__ == '__main__':
main()
Loading

0 comments on commit 113feb3

Please sign in to comment.