forked from whatwg/encoding
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools-index.py
74 lines (62 loc) · 2.67 KB
/
tools-index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import json
import requests
data = json.load(open("indexes.json", "r"))
if not os.path.exists("UnicodeData.txt"):
# Download UnicodeData.txt if it doesn't exist yet
open("UnicodeData.txt", "w").write(requests.get("https://unicode.org/Public/UNIDATA/UnicodeData.txt").text)
names = open("UnicodeData.txt", "r").readlines()
jamo = [
["G","GG","N","D","DD","R","M","B","BB","S","SS","","J","JJ","C","K","T","P","H"],
["A","AE","YA","YAE","EO","E","YEO","YE","O","WA","WAE","OE","YO","U","WEO","WE","WI","YU","EU","YI","I"],
["","G","GG","GS","N","NJ","NH","D","L","LG","LM","LB","LS","LT","LP","LH","M","B","BS","S","SS","NG","J","C","K","T","P","H"]
]
def format_index(num, width):
return str(num).rjust(width, " ")
def format_cp(cp):
return "0x" + hex(cp)[2:].rjust(4, "0").upper()
def get_name(cp):
if cp >= 0x3400 and cp <= 0x4DB5:
return "<CJK Ideograph Extension A>"
elif cp >= 0x4E00 and cp <= 0x9FCB:
return "<CJK Ideograph>"
elif cp >= 0xAC00 and cp <= 0xD7A3:
#return "<Hangul Syllable>"
i = cp - 0xAC00
s = jamo[0][i//28//21] + jamo[1][i//28%21] + jamo[2][i%28]
return "HANGUL SYLLABLE " + s
elif cp >= 0xE000 and cp <= 0xF8FF:
return "<Private Use>"
elif cp >= 0x20000 and cp <= 0x2A6D6:
return "<CJK Ideograph Extension B>"
elif cp >= 0x2A700 and cp <= 0x2B734:
return "<CJK Ideograph Extension C>"
elif cp >= 0x2B740 and cp <= 0x2B81D:
return "<CJK Ideograph Extension D>"
index = format_cp(cp)[2:] + ";"
for line in names:
if line.startswith(index):
return (line.split(";"))[1]
print("name not found", format_cp(cp)[2:])
return "<Private Use>"
for index in data:
import codecs, hashlib, datetime
handle = codecs.open("index-" + index + ".txt", "w", "utf-8")
handle.write("# For details on index index-" + index + ".txt see the Encoding Standard\n")
handle.write("# https://encoding.spec.whatwg.org/\n")
handle.write("#\n")
handle.write("# Identifier: " + hashlib.sha256(str(data[index]).encode("ascii")).hexdigest() + "\n")
handle.write("# Date: " + str(datetime.date.today()) + "\n")
handle.write("\n")
# gb18030-ranges is not like the other indexes, it's an index of ranges
if index == "gb18030-ranges":
for range in data[index]:
handle.write(format_index(range[0], 6) + "\t" + format_cp(range[1]) + "\n")
continue
i = 0
width = len(str(len(data[index])))
for cp in data[index]:
if cp != None:
name = get_name(cp)
handle.write(format_index(i, width) + "\t" + format_cp(cp) + "\t" + chr(cp) + " (" + name + ")\n")
i += 1