-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit c2732d2
Showing
4 changed files
with
288 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Unihan/ | ||
Unihan.zip | ||
Unihan-*.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
language: python | ||
python: "3.6" | ||
install: | ||
- wget ftp://ftp.unicode.org/Public/11.0.0/ucd/Unihan-11.0.0d2.zip | ||
- unzip -d Unihan Unihan-*.zip | ||
script: | ||
- '[[ "$TRAVIS_TAG" = "" ]] || grep "$TRAVIS_TAG" README.md' | ||
- '[[ "$TRAVIS_TAG" = "" ]] || grep "${TRAVIS_TAG%-[0-9]*}" .travis.yml' | ||
- ./process.py Unihan unihan-json | ||
- | | ||
git clone \ | ||
-b gh-pages \ | ||
"https://$GITHUB_TOKEN:[email protected]/dahlia/unihan-json.git" \ | ||
/tmp/gh-pages | ||
- rm -rf /tmp/gh-pages/$TRAVIS_BRANCH | ||
- cp -r unihan-json /tmp/gh-pages/$TRAVIS_BRANCH | ||
- | | ||
pushd /tmp/gh-pages | ||
git add "$TRAVIS_BRANCH" | ||
git commit -m "Update: $TRAVIS_BRANCH" | ||
git push origin gh-pages | ||
popd | ||
before_deploy: | ||
- tar cvfz "unihan-json-$TRAVIS_TAG.tar.gz" unihan-json/ | ||
deploy: | ||
- provider: releases | ||
api_key: "$GITHUB_TOKEN" | ||
file_glob: true | ||
file: unihan-json-*.tar.gz | ||
skip_cleanup: true | ||
on: | ||
tags: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
Unihan JSON | ||
=========== | ||
|
||
[![Build Status][ci-badge]][ci] | ||
|
||
This project generates JSON data files parsed from the [Unicode Han Database] | ||
(Unihan) in a structured way. Although it's automated through the *process.py* | ||
script, the goal of this project is not the script, but JSON data files. | ||
|
||
To download JSON data files, see the [latest release]. To load them from a web | ||
page through XHR or `window.fetch()` function, request the following URI | ||
(replace `<PROP>` with a property name, e.g., `kSimplifiedVariant`): | ||
|
||
https://dahlia.github.io/unihan-json/11.0.0d2-0/<PROP>.json | ||
|
||
Each JSON file corresponds to a property, and is an object which represents | ||
a table from Unicode characters to values for the property. For example, | ||
*kCantonese.json* is like: | ||
|
||
~~~~~~~~ json | ||
{ | ||
"香": ["hoeng1"], | ||
"港": ["gong2"], | ||
... | ||
} | ||
~~~~~~~~ | ||
|
||
The following some properties are parsed further into structured values: | ||
|
||
kAccountingNumeric | ||
kCantonese | ||
kFrequency | ||
kGB0 | ||
kGB1 | ||
kGB3 | ||
kGB5 | ||
kGB7 | ||
kGB8 | ||
kGradeLevel | ||
kHangul | ||
kHanyuPinlu | ||
kHanyuPinyin | ||
kJapaneseKun | ||
kJapaneseOn | ||
kLau | ||
kNelson | ||
kOtherNumeric | ||
kPrimaryNumeric | ||
kSimplifiedVariant | ||
kTaiwanTelegraph | ||
kTang | ||
kTotalStrokes | ||
kTraditionalVariant | ||
kVietnamese | ||
|
||
The rest properties are merely parsed into string values. Contributing more | ||
parsers are welcome; see also the `PROP_PARSERS` map in the *process.py* script. | ||
|
||
[ci-badge]: https://travis-ci.org/dahlia/unihan-json.svg?branch=master | ||
[ci]: https://travis-ci.org/dahlia/unihan-json | ||
[Unicode Han Database]: https://www.unicode.org/reports/tr38/ | ||
[latest release]: https://github.com/dahlia/unihan-json/releases/latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
#!/usr/bin/env python3.6 | ||
import json | ||
import logging | ||
import pathlib | ||
import re | ||
import sys | ||
import tempfile | ||
from typing import (Callable, Dict, Iterator, List, Mapping, | ||
Match, Pattern, Tuple) | ||
|
||
__all__ = ('UNICODE_POINT_RE', 'PROP_PARSERS', 'PropWriter', | ||
'main', 'parse_unicode_point', 'parse_unicode_points') | ||
|
||
|
||
UNICODE_POINT_RE: Pattern = re.compile('^U\+([0-9A-Fa-f]+)$') | ||
|
||
|
||
def parse_unicode_point(unicode_point: str) -> str: | ||
m: Match = UNICODE_POINT_RE.match(unicode_point) | ||
if not m: | ||
raise ValueError(f'invalid unicode point: {unicode_point!r}') | ||
return chr(int(m.group(1), 16)) | ||
|
||
|
||
def parse_unicode_points(unicode_points: str) -> List[str]: | ||
try: | ||
return [parse_unicode_point(code) for code in unicode_points.split()] | ||
except ValueError: | ||
raise ValueError(f'invalid unicode points: {unicode_points!r}') | ||
|
||
|
||
def parse_hangul(entries: str) -> Dict[str, str]: | ||
return dict(pair.split(':', 1) for pair in entries.split()) | ||
|
||
|
||
HANYU_PINLU_RE: Pattern = re.compile( | ||
'^([^()]+)[(]([0-9]+)[)]$' | ||
) | ||
|
||
|
||
def parse_hanyu_pinlu(entries: str) -> Dict[str, int]: | ||
m = HANYU_PINLU_RE.match | ||
return { | ||
reading: int(freq) | ||
for entry in entries.split() | ||
for (reading, freq) in (m(entry).groups(),) | ||
} | ||
|
||
|
||
def parse_hanyu_pinyin(entries: str) -> Dict[str, List[str]]: | ||
return { | ||
entry_num: readings.split(',') | ||
for entry in entries.split() | ||
for (entry_num, readings) in (entry.split(':', 1),) | ||
} | ||
|
||
|
||
def parse_ints(entries: str) -> List[int]: | ||
return [int(entry) for entry in entries.split()] | ||
|
||
|
||
PROP_PARSERS: Mapping[str, Callable[[str], object]] = { | ||
# CHECK: When a parser is added, update README.md as well. | ||
'kAccountingNumeric': int, | ||
'kCantonese': str.split, | ||
'kFrequency': int, | ||
'kGB0': int, | ||
'kGB1': int, | ||
'kGB3': int, | ||
'kGB5': int, | ||
'kGB7': int, | ||
'kGB8': int, | ||
'kGradeLevel': int, | ||
'kHangul': parse_hangul, | ||
'kHanyuPinlu': parse_hanyu_pinlu, | ||
'kHanyuPinyin': parse_hanyu_pinyin, | ||
'kJapaneseKun': str.split, | ||
'kJapaneseOn': str.split, | ||
'kLau': parse_ints, | ||
'kNelson': parse_ints, | ||
'kOtherNumeric': int, | ||
'kPrimaryNumeric': int, | ||
'kSimplifiedVariant': parse_unicode_points, | ||
'kTaiwanTelegraph': int, | ||
'kTang': str.split, | ||
'kTotalStrokes': parse_ints, | ||
'kTraditionalVariant': parse_unicode_points, | ||
'kVietnamese': str.split, | ||
} | ||
|
||
|
||
class PropWriter: | ||
|
||
DEFAULT_FILENAME_FORMAT: str = '{0}.json' | ||
|
||
def __init__(self, | ||
directory_path: pathlib.Path, | ||
filename_format: str=DEFAULT_FILENAME_FORMAT) -> None: | ||
self.directory_path = directory_path | ||
self.filename_format = filename_format | ||
self.temp_files = {} | ||
self.logger = logging.getLogger( | ||
f'{PropWriter.__module__}.{PropWriter.__qualname__}' | ||
) | ||
|
||
def __enter__(self) -> 'PropWriter': | ||
return self | ||
|
||
def __exit__(self, *exc_info) -> None: | ||
logger = self.logger | ||
for prop, file_path in self.flush(): | ||
logger.info('%s: %s', prop, file_path) | ||
for prop, tf in self.temp_files.items(): | ||
tf.close() | ||
|
||
def feed_file(self, file) -> None: | ||
for line in file: | ||
self.feed_line(line) | ||
|
||
def feed_line(self, line: str) -> None: | ||
ltrimmed = line.lstrip() | ||
if not ltrimmed or ltrimmed.startswith('#'): | ||
return | ||
try: | ||
code, prop, value = line.split('\t', 2) | ||
except ValueError as e: | ||
raise ValueError(f'{e}: {line!r}') | ||
try: | ||
f = self.temp_files[prop] | ||
except KeyError: | ||
f = tempfile.TemporaryFile(mode='w+', prefix=prop) | ||
self.temp_files[prop] = f | ||
print(line.rstrip('\r\n'), file=f) | ||
|
||
def flush(self) -> Iterator[Tuple[str, pathlib.Path]]: | ||
for prop, tf in self.temp_files.items(): | ||
tf.flush() | ||
tf.seek(0) | ||
file_path = self.directory_path / self.filename_format.format(prop) | ||
yield prop, file_path | ||
parse_value = PROP_PARSERS.get(prop, str) | ||
with file_path.open('w') as f: | ||
wf = f.write | ||
wf('{') | ||
first = True | ||
for line in tf: | ||
try: | ||
code, prop, value = line.rstrip('\r\n').split('\t', 2) | ||
except ValueError as e: | ||
raise ValueError(f'{e}: {line!r}') | ||
char = parse_unicode_point(code) | ||
parsed_value = parse_value(value) | ||
if first: | ||
wf('\n') | ||
first = False | ||
else: | ||
wf(',\n') | ||
wf('\t') | ||
wf(json.dumps(char, ensure_ascii=False)) | ||
wf(':') | ||
wf(json.dumps(parsed_value, ensure_ascii=False)) | ||
wf('\n}\n') | ||
|
||
|
||
def main() -> None: | ||
if len(sys.argv) < 3: | ||
print(f'usage: {sys.argv[0]} UNIHAN_DIR DEST_DIR', file=sys.stderr) | ||
raise SystemExit(2) | ||
data_dir_path = pathlib.Path(sys.argv[1]) | ||
dest_dir_path = pathlib.Path(sys.argv[2]) | ||
if not data_dir_path.is_dir(): | ||
print(f'error: {data_dir_path} is not a directory') | ||
raise SystemExit(2) | ||
elif dest_dir_path.is_file(): | ||
print(f'error: {dest_dir_path} already exists and is a file') | ||
raise SystemExit(2) | ||
logging.basicConfig( | ||
level=logging.INFO, | ||
stream=sys.stderr, | ||
format='%(message)s' | ||
) | ||
dest_dir_path.mkdir(parents=True, exist_ok=True) | ||
with PropWriter(dest_dir_path) as prop_writer: | ||
for data_path in data_dir_path.glob('*.txt'): | ||
print(data_path, file=sys.stderr) | ||
with data_path.open() as f: | ||
prop_writer.feed_file(f) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |