Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dahlia committed Apr 4, 2018
0 parents commit c2732d2
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Unihan/
Unihan.zip
Unihan-*.zip
32 changes: 32 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
language: python
python: "3.6"
install:
- wget ftp://ftp.unicode.org/Public/11.0.0/ucd/Unihan-11.0.0d2.zip
- unzip -d Unihan Unihan-*.zip
script:
- '[[ "$TRAVIS_TAG" = "" ]] || grep "$TRAVIS_TAG" README.md'
- '[[ "$TRAVIS_TAG" = "" ]] || grep "${TRAVIS_TAG%-[0-9]*}" .travis.yml'
- ./process.py Unihan unihan-json
- |
git clone \
-b gh-pages \
"https://$GITHUB_TOKEN:[email protected]/dahlia/unihan-json.git" \
/tmp/gh-pages
- rm -rf /tmp/gh-pages/$TRAVIS_BRANCH
- cp -r unihan-json /tmp/gh-pages/$TRAVIS_BRANCH
- |
pushd /tmp/gh-pages
git add "$TRAVIS_BRANCH"
git commit -m "Update: $TRAVIS_BRANCH"
git push origin gh-pages
popd
before_deploy:
- tar cvfz "unihan-json-$TRAVIS_TAG.tar.gz" unihan-json/
deploy:
- provider: releases
api_key: "$GITHUB_TOKEN"
file_glob: true
file: unihan-json-*.tar.gz
skip_cleanup: true
on:
tags: true
62 changes: 62 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
Unihan JSON
===========

[![Build Status][ci-badge]][ci]

This project generates JSON data files parsed from the [Unicode Han Database]
(Unihan) in a structured way. Although it's automated through the *process.py*
script, the goal of this project is not the script, but JSON data files.

To download JSON data files, see the [latest release]. To load them from a web
page through XHR or `window.fetch()` function, request the following URI
(replace `<PROP>` with a property name, e.g., `kSimplifiedVariant`):

https://dahlia.github.io/unihan-json/11.0.0d2-0/<PROP>.json

Each JSON file corresponds to a property, and is an object which represents
a table from Unicode characters to values for the property. For example,
*kCantonese.json* is like:

~~~~~~~~ json
{
"香": ["hoeng1"],
"港": ["gong2"],
...
}
~~~~~~~~

The following some properties are parsed further into structured values:

kAccountingNumeric
kCantonese
kFrequency
kGB0
kGB1
kGB3
kGB5
kGB7
kGB8
kGradeLevel
kHangul
kHanyuPinlu
kHanyuPinyin
kJapaneseKun
kJapaneseOn
kLau
kNelson
kOtherNumeric
kPrimaryNumeric
kSimplifiedVariant
kTaiwanTelegraph
kTang
kTotalStrokes
kTraditionalVariant
kVietnamese

The rest properties are merely parsed into string values. Contributing more
parsers are welcome; see also the `PROP_PARSERS` map in the *process.py* script.

[ci-badge]: https://travis-ci.org/dahlia/unihan-json.svg?branch=master
[ci]: https://travis-ci.org/dahlia/unihan-json
[Unicode Han Database]: https://www.unicode.org/reports/tr38/
[latest release]: https://github.com/dahlia/unihan-json/releases/latest
191 changes: 191 additions & 0 deletions process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#!/usr/bin/env python3.6
import json
import logging
import pathlib
import re
import sys
import tempfile
from typing import (Callable, Dict, Iterator, List, Mapping,
Match, Pattern, Tuple)

__all__ = ('UNICODE_POINT_RE', 'PROP_PARSERS', 'PropWriter',
'main', 'parse_unicode_point', 'parse_unicode_points')


UNICODE_POINT_RE: Pattern = re.compile('^U\+([0-9A-Fa-f]+)$')


def parse_unicode_point(unicode_point: str) -> str:
m: Match = UNICODE_POINT_RE.match(unicode_point)
if not m:
raise ValueError(f'invalid unicode point: {unicode_point!r}')
return chr(int(m.group(1), 16))


def parse_unicode_points(unicode_points: str) -> List[str]:
try:
return [parse_unicode_point(code) for code in unicode_points.split()]
except ValueError:
raise ValueError(f'invalid unicode points: {unicode_points!r}')


def parse_hangul(entries: str) -> Dict[str, str]:
return dict(pair.split(':', 1) for pair in entries.split())


HANYU_PINLU_RE: Pattern = re.compile(
'^([^()]+)[(]([0-9]+)[)]$'
)


def parse_hanyu_pinlu(entries: str) -> Dict[str, int]:
m = HANYU_PINLU_RE.match
return {
reading: int(freq)
for entry in entries.split()
for (reading, freq) in (m(entry).groups(),)
}


def parse_hanyu_pinyin(entries: str) -> Dict[str, List[str]]:
return {
entry_num: readings.split(',')
for entry in entries.split()
for (entry_num, readings) in (entry.split(':', 1),)
}


def parse_ints(entries: str) -> List[int]:
return [int(entry) for entry in entries.split()]


PROP_PARSERS: Mapping[str, Callable[[str], object]] = {
# CHECK: When a parser is added, update README.md as well.
'kAccountingNumeric': int,
'kCantonese': str.split,
'kFrequency': int,
'kGB0': int,
'kGB1': int,
'kGB3': int,
'kGB5': int,
'kGB7': int,
'kGB8': int,
'kGradeLevel': int,
'kHangul': parse_hangul,
'kHanyuPinlu': parse_hanyu_pinlu,
'kHanyuPinyin': parse_hanyu_pinyin,
'kJapaneseKun': str.split,
'kJapaneseOn': str.split,
'kLau': parse_ints,
'kNelson': parse_ints,
'kOtherNumeric': int,
'kPrimaryNumeric': int,
'kSimplifiedVariant': parse_unicode_points,
'kTaiwanTelegraph': int,
'kTang': str.split,
'kTotalStrokes': parse_ints,
'kTraditionalVariant': parse_unicode_points,
'kVietnamese': str.split,
}


class PropWriter:

DEFAULT_FILENAME_FORMAT: str = '{0}.json'

def __init__(self,
directory_path: pathlib.Path,
filename_format: str=DEFAULT_FILENAME_FORMAT) -> None:
self.directory_path = directory_path
self.filename_format = filename_format
self.temp_files = {}
self.logger = logging.getLogger(
f'{PropWriter.__module__}.{PropWriter.__qualname__}'
)

def __enter__(self) -> 'PropWriter':
return self

def __exit__(self, *exc_info) -> None:
logger = self.logger
for prop, file_path in self.flush():
logger.info('%s: %s', prop, file_path)
for prop, tf in self.temp_files.items():
tf.close()

def feed_file(self, file) -> None:
for line in file:
self.feed_line(line)

def feed_line(self, line: str) -> None:
ltrimmed = line.lstrip()
if not ltrimmed or ltrimmed.startswith('#'):
return
try:
code, prop, value = line.split('\t', 2)
except ValueError as e:
raise ValueError(f'{e}: {line!r}')
try:
f = self.temp_files[prop]
except KeyError:
f = tempfile.TemporaryFile(mode='w+', prefix=prop)
self.temp_files[prop] = f
print(line.rstrip('\r\n'), file=f)

def flush(self) -> Iterator[Tuple[str, pathlib.Path]]:
for prop, tf in self.temp_files.items():
tf.flush()
tf.seek(0)
file_path = self.directory_path / self.filename_format.format(prop)
yield prop, file_path
parse_value = PROP_PARSERS.get(prop, str)
with file_path.open('w') as f:
wf = f.write
wf('{')
first = True
for line in tf:
try:
code, prop, value = line.rstrip('\r\n').split('\t', 2)
except ValueError as e:
raise ValueError(f'{e}: {line!r}')
char = parse_unicode_point(code)
parsed_value = parse_value(value)
if first:
wf('\n')
first = False
else:
wf(',\n')
wf('\t')
wf(json.dumps(char, ensure_ascii=False))
wf(':')
wf(json.dumps(parsed_value, ensure_ascii=False))
wf('\n}\n')


def main() -> None:
if len(sys.argv) < 3:
print(f'usage: {sys.argv[0]} UNIHAN_DIR DEST_DIR', file=sys.stderr)
raise SystemExit(2)
data_dir_path = pathlib.Path(sys.argv[1])
dest_dir_path = pathlib.Path(sys.argv[2])
if not data_dir_path.is_dir():
print(f'error: {data_dir_path} is not a directory')
raise SystemExit(2)
elif dest_dir_path.is_file():
print(f'error: {dest_dir_path} already exists and is a file')
raise SystemExit(2)
logging.basicConfig(
level=logging.INFO,
stream=sys.stderr,
format='%(message)s'
)
dest_dir_path.mkdir(parents=True, exist_ok=True)
with PropWriter(dest_dir_path) as prop_writer:
for data_path in data_dir_path.glob('*.txt'):
print(data_path, file=sys.stderr)
with data_path.open() as f:
prop_writer.feed_file(f)


if __name__ == '__main__':
main()

0 comments on commit c2732d2

Please sign in to comment.