Initial commit

dahlia · Apr 4, 2018 · c2732d2 · c2732d2
commit c2732d2
Show file tree

Hide file tree

Showing 4 changed files with 288 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+Unihan/
+Unihan.zip
+Unihan-*.zip
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,32 @@
+language: python
+python: "3.6"
+install:
+- wget ftp://ftp.unicode.org/Public/11.0.0/ucd/Unihan-11.0.0d2.zip
+- unzip -d Unihan Unihan-*.zip
+script:
+- '[[ "$TRAVIS_TAG" = "" ]] || grep "$TRAVIS_TAG" README.md'
+- '[[ "$TRAVIS_TAG" = "" ]] || grep "${TRAVIS_TAG%-[0-9]*}" .travis.yml'
+- ./process.py Unihan unihan-json
+- |
+  git clone \
+    -b gh-pages \
+    "https://$GITHUB_TOKEN:[email protected]/dahlia/unihan-json.git" \
+    /tmp/gh-pages
+- rm -rf /tmp/gh-pages/$TRAVIS_BRANCH
+- cp -r unihan-json /tmp/gh-pages/$TRAVIS_BRANCH
+- |
+  pushd /tmp/gh-pages
+  git add "$TRAVIS_BRANCH"
+  git commit -m "Update: $TRAVIS_BRANCH"
+  git push origin gh-pages
+  popd
+before_deploy:
+- tar cvfz "unihan-json-$TRAVIS_TAG.tar.gz" unihan-json/
+deploy:
+- provider: releases
+  api_key: "$GITHUB_TOKEN"
+  file_glob: true
+  file: unihan-json-*.tar.gz
+  skip_cleanup: true
+  on:
+    tags: true
diff --git a/README.md b/README.md
@@ -0,0 +1,62 @@
+Unihan JSON
+===========
+
+[![Build Status][ci-badge]][ci]
+
+This project generates JSON data files parsed from the [Unicode Han Database]
+(Unihan) in a structured way.  Although it's automated through the *process.py*
+script, the goal of this project is not the script, but JSON data files.
+
+To download JSON data files, see the [latest release].  To load them from a web
+page through XHR or `window.fetch()` function, request the following URI
+(replace `<PROP>` with a property name, e.g., `kSimplifiedVariant`):
+
+    https://dahlia.github.io/unihan-json/11.0.0d2-0/<PROP>.json
+
+Each JSON file corresponds to a property, and is an object which represents
+a table from Unicode characters to values for the property.  For example,
+*kCantonese.json* is like:
+
+~~~~~~~~ json
+{
+	"香": ["hoeng1"],
+	"港": ["gong2"],
+    ...
+}
+~~~~~~~~
+
+The following some properties are parsed further into structured values:
+
+    kAccountingNumeric
+    kCantonese
+    kFrequency
+    kGB0
+    kGB1
+    kGB3
+    kGB5
+    kGB7
+    kGB8
+    kGradeLevel
+    kHangul
+    kHanyuPinlu
+    kHanyuPinyin
+    kJapaneseKun
+    kJapaneseOn
+    kLau
+    kNelson
+    kOtherNumeric
+    kPrimaryNumeric
+    kSimplifiedVariant
+    kTaiwanTelegraph
+    kTang
+    kTotalStrokes
+    kTraditionalVariant
+    kVietnamese
+
+The rest properties are merely parsed into string values.  Contributing more
+parsers are welcome; see also the `PROP_PARSERS` map in the *process.py* script.
+
+[ci-badge]: https://travis-ci.org/dahlia/unihan-json.svg?branch=master
+[ci]: https://travis-ci.org/dahlia/unihan-json
+[Unicode Han Database]: https://www.unicode.org/reports/tr38/
+[latest release]: https://github.com/dahlia/unihan-json/releases/latest
diff --git a/process.py b/process.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3.6
+import json
+import logging
+import pathlib
+import re
+import sys
+import tempfile
+from typing import (Callable, Dict, Iterator, List, Mapping,
+                    Match, Pattern, Tuple)
+
+__all__ = ('UNICODE_POINT_RE', 'PROP_PARSERS', 'PropWriter',
+           'main', 'parse_unicode_point', 'parse_unicode_points')
+
+
+UNICODE_POINT_RE: Pattern = re.compile('^U\+([0-9A-Fa-f]+)$')
+
+
+def parse_unicode_point(unicode_point: str) -> str:
+    m: Match = UNICODE_POINT_RE.match(unicode_point)
+    if not m:
+        raise ValueError(f'invalid unicode point: {unicode_point!r}')
+    return chr(int(m.group(1), 16))
+
+
+def parse_unicode_points(unicode_points: str) -> List[str]:
+    try:
+        return [parse_unicode_point(code) for code in unicode_points.split()]
+    except ValueError:
+        raise ValueError(f'invalid unicode points: {unicode_points!r}')
+
+
+def parse_hangul(entries: str) -> Dict[str, str]:
+    return dict(pair.split(':', 1) for pair in entries.split())
+
+
+HANYU_PINLU_RE: Pattern = re.compile(
+    '^([^()]+)[(]([0-9]+)[)]$'
+)
+
+
+def parse_hanyu_pinlu(entries: str) -> Dict[str, int]:
+    m = HANYU_PINLU_RE.match
+    return {
+        reading: int(freq)
+        for entry in entries.split()
+        for (reading, freq) in (m(entry).groups(),)
+    }
+
+
+def parse_hanyu_pinyin(entries: str) -> Dict[str, List[str]]:
+    return {
+        entry_num: readings.split(',')
+        for entry in entries.split()
+        for (entry_num, readings) in (entry.split(':', 1),)
+    }
+
+
+def parse_ints(entries: str) -> List[int]:
+    return [int(entry) for entry in entries.split()]
+
+
+PROP_PARSERS: Mapping[str, Callable[[str], object]] = {
+    # CHECK: When a parser is added, update README.md as well.
+    'kAccountingNumeric': int,
+    'kCantonese': str.split,
+    'kFrequency': int,
+    'kGB0': int,
+    'kGB1': int,
+    'kGB3': int,
+    'kGB5': int,
+    'kGB7': int,
+    'kGB8': int,
+    'kGradeLevel': int,
+    'kHangul': parse_hangul,
+    'kHanyuPinlu': parse_hanyu_pinlu,
+    'kHanyuPinyin': parse_hanyu_pinyin,
+    'kJapaneseKun': str.split,
+    'kJapaneseOn': str.split,
+    'kLau': parse_ints,
+    'kNelson': parse_ints,
+    'kOtherNumeric': int,
+    'kPrimaryNumeric': int,
+    'kSimplifiedVariant': parse_unicode_points,
+    'kTaiwanTelegraph': int,
+    'kTang': str.split,
+    'kTotalStrokes': parse_ints,
+    'kTraditionalVariant': parse_unicode_points,
+    'kVietnamese': str.split,
+}
+
+
+class PropWriter:
+
+    DEFAULT_FILENAME_FORMAT: str = '{0}.json'
+
+    def __init__(self,
+                 directory_path: pathlib.Path,
+                 filename_format: str=DEFAULT_FILENAME_FORMAT) -> None:
+        self.directory_path = directory_path
+        self.filename_format = filename_format
+        self.temp_files = {}
+        self.logger = logging.getLogger(
+            f'{PropWriter.__module__}.{PropWriter.__qualname__}'
+        )
+
+    def __enter__(self) -> 'PropWriter':
+        return self
+
+    def __exit__(self, *exc_info) -> None:
+        logger = self.logger
+        for prop, file_path in self.flush():
+            logger.info('%s: %s', prop, file_path)
+        for prop, tf in self.temp_files.items():
+            tf.close()
+
+    def feed_file(self, file) -> None:
+        for line in file:
+            self.feed_line(line)
+
+    def feed_line(self, line: str) -> None:
+        ltrimmed = line.lstrip()
+        if not ltrimmed or ltrimmed.startswith('#'):
+            return
+        try:
+            code, prop, value = line.split('\t', 2)
+        except ValueError as e:
+            raise ValueError(f'{e}: {line!r}')
+        try:
+            f = self.temp_files[prop]
+        except KeyError:
+            f = tempfile.TemporaryFile(mode='w+', prefix=prop)
+            self.temp_files[prop] = f
+        print(line.rstrip('\r\n'), file=f)
+
+    def flush(self) -> Iterator[Tuple[str, pathlib.Path]]:
+        for prop, tf in self.temp_files.items():
+            tf.flush()
+            tf.seek(0)
+            file_path = self.directory_path / self.filename_format.format(prop)
+            yield prop, file_path
+            parse_value = PROP_PARSERS.get(prop, str)
+            with file_path.open('w') as f:
+                wf = f.write
+                wf('{')
+                first = True
+                for line in tf:
+                    try:
+                        code, prop, value = line.rstrip('\r\n').split('\t', 2)
+                    except ValueError as e:
+                        raise ValueError(f'{e}: {line!r}')
+                    char = parse_unicode_point(code)
+                    parsed_value = parse_value(value)
+                    if first:
+                        wf('\n')
+                        first = False
+                    else:
+                        wf(',\n')
+                    wf('\t')
+                    wf(json.dumps(char, ensure_ascii=False))
+                    wf(':')
+                    wf(json.dumps(parsed_value, ensure_ascii=False))
+                wf('\n}\n')
+
+
+def main() -> None:
+    if len(sys.argv) < 3:
+        print(f'usage: {sys.argv[0]} UNIHAN_DIR DEST_DIR', file=sys.stderr)
+        raise SystemExit(2)
+    data_dir_path = pathlib.Path(sys.argv[1])
+    dest_dir_path = pathlib.Path(sys.argv[2])
+    if not data_dir_path.is_dir():
+        print(f'error: {data_dir_path} is not a directory')
+        raise SystemExit(2)
+    elif dest_dir_path.is_file():
+        print(f'error: {dest_dir_path} already exists and is a file')
+        raise SystemExit(2)
+    logging.basicConfig(
+        level=logging.INFO,
+        stream=sys.stderr,
+        format='%(message)s'
+    )
+    dest_dir_path.mkdir(parents=True, exist_ok=True)
+    with PropWriter(dest_dir_path) as prop_writer:
+        for data_path in data_dir_path.glob('*.txt'):
+            print(data_path, file=sys.stderr)
+            with data_path.open() as f:
+                prop_writer.feed_file(f)
+
+
+if __name__ == '__main__':
+    main()