adds code and tests

joniumGit · Feb 16, 2022 · 94a1186 · 94a1186
1 parent 70f0146
commit 94a1186
Show file tree

Hide file tree

Showing 12 changed files with 17,417 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,7 @@ venv
 .pytest_cache
 __pycache__
 .coverage
-htmlcov
+htmlcov
+
+# Exclude generated
+src/languager/languages.py
diff --git a/README.md b/README.md
@@ -1,2 +1,76 @@
-# languager
-ISO639 Language Service
+## Languager
+
+---
+
+#### ISO639 Language Service
+
+This module identifies languages from ISO639-1 and ISO639-3 codes or ISO names and provides a convenient class to access
+related attributes. It is also possible to lookup codes for languages based on their ISO names. However, the name based
+lookup will be slower as all language names are compared in lowercase for that.
+
+Basic flows:
+
+- Code is 2 characters
+    - Lookup long code
+    - Lookup data
+- Code is 3 characters
+    - Lookup data
+- Code is something else
+    - Loop all language names to check for match
+        - `input_language == language_name.lower()`
+    - Lookup data
+
+This means that that Name based lookup is _n_-times slower than the other two options. But this really should not make a
+difference.
+
+#### Data
+
+The data is taken from [iso639-3.sil.org](https://iso639-3.sil.org/code_tables/download_tables)
+and is stored in the [tables](./languages/tables) folder. Further releases will update these tables.
+
+#### Code
+
+The code in [generator.py](./languages/generator.py) generates a single python file that contains all lookup tables and
+methods.
+
+#### Language
+
+- __code__: The ISO639-3 Code
+- __short__: ISO639-1 Code if available
+- __deprecated__: True if the definition is deprecated
+- __macro__: True if this is in a macrolanguage gropup
+- __parent__: The parent macrolanguage
+- __macros__: Any languages belonging to this macrolanguage
+
+#### Examples
+
+Checking the macrolanguages for Chinese:
+
+```python
+from languager import get_language
+
+lang = get_language('zho')
+# lang = get_language('zh')
+# lang = get_language('Chinese')
+# lang = get_language('does not exist', default='zho')
+
+for language in lang.macros:
+    print(language)
+
+# czo
+# csp
+# yue
+# cnp
+# cmn
+# czh
+# hak
+# nan
+# wuu
+# cjy
+# lzh
+# gan
+# mnp
+# cpx
+# hsn
+# cdo
+```
diff --git a/generator/NOTICE.txt b/generator/NOTICE.txt
@@ -0,0 +1,2 @@
+LIST OF ISO639 DATA IS OBTAINED FROM:
+https://iso639-3.sil.org/code_tables/download_tables
diff --git a/generator/generator.py b/generator/generator.py
@@ -0,0 +1,200 @@
+import csv
+import re
+import textwrap
+
+
+def skip_one(iter_):
+    next(iter_)
+    return iter_
+
+
+def read_csv(file: str, *columns: int):
+    with open(file, 'r') as f:
+        for row in skip_one(csv.reader(f, delimiter='\t')):
+            if len(columns) == 1:
+                yield row[columns[0]].strip()
+            else:
+                yield tuple(row[i].strip() for i in columns)
+
+
+name_table = {id_.lower(): name for id_, name in read_csv('./tables/iso-639-3_Name_Index.tab', 0, 1)}
+ref_table = {id_.lower(): part1.lower() for id_, part1 in read_csv('./tables/iso-639-3.tab', 0, 3) if part1 != ''}
+macro_table = {macro.lower(): id_.lower() for id_, macro in read_csv('./tables/iso-639-3-macrolanguages.tab', 0, 1)}
+retired_set = {id_.lower() for id_ in read_csv('./tables/iso-639-3_Retirements.tab', 0)}
+
+names = ',\n'.join(f'    "{k}": "{v}"' for k, v in name_table.items())
+stl = ',\n'.join(f'    "{v}": "{k}"' for k, v in ref_table.items())
+lts = ',\n'.join(f'    "{k}": "{v}"' for k, v in ref_table.items())
+dep = ',\n'.join(f'    "{v}"' for v in retired_set)
+
+
+def macros(lang_id: str):
+    return ",\n".join(map(lambda e: f'        "{e[0]}"', filter(lambda e: e[1] == lang_id, macro_table.items())))
+
+
+mtp = ',\n'.join(f'    "{k}": "{v}"' for k, v in macro_table.items())
+ptm = ',\n'.join(f'    "{k}": [\n{macros(k)}\n    ]' for k in ref_table.keys() if k in macro_table.values())
+
+source = textwrap.dedent(
+    """
+    \"\"\"
+    Generated File
+    
+    Do not modify directly.
+    \"\"\"
+    
+    from dataclasses import dataclass, field
+    from functools import lru_cache
+    from typing import Optional, FrozenSet
+    
+    NAMES = {{
+    {names}
+    }}
+    SHORT_TO_LONG = {{
+    {stl}
+    }}
+    LONG_TO_SHORT = {{
+    {lts}
+    }}
+    DEPRECATED = {{
+    {dep}
+    }}
+    MACRO_TO_PARENT = {{
+    {mtp}
+    }}
+    PARENT_TO_MACROS = {{
+    {ptm}
+    }}
+    
+    
+    @dataclass(order=True, frozen=True)
+    class Language:
+        \"\"\"Represents a language
+        
+        Attributes
+        ----------
+        code
+            ISO639-03 Language code
+        short
+            ISO639-01 Language code if available
+        name
+            ISO Standard Name for the language
+        deprecated
+            Indicates this code is deprecated
+        macro
+            If True, indicates this is a member of a macrolanguage group
+        parent
+            Parent for this macrolanguage group member
+        macros
+            Contains all children if this is a macrogroup parent
+        \"\"\"
+        code:           str             = field(compare=True,  hash=True)
+        short:          Optional[str]   = field(compare=False, hash=False)
+        name:           str             = field(compare=False, hash=False) 
+        deprecated:     bool            = field(compare=False, hash=False)
+        macro:          bool            = field(compare=False, hash=False)
+        parent:         Optional[str]   = field(compare=False, hash=False)
+        macros:         FrozenSet[str]  = field(compare=False, hash=False)
+    
+    
+    def _get_language(language: str) -> Language:
+        try:
+            language = language.lower()
+            if len(language) == 2:
+                long_id = SHORT_TO_LONG[language]
+                short_id = language
+            elif len(language) == 3:
+                short_id = LONG_TO_SHORT.get(language, None)
+                long_id = language
+            else:
+                long_id = None
+                for e in NAMES.items():
+                    if e[1].lower() == language:
+                        long_id = e[0]
+                if long_id is None:
+                    raise ValueError('Invalid Language Tag: Not Found')
+                short_id = LONG_TO_SHORT.get(long_id, None)
+            parent = MACRO_TO_PARENT.get(long_id, None)
+            return Language(
+                code=long_id,
+                short=short_id,
+                name=NAMES[long_id],
+                parent=parent,
+                macros=frozenset(PARENT_TO_MACROS.get(long_id, frozenset())),
+                deprecated=long_id in DEPRECATED,
+                macro=parent is not None
+            )
+        except KeyError:
+            raise ValueError('Invalid Language Tag: Not Found')
+    
+    
+    @lru_cache(maxsize=16, typed=False)
+    def get_language(language: str, *, default: Optional[str] = None) -> Language:
+        \"\"\"Looks up a language
+        
+        The language can be in one of the following formats:
+        
+        - ISO639-01 e.g. fi
+        - ISO639-03 e.g. fin
+        - ISO Name  e.g.  Finnish
+        
+        Parameters
+        ----------
+        language
+            The language code or name to look up
+        default: optional
+            The language code or name to use if the language is not found
+        
+        Returns
+        -------
+        language
+            Language instance for the looked up language
+        
+        Raises
+        ------
+        ValueError
+            If the language lookup fails, format _Invalid Language Tag: reason_
+        
+        Examples
+        ---------
+        >>> get_language('nor')
+        Language(code='nor', short='no', name='Norwegian', deprecated=False, macro=False, parent=None, macros=frozenset({{'nob', 'nno'}}))
+    
+        >>> get_language('Finnish')
+        Language(code='fin', short='fi', name='Finnish', deprecated=False, macro=False, parent=None, macros=frozenset())
+        
+        Notes
+        -----
+        The lookup is case insensitive, so FI, fi, Fi are all the same thing.
+        
+        The function is wrapped with lru_cache of size 16.
+        \"\"\"
+        if language is None:
+            if default is None:
+                raise ValueError('Invalid Language Tag: None')
+            else:
+                language = default
+        if default is None:
+            return _get_language(language)
+        else:
+            try:
+                return _get_language(language)
+            except ValueError as e:
+                try:
+                    return _get_language(default)
+                except ValueError as ex:
+                    raise ex from e
+    
+    
+    __all__ = [
+        'get_language',
+        'Language'
+    ]
+    
+    """
+)
+
+with open('../src/languager/languages.py', 'w') as out:
+    source = re.sub(r'^(\s+.+?):\s{2,}(.*?)\s{2,}=', r'\1: \2 =', source, flags=re.MULTILINE)
+    out.write(source.format(names=names, dep=dep, lts=lts, stl=stl, mtp=mtp, ptm=ptm).strip())
+    out.write('\n')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		LIST OF ISO639 DATA IS OBTAINED FROM:
		https://iso639-3.sil.org/code_tables/download_tables