Skip to content

Commit

Permalink
adds code and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
joniumGit committed Feb 16, 2022
1 parent 70f0146 commit 94a1186
Show file tree
Hide file tree
Showing 12 changed files with 17,417 additions and 3 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ venv
.pytest_cache
__pycache__
.coverage
htmlcov
htmlcov

# Exclude generated
src/languager/languages.py
78 changes: 76 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,76 @@
# languager
ISO639 Language Service
## Languager

---

#### ISO639 Language Service

This module identifies languages from ISO639-1 and ISO639-3 codes or ISO names and provides a convenient class to access
related attributes. It is also possible to lookup codes for languages based on their ISO names. However, the name based
lookup will be slower as all language names are compared in lowercase for that.

Basic flows:

- Code is 2 characters
- Lookup long code
- Lookup data
- Code is 3 characters
- Lookup data
- Code is something else
- Loop all language names to check for match
- `input_language == language_name.lower()`
- Lookup data

This means that that Name based lookup is _n_-times slower than the other two options. But this really should not make a
difference.

#### Data

The data is taken from [iso639-3.sil.org](https://iso639-3.sil.org/code_tables/download_tables)
and is stored in the [tables](./languages/tables) folder. Further releases will update these tables.

#### Code

The code in [generator.py](./languages/generator.py) generates a single python file that contains all lookup tables and
methods.

#### Language

- __code__: The ISO639-3 Code
- __short__: ISO639-1 Code if available
- __deprecated__: True if the definition is deprecated
- __macro__: True if this is in a macrolanguage gropup
- __parent__: The parent macrolanguage
- __macros__: Any languages belonging to this macrolanguage

#### Examples

Checking the macrolanguages for Chinese:

```python
from languager import get_language

lang = get_language('zho')
# lang = get_language('zh')
# lang = get_language('Chinese')
# lang = get_language('does not exist', default='zho')

for language in lang.macros:
print(language)

# czo
# csp
# yue
# cnp
# cmn
# czh
# hak
# nan
# wuu
# cjy
# lzh
# gan
# mnp
# cpx
# hsn
# cdo
```
2 changes: 2 additions & 0 deletions generator/NOTICE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
LIST OF ISO639 DATA IS OBTAINED FROM:
https://iso639-3.sil.org/code_tables/download_tables
200 changes: 200 additions & 0 deletions generator/generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import csv
import re
import textwrap


def skip_one(iter_):
next(iter_)
return iter_


def read_csv(file: str, *columns: int):
with open(file, 'r') as f:
for row in skip_one(csv.reader(f, delimiter='\t')):
if len(columns) == 1:
yield row[columns[0]].strip()
else:
yield tuple(row[i].strip() for i in columns)


name_table = {id_.lower(): name for id_, name in read_csv('./tables/iso-639-3_Name_Index.tab', 0, 1)}
ref_table = {id_.lower(): part1.lower() for id_, part1 in read_csv('./tables/iso-639-3.tab', 0, 3) if part1 != ''}
macro_table = {macro.lower(): id_.lower() for id_, macro in read_csv('./tables/iso-639-3-macrolanguages.tab', 0, 1)}
retired_set = {id_.lower() for id_ in read_csv('./tables/iso-639-3_Retirements.tab', 0)}

names = ',\n'.join(f' "{k}": "{v}"' for k, v in name_table.items())
stl = ',\n'.join(f' "{v}": "{k}"' for k, v in ref_table.items())
lts = ',\n'.join(f' "{k}": "{v}"' for k, v in ref_table.items())
dep = ',\n'.join(f' "{v}"' for v in retired_set)


def macros(lang_id: str):
return ",\n".join(map(lambda e: f' "{e[0]}"', filter(lambda e: e[1] == lang_id, macro_table.items())))


mtp = ',\n'.join(f' "{k}": "{v}"' for k, v in macro_table.items())
ptm = ',\n'.join(f' "{k}": [\n{macros(k)}\n ]' for k in ref_table.keys() if k in macro_table.values())

source = textwrap.dedent(
"""
\"\"\"
Generated File
Do not modify directly.
\"\"\"
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Optional, FrozenSet
NAMES = {{
{names}
}}
SHORT_TO_LONG = {{
{stl}
}}
LONG_TO_SHORT = {{
{lts}
}}
DEPRECATED = {{
{dep}
}}
MACRO_TO_PARENT = {{
{mtp}
}}
PARENT_TO_MACROS = {{
{ptm}
}}
@dataclass(order=True, frozen=True)
class Language:
\"\"\"Represents a language
Attributes
----------
code
ISO639-03 Language code
short
ISO639-01 Language code if available
name
ISO Standard Name for the language
deprecated
Indicates this code is deprecated
macro
If True, indicates this is a member of a macrolanguage group
parent
Parent for this macrolanguage group member
macros
Contains all children if this is a macrogroup parent
\"\"\"
code: str = field(compare=True, hash=True)
short: Optional[str] = field(compare=False, hash=False)
name: str = field(compare=False, hash=False)
deprecated: bool = field(compare=False, hash=False)
macro: bool = field(compare=False, hash=False)
parent: Optional[str] = field(compare=False, hash=False)
macros: FrozenSet[str] = field(compare=False, hash=False)
def _get_language(language: str) -> Language:
try:
language = language.lower()
if len(language) == 2:
long_id = SHORT_TO_LONG[language]
short_id = language
elif len(language) == 3:
short_id = LONG_TO_SHORT.get(language, None)
long_id = language
else:
long_id = None
for e in NAMES.items():
if e[1].lower() == language:
long_id = e[0]
if long_id is None:
raise ValueError('Invalid Language Tag: Not Found')
short_id = LONG_TO_SHORT.get(long_id, None)
parent = MACRO_TO_PARENT.get(long_id, None)
return Language(
code=long_id,
short=short_id,
name=NAMES[long_id],
parent=parent,
macros=frozenset(PARENT_TO_MACROS.get(long_id, frozenset())),
deprecated=long_id in DEPRECATED,
macro=parent is not None
)
except KeyError:
raise ValueError('Invalid Language Tag: Not Found')
@lru_cache(maxsize=16, typed=False)
def get_language(language: str, *, default: Optional[str] = None) -> Language:
\"\"\"Looks up a language
The language can be in one of the following formats:
- ISO639-01 e.g. fi
- ISO639-03 e.g. fin
- ISO Name e.g. Finnish
Parameters
----------
language
The language code or name to look up
default: optional
The language code or name to use if the language is not found
Returns
-------
language
Language instance for the looked up language
Raises
------
ValueError
If the language lookup fails, format _Invalid Language Tag: reason_
Examples
---------
>>> get_language('nor')
Language(code='nor', short='no', name='Norwegian', deprecated=False, macro=False, parent=None, macros=frozenset({{'nob', 'nno'}}))
>>> get_language('Finnish')
Language(code='fin', short='fi', name='Finnish', deprecated=False, macro=False, parent=None, macros=frozenset())
Notes
-----
The lookup is case insensitive, so FI, fi, Fi are all the same thing.
The function is wrapped with lru_cache of size 16.
\"\"\"
if language is None:
if default is None:
raise ValueError('Invalid Language Tag: None')
else:
language = default
if default is None:
return _get_language(language)
else:
try:
return _get_language(language)
except ValueError as e:
try:
return _get_language(default)
except ValueError as ex:
raise ex from e
__all__ = [
'get_language',
'Language'
]
"""
)

with open('../src/languager/languages.py', 'w') as out:
source = re.sub(r'^(\s+.+?):\s{2,}(.*?)\s{2,}=', r'\1: \2 =', source, flags=re.MULTILINE)
out.write(source.format(names=names, dep=dep, lts=lts, stl=stl, mtp=mtp, ptm=ptm).strip())
out.write('\n')
Loading

0 comments on commit 94a1186

Please sign in to comment.