-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7ca01c9
Showing
35 changed files
with
47,534 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
*.7z filter=lfs diff=lfs merge=lfs -text | ||
*.arrow filter=lfs diff=lfs merge=lfs -text | ||
*.bin filter=lfs diff=lfs merge=lfs -text | ||
*.bz2 filter=lfs diff=lfs merge=lfs -text | ||
*.ckpt filter=lfs diff=lfs merge=lfs -text | ||
*.ftz filter=lfs diff=lfs merge=lfs -text | ||
*.gz filter=lfs diff=lfs merge=lfs -text | ||
*.h5 filter=lfs diff=lfs merge=lfs -text | ||
*.joblib filter=lfs diff=lfs merge=lfs -text | ||
*.lfs.* filter=lfs diff=lfs merge=lfs -text | ||
*.mlmodel filter=lfs diff=lfs merge=lfs -text | ||
*.model filter=lfs diff=lfs merge=lfs -text | ||
*.msgpack filter=lfs diff=lfs merge=lfs -text | ||
*.npy filter=lfs diff=lfs merge=lfs -text | ||
*.npz filter=lfs diff=lfs merge=lfs -text | ||
*.onnx filter=lfs diff=lfs merge=lfs -text | ||
*.ot filter=lfs diff=lfs merge=lfs -text | ||
*.parquet filter=lfs diff=lfs merge=lfs -text | ||
*.pb filter=lfs diff=lfs merge=lfs -text | ||
*.pickle filter=lfs diff=lfs merge=lfs -text | ||
*.pkl filter=lfs diff=lfs merge=lfs -text | ||
*.pt filter=lfs diff=lfs merge=lfs -text | ||
*.pth filter=lfs diff=lfs merge=lfs -text | ||
*.rar filter=lfs diff=lfs merge=lfs -text | ||
*.safetensors filter=lfs diff=lfs merge=lfs -text | ||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text | ||
*.tar.* filter=lfs diff=lfs merge=lfs -text | ||
*.tflite filter=lfs diff=lfs merge=lfs -text | ||
*.tgz filter=lfs diff=lfs merge=lfs -text | ||
*.wasm filter=lfs diff=lfs merge=lfs -text | ||
*.xz filter=lfs diff=lfs merge=lfs -text | ||
*.zip filter=lfs diff=lfs merge=lfs -text | ||
*.zst filter=lfs diff=lfs merge=lfs -text | ||
*tfevents* filter=lfs diff=lfs merge=lfs -text | ||
*.ipynb linguist-vendored |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Data and logs | ||
logs | ||
stores | ||
data | ||
changelog.md | ||
|
||
# Other folders | ||
papers | ||
ignore | ||
yomikata/*-artifacts | ||
yomikata/*.tar.gz | ||
|
||
# Binaries | ||
*.bin | ||
|
||
# Job scripts | ||
*.sl | ||
|
||
# Packaging | ||
venv | ||
dist | ||
*.egg-info | ||
__pycache__ | ||
|
||
# Misc | ||
.DS_Store | ||
.ipynb_checkpoints | ||
*.out | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2022 Sam Passaglia | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Makefile | ||
SHELL = /bin/bash | ||
|
||
.PHONY: help | ||
help: | ||
@echo "Commands:" | ||
@echo "venv : creates a virtual environment." | ||
@echo "style : executes style formatting." | ||
@echo "app : runs the streamlit app." | ||
|
||
# Styling | ||
.PHONY: style | ||
style: | ||
source venv/bin/activate && \ | ||
black . && \ | ||
isort . && \ | ||
flake8 | ||
|
||
# Environment | ||
venv: | ||
python3 -m venv venv | ||
source venv/bin/activate && \ | ||
python3 -m pip install pip setuptools wheel && \ | ||
python3 -m pip install -e . | ||
|
||
# Streamlit app | ||
app: | ||
source venv/bin/activate && \ | ||
streamlit run app.py --server.fileWatcherType none | ||
|
||
# Build | ||
build: | ||
python3 -m build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# yomikata | ||
|
||
<img src="robot_reading.png" width=125 height=125 alt="A robot reading a book" /> | ||
|
||
**Yomikata** uses context to resolve ambiguous words in Japanese. Check out the [**interactive demo**](https://huggingface.co/spaces/passaglia/yomikata-demo)! | ||
|
||
**Yomikata** supports 130 ambiguous forms and reaches a global accuracy of 94%. See the demo page for detailed performance information. | ||
|
||
**Yomikata** follows the approach of [Sato et al. 2022](https://aclanthology.org/2022.lrec-1.770/) by fine-tuning the Tohoku group's [Japanese BERT transformer](https://github.com/cl-tohoku/bert-japanese) to classify words into different readings based on the sentence context. A similar approach was used in English by [Nicolis et al. 2021](https://www.amazon.science/publications/homograph-disambiguation-with-contextual-word-embeddings-for-tts-systems]). | ||
|
||
**Yomikata** recognizes ~50% more heteronyms than Sato et al. by adding support for words which are not in the original BERT vocabulary, and it expands the original [Aozora Bunko](https://github.com/ndl-lab/huriganacorpus-aozora) and [NDL titles](https://github.com/ndl-lab/huriganacorpus-ndlbib) training data to include the [core BCCWJ corpus](https://clrd.ninjal.ac.jp/bccwj/) and the [KWDLC corpus](https://github.com/ku-nlp/KWDLC). | ||
|
||
# Usage | ||
|
||
```python | ||
from yomikata.dbert import dBert | ||
reader = dBert() | ||
reader.furigana('そして、畳の表は、すでに幾年前に換えられたのか分らなかった。') | ||
# => そして、畳の{表/おもて}は、すでに幾年前に換えられたのか分らなかった。 | ||
``` | ||
|
||
This example sentence, from the short story *When I Was looking for a Room to Let* (1923) by Mimei Ogawa, contains the very common heteronym 表, which admits the readings *omote* (surface) and *hyō* (table). **Yomikata**'s dBert (disambiguation BERT) correctly determines that in this sentence it refers to the surface of a tatami mat and should be read *omote*. | ||
|
||
The furigana function outputs the sentence with the heteronym annotated. Readings for the other words can be obtained with a simple dictionary lookup. | ||
|
||
```python | ||
from yomikata.dictionary import Dictionary | ||
dictreader = Dictionary() # defaults to unidic. | ||
dictreader.furigana("そして、畳の{表/おもて}は、すでに幾年前に換えられたのか分らなかった。") | ||
# => そして、{畳/たたみ}の{表/おもて}は、すでに{幾年/いくねん}{前/まえ}に{換/か}えられたのか{分/わ}らなかった。 | ||
``` | ||
|
||
Without **Yomikata**, the dictionary outputs the wrong reading for the heteronym. | ||
|
||
# Installation | ||
|
||
Inferrence should work fine on CPU. | ||
|
||
For details on data processing and training, see the [main notebook](https://github.com/passaglia/yomikata/tree/main/notebooks). | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
"""app.py | ||
streamlit demo of yomikata""" | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
import spacy | ||
import streamlit as st | ||
from speach import ttlig | ||
|
||
from yomikata import utils | ||
from yomikata.dictionary import Dictionary | ||
from yomikata.utils import parse_furigana | ||
|
||
|
||
@st.cache | ||
def add_border(html: str): | ||
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>""" | ||
html = html.replace("\n", " ") | ||
return WRAPPER.format(html) | ||
|
||
|
||
def get_random_sentence(): | ||
from yomikata.config.config import TEST_DATA_DIR | ||
|
||
df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv")) | ||
return df.sample(1).iloc[0].sentence | ||
|
||
|
||
@st.cache | ||
def get_dbert_prediction_and_heteronym_list(text): | ||
from yomikata.dbert import dBert | ||
|
||
reader = dBert() | ||
return reader.furigana(text), reader.heteronyms | ||
|
||
|
||
@st.cache | ||
def get_stats(): | ||
from yomikata.config import config | ||
from yomikata.utils import load_dict | ||
|
||
stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json")) | ||
|
||
global_accuracy = stats["test"]["accuracy"] | ||
|
||
stats = stats["test"]["heteronym_performance"] | ||
heteronyms = stats.keys() | ||
|
||
accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms] | ||
|
||
readings = [ | ||
"、".join( | ||
[ | ||
"{reading} ({correct}/{n})".format( | ||
reading=reading, | ||
correct=stats[heteronym]["readings"][reading]["found"][reading], | ||
n=stats[heteronym]["readings"][reading]["n"], | ||
) | ||
for reading in stats[heteronym]["readings"].keys() | ||
if ( | ||
stats[heteronym]["readings"][reading]["found"][reading] != 0 | ||
or reading != "<OTHER>" | ||
) | ||
] | ||
) | ||
for heteronym in heteronyms | ||
] | ||
|
||
# if reading != '<OTHER>' | ||
|
||
df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings}) | ||
|
||
df = df[df["readings"].str.contains("、")] | ||
|
||
df["readings"] = df["readings"].str.replace("<OTHER>", "Other") | ||
|
||
df = df.rename(columns={"readings": "readings (correct/total)"}) | ||
|
||
df = df.sort_values("accuracy", ascending=False, ignore_index=True) | ||
|
||
df.index += 1 | ||
|
||
return global_accuracy, df | ||
|
||
|
||
@st.cache | ||
def furigana_to_spacy(text_with_furigana): | ||
tokens = parse_furigana(text_with_furigana) | ||
ents = [] | ||
output_text = "" | ||
heteronym_count = 0 | ||
for token in tokens.groups: | ||
if isinstance(token, ttlig.RubyFrag): | ||
if heteronym_count != 0: | ||
output_text += ", " | ||
|
||
ents.append( | ||
{ | ||
"start": len(output_text), | ||
"end": len(output_text) + len(token.text), | ||
"label": token.furi, | ||
} | ||
) | ||
|
||
output_text += token.text | ||
heteronym_count += 1 | ||
else: | ||
pass | ||
return { | ||
"text": output_text, | ||
"ents": ents, | ||
"title": None, | ||
} | ||
|
||
|
||
st.title("Yomikata: Disambiguate Japanese Heteronyms") | ||
|
||
# Input text box | ||
st.markdown("Input a Japanese sentence:") | ||
|
||
if "default_sentence" not in st.session_state: | ||
st.session_state.default_sentence = "え、{人間/にんげん}というものかい? {人間/にんげん}というものは{角/つの}の{生/は}えない、{生白/なまじろ}い{顔/かお}や{手足/てあし}をした、{何/なん}ともいわれず{気味/きみ}の{悪/わる}いものだよ。" | ||
|
||
input_text = st.text_area( | ||
"Input a Japanese sentence:", | ||
utils.remove_furigana(st.session_state.default_sentence), | ||
label_visibility="collapsed", | ||
) | ||
|
||
# Yomikata prediction | ||
dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text) | ||
|
||
# spacy-style output for the predictions | ||
colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"] | ||
spacy_dict = furigana_to_spacy(dbert_prediction) | ||
label_colors = { | ||
reading: colors[i % len(colors)] | ||
for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]])) | ||
} | ||
html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors}) | ||
|
||
if len(spacy_dict["ents"]) > 0: | ||
st.markdown("**Yomikata** disambiguated the following words with multiple readings:") | ||
st.write( | ||
f"{add_border(html)}", | ||
unsafe_allow_html=True, | ||
) | ||
else: | ||
st.markdown("**Yomikata** found no heteronyms in the input text.") | ||
|
||
# Dictionary + Yomikata prediction | ||
st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:") | ||
dictionary = st.radio( | ||
"It can be coupled with a dictionary", | ||
("sudachi", "unidic", "ipadic", "juman"), | ||
horizontal=True, | ||
label_visibility="collapsed", | ||
) | ||
|
||
dictreader = Dictionary(dictionary) | ||
dictionary_prediction = dictreader.furigana(dbert_prediction) | ||
html = parse_furigana(dictionary_prediction).to_html() | ||
st.write( | ||
f"{add_border(html)}", | ||
unsafe_allow_html=True, | ||
) | ||
|
||
# Dictionary alone prediction | ||
if len(spacy_dict["ents"]) > 0: | ||
dictionary_prediction = dictreader.furigana(utils.remove_furigana(input_text)) | ||
html = parse_furigana(dictionary_prediction).to_html() | ||
st.markdown("Without **Yomikata** disambiguation, the dictionary would yield:") | ||
st.write( | ||
f"{add_border(html)}", | ||
unsafe_allow_html=True, | ||
) | ||
|
||
# Randomize button | ||
if st.button("🎲 Randomize the input sentence"): | ||
st.session_state.default_sentence = get_random_sentence() | ||
st.experimental_rerun() | ||
|
||
# Stats section | ||
global_accuracy, stats_df = get_stats() | ||
|
||
st.subheader( | ||
f"**Yomikata** supports {len(stats_df)} heteronyms, with a global accuracy of {global_accuracy:.0%}!" | ||
) | ||
|
||
st.dataframe(stats_df) | ||
|
||
st.subheader("Check out **Yomikata** on [GitHub](https://github.com/passaglia/yomikata) today!") | ||
|
||
# Hide the footer | ||
hide_streamlit_style = """ | ||
<style> | ||
#MainMenu {visibility: hidden;} | ||
footer {visibility: hidden;} | ||
</style> | ||
""" | ||
st.markdown(hide_streamlit_style, unsafe_allow_html=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"latest": { | ||
"url": "", | ||
"version": "0.0.1" | ||
}, | ||
"0.0.1": { | ||
"url": "", | ||
"version": "0.0.1" | ||
} | ||
} |
Oops, something went wrong.