Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions silnlp/common/translate_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from machine.scripture import VerseRef, book_id_to_number

from .paratext import book_file_name_digits
from .translator import SentenceTranslation, SentenceTranslationGroup, Translator
from .translator import SentenceTranslation, SentenceTranslationGroup, TranslationInputSentence, Translator
from .utils import get_git_revision_hash, get_mt_exp_dir

LOGGER = logging.getLogger((__package__ or "") + ".translate")
Expand All @@ -18,21 +18,18 @@ def __init__(self) -> None:

def translate(
self,
sentences: Iterable[str],
src_iso: str,
trg_iso: str,
sentences: Iterable[TranslationInputSentence],
produce_multiple_translations: bool = False,
vrefs: Optional[Iterable[VerseRef]] = None,
) -> Generator[SentenceTranslationGroup, None, None]:
if produce_multiple_translations:
LOGGER.warning("Google Translator does not support --multiple-translations")

for sentence in sentences:
if len(sentence) == 0:
yield ""
if sentence.text is None or len(sentence.text) == 0:
yield [SentenceTranslation("", [], [], None)]
else:
results = self._translate_client.translate(
sentence, source_language=src_iso, target_language=trg_iso, format_="text"
sentence.text, source_language=sentence.src_iso, target_language=sentence.trg_iso, format_="text"
)
translation: str = results["translatedText"]
yield [SentenceTranslation(translation, [], [], None)]
Expand Down
176 changes: 149 additions & 27 deletions silnlp/common/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from contextlib import AbstractContextManager
from dataclasses import dataclass
from datetime import date
from itertools import groupby
from math import exp
from pathlib import Path
from typing import DefaultDict, Generator, Iterable, List, Optional, Tuple, cast
from typing import DefaultDict, Generator, List, Optional, Tuple

import docx
import nltk
Expand Down Expand Up @@ -40,6 +41,101 @@
CONFIDENCE_SCORES_SUFFIX = ".confidences.tsv"


@dataclass
class TranslationInputSentence:
def __init__(
self,
text: str | None = None,
tokens: List[str] | None = None,
src_iso: str = "",
trg_iso: str = "",
scripture_ref: ScriptureRef | None = None,
vref: VerseRef | None = None,
):
self._text = text
self._tokens = tokens
self._src_iso = src_iso
self._trg_iso = trg_iso
self._scripture_ref = scripture_ref
self._vref = vref

@property
def text(self) -> str | None:
return self._text

@property
def tokens(self) -> List[str] | None:
return self._tokens

@property
def src_iso(self) -> str:
return self._src_iso

@property
def trg_iso(self) -> str:
return self._trg_iso

@property
def scripture_ref(self) -> ScriptureRef | None:
return self._scripture_ref

@property
def vref(self) -> VerseRef | None:
return self._vref

def has_tokens(self) -> bool:
return self.tokens is not None

class Builder:
def __init__(self):
self._text = None
self._tokens = None
self._src_iso = None
self._trg_iso = None
self._scripture_ref = None
self._vref = None

def set_text(self, text: str) -> "TranslationInputSentence.Builder":
self._text = text
return self

def set_tokens(self, tokens: List[str]) -> "TranslationInputSentence.Builder":
self._tokens = tokens
return self

def set_src_iso(self, src_iso: str) -> "TranslationInputSentence.Builder":
self._src_iso = src_iso
return self

def set_trg_iso(self, trg_iso: str) -> "TranslationInputSentence.Builder":
self._trg_iso = trg_iso
return self

def set_scripture_ref(self, scripture_ref: ScriptureRef) -> "TranslationInputSentence.Builder":
self._scripture_ref = scripture_ref
return self

def set_verse_ref(self, vref: VerseRef) -> "TranslationInputSentence.Builder":
self._vref = vref
return self

def build(self) -> "TranslationInputSentence":
if self._text is None and self._tokens is None:
raise ValueError("TranslationInputSentence must have either text or tokens defined")
if self._src_iso is None:
raise ValueError("TranslationInputSentence must have a src_iso defined")
if self._trg_iso is None:
raise ValueError("TranslationInputSentence must have a trg_iso defined")
return TranslationInputSentence(
text=self._text,
tokens=self._tokens,
src_iso=self._src_iso,
trg_iso=self._trg_iso,
scripture_ref=self._scripture_ref,
vref=self._vref,
)


# A single translation of a single sentence
class SentenceTranslation:
def __init__(
Expand Down Expand Up @@ -265,11 +361,8 @@ class Translator(AbstractContextManager["Translator"], ABC):
@abstractmethod
def translate(
self,
sentences: Iterable[str],
src_iso: str,
trg_iso: str,
sentences: List[TranslationInputSentence],
produce_multiple_translations: bool = False,
vrefs: Optional[Iterable[VerseRef]] = None,
) -> Generator[SentenceTranslationGroup, None, None]:
pass

Expand All @@ -285,9 +378,16 @@ def translate_text(
tags: Optional[List[str]] = None,
) -> None:

sentences = [add_tags_to_sentence(tags, sentence) for sentence in load_corpus(src_file_path)]
translation_inputs = [
TranslationInputSentence.Builder()
.set_text(add_tags_to_sentence(tags, sentence))
.set_src_iso(src_iso)
.set_trg_iso(trg_iso)
.build()
for sentence in load_corpus(src_file_path)
]
sentence_translation_groups: List[SentenceTranslationGroup] = list(
self.translate(sentences, src_iso, trg_iso, produce_multiple_translations)
self.translate(translation_inputs, produce_multiple_translations)
)
draft_set = DraftGroup(sentence_translation_groups)
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
Expand Down Expand Up @@ -386,37 +486,49 @@ def translate_usfm(

src_file_text = UsfmFileText(stylesheet, "utf-8-sig", book_id, src_file_path, include_all_text=True)

sentences = [re.sub(" +", " ", add_tags_to_sentence(tags, s.text.strip())) for s in src_file_text]
scripture_refs: List[ScriptureRef] = [s.ref for s in src_file_text]
vrefs: List[VerseRef] = [sr.verse_ref for sr in scripture_refs]
sentences = [
TranslationInputSentence.Builder()
.set_text(re.sub(" +", " ", add_tags_to_sentence(tags, s.text.strip())))
.set_src_iso(src_iso)
.set_trg_iso(trg_iso)
.set_scripture_ref(s.ref)
.set_verse_ref(s.ref.verse_ref)
.build()
for s in src_file_text
]
LOGGER.info(f"File {src_file_path} parsed correctly.")

# Filter sentences
for i in reversed(range(len(sentences))):
marker = scripture_refs[i].path[-1].name if len(scripture_refs[i].path) > 0 else ""
sentence_scripture_ref = sentences[i].scripture_ref
if sentence_scripture_ref is None:
continue
marker = sentence_scripture_ref.path[-1].name if len(sentence_scripture_ref.path) > 0 else ""
if (
(len(chapters) > 0 and scripture_refs[i].chapter_num not in chapters)
(len(chapters) > 0 and sentence_scripture_ref.chapter_num not in chapters)
or marker in PARAGRAPH_TYPE_EMBEDS
or stylesheet.get_tag(marker).text_type == UsfmTextType.NOTE_TEXT
):
sentences.pop(i)
scripture_refs.pop(i)
empty_sents: List[Tuple[int, ScriptureRef]] = []
for i in reversed(range(len(sentences))):
if len(sentences[i].strip()) == 0:
sentence_scripture_ref = sentences[i].scripture_ref
if sentence_scripture_ref is None:
continue
sentence_text = sentences[i].text
if (sentence_text is None or len(sentence_text.strip()) == 0) and sentence_scripture_ref is not None:
empty_sents.append((i, sentence_scripture_ref))
sentences.pop(i)
empty_sents.append((i, scripture_refs.pop(i)))

sentence_translation_groups: List[SentenceTranslationGroup] = list(
self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs)
self.translate(sentences, produce_multiple_translations)
)
num_drafts = len(sentence_translation_groups[0])

# Add empty sentences back in
# Prevents pre-existing text from showing up in the sections of translated text
for idx, vref in reversed(empty_sents):
sentences.insert(idx, "")
scripture_refs.insert(idx, vref)
sentences.insert(idx, TranslationInputSentence(None, None, "", "", vref, vref.verse_ref))
sentence_translation_groups.insert(idx, [SentenceTranslation("", [], [], None)] * num_drafts)

text_behavior = (
Expand All @@ -425,13 +537,19 @@ def translate_usfm(

draft_set: DraftGroup = DraftGroup(sentence_translation_groups)
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
postprocess_handler.construct_rows(scripture_refs, sentences, translated_draft.get_all_translations())
postprocess_handler.construct_rows(
[s.scripture_ref for s in sentences if s.scripture_ref is not None],
[s.text or "" for s in sentences],
translated_draft.get_all_translations(),
)

for config in postprocess_handler.configs:
first_scripture_ref = sentences[0].scripture_ref
assert first_scripture_ref is not None

# Compile draft remarks
draft_src_str = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
draft_remark = f"This draft of {scripture_refs[0].book} was machine translated on {date.today()} from {draft_src_str} using model {experiment_ckpt_str}. It should be reviewed and edited carefully."
draft_remark = f"This draft of {first_scripture_ref.book} was machine translated on {date.today()} from {draft_src_str} using model {experiment_ckpt_str}. It should be reviewed and edited carefully."
postprocess_remark = config.get_postprocess_remark()
remarks = [draft_remark] + ([postprocess_remark] if postprocess_remark else [])

Expand Down Expand Up @@ -466,7 +584,7 @@ def translate_usfm(
usfm = f.read()
handler = UpdateUsfmParserHandler(
rows=config.rows,
id_text=scripture_refs[0].book,
id_text=first_scripture_ref.book,
text_behavior=text_behavior,
paragraph_behavior=config.get_paragraph_behavior(),
embed_behavior=config.get_embed_behavior(),
Expand Down Expand Up @@ -513,7 +631,7 @@ def translate_usfm(
translated_draft,
trg_file_path,
produce_multiple_translations=produce_multiple_translations,
scripture_refs=scripture_refs,
scripture_refs=[s.scripture_ref for s in sentences if s.scripture_ref is not None],
draft_index=draft_index,
)

Expand All @@ -537,17 +655,21 @@ def translate_docx(
with src_file_path.open("rb") as file:
doc = docx.Document(file)

sentences: List[str] = []
translation_inputs: List[TranslationInputSentence] = []
paras: List[int] = []

for i, paragraph in enumerate(doc.paragraphs):
for sentence in tokenizer.tokenize(paragraph.text):
sentences.append(add_tags_to_sentence(tags, sentence))
translation_inputs.append(
TranslationInputSentence.Builder()
.set_text(add_tags_to_sentence(tags, sentence))
.set_src_iso(src_iso)
.set_trg_iso(trg_iso)
.build()
)
paras.append(i)

draft_set: DraftGroup = DraftGroup(
list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations))
)
draft_set: DraftGroup = DraftGroup(list(self.translate(translation_inputs, produce_multiple_translations)))

for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
for para, group in groupby(zip(translated_draft.get_all_translations(), paras), key=lambda t: t[1]):
Expand Down
Loading