Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 193 additions & 95 deletions silnlp/common/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,112 +36,211 @@

CONFIDENCE_SCORES_SUFFIX = ".confidences.tsv"


# A single translation of a single sentence
class SentenceTranslation:
def __init__(
self,
translation: str,
tokens: List[str],
token_scores: List[float],
sequence_score: Optional[float],
):
self._translation = translation
self._tokens = tokens
self._token_scores = token_scores
self._sequence_score = sequence_score

def get_translation(self) -> str:
return self._translation

def has_sequence_confidence_score(self) -> bool:
return self._sequence_score is not None

def get_sequence_confidence_score(self) -> Optional[float]:
return self._sequence_score

def join_tokens_for_confidence_file(self) -> str:
return "\t".join(self._tokens)

def join_token_scores_for_confidence_file(self) -> str:
return "\t".join([str(exp(ts)) for ts in [self._sequence_score] + self._token_scores])


# A group of multiple translations of a single sentence
TranslationGroup = List[str]
SentenceTranslationGroup = List[SentenceTranslation]


# A class representing a single draft (one translation of each input sentence)
class TranslatedDraft:
def __init__(self, sentence_translations: List[SentenceTranslation]):
self._sentence_translations = sentence_translations

def has_sequence_confidence_scores(self) -> bool:
return any([st.has_sequence_confidence_score() for st in self._sentence_translations])

def write_confidence_scores_to_file(
self,
confidences_path: Path,
row1col1_label: str,
vrefs: Optional[List[VerseRef]] = None,
) -> None:
with confidences_path.open("w", encoding="utf-8", newline="\n") as confidences_file:
confidences_file.write("\t".join([f"{row1col1_label}"] + [f"Token {i}" for i in range(200)]) + "\n")
confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n")
for sentence_num, sentence_translation in enumerate(self._sentence_translations):
if not sentence_translation.has_sequence_confidence_score():
continue
sequence_label = str(sentence_num)
if vrefs is not None:
sequence_label = str(vrefs[sentence_num])
confidences_file.write(
sequence_label + "\t" + sentence_translation.join_tokens_for_confidence_file() + "\n"
)
confidences_file.write(sentence_translation.join_token_scores_for_confidence_file() + "\n")

def write_chapter_confidence_scores_to_file(self, chapter_confidences_path: Path, vrefs: List[VerseRef]):
chapter_confidences: DefaultDict[int, List[float]] = defaultdict(list)
for sentence_num, vref in enumerate(vrefs):
if not vref.is_verse or self._sentence_translations[sentence_num].get_sequence_confidence_score() is None:
continue
vref_confidence = exp(self._sentence_translations[sentence_num].get_sequence_confidence_score())
chapter_confidences[vref.chapter_num].append(vref_confidence)

with chapter_confidences_path.open("w", encoding="utf-8", newline="\n") as chapter_confidences_file:
chapter_confidences_file.write("Chapter\tConfidence\n")
for chapter, confidences in chapter_confidences.items():
chapter_confidence = gmean(confidences)
chapter_confidences_file.write(f"{chapter}\t{chapter_confidence}\n")

# A list representing a single draft (one translation of each input sentence)
TranslatedDraft = List[str]
def get_all_sequence_confidence_scores(self) -> List[float]:
return [
exp(st.get_sequence_confidence_score())
for st in self._sentence_translations
if st.get_sequence_confidence_score() is not None
]

def get_all_translations(self) -> List[str]:
return [st.get_translation() for st in self._sentence_translations]

# A wrapper around List[TranslationGroup] that allows upstream consumers to view a

# A wrapper around List[SentenceTranslationGroup] that allows upstream consumers to view a
# list of translation groups as a collection of discrete drafts
class DraftGroup:
def __init__(self, translation_groups: List[TranslationGroup]):
def __init__(self, translation_groups: List[SentenceTranslationGroup]):
self.translation_groups = translation_groups
self.num_drafts: int = len(self.translation_groups[0])

def get_drafts(self) -> List[TranslatedDraft]:
translated_draft_sentences = [[] for _ in range(self.num_drafts)]
translated_draft_sentences: List[List[SentenceTranslation]] = [[] for _ in range(self.num_drafts)]

for translation_group in self.translation_groups:
if len(translation_group) == 0:
translation_group = self._createEmptyTranslationGroup()

for draft_index in range(self.num_drafts):
translated_draft_sentences[draft_index].append(translation_group[draft_index])

return translated_draft_sentences

def _createEmptyTranslationGroup(self):
return ["" for _ in range(self.num_drafts)]
return [TranslatedDraft(sentences) for sentences in translated_draft_sentences]


def generate_confidence_files(
output: List[TranslationGroup],
translated_draft: TranslatedDraft,
trg_file_path: Path,
translate_step: bool = False,
trg_prefix: str = "",
produce_multiple_translations: bool = False,
draft_index: int = 0,
vrefs: Optional[List[VerseRef]] = None,
draft_index: int = 0,
) -> None:
if not translated_draft.has_sequence_confidence_scores():
LOGGER.warning(
f"{trg_file_path} was not translated with beam search, so confidence scores will not be calculated for this file."
)
return

if produce_multiple_translations:
confidences_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
else:
confidences_path = trg_file_path.with_suffix(f"{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
sequence_confidences: List[float] = []

ext = trg_file_path.suffix.lower()
with confidences_path.open("w", encoding="utf-8", newline="\n") as confidences_file:
if translate_step and ext in {".usfm", ".sfm"}:
row1_col1_header = "VRef"
else:
row1_col1_header = "Sequence Number"
confidences_file.write("\t".join([f"{row1_col1_header}"] + [f"Token {i}" for i in range(200)]) + "\n")
confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n")
for sentence_num, _ in enumerate(output):
if output[sentence_num][0] is None:
continue
sequence_label = [str(sentence_num)]
if translate_step:
if ext in {".usfm", ".sfm"}:
sequence_label = [str(vrefs[sentence_num])]
elif ext == ".txt":
sequence_confidences.append(exp(output[sentence_num][3][draft_index - 1]))
confidences_file.write("\t".join(sequence_label + output[sentence_num][1][draft_index - 1]) + "\n")
confidences_file.write(
"\t".join(
[str(exp(output[sentence_num][3][draft_index - 1]))]
+ [str(exp(token_score)) for token_score in output[sentence_num][2][draft_index - 1]]
)
+ "\n"
)
if translate_step:
if ext in {".usfm", ".sfm"}:
chapter_confidences: DefaultDict[int, List[float]] = defaultdict(list)
for sentence_num, vref in enumerate(vrefs):
if not vref.is_verse or output[sentence_num][0] is None:
continue
vref_confidence = exp(output[sentence_num][3][draft_index - 1])
chapter_confidences[vref.chapter_num].append(vref_confidence)

with confidences_path.with_suffix(".chapters.tsv").open(
"w", encoding="utf-8", newline="\n"
) as chapter_confidences_file:
chapter_confidences_file.write("Chapter\tConfidence\n")
for chapter, confidences in chapter_confidences.items():
sequence_confidences += confidences
chapter_confidence = gmean(confidences)
chapter_confidences_file.write(f"{chapter}\t{chapter_confidence}\n")

file_confidences_path = trg_file_path.parent / "confidences.books.tsv"
row1_col1_header = "Book"
if vrefs:
col1_entry = vrefs[0].book
else:
col1_entry = trg_file_path.stem
elif ext == ".txt":
file_confidences_path = trg_file_path.parent / f"{trg_prefix}confidences.files.tsv"
row1_col1_header = "File"
col1_entry = trg_file_path.name
else:
raise ValueError(
f"Invalid trg file extension {ext} when using --save-confidences in the translate step."
f"Valid file extensions for --save-confidences are .usfm, .sfm, and .txt."
)
with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file:
if file_confidences_file.tell() == 0:
file_confidences_file.write(f"{row1_col1_header}\tConfidence\n")
file_confidences_file.write(f"{col1_entry}\t{gmean(sequence_confidences)}\n")
if ext in {".usfm", ".sfm"}:
assert vrefs is not None
generate_usfm_confidence_files(translated_draft, trg_file_path, confidences_path, vrefs, draft_index)
elif ext == ".txt":
generate_txt_confidence_files(translated_draft, trg_file_path, confidences_path, trg_prefix)
else:
raise ValueError(
f"Invalid trg file extension {ext} when using --save-confidences in the translate step."
f"Valid file extensions for --save-confidences are .usfm, .sfm, and .txt."
)


def generate_usfm_confidence_files(
translated_draft: TranslatedDraft,
trg_file_path: Path,
confidences_path: Path,
vrefs: List[VerseRef],
draft_index: int = 0,
) -> None:

translated_draft.write_confidence_scores_to_file(confidences_path, "VRef", vrefs)
translated_draft.write_chapter_confidence_scores_to_file(confidences_path.with_suffix(".chapters.tsv"), vrefs)
_append_book_confidence_score(translated_draft, trg_file_path, vrefs)


def _append_book_confidence_score(
translated_draft: TranslatedDraft,
trg_file_path: Path,
vrefs: List[VerseRef],
) -> None:
file_confidences_path = trg_file_path.parent / "confidences.books.tsv"
row1_col1_header = "Book"
if vrefs:
col1_entry = vrefs[0].book
else:
col1_entry = trg_file_path.stem

with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file:
if file_confidences_file.tell() == 0:
file_confidences_file.write(f"{row1_col1_header}\tConfidence\n")
file_confidences_file.write(f"{col1_entry}\t{gmean(translated_draft.get_all_sequence_confidence_scores())}\n")


def generate_txt_confidence_files(
translated_draft: TranslatedDraft,
trg_file_path: Path,
confidences_path: Path,
trg_prefix: str = "",
) -> None:
translated_draft.write_confidence_scores_to_file(confidences_path, "Sequence Number")

_append_file_confidence_score(translated_draft, trg_file_path, trg_prefix)


def _append_file_confidence_score(
translated_draft: TranslatedDraft,
trg_file_path: Path,
trg_prefix: str = "",
) -> None:
file_confidences_path = trg_file_path.parent / f"{trg_prefix}confidences.files.tsv"

with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file:
if file_confidences_file.tell() == 0:
file_confidences_file.write("File\tConfidence\n")
file_confidences_file.write(
f"{trg_file_path.name}\t{gmean(translated_draft.get_all_sequence_confidence_scores())}\n"
)


def generate_test_confidence_files(
translated_draft: TranslatedDraft,
trg_file_path: Path,
produce_multiple_translations: bool = False,
draft_index: int = 0,
) -> None:
if produce_multiple_translations:
confidences_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
else:
confidences_path = trg_file_path.with_suffix(f"{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
translated_draft.write_confidence_scores_to_file(confidences_path, "Sequence Number")


class Translator(ABC):
Expand All @@ -153,7 +252,7 @@ def translate(
trg_iso: str,
produce_multiple_translations: bool = False,
vrefs: Optional[Iterable[VerseRef]] = None,
) -> Iterable[TranslationGroup]:
) -> Iterable[SentenceTranslationGroup]:
pass

def translate_text(
Expand All @@ -166,21 +265,21 @@ def translate_text(
save_confidences: bool = False,
trg_prefix: str = "",
) -> None:
output = list(self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations))
translations = [translation for translation, _, _, _ in output]
draft_set = DraftGroup(translations)
sentence_translation_groups: List[SentenceTranslationGroup] = list(
self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations)
)
draft_set = DraftGroup(sentence_translation_groups)
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
if produce_multiple_translations:
trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
else:
trg_draft_file_path = trg_file_path
write_corpus(trg_draft_file_path, translated_draft)
write_corpus(trg_draft_file_path, translated_draft.get_all_translations())

if save_confidences:
generate_confidence_files(
output,
translated_draft,
trg_file_path,
translate_step=True,
trg_prefix=trg_prefix,
produce_multiple_translations=produce_multiple_translations,
draft_index=draft_index,
Expand Down Expand Up @@ -280,25 +379,25 @@ def translate_usfm(
sentences.pop(i)
empty_sents.append((i, vrefs.pop(i)))

output = list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs))

translations = [translation for translation, _, _, _ in output]
sentence_translation_groups: List[SentenceTranslationGroup] = list(
self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs)
)
num_drafts = len(sentence_translation_groups[0])

# Add empty sentences back in
# Prevents pre-existing text from showing up in the sections of translated text
for idx, vref in reversed(empty_sents):
sentences.insert(idx, "")
translations.insert(idx, ["" for _ in range(len(translations[0]))])
vrefs.insert(idx, vref)
output.insert(idx, [None, None, None, None])
sentence_translation_groups.insert(idx, [SentenceTranslation("", [], [], None)] * num_drafts)

text_behavior = (
UpdateUsfmTextBehavior.PREFER_NEW if trg_project is not None else UpdateUsfmTextBehavior.STRIP_EXISTING
)

draft_set: DraftGroup = DraftGroup(translations)
draft_set: DraftGroup = DraftGroup(sentence_translation_groups)
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
postprocess_handler.construct_rows(vrefs, sentences, translated_draft)
postprocess_handler.construct_rows(vrefs, sentences, translated_draft.get_all_translations())

for config in postprocess_handler.configs:

Expand Down Expand Up @@ -382,12 +481,11 @@ def translate_usfm(

if save_confidences:
generate_confidence_files(
output,
translated_draft,
trg_file_path,
translate_step=True,
produce_multiple_translations=produce_multiple_translations,
draft_index=draft_index,
vrefs=vrefs,
draft_index=draft_index,
)

def translate_docx(
Expand Down
4 changes: 2 additions & 2 deletions silnlp/nmt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
write_corpus,
)
from ..common.environment import SIL_NLP_ENV
from ..common.translator import TranslationGroup
from ..common.translator import SentenceTranslationGroup
from ..common.utils import NoiseMethod, Side, get_mt_exp_dir, set_seed
from .augment import AugmentMethod
from .corpora import (
Expand Down Expand Up @@ -86,7 +86,7 @@ def translate(
trg_iso: str,
vrefs: Optional[Iterable[VerseRef]] = None,
ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST,
) -> Iterable[TranslationGroup]: ...
) -> Iterable[SentenceTranslationGroup]: ...

@abstractmethod
def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: ...
Expand Down
Loading