sillsdev · benjaminking · Sep 26, 2025 · Sep 26, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py
@@ -36,112 +36,211 @@
 
 CONFIDENCE_SCORES_SUFFIX = ".confidences.tsv"
 
+
+# A single translation of a single sentence
+class SentenceTranslation:
+    def __init__(
+        self,
+        translation: str,
+        tokens: List[str],
+        token_scores: List[float],
+        sequence_score: Optional[float],
+    ):
+        self._translation = translation
+        self._tokens = tokens
+        self._token_scores = token_scores
+        self._sequence_score = sequence_score
+
+    def get_translation(self) -> str:
+        return self._translation
+
+    def has_sequence_confidence_score(self) -> bool:
+        return self._sequence_score is not None
+
+    def get_sequence_confidence_score(self) -> Optional[float]:
+        return self._sequence_score
+
+    def join_tokens_for_confidence_file(self) -> str:
+        return "\t".join(self._tokens)
+
+    def join_token_scores_for_confidence_file(self) -> str:
+        return "\t".join([str(exp(ts)) for ts in [self._sequence_score] + self._token_scores])
+
+
 # A group of multiple translations of a single sentence
-TranslationGroup = List[str]
+SentenceTranslationGroup = List[SentenceTranslation]
+
+
+# A class representing a single draft (one translation of each input sentence)
+class TranslatedDraft:
+    def __init__(self, sentence_translations: List[SentenceTranslation]):
+        self._sentence_translations = sentence_translations
+
+    def has_sequence_confidence_scores(self) -> bool:
+        return any([st.has_sequence_confidence_score() for st in self._sentence_translations])
+
+    def write_confidence_scores_to_file(
+        self,
+        confidences_path: Path,
+        row1col1_label: str,
+        vrefs: Optional[List[VerseRef]] = None,
+    ) -> None:
+        with confidences_path.open("w", encoding="utf-8", newline="\n") as confidences_file:
+            confidences_file.write("\t".join([f"{row1col1_label}"] + [f"Token {i}" for i in range(200)]) + "\n")
+            confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n")
+            for sentence_num, sentence_translation in enumerate(self._sentence_translations):
+                if not sentence_translation.has_sequence_confidence_score():
+                    continue
+                sequence_label = str(sentence_num)
+                if vrefs is not None:
+                    sequence_label = str(vrefs[sentence_num])
+                confidences_file.write(
+                    sequence_label + "\t" + sentence_translation.join_tokens_for_confidence_file() + "\n"
+                )
+                confidences_file.write(sentence_translation.join_token_scores_for_confidence_file() + "\n")
+
+    def write_chapter_confidence_scores_to_file(self, chapter_confidences_path: Path, vrefs: List[VerseRef]):
+        chapter_confidences: DefaultDict[int, List[float]] = defaultdict(list)
+        for sentence_num, vref in enumerate(vrefs):
+            if not vref.is_verse or self._sentence_translations[sentence_num].get_sequence_confidence_score() is None:
+                continue
+            vref_confidence = exp(self._sentence_translations[sentence_num].get_sequence_confidence_score())
+            chapter_confidences[vref.chapter_num].append(vref_confidence)
+
+        with chapter_confidences_path.open("w", encoding="utf-8", newline="\n") as chapter_confidences_file:
+            chapter_confidences_file.write("Chapter\tConfidence\n")
+            for chapter, confidences in chapter_confidences.items():
+                chapter_confidence = gmean(confidences)
+                chapter_confidences_file.write(f"{chapter}\t{chapter_confidence}\n")
 
-# A list representing a single draft (one translation of each input sentence)
-TranslatedDraft = List[str]
+    def get_all_sequence_confidence_scores(self) -> List[float]:
+        return [
+            exp(st.get_sequence_confidence_score())
+            for st in self._sentence_translations
+            if st.get_sequence_confidence_score() is not None
+        ]
 
+    def get_all_translations(self) -> List[str]:
+        return [st.get_translation() for st in self._sentence_translations]
 
-# A wrapper around List[TranslationGroup] that allows upstream consumers to view a
+
+# A wrapper around List[SentenceTranslationGroup] that allows upstream consumers to view a
 # list of translation groups as a collection of discrete drafts
 class DraftGroup:
-    def __init__(self, translation_groups: List[TranslationGroup]):
+    def __init__(self, translation_groups: List[SentenceTranslationGroup]):
         self.translation_groups = translation_groups
         self.num_drafts: int = len(self.translation_groups[0])
 
     def get_drafts(self) -> List[TranslatedDraft]:
-        translated_draft_sentences = [[] for _ in range(self.num_drafts)]
+        translated_draft_sentences: List[List[SentenceTranslation]] = [[] for _ in range(self.num_drafts)]
 
         for translation_group in self.translation_groups:
-            if len(translation_group) == 0:
-                translation_group = self._createEmptyTranslationGroup()
-
             for draft_index in range(self.num_drafts):
                 translated_draft_sentences[draft_index].append(translation_group[draft_index])
 
-        return translated_draft_sentences
-
-    def _createEmptyTranslationGroup(self):
-        return ["" for _ in range(self.num_drafts)]
+        return [TranslatedDraft(sentences) for sentences in translated_draft_sentences]
 
 
 def generate_confidence_files(
-    output: List[TranslationGroup],
+    translated_draft: TranslatedDraft,
     trg_file_path: Path,
-    translate_step: bool = False,
     trg_prefix: str = "",
     produce_multiple_translations: bool = False,
-    draft_index: int = 0,
     vrefs: Optional[List[VerseRef]] = None,
+    draft_index: int = 0,
 ) -> None:
+    if not translated_draft.has_sequence_confidence_scores():
+        LOGGER.warning(
+            f"{trg_file_path} was not translated with beam search, so confidence scores will not be calculated for this file."
+        )
+        return
+
     if produce_multiple_translations:
         confidences_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
     else:
         confidences_path = trg_file_path.with_suffix(f"{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
-    sequence_confidences: List[float] = []
+
     ext = trg_file_path.suffix.lower()
-    with confidences_path.open("w", encoding="utf-8", newline="\n") as confidences_file:
-        if translate_step and ext in {".usfm", ".sfm"}:
-            row1_col1_header = "VRef"
-        else:
-            row1_col1_header = "Sequence Number"
-        confidences_file.write("\t".join([f"{row1_col1_header}"] + [f"Token {i}" for i in range(200)]) + "\n")
-        confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n")
-        for sentence_num, _ in enumerate(output):
-            if output[sentence_num][0] is None:
-                continue
-            sequence_label = [str(sentence_num)]
-            if translate_step:
-                if ext in {".usfm", ".sfm"}:
-                    sequence_label = [str(vrefs[sentence_num])]
-                elif ext == ".txt":
-                    sequence_confidences.append(exp(output[sentence_num][3][draft_index - 1]))
-            confidences_file.write("\t".join(sequence_label + output[sentence_num][1][draft_index - 1]) + "\n")
-            confidences_file.write(
-                "\t".join(
-                    [str(exp(output[sentence_num][3][draft_index - 1]))]
-                    + [str(exp(token_score)) for token_score in output[sentence_num][2][draft_index - 1]]
-                )
-                + "\n"
-            )
-    if translate_step:
-        if ext in {".usfm", ".sfm"}:
-            chapter_confidences: DefaultDict[int, List[float]] = defaultdict(list)
-            for sentence_num, vref in enumerate(vrefs):
-                if not vref.is_verse or output[sentence_num][0] is None:
-                    continue
-                vref_confidence = exp(output[sentence_num][3][draft_index - 1])
-                chapter_confidences[vref.chapter_num].append(vref_confidence)
-
-            with confidences_path.with_suffix(".chapters.tsv").open(
-                "w", encoding="utf-8", newline="\n"
-            ) as chapter_confidences_file:
-                chapter_confidences_file.write("Chapter\tConfidence\n")
-                for chapter, confidences in chapter_confidences.items():
-                    sequence_confidences += confidences
-                    chapter_confidence = gmean(confidences)
-                    chapter_confidences_file.write(f"{chapter}\t{chapter_confidence}\n")
-
-            file_confidences_path = trg_file_path.parent / "confidences.books.tsv"
-            row1_col1_header = "Book"
-            if vrefs:
-                col1_entry = vrefs[0].book
-            else:
-                col1_entry = trg_file_path.stem
-        elif ext == ".txt":
-            file_confidences_path = trg_file_path.parent / f"{trg_prefix}confidences.files.tsv"
-            row1_col1_header = "File"
-            col1_entry = trg_file_path.name
-        else:
-            raise ValueError(
-                f"Invalid trg file extension {ext} when using --save-confidences in the translate step."
-                f"Valid file extensions for --save-confidences are .usfm, .sfm, and .txt."
-            )
-        with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file:
-            if file_confidences_file.tell() == 0:
-                file_confidences_file.write(f"{row1_col1_header}\tConfidence\n")
-            file_confidences_file.write(f"{col1_entry}\t{gmean(sequence_confidences)}\n")
+    if ext in {".usfm", ".sfm"}:
+        assert vrefs is not None
+        generate_usfm_confidence_files(translated_draft, trg_file_path, confidences_path, vrefs, draft_index)
+    elif ext == ".txt":
+        generate_txt_confidence_files(translated_draft, trg_file_path, confidences_path, trg_prefix)
+    else:
+        raise ValueError(
+            f"Invalid trg file extension {ext} when using --save-confidences in the translate step."
+            f"Valid file extensions for --save-confidences are .usfm, .sfm, and .txt."
+        )
+
+
+def generate_usfm_confidence_files(
+    translated_draft: TranslatedDraft,
+    trg_file_path: Path,
+    confidences_path: Path,
+    vrefs: List[VerseRef],
+    draft_index: int = 0,
+) -> None:
+
+    translated_draft.write_confidence_scores_to_file(confidences_path, "VRef", vrefs)
+    translated_draft.write_chapter_confidence_scores_to_file(confidences_path.with_suffix(".chapters.tsv"), vrefs)
+    _append_book_confidence_score(translated_draft, trg_file_path, vrefs)
+
+
+def _append_book_confidence_score(
+    translated_draft: TranslatedDraft,
+    trg_file_path: Path,
+    vrefs: List[VerseRef],
+) -> None:
+    file_confidences_path = trg_file_path.parent / "confidences.books.tsv"
+    row1_col1_header = "Book"
+    if vrefs:
+        col1_entry = vrefs[0].book
+    else:
+        col1_entry = trg_file_path.stem
+
+    with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file:
+        if file_confidences_file.tell() == 0:
+            file_confidences_file.write(f"{row1_col1_header}\tConfidence\n")
+        file_confidences_file.write(f"{col1_entry}\t{gmean(translated_draft.get_all_sequence_confidence_scores())}\n")
+
+
+def generate_txt_confidence_files(
+    translated_draft: TranslatedDraft,
+    trg_file_path: Path,
+    confidences_path: Path,
+    trg_prefix: str = "",
+) -> None:
+    translated_draft.write_confidence_scores_to_file(confidences_path, "Sequence Number")
+
+    _append_file_confidence_score(translated_draft, trg_file_path, trg_prefix)
+
+
+def _append_file_confidence_score(
+    translated_draft: TranslatedDraft,
+    trg_file_path: Path,
+    trg_prefix: str = "",
+) -> None:
+    file_confidences_path = trg_file_path.parent / f"{trg_prefix}confidences.files.tsv"
+
+    with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file:
+        if file_confidences_file.tell() == 0:
+            file_confidences_file.write("File\tConfidence\n")
+        file_confidences_file.write(
+            f"{trg_file_path.name}\t{gmean(translated_draft.get_all_sequence_confidence_scores())}\n"
+        )
+
+
+def generate_test_confidence_files(
+    translated_draft: TranslatedDraft,
+    trg_file_path: Path,
+    produce_multiple_translations: bool = False,
+    draft_index: int = 0,
+) -> None:
+    if produce_multiple_translations:
+        confidences_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
+    else:
+        confidences_path = trg_file_path.with_suffix(f"{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}")
+    translated_draft.write_confidence_scores_to_file(confidences_path, "Sequence Number")
 
 
 class Translator(ABC):
@@ -153,7 +252,7 @@ def translate(
         trg_iso: str,
         produce_multiple_translations: bool = False,
         vrefs: Optional[Iterable[VerseRef]] = None,
-    ) -> Iterable[TranslationGroup]:
+    ) -> Iterable[SentenceTranslationGroup]:
         pass
 
     def translate_text(
@@ -166,21 +265,21 @@ def translate_text(
         save_confidences: bool = False,
         trg_prefix: str = "",
     ) -> None:
-        output = list(self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations))
-        translations = [translation for translation, _, _, _ in output]
-        draft_set = DraftGroup(translations)
+        sentence_translation_groups: List[SentenceTranslationGroup] = list(
+            self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations)
+        )
+        draft_set = DraftGroup(sentence_translation_groups)
         for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
             if produce_multiple_translations:
                 trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
             else:
                 trg_draft_file_path = trg_file_path
-            write_corpus(trg_draft_file_path, translated_draft)
+            write_corpus(trg_draft_file_path, translated_draft.get_all_translations())
 
             if save_confidences:
                 generate_confidence_files(
-                    output,
+                    translated_draft,
                     trg_file_path,
-                    translate_step=True,
                     trg_prefix=trg_prefix,
                     produce_multiple_translations=produce_multiple_translations,
                     draft_index=draft_index,
@@ -280,25 +379,25 @@ def translate_usfm(
                 sentences.pop(i)
                 empty_sents.append((i, vrefs.pop(i)))
 
-        output = list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs))
-
-        translations = [translation for translation, _, _, _ in output]
+        sentence_translation_groups: List[SentenceTranslationGroup] = list(
+            self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs)
+        )
+        num_drafts = len(sentence_translation_groups[0])
 
         # Add empty sentences back in
         # Prevents pre-existing text from showing up in the sections of translated text
         for idx, vref in reversed(empty_sents):
             sentences.insert(idx, "")
-            translations.insert(idx, ["" for _ in range(len(translations[0]))])
             vrefs.insert(idx, vref)
-            output.insert(idx, [None, None, None, None])
+            sentence_translation_groups.insert(idx, [SentenceTranslation("", [], [], None)] * num_drafts)
 
         text_behavior = (
             UpdateUsfmTextBehavior.PREFER_NEW if trg_project is not None else UpdateUsfmTextBehavior.STRIP_EXISTING
         )
 
-        draft_set: DraftGroup = DraftGroup(translations)
+        draft_set: DraftGroup = DraftGroup(sentence_translation_groups)
         for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
-            postprocess_handler.construct_rows(vrefs, sentences, translated_draft)
+            postprocess_handler.construct_rows(vrefs, sentences, translated_draft.get_all_translations())
 
             for config in postprocess_handler.configs:
 
@@ -382,12 +481,11 @@ def translate_usfm(
 
             if save_confidences:
                 generate_confidence_files(
-                    output,
+                    translated_draft,
                     trg_file_path,
-                    translate_step=True,
                     produce_multiple_translations=produce_multiple_translations,
-                    draft_index=draft_index,
                     vrefs=vrefs,
+                    draft_index=draft_index,
                 )
 
     def translate_docx(

diff --git a/silnlp/nmt/config.py b/silnlp/nmt/config.py
@@ -32,7 +32,7 @@
     write_corpus,
 )
 from ..common.environment import SIL_NLP_ENV
-from ..common.translator import TranslationGroup
+from ..common.translator import SentenceTranslationGroup
 from ..common.utils import NoiseMethod, Side, get_mt_exp_dir, set_seed
 from .augment import AugmentMethod
 from .corpora import (
@@ -86,7 +86,7 @@ def translate(
         trg_iso: str,
         vrefs: Optional[Iterable[VerseRef]] = None,
         ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST,
-    ) -> Iterable[TranslationGroup]: ...
+    ) -> Iterable[SentenceTranslationGroup]: ...
 
     @abstractmethod
     def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: ...