From aec324a652405265a3854b2c910a5fc1f0cd5311 Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 26 Sep 2025 13:23:54 +0000 Subject: [PATCH 1/5] Update multiple translations classes with confidence methods --- silnlp/common/translator.py | 283 ++++++++++++++++++++---------- silnlp/nmt/hugging_face_config.py | 123 ++++++------- 2 files changed, 244 insertions(+), 162 deletions(-) diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index ebbf9a1b..8c7be5a3 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -36,112 +36,209 @@ CONFIDENCE_SCORES_SUFFIX = ".confidences.tsv" + +class SentenceTranslation: + def __init__( + self, + translation: str, + tokens: List[str], + token_scores: List[float], + sequence_score: Optional[float], + ): + self._translation = translation + self._tokens = tokens + self._token_scores = token_scores + self._sequence_score = sequence_score + + def get_translation(self) -> str: + return self._translation + + def has_sequence_confidence_score(self) -> bool: + return self._sequence_score is not None + + def get_sequence_confidence_score(self) -> Optional[float]: + return self._sequence_score + + def join_tokens_for_confidence_file(self) -> str: + return "\t".join(self._tokens) + + def join_token_scores_for_confidence_file(self) -> str: + return "\t".join([self._sequence_score] + self._token_scores) + + # A group of multiple translations of a single sentence -TranslationGroup = List[str] +SentenceTranslationGroup = List[SentenceTranslation] + # A list representing a single draft (one translation of each input sentence) -TranslatedDraft = List[str] +class TranslatedDraft: + def __init__(self, sentence_translations: List[SentenceTranslation]): + self._sentence_translations = sentence_translations + + def has_sequence_confidence_scores(self) -> bool: + # If any sentence has a sequence score, all sentences should have one + return self._sentence_translations[0].has_sequence_confidence_score() + + def write_confidence_scores_to_file( + self, + confidences_path: Path, + row1col1_label: str, + vrefs: Optional[List[VerseRef]] = None, + ) -> None: + with confidences_path.open("w", encoding="utf-8", newline="\n") as confidences_file: + confidences_file.write("\t".join([f"{row1col1_label}"] + [f"Token {i}" for i in range(200)]) + "\n") + confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n") + for sentence_num, sentence_translation in enumerate(self._sentence_translations): + sequence_label = str(sentence_num) + if vrefs is not None: + sequence_label = str(vrefs[sentence_num]) + confidences_file.write( + sequence_label + "\t" + sentence_translation.join_tokens_for_confidence_file() + "\n" + ) + confidences_file.write(sentence_translation.join_token_scores_for_confidence_file() + "\n") + + def write_chapter_confidence_scores_to_file(self, chapter_confidences_path: Path, vrefs: List[VerseRef]): + chapter_confidences: DefaultDict[int, List[float]] = defaultdict(list) + for sentence_num, vref in enumerate(vrefs): + if not vref.is_verse or self._sentence_translations[sentence_num].get_sequence_confidence_score() is None: + continue + vref_confidence = exp(self._sentence_translations[sentence_num].get_sequence_confidence_score()) + chapter_confidences[vref.chapter_num].append(vref_confidence) + + with chapter_confidences_path.open("w", encoding="utf-8", newline="\n") as chapter_confidences_file: + chapter_confidences_file.write("Chapter\tConfidence\n") + for chapter, confidences in chapter_confidences.items(): + chapter_confidence = gmean(confidences) + chapter_confidences_file.write(f"{chapter}\t{chapter_confidence}\n") + + def get_all_sequence_confidence_scores(self) -> List[float]: + return [ + st.get_sequence_confidence_score() + for st in self._sentence_translations + if st.get_sequence_confidence_score() is not None + ] + def get_rows_for_postprocess(self) -> List[str]: + return [st.get_translation() for st in self._sentence_translations] -# A wrapper around List[TranslationGroup] that allows upstream consumers to view a + +# A wrapper around List[SentenceTranslationGroup] that allows upstream consumers to view a # list of translation groups as a collection of discrete drafts class DraftGroup: - def __init__(self, translation_groups: List[TranslationGroup]): + def __init__(self, translation_groups: List[SentenceTranslationGroup]): self.translation_groups = translation_groups self.num_drafts: int = len(self.translation_groups[0]) def get_drafts(self) -> List[TranslatedDraft]: - translated_draft_sentences = [[] for _ in range(self.num_drafts)] + translated_draft_sentences: List[List[SentenceTranslation]] = [[] for _ in range(self.num_drafts)] for translation_group in self.translation_groups: - if len(translation_group) == 0: - translation_group = self._createEmptyTranslationGroup() - for draft_index in range(self.num_drafts): translated_draft_sentences[draft_index].append(translation_group[draft_index]) - return translated_draft_sentences - - def _createEmptyTranslationGroup(self): - return ["" for _ in range(self.num_drafts)] + return [TranslatedDraft(sentences) for sentences in translated_draft_sentences] def generate_confidence_files( - output: List[TranslationGroup], + translated_draft: TranslatedDraft, trg_file_path: Path, - translate_step: bool = False, trg_prefix: str = "", produce_multiple_translations: bool = False, - draft_index: int = 0, vrefs: Optional[List[VerseRef]] = None, + draft_index: int = 0, ) -> None: + if not translated_draft.has_sequence_confidence_scores(): + LOGGER.warning( + f"{trg_file_path} was not translated with beam search, so confidence scores will not be calculated for this file." + ) + return + if produce_multiple_translations: confidences_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}") else: confidences_path = trg_file_path.with_suffix(f"{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}") - sequence_confidences: List[float] = [] + ext = trg_file_path.suffix.lower() - with confidences_path.open("w", encoding="utf-8", newline="\n") as confidences_file: - if translate_step and ext in {".usfm", ".sfm"}: - row1_col1_header = "VRef" - else: - row1_col1_header = "Sequence Number" - confidences_file.write("\t".join([f"{row1_col1_header}"] + [f"Token {i}" for i in range(200)]) + "\n") - confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n") - for sentence_num, _ in enumerate(output): - if output[sentence_num][0] is None: - continue - sequence_label = [str(sentence_num)] - if translate_step: - if ext in {".usfm", ".sfm"}: - sequence_label = [str(vrefs[sentence_num])] - elif ext == ".txt": - sequence_confidences.append(exp(output[sentence_num][3][draft_index - 1])) - confidences_file.write("\t".join(sequence_label + output[sentence_num][1][draft_index - 1]) + "\n") - confidences_file.write( - "\t".join( - [str(exp(output[sentence_num][3][draft_index - 1]))] - + [str(exp(token_score)) for token_score in output[sentence_num][2][draft_index - 1]] - ) - + "\n" - ) - if translate_step: - if ext in {".usfm", ".sfm"}: - chapter_confidences: DefaultDict[int, List[float]] = defaultdict(list) - for sentence_num, vref in enumerate(vrefs): - if not vref.is_verse or output[sentence_num][0] is None: - continue - vref_confidence = exp(output[sentence_num][3][draft_index - 1]) - chapter_confidences[vref.chapter_num].append(vref_confidence) - - with confidences_path.with_suffix(".chapters.tsv").open( - "w", encoding="utf-8", newline="\n" - ) as chapter_confidences_file: - chapter_confidences_file.write("Chapter\tConfidence\n") - for chapter, confidences in chapter_confidences.items(): - sequence_confidences += confidences - chapter_confidence = gmean(confidences) - chapter_confidences_file.write(f"{chapter}\t{chapter_confidence}\n") - - file_confidences_path = trg_file_path.parent / "confidences.books.tsv" - row1_col1_header = "Book" - if vrefs: - col1_entry = vrefs[0].book - else: - col1_entry = trg_file_path.stem - elif ext == ".txt": - file_confidences_path = trg_file_path.parent / f"{trg_prefix}confidences.files.tsv" - row1_col1_header = "File" - col1_entry = trg_file_path.name - else: - raise ValueError( - f"Invalid trg file extension {ext} when using --save-confidences in the translate step." - f"Valid file extensions for --save-confidences are .usfm, .sfm, and .txt." - ) - with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file: - if file_confidences_file.tell() == 0: - file_confidences_file.write(f"{row1_col1_header}\tConfidence\n") - file_confidences_file.write(f"{col1_entry}\t{gmean(sequence_confidences)}\n") + if ext in {".usfm", ".sfm"}: + assert vrefs is not None + generate_usfm_confidence_files(translated_draft, trg_file_path, confidences_path, vrefs, draft_index) + elif ext == ".txt": + generate_txt_confidence_files(translated_draft, trg_file_path, confidences_path, trg_prefix) + else: + raise ValueError( + f"Invalid trg file extension {ext} when using --save-confidences in the translate step." + f"Valid file extensions for --save-confidences are .usfm, .sfm, and .txt." + ) + + +def generate_usfm_confidence_files( + translated_draft: TranslatedDraft, + trg_file_path: Path, + confidences_path: Path, + vrefs: List[VerseRef], + draft_index: int = 0, +) -> None: + + translated_draft.write_confidence_scores_to_file(confidences_path, "VRef", vrefs) + translated_draft.write_chapter_confidence_scores_to_file(confidences_path.with_suffix(".chapters.tsv"), vrefs) + _append_book_confidence_score(translated_draft, trg_file_path, vrefs) + + +def _append_book_confidence_score( + translated_draft: TranslatedDraft, + trg_file_path: Path, + vrefs: List[VerseRef], +) -> None: + file_confidences_path = trg_file_path.parent / "confidences.books.tsv" + row1_col1_header = "Book" + if vrefs: + col1_entry = vrefs[0].book + else: + col1_entry = trg_file_path.stem + + with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file: + if file_confidences_file.tell() == 0: + file_confidences_file.write(f"{row1_col1_header}\tConfidence\n") + file_confidences_file.write(f"{col1_entry}\t{gmean(translated_draft.get_all_sequence_confidence_scores())}\n") + + +def generate_txt_confidence_files( + translated_draft: TranslatedDraft, + trg_file_path: Path, + confidences_path: Path, + trg_prefix: str = "", +) -> None: + translated_draft.write_confidence_scores_to_file(confidences_path, "Sequence Number") + + _append_file_confidence_score(translated_draft, trg_file_path, trg_prefix) + + +def _append_file_confidence_score( + translated_draft: TranslatedDraft, + trg_file_path: Path, + trg_prefix: str = "", +) -> None: + file_confidences_path = trg_file_path.parent / f"{trg_prefix}confidences.files.tsv" + + with file_confidences_path.open("a", encoding="utf-8", newline="\n") as file_confidences_file: + if file_confidences_file.tell() == 0: + file_confidences_file.write("File\tConfidence\n") + file_confidences_file.write( + f"{trg_file_path.name}\t{gmean(translated_draft.get_all_sequence_confidence_scores())}\n" + ) + + +def generate_test_confidence_files( + translated_draft: TranslatedDraft, + trg_file_path: Path, + produce_multiple_translations: bool = False, + draft_index: int = 0, +) -> None: + if produce_multiple_translations: + confidences_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}") + else: + confidences_path = trg_file_path.with_suffix(f"{trg_file_path.suffix}{CONFIDENCE_SCORES_SUFFIX}") + translated_draft.write_confidence_scores_to_file(confidences_path, "Sequence Number") class Translator(ABC): @@ -153,7 +250,7 @@ def translate( trg_iso: str, produce_multiple_translations: bool = False, vrefs: Optional[Iterable[VerseRef]] = None, - ) -> Iterable[TranslationGroup]: + ) -> Iterable[SentenceTranslationGroup]: pass def translate_text( @@ -166,21 +263,21 @@ def translate_text( save_confidences: bool = False, trg_prefix: str = "", ) -> None: - output = list(self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations)) - translations = [translation for translation, _, _, _ in output] - draft_set = DraftGroup(translations) + sentence_translation_groups: List[SentenceTranslationGroup] = list( + self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations) + ) + draft_set = DraftGroup(sentence_translation_groups) for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1): if produce_multiple_translations: trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}") else: trg_draft_file_path = trg_file_path - write_corpus(trg_draft_file_path, translated_draft) + write_corpus(trg_draft_file_path, translated_draft.get_all_translations()) if save_confidences: generate_confidence_files( - output, + translated_draft, trg_file_path, - translate_step=True, trg_prefix=trg_prefix, produce_multiple_translations=produce_multiple_translations, draft_index=draft_index, @@ -280,25 +377,24 @@ def translate_usfm( sentences.pop(i) empty_sents.append((i, vrefs.pop(i))) - output = list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs)) - - translations = [translation for translation, _, _, _ in output] + sentence_translation_groups: List[SentenceTranslationGroup] = list( + self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs) + ) # Add empty sentences back in # Prevents pre-existing text from showing up in the sections of translated text for idx, vref in reversed(empty_sents): sentences.insert(idx, "") - translations.insert(idx, ["" for _ in range(len(translations[0]))]) vrefs.insert(idx, vref) - output.insert(idx, [None, None, None, None]) + sentence_translation_groups.insert(idx, SentenceTranslation("", [], [], None)) text_behavior = ( UpdateUsfmTextBehavior.PREFER_NEW if trg_project is not None else UpdateUsfmTextBehavior.STRIP_EXISTING ) - draft_set: DraftGroup = DraftGroup(translations) + draft_set: DraftGroup = DraftGroup(sentence_translation_groups) for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1): - postprocess_handler.construct_rows(vrefs, sentences, translated_draft) + postprocess_handler.construct_rows(vrefs, sentences, translated_draft.get_rows_for_postprocess()) for config in postprocess_handler.configs: @@ -382,12 +478,11 @@ def translate_usfm( if save_confidences: generate_confidence_files( - output, + translated_draft, trg_file_path, - translate_step=True, produce_multiple_translations=produce_multiple_translations, - draft_index=draft_index, vrefs=vrefs, + draft_index=draft_index, ) def translate_docx( diff --git a/silnlp/nmt/hugging_face_config.py b/silnlp/nmt/hugging_face_config.py index 7c4679bb..88618137 100644 --- a/silnlp/nmt/hugging_face_config.py +++ b/silnlp/nmt/hugging_face_config.py @@ -6,9 +6,9 @@ import shutil from contextlib import ExitStack from copy import deepcopy +from dataclasses import dataclass from enum import Enum from itertools import repeat -from math import exp, prod from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, TypeVar, Union, cast @@ -19,7 +19,6 @@ import torch import transformers.utils.logging as transformers_logging import yaml -from accelerate import infer_auto_device_map, init_empty_weights from accelerate.utils.memory import should_reduce_batch_size from datasets import Dataset from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef @@ -74,7 +73,12 @@ from ..common.corpus import Term, count_lines, get_terms from ..common.environment import SIL_NLP_ENV -from ..common.translator import DraftGroup, TranslationGroup, generate_confidence_files +from ..common.translator import ( + DraftGroup, + SentenceTranslation, + SentenceTranslationGroup, + generate_test_confidence_files, +) from ..common.utils import NoiseMethod, ReplaceRandomToken, Side, create_noise_methods, get_mt_exp_dir, merge_dict from .config import CheckpointType, Config, NMTModel from .corpora import DataFile @@ -816,21 +820,38 @@ def batch_sentences( yield batch, None -class OutputGroup: - def __init__(self, outputs: List[dict]): - self.outputs = outputs +@dataclass +class ModelOutput: + translated_text: str + translation_token_ids: List[int] + token_scores: List[float] + sequence_score: Optional[float] - def get_translated_text(self) -> List[str]: - return [output["translation_text"] for output in self.outputs] + def convert_to_sentence_translation(self, tokenizer: PreTrainedTokenizer) -> SentenceTranslation: + tokens = tokenizer.convert_ids_to_tokens(self.translation_token_ids) + return SentenceTranslation(self.translated_text, tokens, self.token_scores, self.sequence_score) - def get_token_ids(self) -> List[List[int]]: - return [output["translation_token_ids"] for output in self.outputs] - def get_token_scores(self) -> List[float]: - return [output["token_scores"] for output in self.outputs] +# This class represents multiple translations of a single input sequence +class ModelOutputGroup: + def __init__(self, outputs: List[dict]): + self._outputs = outputs + + def _get_model_outputs(self) -> List[ModelOutput]: + return [ + ModelOutput( + output["translation_text"], + output["translation_token_ids"], + output["token_scores"], + output["sequence_score"], + ) + for output in self._outputs + ] - def get_sequence_score(self) -> List[float]: - return [output["sequence_score"] for output in self.outputs] + def convert_to_sentence_translation_group(self, tokenizer: PreTrainedTokenizer) -> SentenceTranslationGroup: + return SentenceTranslationGroup( + [model_output.convert_to_sentence_translation(tokenizer) for model_output in self._get_model_outputs()] + ) class HuggingFaceNMTModel(NMTModel): @@ -1152,8 +1173,8 @@ def translate_test_files( out_file.write("\n".join(translated_draft) + "\n") if save_confidences: - generate_confidence_files( - output, + generate_test_confidence_files( + translated_draft, translation_path, produce_multiple_translations=produce_multiple_translations, draft_index=draft_index, @@ -1167,7 +1188,7 @@ def _translate_test_sentences( vrefs: Iterable[VerseRef], length: int, produce_multiple_translations: bool = False, - ) -> Iterable[TranslationGroup]: + ) -> Iterable[SentenceTranslationGroup]: num_drafts = self.get_num_drafts() if produce_multiple_translations and num_drafts > 1: LOGGER.info("Producing %i translated drafts", num_drafts) @@ -1177,31 +1198,14 @@ def _translate_test_sentences( "Falling back to a single translation." ) - for output_group in tqdm( + for model_output_group in tqdm( self._translate_sentences( tokenizer, pipeline, sentences, vrefs, produce_multiple_translations, return_tensors=True ), total=length, unit="ex", ): - all_ids = to_py_obj(output_group.get_token_ids()) - all_scores = to_py_obj(output_group.get_token_scores()) - sequence_score = to_py_obj(output_group.get_sequence_score()) - ids = [] - token_scores = [] - for output_id, output_score in zip(all_ids, all_scores): - output_ids = [] - output_scores = [] - for id, score in zip(output_id[1:], output_score[1:]): - if id == tokenizer.pad_token_id: - continue - output_ids.append(id) - output_scores.append(score) - ids.append(output_ids) - token_scores.append(output_scores) - # ids = [[id for id in output[1:] if id != tokenizer.pad_token_id] for output in ids] - tokens = [tokenizer.convert_ids_to_tokens(id_group) for id_group in ids] - yield [" ".join(token_group) for token_group in tokens], tokens, token_scores, sequence_score + yield model_output_group.convert_to_sentence_translation_group(tokenizer) def get_num_drafts(self) -> int: num_drafts = self._config.infer.get("num_drafts", 1) @@ -1215,7 +1219,7 @@ def translate( produce_multiple_translations: bool = False, vrefs: Optional[Iterable[VerseRef]] = None, ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST, - ) -> Iterable[TranslationGroup]: + ) -> Iterable[SentenceTranslationGroup]: src_lang = self._config.data["lang_codes"].get(src_iso, src_iso) trg_lang = self._config.data["lang_codes"].get(trg_iso, trg_iso) tokenizer = self._config.get_tokenizer() @@ -1248,32 +1252,12 @@ def translate( pipeline.model = torch.compile(pipeline.model) if not isinstance(sentences, list): sentences = list(sentences) - for outputs in tqdm( + for model_output_group in tqdm( self._translate_sentences(tokenizer, pipeline, sentences, vrefs, produce_multiple_translations), total=len(sentences), unit="ex", ): - if isinstance(outputs, OutputGroup): - outputs = [outputs] - for output_group in outputs: - translated_text = to_py_obj(output_group.get_translated_text()) - all_ids = to_py_obj(output_group.get_token_ids()) - all_scores = to_py_obj(output_group.get_token_scores()) - sequence_score = to_py_obj(output_group.get_sequence_score()) - ids = [] - token_scores = [] - for output_id, output_score in zip(all_ids, all_scores): - output_ids = [] - output_scores = [] - for id, score in zip(output_id[1:], output_score[1:]): - if id == tokenizer.pad_token_id: - continue - output_ids.append(id) - output_scores.append(score) - ids.append(output_ids) - token_scores.append(output_scores) - tokens = [tokenizer.convert_ids_to_tokens(id_group) for id_group in ids] - yield translated_text, tokens, token_scores, sequence_score + yield model_output_group.convert_to_sentence_translation_group(model_output_group, tokenizer) def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: step: Optional[int] = None @@ -1474,7 +1458,7 @@ def _translate_sentences( vrefs: Optional[Iterable[VerseRef]], produce_multiple_translations: bool = False, return_tensors: bool = False, - ) -> Iterable[OutputGroup]: + ) -> Iterable[ModelOutputGroup]: batch_size: int = self._config.infer["infer_batch_size"] dictionary = self._get_dictionary() @@ -1511,7 +1495,7 @@ def _translate_sentence_helper( return_tensors: bool, force_words_ids: List[List[List[int]]] = None, produce_multiple_translations: bool = False, - ) -> Iterable[OutputGroup]: + ) -> Iterable[ModelOutputGroup]: num_drafts = self.get_num_drafts() if produce_multiple_translations and num_drafts > 1: @@ -1540,12 +1524,13 @@ def _translate_sentence_helper( # concatenate the beam search results with the sampling results yield from [ - OutputGroup(beam_search_results[i] + sampling_results[i]) for i in range(len(beam_search_results)) + ModelOutputGroup(beam_search_results[i] + sampling_results[i]) + for i in range(len(beam_search_results)) ] elif multiple_translations_method == "sampling": yield from [ - OutputGroup(result) + ModelOutputGroup(result) for result in self._translate_with_sampling( pipeline, sentences, @@ -1558,7 +1543,7 @@ def _translate_sentence_helper( elif multiple_translations_method == "beam_search": yield from [ - OutputGroup(result) + ModelOutputGroup(result) for result in self._translate_with_beam_search( pipeline, sentences, @@ -1571,7 +1556,7 @@ def _translate_sentence_helper( elif multiple_translations_method == "diverse_beam_search": yield from [ - OutputGroup(result) + ModelOutputGroup(result) for result in self._translate_with_diverse_beam_search( pipeline, sentences, @@ -1586,7 +1571,7 @@ def _translate_sentence_helper( else: yield from [ - OutputGroup([translated_sentence[0]]) + ModelOutputGroup([translated_sentence[0]]) for translated_sentence in self._translate_with_beam_search( pipeline, sentences, @@ -1925,7 +1910,7 @@ def _forward(self, model_inputs, **generate_kwargs): beam_indices = torch.zeros_like(output_ids) assert output.scores is not None scores = tuple(torch.nn.functional.log_softmax(logits, dim=-1) for logits in output.scores) - sequences_scores = output.sequences_scores + sequences_scores = None else: raise RuntimeError("Cannot postprocess the output of the model.") @@ -1980,7 +1965,9 @@ def postprocess(self, model_outputs, return_type=None, clean_up_tokenization_spa "translation_tokens": output_tokens, "translation_token_ids": output_token_ids, "token_scores": scores, - "sequence_score": model_outputs["sequences_scores"][0], + "sequence_score": ( + model_outputs["sequences_scores"][0] if model_outputs["sequences_scores"] is not None else None + ), "translation_text": self.tokenizer.decode( output_ids, skip_special_tokens=True, From 7e06364f69c988e7f504eb06cc22f858e5b4e1f5 Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 26 Sep 2025 19:15:08 +0000 Subject: [PATCH 2/5] Minor bug fixes for implementation --- silnlp/common/translator.py | 11 ++++++----- silnlp/nmt/config.py | 4 ++-- silnlp/nmt/hugging_face_config.py | 12 +++++++++--- silnlp/nmt/translate.py | 4 ++-- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index 8c7be5a3..ecf65b2c 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -63,7 +63,7 @@ def join_tokens_for_confidence_file(self) -> str: return "\t".join(self._tokens) def join_token_scores_for_confidence_file(self) -> str: - return "\t".join([self._sequence_score] + self._token_scores) + return "\t".join([str(exp(ts)) for ts in [self._sequence_score] + self._token_scores]) # A group of multiple translations of a single sentence @@ -76,8 +76,7 @@ def __init__(self, sentence_translations: List[SentenceTranslation]): self._sentence_translations = sentence_translations def has_sequence_confidence_scores(self) -> bool: - # If any sentence has a sequence score, all sentences should have one - return self._sentence_translations[0].has_sequence_confidence_score() + return any([st.has_sequence_confidence_score() for st in self._sentence_translations]) def write_confidence_scores_to_file( self, @@ -89,6 +88,8 @@ def write_confidence_scores_to_file( confidences_file.write("\t".join([f"{row1col1_label}"] + [f"Token {i}" for i in range(200)]) + "\n") confidences_file.write("\t".join(["Sequence Score"] + [f"Token Score {i}" for i in range(200)]) + "\n") for sentence_num, sentence_translation in enumerate(self._sentence_translations): + if not sentence_translation.has_sequence_confidence_score(): + continue sequence_label = str(sentence_num) if vrefs is not None: sequence_label = str(vrefs[sentence_num]) @@ -113,7 +114,7 @@ def write_chapter_confidence_scores_to_file(self, chapter_confidences_path: Path def get_all_sequence_confidence_scores(self) -> List[float]: return [ - st.get_sequence_confidence_score() + exp(st.get_sequence_confidence_score()) for st in self._sentence_translations if st.get_sequence_confidence_score() is not None ] @@ -386,7 +387,7 @@ def translate_usfm( for idx, vref in reversed(empty_sents): sentences.insert(idx, "") vrefs.insert(idx, vref) - sentence_translation_groups.insert(idx, SentenceTranslation("", [], [], None)) + sentence_translation_groups.insert(idx, [SentenceTranslation("", [], [], None)]) text_behavior = ( UpdateUsfmTextBehavior.PREFER_NEW if trg_project is not None else UpdateUsfmTextBehavior.STRIP_EXISTING diff --git a/silnlp/nmt/config.py b/silnlp/nmt/config.py index 5f81694f..359e503b 100644 --- a/silnlp/nmt/config.py +++ b/silnlp/nmt/config.py @@ -32,7 +32,7 @@ write_corpus, ) from ..common.environment import SIL_NLP_ENV -from ..common.translator import TranslationGroup +from ..common.translator import SentenceTranslationGroup from ..common.utils import NoiseMethod, Side, get_mt_exp_dir, set_seed from .augment import AugmentMethod from .corpora import ( @@ -86,7 +86,7 @@ def translate( trg_iso: str, vrefs: Optional[Iterable[VerseRef]] = None, ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST, - ) -> Iterable[TranslationGroup]: ... + ) -> Iterable[SentenceTranslationGroup]: ... @abstractmethod def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: ... diff --git a/silnlp/nmt/hugging_face_config.py b/silnlp/nmt/hugging_face_config.py index 88618137..c4b48d77 100644 --- a/silnlp/nmt/hugging_face_config.py +++ b/silnlp/nmt/hugging_face_config.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from enum import Enum from itertools import repeat +from math import prod from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, TypeVar, Union, cast @@ -829,7 +830,12 @@ class ModelOutput: def convert_to_sentence_translation(self, tokenizer: PreTrainedTokenizer) -> SentenceTranslation: tokens = tokenizer.convert_ids_to_tokens(self.translation_token_ids) - return SentenceTranslation(self.translated_text, tokens, self.token_scores, self.sequence_score) + return SentenceTranslation( + to_py_obj(self.translated_text), + to_py_obj(tokens), + to_py_obj(self.token_scores), + to_py_obj(self.sequence_score), + ) # This class represents multiple translations of a single input sequence @@ -849,7 +855,7 @@ def _get_model_outputs(self) -> List[ModelOutput]: ] def convert_to_sentence_translation_group(self, tokenizer: PreTrainedTokenizer) -> SentenceTranslationGroup: - return SentenceTranslationGroup( + return list( [model_output.convert_to_sentence_translation(tokenizer) for model_output in self._get_model_outputs()] ) @@ -1257,7 +1263,7 @@ def translate( total=len(sentences), unit="ex", ): - yield model_output_group.convert_to_sentence_translation_group(model_output_group, tokenizer) + yield model_output_group.convert_to_sentence_translation_group(tokenizer) def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: step: Optional[int] = None diff --git a/silnlp/nmt/translate.py b/silnlp/nmt/translate.py index 52771a26..581d4b1b 100644 --- a/silnlp/nmt/translate.py +++ b/silnlp/nmt/translate.py @@ -11,7 +11,7 @@ from ..common.environment import SIL_NLP_ENV from ..common.paratext import book_file_name_digits, get_project_dir from ..common.postprocesser import PostprocessConfig, PostprocessHandler -from ..common.translator import TranslationGroup, Translator +from ..common.translator import SentenceTranslationGroup, Translator from ..common.utils import get_git_revision_hash, show_attrs from .clearml_connection import SILClearML from .config import CheckpointType, Config, NMTModel @@ -31,7 +31,7 @@ def translate( trg_iso: str, produce_multiple_translations: bool = False, vrefs: Optional[Iterable[VerseRef]] = None, - ) -> Iterable[TranslationGroup]: + ) -> Iterable[SentenceTranslationGroup]: return self._model.translate( sentences, src_iso, trg_iso, produce_multiple_translations, vrefs, self._checkpoint ) From 4b483c0af5ff752c4e982aad28ece7e2990300db Mon Sep 17 00:00:00 2001 From: Ben King Date: Mon, 29 Sep 2025 15:00:43 +0000 Subject: [PATCH 3/5] Fix for empty sentences --- silnlp/common/translator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index ecf65b2c..3c589f84 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -381,13 +381,14 @@ def translate_usfm( sentence_translation_groups: List[SentenceTranslationGroup] = list( self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs) ) + num_drafts = len(sentence_translation_groups[0]) # Add empty sentences back in # Prevents pre-existing text from showing up in the sections of translated text for idx, vref in reversed(empty_sents): sentences.insert(idx, "") vrefs.insert(idx, vref) - sentence_translation_groups.insert(idx, [SentenceTranslation("", [], [], None)]) + sentence_translation_groups.insert(idx, [SentenceTranslation("", [], [], None)] * num_drafts) text_behavior = ( UpdateUsfmTextBehavior.PREFER_NEW if trg_project is not None else UpdateUsfmTextBehavior.STRIP_EXISTING From 634c600e71c499ff5cae963d0a44aaec7be7a7f0 Mon Sep 17 00:00:00 2001 From: Ben King Date: Mon, 29 Sep 2025 15:24:04 +0000 Subject: [PATCH 4/5] Improved comments on new classes --- silnlp/common/translator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index 3c589f84..a235cca5 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -37,6 +37,7 @@ CONFIDENCE_SCORES_SUFFIX = ".confidences.tsv" +# A single translation of a single sentence class SentenceTranslation: def __init__( self, @@ -70,7 +71,7 @@ def join_token_scores_for_confidence_file(self) -> str: SentenceTranslationGroup = List[SentenceTranslation] -# A list representing a single draft (one translation of each input sentence) +# A class representing a single draft (one translation of each input sentence) class TranslatedDraft: def __init__(self, sentence_translations: List[SentenceTranslation]): self._sentence_translations = sentence_translations From 4fbf1c76e8c9a7f6717c04438b307ce96ded677f Mon Sep 17 00:00:00 2001 From: Ben King Date: Fri, 3 Oct 2025 20:27:16 +0000 Subject: [PATCH 5/5] Fix method naming bug --- silnlp/common/translator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/silnlp/common/translator.py b/silnlp/common/translator.py index a235cca5..7431b082 100644 --- a/silnlp/common/translator.py +++ b/silnlp/common/translator.py @@ -120,7 +120,7 @@ def get_all_sequence_confidence_scores(self) -> List[float]: if st.get_sequence_confidence_score() is not None ] - def get_rows_for_postprocess(self) -> List[str]: + def get_all_translations(self) -> List[str]: return [st.get_translation() for st in self._sentence_translations] @@ -397,7 +397,7 @@ def translate_usfm( draft_set: DraftGroup = DraftGroup(sentence_translation_groups) for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1): - postprocess_handler.construct_rows(vrefs, sentences, translated_draft.get_rows_for_postprocess()) + postprocess_handler.construct_rows(vrefs, sentences, translated_draft.get_all_translations()) for config in postprocess_handler.configs: