mozilla
diff --git a/‎pipeline/common/command_runner.py‎
Lines changed: 9 additions & 29 deletions b/‎pipeline/common/command_runner.py‎
Lines changed: 9 additions & 29 deletions
diff --git a/‎pipeline/common/marian.py‎
Lines changed: 52 additions & 0 deletions b/‎pipeline/common/marian.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎pipeline/eval/eval.py‎
Lines changed: 1 addition & 1 deletion b/‎pipeline/eval/eval.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipeline/translate/translate.py‎
Lines changed: 82 additions & 7 deletions b/‎pipeline/translate/translate.py‎
Lines changed: 82 additions & 7 deletions
@@ -1,8 +1,8 @@
+import os
 import re
 from shlex import join
 import shlex
 import subprocess
-from typing import Union
 
 
 def _get_indented_command_string(command_parts: list[str]) -> str:
@@ -97,7 +97,9 @@ def run_command_pipeline(
     subprocess.check_call(command_string, shell=True)
 
 
-def run_command(command: list[str], capture=False, logger=None) -> str | None:
+def run_command(
+    command: list[str], capture=False, shell=False, logger=None, env=None
+) -> str | None:
     """
     Runs a command and outputs a nice representation of the command to a logger, if supplied.
 
@@ -107,13 +109,17 @@ def run_command(command: list[str], capture=False, logger=None) -> str | None:
         pipeline. If False, output is printed to stdout.
       logger: A logger instance used for logging the command execution. If provided,
         it will log the pipeline commands.
+      env: The environment object.
 
     Example:
       directory_listing = run_command(
         ["ls", "-l"],
         capture=True
       )
     """
+    # Expand any environment variables.
+    command = [os.path.expandvars(part) for part in command]
+
     if logger:
         # Log out a nice representation of this command.
         logger.info("Running:")
@@ -123,30 +129,4 @@ def run_command(command: list[str], capture=False, logger=None) -> str | None:
     if capture:
         return subprocess.check_output(command).decode("utf-8")
 
-    subprocess.check_call(command)
-
-
-def marian_args_to_dict(extra_marian_args: list[str]) -> dict[str, Union[str, list[str]]]:
-    """
-    Converts marian args, to the dict format.
-
-    e.g. `--precision float16` becomes {"precision": "float16"}
-    """
-    decoder_config = {}
-    key = None
-    for arg in extra_marian_args:
-        if arg.startswith("--"):
-            key = arg[2:]
-        elif key:
-            existing_arg = decoder_config.get(key)
-            if existing_arg is None:
-                decoder_config[key] = arg
-            elif isinstance(existing_arg, list):
-                existing_arg.append(arg)
-            else:
-                # Convert these arguments into a list, since there are multiple
-                decoder_config[key] = [existing_arg, arg]
-        else:
-            raise ValueError("Marian args should start with a --key")
-
-    return decoder_config
+    subprocess.check_call(command, env=env)
@@ -0,0 +1,52 @@
+"""
+Common utilities related to working with Marian.
+"""
+
+from pathlib import Path
+
+import yaml
+
+
+def get_combined_config(config_path: Path, extra_marian_args: list[str]) -> dict[str, any]:
+    """
+    Frequently we combine a Marian yml config with extra marian args when running
+    training. To get the final value, add both here.
+    """
+    return {
+        **yaml.safe_load(config_path.open()),
+        **marian_args_to_dict(extra_marian_args),
+    }
+
+
+def marian_args_to_dict(extra_marian_args: list[str]) -> dict:
+    """
+    Converts marian args, to the dict format. This will combine a decoder.yml
+    and extra marian args.
+
+    e.g. `--precision float16` becomes {"precision": "float16"}
+    """
+    decoder_config = {}
+    if extra_marian_args and extra_marian_args[0] == "--":
+        extra_marian_args = extra_marian_args[1:]
+
+    previous_key = None
+    for arg in extra_marian_args:
+        if arg.startswith("--"):
+            previous_key = arg[2:]
+            decoder_config[previous_key] = True
+            continue
+
+        if not previous_key:
+            raise Exception(
+                f"Expected to have a previous key when converting marian args to a dict: {extra_marian_args}"
+            )
+
+        prev_value = decoder_config.get(previous_key)
+        if prev_value is True:
+            decoder_config[previous_key] = arg
+        elif isinstance(prev_value, list):
+            prev_value.append(arg)
+        else:
+            decoder_config[previous_key] = [prev_value, arg]
+
+    return decoder_config
@@ -65,7 +65,7 @@
         list_existing_group_logs_metrics,
     )
 
-    WANDB_AVAILABLE = True
+    WANDB_AVAILABLE = "TASKCLUSTER_PROXY_URL " in os.environ
 except ImportError as e:
     print(f"Failed to import tracking module: {e}")
     WANDB_AVAILABLE = False
 
@@ -3,17 +3,38 @@
 """
 
 import argparse
+from enum import Enum
 from glob import glob
+import os
 from pathlib import Path
 import tempfile
 
+
 from pipeline.common.command_runner import apply_command_args, run_command
 from pipeline.common.datasets import compress, decompress
 from pipeline.common.downloads import count_lines, is_file_empty, write_lines
 from pipeline.common.logging import get_logger
+from pipeline.common.marian import get_combined_config
+from pipeline.translate.translate_ctranslate2 import translate_with_ctranslate2
 
 logger = get_logger(__file__)
 
+DECODER_CONFIG_PATH = Path(__file__).parent / "decoder.yml"
+
+
+class Decoder(Enum):
+    marian = "marian"
+    ctranslate2 = "ctranslate2"
+
+
+class Device(Enum):
+    cpu = "cpu"
+    gpu = "gpu"
+
+
+def get_beam_size(extra_marian_args: list[str]):
+    return get_combined_config(DECODER_CONFIG_PATH, extra_marian_args)["beam-size"]
+
 
 def run_marian(
     marian_dir: Path,
@@ -30,7 +51,7 @@ def run_marian(
     marian_bin = str(marian_dir / "marian-decoder")
     log = input.parent / f"{input.name}.log"
     if is_nbest:
-        extra_args = ["--nbest", *extra_args]
+        extra_args = ["--n-best", *extra_args]
 
     logger.info("Starting Marian to translate")
 
@@ -52,6 +73,7 @@ def run_marian(
             *extra_args,
         ],
         logger=logger,
+        env={**os.environ},
     )
 
 
@@ -69,6 +91,7 @@ def main() -> None:
         "--models_glob",
         type=str,
         required=True,
+        nargs="+",
         help="A glob pattern to the Marian model(s)",
     )
     parser.add_argument(
@@ -91,6 +114,18 @@ def main() -> None:
         required=True,
         help="The amount of Marian memory (in MB) to preallocate",
     )
+    parser.add_argument(
+        "--decoder",
+        type=Decoder,
+        default=Decoder.marian,
+        help="Either use the normal marian decoder, or opt for CTranslate2.",
+    )
+    parser.add_argument(
+        "--device",
+        type=Device,
+        default=Device.gpu,
+        help="Either use the normal marian decoder, or opt for CTranslate2.",
+    )
     parser.add_argument(
         "extra_marian_args",
         nargs=argparse.REMAINDER,
@@ -103,13 +138,19 @@ def main() -> None:
     marian_dir: Path = args.marian_dir
     input_zst: Path = args.input
     artifacts: Path = args.artifacts
-    models_glob: str = args.models_glob
-    models: list[Path] = [Path(path) for path in glob(models_glob)]
+    models_globs: str = args.models_glob
+    models: list[Path] = []
+    for models_glob in models_globs:
+        for path in glob(models_glob):
+            models.append(Path(path))
     postfix = "nbest" if args.nbest else "out"
     output_zst = artifacts / f"{input_zst.stem}.{postfix}.zst"
     vocab: Path = args.vocab
     gpus: list[str] = args.gpus.split(" ")
     extra_marian_args: list[str] = args.extra_marian_args
+    decoder: Decoder = args.decoder
+    is_nbest: bool = args.nbest
+    device: Device = args.device
 
     # Do some light validation of the arguments.
     assert input_zst.exists(), f"The input file exists: {input_zst}"
@@ -118,6 +159,7 @@ def main() -> None:
         artifacts.mkdir()
     for gpu_index in gpus:
         assert gpu_index.isdigit(), f'GPUs must be list of numbers: "{gpu_index}"'
+    assert models, "There must be at least one model"
     for model in models:
         assert model.exists(), f"The model file exists {model}"
     if extra_marian_args and extra_marian_args[0] != "--":
@@ -136,6 +178,29 @@ def main() -> None:
             pass
         return
 
+    if decoder == Decoder.ctranslate2:
+        translate_with_ctranslate2(
+            input_zst=input_zst,
+            artifacts=artifacts,
+            extra_marian_args=extra_marian_args,
+            models_glob=models_glob,
+            is_nbest=is_nbest,
+            vocab=[str(vocab)],
+            device=device.value,
+        )
+        return
+
+    # The device flag is for use with CTranslate, but add some assertions here so that
+    # we can be consistent in usage.
+    if device == Device.cpu:
+        assert (
+            "--cpu-threads" in extra_marian_args
+        ), "Marian's cpu should be controlled with the flag --cpu-threads"
+    else:
+        assert (
+            "--cpu-threads" not in extra_marian_args
+        ), "Requested a GPU device, but --cpu-threads was provided"
+
     # Run the training.
     with tempfile.TemporaryDirectory() as temp_dir_str:
         temp_dir = Path(temp_dir_str)
@@ -152,16 +217,26 @@ def main() -> None:
             output=output_txt,
             gpus=gpus,
             workspace=args.workspace,
-            is_nbest=args.nbest,
+            is_nbest=is_nbest,
             # Take off the initial "--"
             extra_args=extra_marian_args[1:],
         )
-        assert count_lines(input_txt) == count_lines(
-            output_txt
-        ), "The input and output had the same number of lines"
 
         compress(output_txt, destination=output_zst, remove=True, logger=logger)
 
+        input_count = count_lines(input_txt)
+        output_count = count_lines(output_zst)
+        if is_nbest:
+            beam_size = get_beam_size(extra_marian_args)
+            expected_output = input_count * beam_size
+            assert (
+                expected_output == output_count
+            ), f"The nbest output had {beam_size}x as many lines ({expected_output} vs {output_count})"
+        else:
+            assert (
+                input_count == output_count
+            ), f"The input ({input_count} and output ({output_count}) had the same number of lines"
+
 
 if __name__ == "__main__":
     main()
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@`
`65`	`65`	`list_existing_group_logs_metrics,`
`66`	`66`	`)`
`67`	`67`
`68`		`- WANDB_AVAILABLE = True`
	`68`	`+ WANDB_AVAILABLE = "TASKCLUSTER_PROXY_URL " in os.environ`
`69`	`69`	`except ImportError as e:`
`70`	`70`	`print(f"Failed to import tracking module: {e}")`
`71`	`71`	`WANDB_AVAILABLE = False`