Use the config.ci.yml for the training defaults

gregtatum · gregtatum · commit 42639fe1a887 · 2024-09-20T15:39:35.000-05:00
diff --git a/taskcluster/configs/config.ci.yml b/taskcluster/configs/config.ci.yml
@@ -9,9 +9,13 @@ experiment:
 
   teacher-ensemble: 1
 
-  mono-max-sentences-src: 10000
-  mono-max-sentences-trg: 10000
-  spm-sample-size: 10000
+  mono-max-sentences-src:
+    total: 10000
+    per-dataset: 10000
+  mono-max-sentences-trg:
+    total: 10000
+    per-dataset: 10000
+  spm-sample-size: 1000
   spm-vocab-size: 1000
 
   best-model: chrf
@@ -20,50 +24,57 @@ experiment:
   opuscleaner-mode: "custom"
   teacher-mode: "two-stage"
 
-
   bicleaner:
     default-threshold: 0.5
     dataset-thresholds:
       opus_ada83/v1: 0.0
       opus_ELRC-3075-wikipedia_health/v1: 0.6
 
+  min-fluency-threshold:
+    mono-src: 0.8
+    mono-trg: 0.9
+
 marian-args:
   training-backward:
-    disp-freq: "1"
-    save-freq: "5"
-    valid-freq: "10"
-    after: 10u
+    # Run training for 10 updates, and display 5 updates. Only validate and save the
+    # model once.
+    disp-freq: "2"
+    save-freq: "25"
+    valid-freq: "50"
+    after: 50u
     dim-vocabs: "1000 1000"
   training-teacher:
     disp-freq: "1"
-    save-freq: "5"
-    valid-freq: "10"
-    after: 10u
+    save-freq: "25"
+    valid-freq: "50"
+    after: 50u
     dim-vocabs: "1000 1000"
     task: transformer-base
   training-student:
     disp-freq: "1"
-    save-freq: "5"
-    valid-freq: "10"
-    after: 10u
+    save-freq: "25"
+    valid-freq: "50"
+    after: 50u
     dim-vocabs: "1000 1000"
   training-student-finetuned:
     disp-freq: "1"
-    save-freq: "5"
-    valid-freq: "10"
-    after: 10u
-    dim-vocabs: "1000 1000"
+    save-freq: "25"
+    valid-freq: "50"
+    after: 50u
+    dim-vocabs: 1000 1000
   decoding-backward:
     mini-batch-words: "2000"
   decoding-teacher:
     mini-batch-words: "1000"
     precision: float16
 
+# Ensure that we have adequate coverage for dataset types in CI.
 datasets:
   train:
     - opus_ada83/v1
     - opus_ELRC-3075-wikipedia_health/v1
     - url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst
+    - mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus
   devtest:
     - flores_dev
     - sacrebleu_aug-upper_wmt19
@@ -76,7 +87,8 @@ datasets:
     - news-crawl_news.2007
     - opus_tldr-pages/v2023-08-29 # 39,646 sentences
 
-wandb-publication: false
+# Publishes to the "ci" project.
+wandb-publication: true
 target-stage: all
 taskcluster:
   split-chunks: 2
diff --git a/taskcluster/test/test_default_params.py b/taskcluster/test/test_default_params.py
@@ -2,9 +2,9 @@
 
 from taskgraph.taskgraph import TaskGraph
 
-from translations_taskgraph.parameters import get_defaults
+from translations_taskgraph.parameters import get_ci_training_config
 
-PARAMS = deepcopy(get_defaults(None))
+PARAMS = deepcopy(get_ci_training_config())
 PARAMS["target_tasks_method"] = "train-target-tasks"
 
 MOCK_REQUESTS = [
diff --git a/taskcluster/test/test_target_stage.py b/taskcluster/test/test_target_stage.py
@@ -2,9 +2,9 @@
 
 from taskgraph.taskgraph import TaskGraph
 
-from translations_taskgraph.parameters import get_defaults
+from translations_taskgraph.parameters import get_ci_training_config
 
-PARAMS = deepcopy(get_defaults(None))
+PARAMS = deepcopy(get_ci_training_config())
 PARAMS["target_tasks_method"] = "train-target-tasks"
 PARAMS["training_config"]["target-stage"] = "train-teacher"
 
diff --git a/taskcluster/test/test_training_continuation_backwards.py b/taskcluster/test/test_training_continuation_backwards.py
@@ -2,9 +2,9 @@
 
 from taskgraph.taskgraph import TaskGraph
 
-from translations_taskgraph.parameters import get_defaults
+from translations_taskgraph.parameters import get_ci_training_config
 
-PARAMS = deepcopy(get_defaults(None))
+PARAMS = deepcopy(get_ci_training_config())
 PARAMS["target_tasks_method"] = "train-target-tasks"
 PARAMS["training_config"]["experiment"]["pretrained-models"] = {
     "train-backwards": {
diff --git a/taskcluster/translations_taskgraph/actions/train.py b/taskcluster/translations_taskgraph/actions/train.py
@@ -10,7 +10,7 @@
 from taskgraph.taskgraph import TaskGraph
 from taskgraph.util.taskcluster import get_ancestors, get_artifact
 
-from translations_taskgraph.parameters import get_defaults
+from translations_taskgraph.parameters import get_ci_training_config
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +34,7 @@ def can_train(parameters):
     )
 
 
-defaults = get_defaults("")["training_config"]
+defaults = get_ci_training_config()["training_config"]
 
 
 def validate_pretrained_models(params):
diff --git a/taskcluster/translations_taskgraph/parameters.py b/taskcluster/translations_taskgraph/parameters.py
@@ -2,119 +2,21 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+from pathlib import Path
 from taskgraph.parameters import extend_parameters_schema
 from voluptuous import Extra, Optional, Required
+import yaml
 
 
-# These defaults line up with the `config.ci.yml` pipeline as much as possible.
-# Their purpose is to provide a minimal config with a few datasets that can run
-# the entire pipeline reasonably quickly to validate changes to the pipeline
-# itself. Any real training should be overriding most, if not all, of these
-# via the input to the `train` action.
-def get_defaults(_) -> dict:
-    return {
-        "training_config": {
-            "target-stage": "all",
-            "experiment": {
-                "name": "ci",
-                "src": "ru",
-                "trg": "en",
-                "teacher-ensemble": 1,
-                "teacher-mode": "two-stage",
-                "mono-max-sentences-trg": {"total": 10000, "per-dataset": 10000},
-                "mono-max-sentences-src": {"total": 10000, "per-dataset": 10000},
-                "spm-sample-size": 10000,
-                "spm-vocab-size": 1000,
-                "best-model": "chrf",
-                "use-opuscleaner": "true",
-                "opuscleaner-mode": "custom",
-                "bicleaner": {
-                    "default-threshold": 0.5,
-                    "dataset-thresholds": {
-                        "opus_ada83/v1": 0.0,
-                        "opus_ELRC-3075-wikipedia_health/v1": 0.6,
-                    },
-                },
-                "min-fluency-threshold": {
-                    "mono-src": 0.8,
-                    "mono-trg": 0.9,
-                },
-            },
-            "marian-args": {
-                "training-backward": {
-                    "disp-freq": "2",
-                    "save-freq": "25",
-                    "valid-freq": "50",
-                    "after": "50u",
-                    "dim-vocabs": "1000 1000",
-                },
-                "training-teacher": {
-                    "disp-freq": "1",
-                    "save-freq": "25",
-                    "valid-freq": "50",
-                    "after": "50u",
-                    "dim-vocabs": "1000 1000",
-                    "task": "transformer-base",
-                },
-                "training-student": {
-                    "disp-freq": "1",
-                    "save-freq": "25",
-                    "valid-freq": "50",
-                    "after": "50u",
-                    "dim-vocabs": "1000 1000",
-                },
-                "training-student-finetuned": {
-                    "disp-freq": "1",
-                    "save-freq": "25",
-                    "valid-freq": "50",
-                    "after": "50u",
-                    "dim-vocabs": "1000 1000",
-                },
-                "decoding-backward": {
-                    "mini-batch-words": "2000",
-                },
-                "decoding-teacher": {
-                    "mini-batch-words": "1000",
-                    "precision": "float16",
-                },
-            },
-            # These will never be used in practice, but specifying them ensures
-            # that we always generate at least one task for each kind, which helps
-            # to avoid bustage that doesn't show up until we run the training action.
-            "datasets": {
-                "train": [
-                    "opus_ada83/v1",
-                    "opus_ELRC-3075-wikipedia_health/v1",
-                    "url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst",
-                    "mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus",
-                ],
-                "devtest": [
-                    "flores_dev",
-                    "sacrebleu_aug-upper_wmt19",
-                ],
-                "test": [
-                    "flores_devtest",
-                ],
-                "mono-src": [
-                    "news-crawl_news.2008",
-                    "opus_tldr-pages/v2023-08-29",
-                ],
-                "mono-trg": [
-                    "news-crawl_news.2007",
-                    "opus_tldr-pages/v2023-08-29",
-                ],
-            },
-            # Taskcluster-specific configuration
-            "taskcluster": {
-                "split-chunks": 2,
-                "worker-classes": {
-                    "default": "gcp-spot",
-                },
-            },
-            # Disable Weight & Biases publication on CI
-            "wandb-publication": True,
-        },
-    }
+# By default, provide a very minimal config for CI that runs very quickly. This allows
+# the pipeline to be validated in CI. The production training configs should override
+# all of these values.
+def get_ci_training_config(_=None) -> dict:
+    vcs_path = (Path(__file__).parent / "../..").resolve()
+    config_path = vcs_path / "taskcluster/configs/config.ci.yml"
+
+    with config_path.open() as file:
+        return {"training_config": yaml.safe_load(file)}
 
 
 extend_parameters_schema(
@@ -184,7 +86,7 @@ def get_defaults(_) -> dict:
             Optional("wandb-publication"): bool,
         },
     },
-    defaults_fn=get_defaults,
+    defaults_fn=get_ci_training_config,
 )
 
 
@@ -198,4 +100,4 @@ def deep_setdefault(dict_, defaults):
 
 def get_decision_parameters(graph_config, parameters):
     parameters.setdefault("training_config", {})
-    deep_setdefault(parameters, get_defaults(""))
+    deep_setdefault(parameters, get_ci_training_config())