2
2
# License, v. 2.0. If a copy of the MPL was not distributed with this
3
3
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
4
5
+ from pathlib import Path
5
6
from taskgraph .parameters import extend_parameters_schema
6
7
from voluptuous import Extra , Optional , Required
8
+ import yaml
7
9
8
10
9
- # These defaults line up with the `config.ci.yml` pipeline as much as possible.
10
- # Their purpose is to provide a minimal config with a few datasets that can run
11
- # the entire pipeline reasonably quickly to validate changes to the pipeline
12
- # itself. Any real training should be overriding most, if not all, of these
13
- # via the input to the `train` action.
14
- def get_defaults (_ ) -> dict :
15
- return {
16
- "training_config" : {
17
- "target-stage" : "all" ,
18
- "experiment" : {
19
- "name" : "ci" ,
20
- "src" : "ru" ,
21
- "trg" : "en" ,
22
- "teacher-ensemble" : 1 ,
23
- "teacher-mode" : "two-stage" ,
24
- "mono-max-sentences-trg" : {"total" : 10000 , "per-dataset" : 10000 },
25
- "mono-max-sentences-src" : {"total" : 10000 , "per-dataset" : 10000 },
26
- "spm-sample-size" : 10000 ,
27
- "spm-vocab-size" : 1000 ,
28
- "best-model" : "chrf" ,
29
- "use-opuscleaner" : "true" ,
30
- "opuscleaner-mode" : "custom" ,
31
- "bicleaner" : {
32
- "default-threshold" : 0.5 ,
33
- "dataset-thresholds" : {
34
- "opus_ada83/v1" : 0.0 ,
35
- "opus_ELRC-3075-wikipedia_health/v1" : 0.6 ,
36
- },
37
- },
38
- "min-fluency-threshold" : {
39
- "mono-src" : 0.8 ,
40
- "mono-trg" : 0.9 ,
41
- },
42
- },
43
- "marian-args" : {
44
- "training-backward" : {
45
- "disp-freq" : "2" ,
46
- "save-freq" : "25" ,
47
- "valid-freq" : "50" ,
48
- "after" : "50u" ,
49
- "dim-vocabs" : "1000 1000" ,
50
- },
51
- "training-teacher" : {
52
- "disp-freq" : "1" ,
53
- "save-freq" : "25" ,
54
- "valid-freq" : "50" ,
55
- "after" : "50u" ,
56
- "dim-vocabs" : "1000 1000" ,
57
- "task" : "transformer-base" ,
58
- },
59
- "training-student" : {
60
- "disp-freq" : "1" ,
61
- "save-freq" : "25" ,
62
- "valid-freq" : "50" ,
63
- "after" : "50u" ,
64
- "dim-vocabs" : "1000 1000" ,
65
- },
66
- "training-student-finetuned" : {
67
- "disp-freq" : "1" ,
68
- "save-freq" : "25" ,
69
- "valid-freq" : "50" ,
70
- "after" : "50u" ,
71
- "dim-vocabs" : "1000 1000" ,
72
- },
73
- "decoding-backward" : {
74
- "mini-batch-words" : "2000" ,
75
- },
76
- "decoding-teacher" : {
77
- "mini-batch-words" : "1000" ,
78
- "precision" : "float16" ,
79
- },
80
- },
81
- # These will never be used in practice, but specifying them ensures
82
- # that we always generate at least one task for each kind, which helps
83
- # to avoid bustage that doesn't show up until we run the training action.
84
- "datasets" : {
85
- "train" : [
86
- "opus_ada83/v1" ,
87
- "opus_ELRC-3075-wikipedia_health/v1" ,
88
- "url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst" ,
89
- "mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus" ,
90
- ],
91
- "devtest" : [
92
- "flores_dev" ,
93
- "sacrebleu_aug-upper_wmt19" ,
94
- ],
95
- "test" : [
96
- "flores_devtest" ,
97
- ],
98
- "mono-src" : [
99
- "news-crawl_news.2008" ,
100
- "opus_tldr-pages/v2023-08-29" ,
101
- ],
102
- "mono-trg" : [
103
- "news-crawl_news.2007" ,
104
- "opus_tldr-pages/v2023-08-29" ,
105
- ],
106
- },
107
- # Taskcluster-specific configuration
108
- "taskcluster" : {
109
- "split-chunks" : 2 ,
110
- "worker-classes" : {
111
- "default" : "gcp-spot" ,
112
- },
113
- },
114
- # Disable Weight & Biases publication on CI
115
- "wandb-publication" : True ,
116
- },
117
- }
11
+ # By default, provide a very minimal config for CI that runs very quickly. This allows
12
+ # the pipeline to be validated in CI. The production training configs should override
13
+ # all of these values.
14
+ def get_ci_training_config (_ = None ) -> dict :
15
+ vcs_path = (Path (__file__ ).parent / "../.." ).resolve ()
16
+ config_path = vcs_path / "taskcluster/configs/config.ci.yml"
17
+
18
+ with config_path .open () as file :
19
+ return {"training_config" : yaml .safe_load (file )}
118
20
119
21
120
22
extend_parameters_schema (
@@ -184,7 +86,7 @@ def get_defaults(_) -> dict:
184
86
Optional ("wandb-publication" ): bool ,
185
87
},
186
88
},
187
- defaults_fn = get_defaults ,
89
+ defaults_fn = get_ci_training_config ,
188
90
)
189
91
190
92
@@ -198,4 +100,4 @@ def deep_setdefault(dict_, defaults):
198
100
199
101
def get_decision_parameters (graph_config , parameters ):
200
102
parameters .setdefault ("training_config" , {})
201
- deep_setdefault (parameters , get_defaults ( "" ))
103
+ deep_setdefault (parameters , get_ci_training_config ( ))
0 commit comments