Skip to content

Commit c699c75

Browse files
authored
Add TensorFlow 2.14 tests for TPU VM. (#972)
* Add TensorFlow 2.14 tests for TPU VM. * Setting the version in additional places. Also adding to dashboard.
1 parent df26987 commit c699c75

13 files changed

+1190
-1
lines changed

dashboard/app.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ resources:
3333
env_variables:
3434
REDISHOST: '10.25.27.107'
3535
REDISPORT: '6379'
36-
TEST_NAME_PREFIXES: 'pt-nightly,pt-2.0,tf-r2.14.0,tf.nightly,tf.nightly-se,tf.exp,%-1vm,jax,flax,pax-stable,pax-nightly,mp-jax,mp-pax,mp-pt'
36+
TEST_NAME_PREFIXES: 'pt-nightly,pt-2.0,tf.2.14.0,tf.nightly,tf.nightly-se,tf.exp,%-1vm,jax,flax,pax-stable,pax-nightly,mp-jax,mp-pax,mp-pt'
3737
JOB_HISTORY_TABLE_NAME: 'xl-ml-test.metrics_handler_dataset.job_history'
3838
METRIC_HISTORY_TABLE_NAME: 'xl-ml-test.metrics_handler_dataset.metric_history'
3939

Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
// Copyright 2020 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
local common = import '../common.libsonnet';
16+
local experimental = import '../experimental.libsonnet';
17+
local metrics = import 'templates/metrics.libsonnet';
18+
local mixins = import 'templates/mixins.libsonnet';
19+
local utils = import 'templates/utils.libsonnet';
20+
local volumes = import 'templates/volumes.libsonnet';
21+
22+
{
23+
HuggingFaceTransformer:: common.ModelGardenTest {
24+
local config = self,
25+
26+
frameworkPrefix: 'tf.2.14.0',
27+
tpuSettings+: {
28+
softwareVersion: '2.14.0',
29+
},
30+
imageTag: 'r2.14.0',
31+
script: {
32+
initialSetup:
33+
|||
34+
cd /tmp
35+
git clone https://github.com/huggingface/transformers.git
36+
cd transformers
37+
pip install .
38+
pip install -r examples/tensorflow/_tests_requirements.txt
39+
|||,
40+
},
41+
},
42+
ModelGardenTest:: common.ModelGardenTest {
43+
local config = self,
44+
45+
frameworkPrefix: 'tf.2.14.0',
46+
tpuSettings+: {
47+
softwareVersion: '2.14.0',
48+
},
49+
imageTag: 'r2.14.0',
50+
podTemplate+:: if config.accelerator.type == 'tpu' then
51+
{
52+
spec+: {
53+
initContainerMap+:: {
54+
'tpu-version': {
55+
image: config.podTemplate.spec.containerMap.train.image,
56+
env+: [
57+
{
58+
name: 'TPU_NAME',
59+
valueFrom: {
60+
fieldRef: {
61+
fieldPath: "metadata.annotations['name.cloud-tpus.google.com/train']",
62+
},
63+
},
64+
},
65+
{
66+
name: 'POD_UID',
67+
valueFrom: {
68+
fieldRef: {
69+
fieldPath: 'metadata.uid',
70+
},
71+
},
72+
},
73+
],
74+
local tpuCreateSettings = {
75+
acceleratorName: std.escapeStringBash(config.accelerator.name),
76+
softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion),
77+
startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript),
78+
sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds,
79+
testName: std.strReplace(config.testName, '.', '-'),
80+
},
81+
command: [
82+
'python3',
83+
'-c',
84+
|||
85+
import os
86+
import tensorflow as tf
87+
import urllib
88+
import json
89+
import cloud_tpu_client
90+
import sys
91+
print('python version: ' + str(sys.version))
92+
print('tf_version: ' + str(tf.__version__))
93+
#TODO(chandrasekhard):
94+
# Add extra condition to fail if it picks stale image
95+
print(str(tf.__file__))
96+
ctc = cloud_tpu_client.Client(tpu=os.path.basename('$(TPU_NAME)'), zone=os.path.dirname('$(TPU_NAME)'))
97+
ctc.wait_for_healthy()
98+
ctc.configure_tpu_version('nightly', restart_type='always')
99+
ctc.wait_for_healthy()
100+
_VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion'
101+
url = _VERSION_SWITCHER_ENDPOINT.format(ctc.network_endpoints()[0]['ipAddress'])
102+
req = urllib.request.Request(url)
103+
resp = urllib.request.urlopen(req)
104+
version_details = json.loads(resp.read())
105+
print(version_details)
106+
|||,
107+
],
108+
},
109+
},
110+
},
111+
}
112+
else
113+
{},
114+
},
115+
tpuVm:: experimental.TensorFlowTpuVmMixin {
116+
local config = self,
117+
tpuSettings+: {
118+
softwareVersion: if config.accelerator.replicas == 1 then
119+
'tpu-vm-tf-2.13.0'
120+
else
121+
'tpu-vm-tf-2.13.0-pod',
122+
},
123+
podTemplate+:: {
124+
spec+: {
125+
initContainerMap+:: {
126+
'create-tpu'+: {
127+
local tpuCreateSettings = {
128+
acceleratorName: std.escapeStringBash(config.accelerator.name),
129+
softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion),
130+
startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript),
131+
sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds,
132+
testName: std.strReplace(config.testName, '.', '-'),
133+
},
134+
command: utils.scriptCommand(|||
135+
project=$(curl -sS "http://metadata.google.internal/computeMetadata/v1/project/project-id" -H "Metadata-Flavor: Google")
136+
zone=$(curl -sS "http://metadata.google.internal/computeMetadata/v1/instance/zone" -H "Metadata-Flavor: Google" | awk -F'/' '{print $4}')
137+
tpu_name=tpu-${POD_UID}
138+
ssh-keygen -t rsa -f /scripts/id_rsa -q -N ""
139+
echo "
140+
gcloud alpha compute tpus tpu-vm delete -q ${tpu_name} --zone=${zone}
141+
" > /scripts/cleanup.sh
142+
143+
curl -X POST \
144+
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
145+
-H "Content-Type: application/json" \
146+
-d "{
147+
accelerator_type: %(acceleratorName)s,
148+
runtime_version: %(softwareVersion)s,
149+
network_config: {enable_external_ips: true},
150+
labels: {test_name: '%(testName)s' },
151+
boot_disk: {source_image: 'projects/cloud-tpu-v2-images-dev/global/images/family/tpu-vm-tf-2-14-0'},
152+
metadata: {
153+
'ssh-keys': 'xl-ml-test:$(cat /scripts/id_rsa.pub)',
154+
'startup-script': %(startupScript)s,
155+
'tensorflow-docker-url': 'gcr.io/cloud-tpu-v2-images-dev/grpc_tpu_worker:tf-2.14.0'
156+
}
157+
}" https://tpu.googleapis.com/v2alpha1/projects/${project}/locations/${zone}/nodes?node_id=${tpu_name}
158+
echo "Waiting for TPU Pod ${tpu_name} to become ready..."
159+
timeout 10m bash -c -- "
160+
while [[ \${health:-NONE} != READY ]];
161+
do sleep 60 && \
162+
health=\$(gcloud \
163+
--project=${project} \
164+
compute \
165+
tpus \
166+
describe \
167+
${tpu_name} \
168+
--zone=${zone} \
169+
--format='value(state)') && \
170+
echo 'Waiting for ready TPU (current state \${health:-NONE})...';
171+
done
172+
"
173+
echo ${zone} > /scripts/zone
174+
echo ${tpu_name} > /scripts/tpu_name
175+
gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --format="value(networkEndpoints[0].ipAddress)" > /scripts/tpu_ip
176+
gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --flatten="networkEndpoints[]" --format="csv[no-heading](networkEndpoints.ipAddress)" > /scripts/all_tpu_ips
177+
sleep %(sleepTime)d
178+
179+
softwareVersion=%(softwareVersion)s
180+
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "echo 'WRAPT_DISABLE_EXTENSIONS=true' | sudo tee -a /etc/environment"
181+
if [[ ${softwareVersion: -3} == "pod" ]]; then
182+
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo sed -i 's/TF_DOCKER_URL=.*/TF_DOCKER_URL=gcr.io\/cloud-tpu-v2-images-dev\/grpc_tpu_worker:tf-2.14.0\"/' /etc/systemd/system/tpu-runtime.service"
183+
gcloud alpha compute tpus tpu-vm ssh ${tpu_name} --zone=${zone} --project=${project} --internal-ip --ssh-key-file=/scripts/id_rsa --worker=all --command "sudo systemctl daemon-reload && sudo systemctl restart tpu-runtime"
184+
fi
185+
||| % tpuCreateSettings),
186+
},
187+
'tpu-version': {
188+
image: 'google/cloud-sdk',
189+
command: null,
190+
},
191+
},
192+
},
193+
},
194+
},
195+
TfVisionTest:: self.ModelGardenTest + common.TfNlpVisionMixin {
196+
scriptConfig+: {
197+
runnerPath: 'official/vision/train.py',
198+
},
199+
},
200+
TfNlpTest:: self.ModelGardenTest + common.TfNlpVisionMixin {
201+
scriptConfig+: {
202+
runnerPath: 'official/nlp/train.py',
203+
},
204+
},
205+
TfRankingTest:: self.ModelGardenTest {
206+
paramsOverride:: {
207+
runtime: {
208+
distribution_strategy: error 'Must set `runtime.distribution_strategy`',
209+
},
210+
task: {
211+
train_data: {
212+
input_path: '$(CRITEO_DATA_DIR)/train/*',
213+
global_batch_size: 16384,
214+
},
215+
validation_data: {
216+
input_path: '$(CRITEO_DATA_DIR)/eval/*',
217+
global_batch_size: 16384,
218+
},
219+
model: {
220+
num_dense_features: 13,
221+
bottom_mlp: [512, 256, 64],
222+
embedding_dim: 64,
223+
top_mlp: [1024, 1024, 512, 256, 1],
224+
vocab_sizes: [
225+
39884406,
226+
39043,
227+
17289,
228+
7420,
229+
20263,
230+
3,
231+
7120,
232+
1543,
233+
63,
234+
38532951,
235+
2953546,
236+
403346,
237+
10,
238+
2208,
239+
11938,
240+
155,
241+
4,
242+
976,
243+
14,
244+
39979771,
245+
25641295,
246+
39664984,
247+
585935,
248+
12972,
249+
108,
250+
36,
251+
],
252+
},
253+
},
254+
trainer: {
255+
use_orbit: true,
256+
validation_interval: 90000,
257+
checkpoint_interval: 270000,
258+
validation_steps: 5440,
259+
train_steps: 256054,
260+
optimizer_config: {
261+
embedding_optimizer: 'SGD',
262+
lr_config: {
263+
decay_exp: 1.6,
264+
decay_start_steps: 150000,
265+
decay_steps: 136054,
266+
learning_rate: 30,
267+
warmup_steps: 8000,
268+
},
269+
},
270+
},
271+
},
272+
command: [
273+
'python3',
274+
'official/recommendation/ranking/train.py',
275+
'--params_override=%s' % (std.manifestYamlDoc(self.paramsOverride) + '\n'),
276+
'--model_dir=$(MODEL_DIR)',
277+
],
278+
},
279+
imagenet:: {
280+
scriptConfig+: {
281+
trainFilePattern: '$(IMAGENET_DIR)/train*',
282+
evalFilePattern: '$(IMAGENET_DIR)/valid*',
283+
},
284+
},
285+
coco:: {
286+
scriptConfig+: {
287+
trainFilePattern: '$(COCO_DIR)/train*',
288+
evalFilePattern: '$(COCO_DIR)/val*',
289+
paramsOverride+: {
290+
task+: {
291+
annotation_file: '$(COCO_DIR)/instances_val2017.json',
292+
},
293+
},
294+
},
295+
},
296+
local functional_schedule = '0 9 * * *',
297+
Functional:: mixins.Functional {
298+
schedule: if !(self.accelerator.type == 'tpu') || self.accelerator.name == 'v3-8' || self.accelerator.name == 'v4-8' then
299+
functional_schedule
300+
else
301+
null,
302+
metricConfig+: {
303+
sourceMap+:: {
304+
tensorboard+: {
305+
aggregateAssertionsMap+:: {
306+
examples_per_second: {
307+
AVERAGE: {
308+
inclusive_bounds: true,
309+
std_devs_from_mean: {
310+
comparison: 'GREATER',
311+
std_devs: 4.0,
312+
},
313+
wait_for_n_data_points: 0,
314+
},
315+
},
316+
},
317+
},
318+
},
319+
},
320+
},
321+
// Override default schedule for Functional.
322+
RunNightly:: {
323+
schedule: functional_schedule,
324+
},
325+
Convergence:: mixins.Convergence {
326+
schedule: '0 5 * * 0,2,4',
327+
metricConfig+: {
328+
sourceMap+:: {
329+
tensorboard+: {
330+
aggregateAssertionsMap+:: {
331+
examples_per_second: {
332+
AVERAGE: {
333+
inclusive_bounds: true,
334+
std_devs_from_mean: {
335+
comparison: 'GREATER',
336+
// TODO(wcromar): Tighten this restriction
337+
std_devs: 2.0,
338+
},
339+
wait_for_n_data_points: 0,
340+
},
341+
},
342+
},
343+
},
344+
},
345+
},
346+
},
347+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2020 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
local bert = import 'tf-bert-glue_mnli.libsonnet';
16+
local dlrm = import 'tf-dlrm-criteo.libsonnet';
17+
local gpt2 = import 'tf-gpt2-wikitext.libsonnet';
18+
local keras_api = import 'tf-keras-api.libsonnet';
19+
local maskrcnn = import 'tf-maskrcnn-coco.libsonnet';
20+
local resnet = import 'tf-resnet-imagenet.libsonnet';
21+
local resnetrs = import 'tf-resnetrs-imagenet.libsonnet';
22+
local retinanet = import 'tf-retinanet-coco.libsonnet';
23+
local wmt = import 'tf-wmt-wmt14_translate.libsonnet';
24+
25+
// Add new models here
26+
std.flattenArrays([
27+
// dlrm.configs,
28+
keras_api.configs,
29+
// bert.configs,
30+
// wmt.configs,
31+
// maskrcnn.configs,
32+
// retinanet.configs,
33+
resnet.configs,
34+
// resnetrs.configs,
35+
// gpt2.configs,
36+
])

0 commit comments

Comments
 (0)