Skip to content

Commit d4e0a09

Browse files
committed
testing fix
1 parent 1688073 commit d4e0a09

File tree

8 files changed

+187
-68
lines changed

8 files changed

+187
-68
lines changed

cloudbuild/Dockerfile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,15 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
2222
dd of="${bazel_repo_file}" status=none && \
2323
apt-get update -qq
2424
RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
25-
apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
25+
apt-get install -y -qq default-jdk python3-setuptools python3-pip bazel-${bazel_version} > /dev/null 2>&1 && \
2626
apt-get clean
2727

2828
# Set bazel-${bazel_version} as the default bazel alternative in this container
2929
RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
3030
update-alternatives --set bazel /usr/bin/bazel-${bazel_version}
3131

3232
USER ia-tests
33+
34+
# Install Python dependencies
35+
RUN python3 -m pip install --upgrade pip && \
36+
python3 -m pip install -r /init-actions/requirements.txt

cloudbuild/run-presubmit-on-k8s.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ spec:
2828
image: "${IMAGE}"
2929
resources:
3030
requests:
31-
memory: "4G"
31+
memory: "8G"
3232
cpu: "6000m"
3333
env:
3434
- name: COMMIT_SHA

gpu/BUILD

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,17 @@ package(default_visibility = ["//visibility:public"])
22

33
exports_files(["install_gpu_driver.sh", "mig.sh"])
44

5+
py_library(
6+
name = "gpu_test_case_base",
7+
srcs = ["gpu_test_case_base.py"],
8+
srcs_version = "PY3",
9+
testonly = True, # Add this line
10+
deps = [
11+
"//integration_tests:dataproc_test_case",
12+
"@io_abseil_py//absl/testing:parameterized",
13+
],
14+
)
15+
516
py_test(
617
name = "test_gpu",
718
size = "enormous",
@@ -10,7 +21,8 @@ py_test(
1021
local = True,
1122
shard_count = 15,
1223
deps = [
24+
":gpu_test_case_base", # Add this dependency
1325
"//integration_tests:dataproc_test_case",
1426
"@io_abseil_py//absl/testing:parameterized",
1527
],
16-
)
28+
)

gpu/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-ge
4040
WORKDIR /init-actions
4141

4242
USER ia-tests
43+
COPY --chown=ia-tests:ia-tests "cloudbuild/key.json" /key.json
4344
COPY --chown=ia-tests:ia-tests . ${WORKDIR}
4445

4546
ENTRYPOINT ["/bin/bash"]

gpu/gpu_test_case_base.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import os
2+
import time
3+
import random
4+
from packaging import version
5+
from integration_tests.dataproc_test_case import DataprocTestCase
6+
7+
DEFAULT_TIMEOUT = 45 # minutes
8+
9+
class GpuTestCaseBase(DataprocTestCase):
10+
def __init__(self, *args, **kwargs):
11+
super().__init__(*args, **kwargs)
12+
13+
def run_dataproc_job(self,
14+
cluster_name,
15+
job_type,
16+
job_params,
17+
timeout_in_minutes=DEFAULT_TIMEOUT):
18+
"""Executes Dataproc job on a cluster and returns results.
19+
20+
Args:
21+
cluster_name: cluster name to submit job to
22+
job_type: type of the job, e.g. spark, hadoop, pyspark
23+
job_params: job parameters
24+
timeout_in_minutes: timeout in minutes
25+
26+
Returns:
27+
ret_code: the return code of the job
28+
stdout: standard output of the job
29+
stderr: error output of the job
30+
"""
31+
32+
ret_code, stdout, stderr = DataprocTestCase.run_command(
33+
'gcloud dataproc jobs submit {} --cluster={} --region={} {}'.
34+
format(job_type, cluster_name, self.REGION,
35+
job_params), timeout_in_minutes)
36+
return ret_code, stdout, stderr
37+
38+
# Tests for PyTorch
39+
TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
40+
41+
# Tests for TensorFlow
42+
TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
43+
44+
def assert_instance_command(self,
45+
instance,
46+
cmd,
47+
timeout_in_minutes=DEFAULT_TIMEOUT):
48+
retry_count = 5
49+
ssh_cmd = 'gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60 -o StrictHostKeyChecking=no'.format(
50+
instance, self.cluster_zone, cmd.replace('"', '\"'))
51+
52+
while retry_count > 0:
53+
try:
54+
# Use self.assert_command from DataprocTestCase
55+
ret_code, stdout, stderr = self.assert_command(ssh_cmd, timeout_in_minutes)
56+
return ret_code, stdout, stderr
57+
except Exception as e:
58+
print(f"An error occurred in assert_instance_command: {e}")
59+
retry_count -= 1
60+
if retry_count > 0:
61+
print(f"Retrying in 10 seconds...")
62+
time.sleep(10)
63+
continue
64+
else:
65+
print("Max retries reached.")
66+
raise
67+
68+
def verify_instance(self, name):
69+
# Verify that nvidia-smi works
70+
self.assert_instance_command(name, "nvidia-smi", 1)
71+
print(f"OK: nvidia-smi on {name}")
72+
73+
def verify_instance_gpu_agent(self, name):
74+
print(f"--- Verifying GPU Agent on {name} ---")
75+
self.assert_instance_command(
76+
name, "systemctl is-active gpu-utilization-agent.service")
77+
print(f"OK: GPU Agent on {name}")
78+
79+
def get_dataproc_image_version(self, instance):
80+
_, stdout, _ = self.assert_instance_command(instance, "grep DATAPROC_IMAGE_VERSION /etc/environment | cut -d= -f2")
81+
return stdout.strip()
82+
83+
def version_lt(self, v1, v2):
84+
return version.parse(v1) < version.parse(v2)
85+
86+
def verify_pytorch(self, name):
87+
print(f"--- Verifying PyTorch on {name} ---")
88+
test_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
89+
self.TORCH_TEST_SCRIPT_FILE_NAME)
90+
self.upload_test_file(test_filename, name)
91+
92+
image_version = self.get_dataproc_image_version(name)
93+
conda_root_path = "/opt/conda/miniconda3"
94+
if not self.version_lt(image_version, "2.3"):
95+
conda_root_path = "/opt/conda"
96+
97+
conda_env = "dpgce"
98+
env_path = f"{conda_root_path}/envs/{conda_env}"
99+
python_bin = f"{env_path}/bin/python3"
100+
101+
verify_cmd = (
102+
f"for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node; do "
103+
f" if [[ -e \\\"$f\\\" ]]; then echo 0 > \\\"$f\\\"; fi; "
104+
f"done; "
105+
f"if /usr/share/google/get_metadata_value attributes/include-pytorch; then"
106+
f" {python_bin} {self.TORCH_TEST_SCRIPT_FILE_NAME}; "
107+
f"else echo 'PyTorch test skipped as include-pytorch is not set'; fi"
108+
)
109+
_, stdout, _ = self.assert_instance_command(name, verify_cmd)
110+
if "PyTorch test skipped" not in stdout:
111+
self.assertTrue("True" in stdout, f"PyTorch CUDA not available or python not found in {env_path}")
112+
print(f"OK: PyTorch on {name}")
113+
self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
114+
115+
def verify_tensorflow(self, name):
116+
print(f"--- Verifying TensorFlow on {name} ---")
117+
test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
118+
self.TF_TEST_SCRIPT_FILE_NAME)
119+
self.upload_test_file(test_filename, name)
120+
121+
image_version = self.get_dataproc_image_version(name)
122+
conda_root_path = "/opt/conda/miniconda3"
123+
if not self.version_lt(image_version, "2.3"):
124+
conda_root_path = "/opt/conda"
125+
126+
conda_env="dpgce"
127+
env_path = f"{conda_root_path}/envs/{conda_env}"
128+
python_bin = f"{env_path}/bin/python3"
129+
130+
verify_cmd = (
131+
f"for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${{f}} ; done ;"
132+
f"{python_bin} {self.TF_TEST_SCRIPT_FILE_NAME}"
133+
)
134+
self.assert_instance_command(name, verify_cmd)
135+
print(f"OK: TensorFlow on {name}")
136+
self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)

gpu/install_gpu_driver.sh

Lines changed: 15 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,18 @@
1818
set -xeuo pipefail
1919

2020
NM_WAS_RUNNING=0
21-
if systemctl is-active --quiet hadoop-yarn-nodemanager.service; then
22-
echo "NodeManager is running, disabling and stopping..."
23-
NM_WAS_RUNNING=1
24-
systemctl disable hadoop-yarn-nodemanager.service
25-
systemctl stop hadoop-yarn-nodemanager.service
26-
echo "NodeManager disabled and stopped."
27-
else
28-
echo "NodeManager is not running."
29-
# Ensure it's disabled even if not running
30-
systemctl disable hadoop-yarn-nodemanager.service >/dev/null 2>&1 || true
21+
if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "false" ]]; then
22+
if systemctl is-active --quiet hadoop-yarn-nodemanager.service; then
23+
echo "NodeManager is running, disabling and stopping..."
24+
NM_WAS_RUNNING=1
25+
systemctl disable hadoop-yarn-nodemanager.service
26+
systemctl stop hadoop-yarn-nodemanager.service
27+
echo "NodeManager disabled and stopped."
28+
else
29+
echo "NodeManager is not running."
30+
# Ensure it's disabled even if not running
31+
systemctl disable hadoop-yarn-nodemanager.service >/dev/null 2>&1 || true
32+
fi
3133
fi
3234

3335
function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
@@ -2621,7 +2623,9 @@ function main() {
26212623
fi
26222624
# --- End Apply or Defer ---
26232625

2624-
yarn_exit_handler # Restart YARN services
2626+
if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "false" ]]; then
2627+
yarn_exit_handler # Restart YARN services
2628+
fi
26252629
}
26262630

26272631
function cache_fetched_package() {
@@ -2637,50 +2641,6 @@ function cache_fetched_package() {
26372641
fi
26382642
}
26392643

2640-
function fix_nodemanager_init_script() {
2641-
local lsb_script="/etc/init.d/hadoop-yarn-nodemanager"
2642-
local service_name="hadoop-yarn-nodemanager"
2643-
local generated_unit_file="/run/systemd/generator.late/${service_name}.service"
2644-
local changed=0
2645-
2646-
if [[ ! -f "${lsb_script}" ]]; then
2647-
echo "WARN: LSB script ${lsb_script} not found."
2648-
return
2649-
fi
2650-
2651-
# 1. Fix the stop command
2652-
local broken_stop_cmd='start_daemon $EXEC_PATH --config "$CONF_DIR" stop $DAEMON_FLAGS'
2653-
local fixed_stop_cmd='start_daemon $EXEC_PATH --config "$CONF_DIR" --daemon stop $DAEMON_FLAGS'
2654-
2655-
if grep -qF "${broken_stop_cmd}" "${lsb_script}"; then
2656-
echo "Fixing stop command in ${lsb_script}..."
2657-
local sed_broken_stop_cmd=$(printf '%s\\n' "${broken_stop_cmd}" | sed 's/[][\\/.^$*]/\\\\&/g')
2658-
local sed_fixed_stop_cmd=$(printf '%s\\n' "${fixed_cmd}" | sed 's/[][\\/.^$*]/\\\\&/g')
2659-
sed -i "s|${sed_broken_stop_cmd}|${sed_fixed_stop_cmd}|" "${lsb_script}"
2660-
changed=1
2661-
fi
2662-
2663-
# 2. Prepend source commands to the 'su -c' line in the start function
2664-
local start_line_marker='--daemon start $DAEMON_FLAGS'
2665-
if grep -qF "${start_line_marker}" "${lsb_script}" && ! grep -qF "source /etc/environment && source /etc/default/hadoop-yarn-nodemanager" "${lsb_script}"; then
2666-
echo "Adding source commands to su -c in start() for ${lsb_script}"
2667-
sed -i '/su -s \/bin\/bash yarn -c "/s|yarn -c "|yarn -c "source /etc/environment && source /etc/default/hadoop-yarn-nodemanager && |' "${lsb_script}"
2668-
changed=1
2669-
fi
2670-
2671-
if [[ "${changed}" -eq 1 ]]; then
2672-
if [[ -f "${generated_unit_file}" ]]; then
2673-
echo "Removing old generated unit file: ${generated_unit_file}"
2674-
rm -f "${generated_unit_file}"
2675-
fi
2676-
echo "Reloading systemd daemon to regenerate units..."
2677-
systemctl daemon-reload
2678-
echo "Systemd daemon reloaded."
2679-
else
2680-
echo "No changes made to ${lsb_script}."
2681-
fi
2682-
}
2683-
26842644
function clean_up_sources_lists() {
26852645
if ! is_debuntu; then return; fi
26862646
#

integration_tests/dataproc_test_case.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import sys
1010
from threading import Timer
1111

12-
from packaging import version
12+
import pkg_resources
1313
from absl import flags
1414
from absl.testing import parameterized
1515

@@ -252,7 +252,7 @@ def getImageVersion():
252252
# Special case a 'preview' image versions and return a large number
253253
# instead to make it a higher image version in comparisons
254254
version = FLAGS.image_version
255-
return version.parse('999') if version.startswith(
255+
return pkg_resources.parse_version('999') if version.startswith(
256256
'preview') else pkg_resources.parse_version(version.split('-')[0])
257257

258258
@staticmethod
@@ -307,21 +307,23 @@ def assert_dataproc_job(self,
307307
308308
Args:
309309
cluster_name: cluster name to submit job to
310-
job_type: type of the job, e.g. spark, hadoop, pyspark
311-
job_params: job parameters
312-
timeout_in_minutes: timeout in minutes
310+
job_type: job type (hadoop, spark, etc)
311+
job_params: job command parameters
312+
timeout_in_minutes: timeout in minutes after which process that
313+
waits on job will be killed if job did not
314+
finish
315+
Returns:
316+
ret_code: the return code of the job
317+
stdout: standard output of the job
318+
stderr: error output of the job
313319
Raises:
314320
AssertionError: if job returned non-0 exit code.
315321
"""
316322

317-
ret_code, stdout, stderr = DataprocTestCase.run_command(
323+
ret_code, stdout, stderr = self.assert_command(
318324
'gcloud dataproc jobs submit {} --cluster={} --region={} {}'.
319325
format(job_type, cluster_name, self.REGION,
320326
job_params), timeout_in_minutes)
321-
self.assertEqual(
322-
ret_code, 0,
323-
"Job failed with code {}. stdout: {}. stderr: {}".format(
324-
ret_code, stdout, stderr))
325327
return ret_code, stdout, stderr
326328

327329
def assert_command(self, cmd, timeout_in_minutes=DEFAULT_TIMEOUT):

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
bootstrapping==0.1.2
2+
click==8.1.7
3+
packaging
4+
setuptools<70.0.0

0 commit comments

Comments
 (0)