testing fix

cjac · cjac · commit d4e0a0934e98 · 2025-10-15T19:09:11.000Z
diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
@@ -22,11 +22,15 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
     dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
 RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools python3-pip bazel-${bazel_version} > /dev/null 2>&1 && \
     apt-get clean
 
 # Set bazel-${bazel_version} as the default bazel alternative in this container
 RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_version} 1 && \
     update-alternatives                    --set bazel /usr/bin/bazel-${bazel_version}
 
 USER ia-tests
+
+# Install Python dependencies
+RUN python3 -m pip install --upgrade pip && \
+    python3 -m pip install -r /init-actions/requirements.txt
diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh
@@ -28,7 +28,7 @@ spec:
     image: "${IMAGE}"
     resources:
       requests:
-        memory: "4G"
+        memory: "8G"
         cpu: "6000m"
     env:
     - name: COMMIT_SHA
diff --git a/gpu/BUILD b/gpu/BUILD
@@ -2,6 +2,17 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["install_gpu_driver.sh", "mig.sh"])
 
+py_library(
+    name = "gpu_test_case_base",
+    srcs = ["gpu_test_case_base.py"],
+    srcs_version = "PY3",
+    testonly = True,  # Add this line
+    deps = [
+        "//integration_tests:dataproc_test_case",
+        "@io_abseil_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "test_gpu",
     size = "enormous",
@@ -10,7 +21,8 @@ py_test(
     local = True,
     shard_count = 15,
     deps = [
+        ":gpu_test_case_base",  # Add this dependency
         "//integration_tests:dataproc_test_case",
         "@io_abseil_py//absl/testing:parameterized",
     ],
-)
+)
diff --git a/gpu/Dockerfile b/gpu/Dockerfile
@@ -40,6 +40,7 @@ RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-ge
 WORKDIR /init-actions
 
 USER ia-tests
+COPY --chown=ia-tests:ia-tests "cloudbuild/key.json" /key.json
 COPY --chown=ia-tests:ia-tests . ${WORKDIR}
 
 ENTRYPOINT ["/bin/bash"]
diff --git a/gpu/gpu_test_case_base.py b/gpu/gpu_test_case_base.py
@@ -0,0 +1,136 @@
+import os
+import time
+import random
+from packaging import version
+from integration_tests.dataproc_test_case import DataprocTestCase
+
+DEFAULT_TIMEOUT = 45  # minutes
+
+class GpuTestCaseBase(DataprocTestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def run_dataproc_job(self,
+                         cluster_name,
+                         job_type,
+                         job_params,
+                         timeout_in_minutes=DEFAULT_TIMEOUT):
+        """Executes Dataproc job on a cluster and returns results.
+
+        Args:
+            cluster_name: cluster name to submit job to
+            job_type: type of the job, e.g. spark, hadoop, pyspark
+            job_params: job parameters
+            timeout_in_minutes: timeout in minutes
+
+        Returns:
+            ret_code: the return code of the job
+            stdout: standard output of the job
+            stderr: error output of the job
+        """
+
+        ret_code, stdout, stderr = DataprocTestCase.run_command(
+            'gcloud dataproc jobs submit {} --cluster={} --region={} {}'.
+            format(job_type, cluster_name, self.REGION,
+                   job_params), timeout_in_minutes)
+        return ret_code, stdout, stderr
+
+    # Tests for PyTorch
+    TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py"
+
+    # Tests for TensorFlow
+    TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py"
+
+    def assert_instance_command(self,
+                             instance,
+                             cmd,
+                             timeout_in_minutes=DEFAULT_TIMEOUT):
+        retry_count = 5
+        ssh_cmd = 'gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60 -o StrictHostKeyChecking=no'.format(
+            instance, self.cluster_zone, cmd.replace('"', '\"'))
+
+        while retry_count > 0:
+            try:
+                # Use self.assert_command from DataprocTestCase
+                ret_code, stdout, stderr = self.assert_command(ssh_cmd, timeout_in_minutes)
+                return ret_code, stdout, stderr
+            except Exception as e:
+                print(f"An error occurred in assert_instance_command: {e}")
+                retry_count -= 1
+                if retry_count > 0:
+                    print(f"Retrying in 10 seconds...")
+                    time.sleep(10)
+                    continue
+                else:
+                    print("Max retries reached.")
+                    raise
+
+    def verify_instance(self, name):
+        # Verify that nvidia-smi works
+        self.assert_instance_command(name, "nvidia-smi", 1)
+        print(f"OK: nvidia-smi on {name}")
+
+    def verify_instance_gpu_agent(self, name):
+        print(f"--- Verifying GPU Agent on {name} ---")
+        self.assert_instance_command(
+            name, "systemctl is-active gpu-utilization-agent.service")
+        print(f"OK: GPU Agent on {name}")
+
+    def get_dataproc_image_version(self, instance):
+        _, stdout, _ = self.assert_instance_command(instance, "grep DATAPROC_IMAGE_VERSION /etc/environment | cut -d= -f2")
+        return stdout.strip()
+
+    def version_lt(self, v1, v2):
+        return version.parse(v1) < version.parse(v2)
+
+    def verify_pytorch(self, name):
+        print(f"--- Verifying PyTorch on {name} ---")
+        test_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
+                                   self.TORCH_TEST_SCRIPT_FILE_NAME)
+        self.upload_test_file(test_filename, name)
+
+        image_version = self.get_dataproc_image_version(name)
+        conda_root_path = "/opt/conda/miniconda3"
+        if not self.version_lt(image_version, "2.3"):
+            conda_root_path = "/opt/conda"
+
+        conda_env = "dpgce"
+        env_path = f"{conda_root_path}/envs/{conda_env}"
+        python_bin = f"{env_path}/bin/python3"
+
+        verify_cmd = (
+            f"for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node; do "
+            f"  if [[ -e \\\"$f\\\" ]]; then echo 0 > \\\"$f\\\"; fi; "
+            f"done; "
+            f"if /usr/share/google/get_metadata_value attributes/include-pytorch; then"
+            f"  {python_bin} {self.TORCH_TEST_SCRIPT_FILE_NAME}; "
+            f"else echo 'PyTorch test skipped as include-pytorch is not set'; fi"
+        )
+        _, stdout, _ = self.assert_instance_command(name, verify_cmd)
+        if "PyTorch test skipped" not in stdout:
+             self.assertTrue("True" in stdout, f"PyTorch CUDA not available or python not found in {env_path}")
+        print(f"OK: PyTorch on {name}")
+        self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name)
+
+    def verify_tensorflow(self, name):
+        print(f"--- Verifying TensorFlow on {name} ---")
+        test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu",
+                               self.TF_TEST_SCRIPT_FILE_NAME)
+        self.upload_test_file(test_filename, name)
+
+        image_version = self.get_dataproc_image_version(name)
+        conda_root_path = "/opt/conda/miniconda3"
+        if not self.version_lt(image_version, "2.3"):
+            conda_root_path = "/opt/conda"
+
+        conda_env="dpgce"
+        env_path = f"{conda_root_path}/envs/{conda_env}"
+        python_bin = f"{env_path}/bin/python3"
+
+        verify_cmd = (
+            f"for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${{f}} ; done ;"
+            f"{python_bin} {self.TF_TEST_SCRIPT_FILE_NAME}"
+        )
+        self.assert_instance_command(name, verify_cmd)
+        print(f"OK: TensorFlow on {name}")
+        self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name)
diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
@@ -18,16 +18,18 @@
 set -xeuo pipefail
 
 NM_WAS_RUNNING=0
-if systemctl is-active --quiet hadoop-yarn-nodemanager.service; then
-  echo "NodeManager is running, disabling and stopping..."
-  NM_WAS_RUNNING=1
-  systemctl disable hadoop-yarn-nodemanager.service
-  systemctl stop hadoop-yarn-nodemanager.service
-  echo "NodeManager disabled and stopped."
-else
-  echo "NodeManager is not running."
-  # Ensure it's disabled even if not running
-  systemctl disable hadoop-yarn-nodemanager.service >/dev/null 2>&1 || true
+if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "false" ]]; then
+  if systemctl is-active --quiet hadoop-yarn-nodemanager.service; then
+    echo "NodeManager is running, disabling and stopping..."
+    NM_WAS_RUNNING=1
+    systemctl disable hadoop-yarn-nodemanager.service
+    systemctl stop hadoop-yarn-nodemanager.service
+    echo "NodeManager disabled and stopped."
+  else
+    echo "NodeManager is not running."
+    # Ensure it's disabled even if not running
+    systemctl disable hadoop-yarn-nodemanager.service >/dev/null 2>&1 || true
+  fi
 fi
 
 function os_id()       { grep '^ID='               /etc/os-release | cut -d= -f2 | xargs ; }
@@ -2621,7 +2623,9 @@ function main() {
   fi
   # --- End Apply or Defer ---
 
-  yarn_exit_handler # Restart YARN services
+  if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "false" ]]; then
+    yarn_exit_handler # Restart YARN services
+  fi
 }
 
 function cache_fetched_package() {
@@ -2637,50 +2641,6 @@ function cache_fetched_package() {
   fi
 }
 
-function fix_nodemanager_init_script() {
-  local lsb_script="/etc/init.d/hadoop-yarn-nodemanager"
-  local service_name="hadoop-yarn-nodemanager"
-  local generated_unit_file="/run/systemd/generator.late/${service_name}.service"
-  local changed=0
-
-  if [[ ! -f "${lsb_script}" ]]; then
-    echo "WARN: LSB script ${lsb_script} not found."
-    return
-  fi
-
-  # 1. Fix the stop command
-  local broken_stop_cmd='start_daemon $EXEC_PATH --config "$CONF_DIR" stop $DAEMON_FLAGS'
-  local fixed_stop_cmd='start_daemon $EXEC_PATH --config "$CONF_DIR" --daemon stop $DAEMON_FLAGS'
-
-  if grep -qF "${broken_stop_cmd}" "${lsb_script}"; then
-    echo "Fixing stop command in ${lsb_script}..."
-    local sed_broken_stop_cmd=$(printf '%s\\n' "${broken_stop_cmd}" | sed 's/[][\\/.^$*]/\\\\&/g')
-    local sed_fixed_stop_cmd=$(printf '%s\\n' "${fixed_cmd}" | sed 's/[][\\/.^$*]/\\\\&/g')
-    sed -i "s|${sed_broken_stop_cmd}|${sed_fixed_stop_cmd}|" "${lsb_script}"
-    changed=1
-  fi
-
-  # 2. Prepend source commands to the 'su -c' line in the start function
-  local start_line_marker='--daemon start $DAEMON_FLAGS'
-  if grep -qF "${start_line_marker}" "${lsb_script}" && ! grep -qF "source /etc/environment && source /etc/default/hadoop-yarn-nodemanager" "${lsb_script}"; then
-    echo "Adding source commands to su -c in start() for ${lsb_script}"
-    sed -i '/su -s \/bin\/bash yarn -c "/s|yarn -c "|yarn -c "source /etc/environment && source /etc/default/hadoop-yarn-nodemanager && |' "${lsb_script}"
-    changed=1
-  fi
-
-  if [[ "${changed}" -eq 1 ]]; then
-    if [[ -f "${generated_unit_file}" ]]; then
-      echo "Removing old generated unit file: ${generated_unit_file}"
-      rm -f "${generated_unit_file}"
-    fi
-    echo "Reloading systemd daemon to regenerate units..."
-    systemctl daemon-reload
-    echo "Systemd daemon reloaded."
-  else
-    echo "No changes made to ${lsb_script}."
-  fi
-}
-
 function clean_up_sources_lists() {
   if ! is_debuntu; then return; fi
   #
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
@@ -9,7 +9,7 @@
 import sys
 from threading import Timer
 
-from packaging import version
+import pkg_resources
 from absl import flags
 from absl.testing import parameterized
 
@@ -252,7 +252,7 @@ def getImageVersion():
         # Special case a 'preview' image versions and return a large number
         # instead to make it a higher image version in comparisons
         version = FLAGS.image_version
-        return version.parse('999') if version.startswith(
+        return pkg_resources.parse_version('999') if version.startswith(
             'preview') else pkg_resources.parse_version(version.split('-')[0])
 
     @staticmethod
@@ -307,21 +307,23 @@ def assert_dataproc_job(self,
 
         Args:
             cluster_name: cluster name to submit job to
-            job_type: type of the job, e.g. spark, hadoop, pyspark
-            job_params: job parameters
-            timeout_in_minutes: timeout in minutes
+            job_type: job type (hadoop, spark, etc)
+            job_params: job command parameters
+            timeout_in_minutes: timeout in minutes after which process that
+                                waits on job will be killed if job did not
+                                finish
+        Returns:
+            ret_code: the return code of the job
+            stdout: standard output of the job
+            stderr: error output of the job
         Raises:
             AssertionError: if job returned non-0 exit code.
         """
 
-        ret_code, stdout, stderr = DataprocTestCase.run_command(
+        ret_code, stdout, stderr = self.assert_command(
             'gcloud dataproc jobs submit {} --cluster={} --region={} {}'.
             format(job_type, cluster_name, self.REGION,
                    job_params), timeout_in_minutes)
-        self.assertEqual(
-            ret_code, 0,
-            "Job failed with code {}. stdout: {}. stderr: {}".format(
-                ret_code, stdout, stderr))
         return ret_code, stdout, stderr
 
     def assert_command(self, cmd, timeout_in_minutes=DEFAULT_TIMEOUT):
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+bootstrapping==0.1.2
+click==8.1.7
+packaging
+setuptools<70.0.0