From 0e9874288203154517f487eb145080ef1b431ee3 Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Wed, 26 Oct 2022 00:31:26 +0000
Subject: [PATCH] Add OSS GPU tests

[ghstack-poisoned]
---
 .github/scripts/install_nvidia_utils_linux.sh | 89 +++++++++++++++++++
 .github/workflows/runtime_tests.yaml          | 20 ++++-
 Dockerfile                                    | 26 +++---
 setup.py                                      | 12 ++-
 4 files changed, 125 insertions(+), 22 deletions(-)
 create mode 100644 .github/scripts/install_nvidia_utils_linux.sh

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
new file mode 100644
index 00000000..16c9aa0b
--- /dev/null
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -0,0 +1,89 @@
+et -eou pipefail
+
+
+DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
+DRIVER_VERSION="515.57"
+DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
+
+install_nvidia_docker2_amzn2() {
+    (
+        set -x
+        # Needed for yum-config-manager
+        sudo yum install -y yum-utils
+        sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
+        sudo yum install -y nvidia-docker2
+        sudo systemctl restart docker
+    )
+}
+
+install_nvidia_driver_amzn2() {
+    (
+        set -x
+
+        # Purge any nvidia driver installed from RHEL repo
+        sudo yum remove -y nvidia-driver-latest-dkms
+
+        HAS_NVIDIA_DRIVER=0
+        # Check if NVIDIA driver has already been installed
+        if [ -x "$(command -v nvidia-smi)" ]; then
+            # The driver exists, check its version next
+            INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
+
+            if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
+                echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
+            else
+                HAS_NVIDIA_DRIVER=1
+                echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
+            fi
+        fi
+
+        if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
+            sudo yum groupinstall -y "Development Tools"
+            # ensure our kernel install is the same as our underlying kernel,
+            # groupinstall "Development Tools" has a habit of mismatching kernel headers
+            sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
+            sudo modprobe backlight
+            sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+            sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+            sudo rm -fv /tmp/nvidia_driver
+        fi
+
+        (
+            set +e
+            nvidia-smi
+            status=$?
+            # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
+            if [ $status -eq 0 ] || [ $status -eq 14 ]; then
+                echo "INFO: Ignoring allowed status ${status}"
+            else
+                echo "ERROR: nvidia-smi exited with unresolved status ${status}"
+                exit ${status}
+            fi
+        )
+    )
+}
+
+echo "== Installing nvidia driver ${DRIVER_FN} =="
+case "${DISTRIBUTION}" in
+    amzn*)
+        install_nvidia_driver_amzn2
+        ;;
+    *)
+        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+        exit 1
+        ;;
+esac
+
+# Install container toolkit based on distribution
+echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
+case "${DISTRIBUTION}" in
+    amzn*)
+        install_nvidia_docker2_amzn2
+        ;;
+    *)
+        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
+        exit 1
+        ;;
+esac
+
diff --git a/.github/workflows/runtime_tests.yaml b/.github/workflows/runtime_tests.yaml
index 57657a79..c9535412 100644
--- a/.github/workflows/runtime_tests.yaml
+++ b/.github/workflows/runtime_tests.yaml
@@ -13,7 +13,7 @@ jobs:
       matrix:
         python-major-version: [3]
         python-minor-version: [7,8,9,10]
-        platform: [ubuntu-18.04]
+        platform: [linux.4xlarge.nvidia.gpu]
       fail-fast: false
     runs-on: ${{ matrix.platform }}
     steps:
@@ -21,7 +21,19 @@ jobs:
         uses: actions/checkout@v2
         with:
           submodules: true
-
+      - name: Clean up previous CUDA driver installations
+        shell: bash
+        run: |
+          set -x
+          yum list installed | grep nvidia || true
+          yum list installed | grep cuda || true
+          sudo yum remove -y cuda || true
+          sudo yum remove -y cuda-drivers || true
+          sudo yum remove -y "*nvidia*" || true
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        run: |
+          bash .github/workflows/install_nvidia_utils_linux.sh || true
+          echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
       - name: Setup SSH (Click me for login details)
         uses: ./.github/actions/setup-ssh
         with:
@@ -30,11 +42,11 @@ jobs:
       - name: Build
         env:
           DOCKER_BUILDKIT: 1
-        run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} .
+        run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} --build-arg BUILD_CUDA_TESTS=1 .
 
       - name: Test
         run: |
-          docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy"
+          docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy && multipy/runtime/build/test_deploy_gpu"
 
       - name: Examples
         run: |
diff --git a/Dockerfile b/Dockerfile
index 6564803e..7e472f0c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=nvidia/cuda:11.3.1-devel-ubuntu18.04
+ARG BASE_IMAGE=nvidia/cuda:11.6.1-devel-ubuntu18.04
 
 FROM ${BASE_IMAGE} as dev-base
 
@@ -59,6 +59,9 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY multipy multipy
 COPY compat-requirements.txt compat-requirements.txt
+COPY setup.py setup.py
+COPY README.md README.md
+COPY dev-requirements.txt dev-requirements.txt
 
 RUN git submodule update --init --recursive --jobs 0
 
@@ -66,6 +69,7 @@ RUN git submodule update --init --recursive --jobs 0
 FROM dev-base as conda-pyenv
 ARG PYTHON_MAJOR_VERSION=3
 ARG PYTHON_MINOR_VERSION=8
+ARG BUILD_CUDA_TESTS=0
 ENV PYTHON_MINOR_VERSION=${PYTHON_MINOR_VERSION}
 ENV PYTHON_VERSION=${PYTHON_MAJOR_VERSION}.${PYTHON_MINOR_VERSION}
 RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
@@ -75,7 +79,7 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
     rm ~/miniconda.sh && \
     /opt/conda/bin/conda install -y python=${PYTHON_VERSION} mkl mkl-include conda-build pyyaml numpy ipython && \
     /opt/conda/bin/conda install -y -c conda-forge libpython-static=${PYTHON_VERSION} && \
-    /opt/conda/bin/conda install -y pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly && \
+    /opt/conda/bin/conda install -y pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch-nightly -c nvidia && \
     /opt/conda/bin/conda clean -ya; \
     else \
     pip3 install virtualenv && \
@@ -84,29 +88,23 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
     ~/.pyenv/bin/pyenv install --force 3.7.10 && \
     virtualenv -p ~/.pyenv/versions/3.7.10/bin/python3 ~/venvs/multipy && \
     source ~/venvs/multipy/bin/activate && \
-    pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113; \
+    pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116; \
     fi
 
-# Build/Install pytorch with post-cxx11 ABI
 FROM conda-pyenv as build
-WORKDIR /opt/multipy/multipy/runtime/third-party/pytorch
 COPY --from=conda-pyenv /opt/conda* /opt/conda
 COPY --from=submodule-update /opt/multipy /opt/multipy
 
 WORKDIR /opt/multipy
 
 # Build Multipy
-RUN rm -r multipy/runtime/build; mkdir multipy/runtime/build && \
-    cd multipy/runtime/build && \
-    if [[ ${PYTHON_MINOR_VERSION} -lt 8 ]]; then \
-    source ~/venvs/multipy/bin/activate && \
-    cmake -DLEGACY_PYTHON_PRE_3_8=ON ..; \
+RUN ls && pwd && rm -rf multipy/runtime/build && \
+    if [[ ${BUILD_CUDA_TESTS} -eq 1 ]]; then \
+    python -m pip install -e . --install-option="--cudatests"; \
     else \
-    cmake -DLEGACY_PYTHON_PRE_3_8=OFF ..; \
+    python -m pip install -e .; \
     fi && \
-    cmake --build . --config Release -j && \
-    cmake --install . --prefix "." && \
-    cd ../example && python generate_examples.py
+    python multipy/runtime/example/generate_examples.py
 
 # Build examples
 COPY examples examples
diff --git a/setup.py b/setup.py
index 5a4a17a3..03f8d8f3 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@ def get_cmake_version():
 
 
 class MultipyRuntimeCmake(object):
-    user_options = [("cmakeoff", None, None), ("abicxx", None, None)]
+    user_options = [("cmakeoff", None, None), ("cudatests", None, None), ("abicxx", None, None)]
 
 
 class MultipyRuntimeDevelop(MultipyRuntimeCmake, develop):
@@ -41,24 +41,28 @@ def initialize_options(self):
         # TODO(tristanr): remove once unused
         self.abicxx = None
 
+        self.cudatests = None
     def finalize_options(self):
         develop.finalize_options(self)
         if self.cmakeoff is not None:
             self.distribution.get_command_obj("build_ext").cmake_off = True
+        if self.cudatests is not None:
+            self.distribution.get_command_obj("build_ext").cuda_tests_flag = "ON"
 
 
 class MultipyRuntimeBuild(MultipyRuntimeCmake, build_ext):
     user_options = build_ext.user_options + MultipyRuntimeCmake.user_options
     cmake_off = False
+    cuda_tests_flag = "OFF"
 
     def run(self):
         if self.cmake_off:
             return
         try:
             cmake_version_comps = get_cmake_version().split(".")
-            if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "19":
+            if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "12":
                 raise RuntimeError(
-                    "CMake 3.19 or later required for multipy runtime installation."
+                    "CMake 3.12 or later required for multipy runtime installation."
                 )
         except OSError:
             raise RuntimeError(
@@ -74,7 +78,7 @@ def run(self):
         print(f"-- Running multipy runtime makefile in dir {build_dir_abs}")
         try:
             subprocess.run(
-                [f"cmake -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
+                [f"cmake -DBUILD_CUDA_TESTS={self.cuda_tests_flag} -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
                 cwd=build_dir_abs,
                 shell=True,
                 check=True,