From 0e9874288203154517f487eb145080ef1b431ee3 Mon Sep 17 00:00:00 2001 From: PaliC Date: Wed, 26 Oct 2022 00:31:26 +0000 Subject: [PATCH] Add OSS GPU tests [ghstack-poisoned] --- .github/scripts/install_nvidia_utils_linux.sh | 89 +++++++++++++++++++ .github/workflows/runtime_tests.yaml | 20 ++++- Dockerfile | 26 +++--- setup.py | 12 ++- 4 files changed, 125 insertions(+), 22 deletions(-) create mode 100644 .github/scripts/install_nvidia_utils_linux.sh diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh new file mode 100644 index 00000000..16c9aa0b --- /dev/null +++ b/.github/scripts/install_nvidia_utils_linux.sh @@ -0,0 +1,89 @@ +et -eou pipefail + + +DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) +DRIVER_VERSION="515.57" +DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" +YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" + +install_nvidia_docker2_amzn2() { + ( + set -x + # Needed for yum-config-manager + sudo yum install -y yum-utils + sudo yum-config-manager --add-repo "${YUM_REPO_URL}" + sudo yum install -y nvidia-docker2 + sudo systemctl restart docker + ) +} + +install_nvidia_driver_amzn2() { + ( + set -x + + # Purge any nvidia driver installed from RHEL repo + sudo yum remove -y nvidia-driver-latest-dkms + + HAS_NVIDIA_DRIVER=0 + # Check if NVIDIA driver has already been installed + if [ -x "$(command -v nvidia-smi)" ]; then + # The driver exists, check its version next + INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader) + + if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then + echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing" + else + HAS_NVIDIA_DRIVER=1 + echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation" + fi + fi + + if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then + sudo yum groupinstall -y "Development Tools" + # ensure our kernel install is the same as our underlying kernel, + # groupinstall "Development Tools" has a habit of mismatching kernel headers + sudo yum install -y "kernel-devel-uname-r == $(uname -r)" + sudo modprobe backlight + sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" + sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) + sudo rm -fv /tmp/nvidia_driver + fi + + ( + set +e + nvidia-smi + status=$? + # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285 + if [ $status -eq 0 ] || [ $status -eq 14 ]; then + echo "INFO: Ignoring allowed status ${status}" + else + echo "ERROR: nvidia-smi exited with unresolved status ${status}" + exit ${status} + fi + ) + ) +} + +echo "== Installing nvidia driver ${DRIVER_FN} ==" +case "${DISTRIBUTION}" in + amzn*) + install_nvidia_driver_amzn2 + ;; + *) + echo "ERROR: Unknown distribution ${DISTRIBUTION}" + exit 1 + ;; +esac + +# Install container toolkit based on distribution +echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" +case "${DISTRIBUTION}" in + amzn*) + install_nvidia_docker2_amzn2 + ;; + *) + echo "ERROR: Unknown distribution ${DISTRIBUTION}" + exit 1 + ;; +esac + diff --git a/.github/workflows/runtime_tests.yaml b/.github/workflows/runtime_tests.yaml index 57657a79..c9535412 100644 --- a/.github/workflows/runtime_tests.yaml +++ b/.github/workflows/runtime_tests.yaml @@ -13,7 +13,7 @@ jobs: matrix: python-major-version: [3] python-minor-version: [7,8,9,10] - platform: [ubuntu-18.04] + platform: [linux.4xlarge.nvidia.gpu] fail-fast: false runs-on: ${{ matrix.platform }} steps: @@ -21,7 +21,19 @@ jobs: uses: actions/checkout@v2 with: submodules: true - + - name: Clean up previous CUDA driver installations + shell: bash + run: | + set -x + yum list installed | grep nvidia || true + yum list installed | grep cuda || true + sudo yum remove -y cuda || true + sudo yum remove -y cuda-drivers || true + sudo yum remove -y "*nvidia*" || true + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/workflows/install_nvidia_utils_linux.sh || true + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Setup SSH (Click me for login details) uses: ./.github/actions/setup-ssh with: @@ -30,11 +42,11 @@ jobs: - name: Build env: DOCKER_BUILDKIT: 1 - run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} . + run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} --build-arg BUILD_CUDA_TESTS=1 . - name: Test run: | - docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy" + docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy && multipy/runtime/build/test_deploy_gpu" - name: Examples run: | diff --git a/Dockerfile b/Dockerfile index 6564803e..7e472f0c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=nvidia/cuda:11.3.1-devel-ubuntu18.04 +ARG BASE_IMAGE=nvidia/cuda:11.6.1-devel-ubuntu18.04 FROM ${BASE_IMAGE} as dev-base @@ -59,6 +59,9 @@ COPY .git .git COPY .gitmodules .gitmodules COPY multipy multipy COPY compat-requirements.txt compat-requirements.txt +COPY setup.py setup.py +COPY README.md README.md +COPY dev-requirements.txt dev-requirements.txt RUN git submodule update --init --recursive --jobs 0 @@ -66,6 +69,7 @@ RUN git submodule update --init --recursive --jobs 0 FROM dev-base as conda-pyenv ARG PYTHON_MAJOR_VERSION=3 ARG PYTHON_MINOR_VERSION=8 +ARG BUILD_CUDA_TESTS=0 ENV PYTHON_MINOR_VERSION=${PYTHON_MINOR_VERSION} ENV PYTHON_VERSION=${PYTHON_MAJOR_VERSION}.${PYTHON_MINOR_VERSION} RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \ @@ -75,7 +79,7 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \ rm ~/miniconda.sh && \ /opt/conda/bin/conda install -y python=${PYTHON_VERSION} mkl mkl-include conda-build pyyaml numpy ipython && \ /opt/conda/bin/conda install -y -c conda-forge libpython-static=${PYTHON_VERSION} && \ - /opt/conda/bin/conda install -y pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly && \ + /opt/conda/bin/conda install -y pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch-nightly -c nvidia && \ /opt/conda/bin/conda clean -ya; \ else \ pip3 install virtualenv && \ @@ -84,29 +88,23 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \ ~/.pyenv/bin/pyenv install --force 3.7.10 && \ virtualenv -p ~/.pyenv/versions/3.7.10/bin/python3 ~/venvs/multipy && \ source ~/venvs/multipy/bin/activate && \ - pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113; \ + pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116; \ fi -# Build/Install pytorch with post-cxx11 ABI FROM conda-pyenv as build -WORKDIR /opt/multipy/multipy/runtime/third-party/pytorch COPY --from=conda-pyenv /opt/conda* /opt/conda COPY --from=submodule-update /opt/multipy /opt/multipy WORKDIR /opt/multipy # Build Multipy -RUN rm -r multipy/runtime/build; mkdir multipy/runtime/build && \ - cd multipy/runtime/build && \ - if [[ ${PYTHON_MINOR_VERSION} -lt 8 ]]; then \ - source ~/venvs/multipy/bin/activate && \ - cmake -DLEGACY_PYTHON_PRE_3_8=ON ..; \ +RUN ls && pwd && rm -rf multipy/runtime/build && \ + if [[ ${BUILD_CUDA_TESTS} -eq 1 ]]; then \ + python -m pip install -e . --install-option="--cudatests"; \ else \ - cmake -DLEGACY_PYTHON_PRE_3_8=OFF ..; \ + python -m pip install -e .; \ fi && \ - cmake --build . --config Release -j && \ - cmake --install . --prefix "." && \ - cd ../example && python generate_examples.py + python multipy/runtime/example/generate_examples.py # Build examples COPY examples examples diff --git a/setup.py b/setup.py index 5a4a17a3..03f8d8f3 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def get_cmake_version(): class MultipyRuntimeCmake(object): - user_options = [("cmakeoff", None, None), ("abicxx", None, None)] + user_options = [("cmakeoff", None, None), ("cudatests", None, None), ("abicxx", None, None)] class MultipyRuntimeDevelop(MultipyRuntimeCmake, develop): @@ -41,24 +41,28 @@ def initialize_options(self): # TODO(tristanr): remove once unused self.abicxx = None + self.cudatests = None def finalize_options(self): develop.finalize_options(self) if self.cmakeoff is not None: self.distribution.get_command_obj("build_ext").cmake_off = True + if self.cudatests is not None: + self.distribution.get_command_obj("build_ext").cuda_tests_flag = "ON" class MultipyRuntimeBuild(MultipyRuntimeCmake, build_ext): user_options = build_ext.user_options + MultipyRuntimeCmake.user_options cmake_off = False + cuda_tests_flag = "OFF" def run(self): if self.cmake_off: return try: cmake_version_comps = get_cmake_version().split(".") - if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "19": + if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "12": raise RuntimeError( - "CMake 3.19 or later required for multipy runtime installation." + "CMake 3.12 or later required for multipy runtime installation." ) except OSError: raise RuntimeError( @@ -74,7 +78,7 @@ def run(self): print(f"-- Running multipy runtime makefile in dir {build_dir_abs}") try: subprocess.run( - [f"cmake -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."], + [f"cmake -DBUILD_CUDA_TESTS={self.cuda_tests_flag} -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."], cwd=build_dir_abs, shell=True, check=True,