NVIDIA-NeMo · praateekmahajan · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml
@@ -2,8 +2,7 @@ name: "GPU CI/CD"
 
 on:
   push:
-    branches:
-      - main
+    branches: [main]
   pull_request:
     branches:
       # We can run gpuCI on any PR targeting these branches
@@ -12,28 +11,36 @@ on:
       - '[rv][0-9].[0-9].[0-9]rc[0-9]'
     # PR has to be labeled with "gpuCI" label
     # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI
-    types: [ labeled ]
+    types: [labeled]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+# Reusable job templates
 jobs:
   # First, we build and push a NeMo-Curator container
   build-container:
     # "build-container" job is run if the "gpuci" label is added to the PR
     if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
+    strategy:
+      matrix:
+        include:
+          - type: stable
+            image-suffix: ""
+          - type: nightly
+            image-suffix: "_nightly"
     uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
     with:
-      image-name: nemo_curator_container
+      image-name: nemo_curator_container${{ matrix.image-suffix }}
       dockerfile: Dockerfile
-      image-label: nemo-curator
+      image-label: nemo-curator${{ matrix.image-suffix }}
       build-args: |
-        IMAGE_LABEL=nemo-curator
+        IMAGE_LABEL=nemo-curator${{ matrix.image-suffix }}
         REPO_URL=https://github.com/${{ github.repository }}.git
         CURATOR_COMMIT=${{ github.sha }}
+        BUILD_TYPE=${{ matrix.type }}
       prune-filter-timerange: 24h
-
   # Then, we run our PyTests in the container we just built
   run-gpu-tests:
     needs: build-container
@@ -42,48 +49,59 @@ jobs:
     runs-on: self-hosted-azure
     # "run-gpu-tests" job is run if the "gpuci" label is added to the PR
     if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
+    strategy:
+      matrix:
+        include:
+          - type: stable
+            image-suffix: ""
+          - type: nightly
+            image-suffix: "_nightly"
+
+    env:
+      CONTAINER_NAME: nemo-curator-container${{ matrix.image-suffix }}
+      IMAGE_NAME: nemoci.azurecr.io/nemo_curator_container${{ matrix.image-suffix }}:${{ github.run_id }}
 
     steps:
       # If something went wrong during the last cleanup, this step ensures any existing container is removed
     - name: Remove existing container if it exists
       run: |
-        if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then
-            docker rm -f nemo-curator-container
+        if [ "$(docker ps -aq -f name=${{ env.CONTAINER_NAME }})" ]; then
+          docker rm -f ${{ env.CONTAINER_NAME }}
         fi
 
-      # This runs the container which was pushed by build-container, which we call "nemo-curator-container"
-      # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
-      # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
-      # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
+    # This runs the container which was pushed by build-container, which we call "nemo-curator-container"
+    # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
+    # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
+    # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
     - name: Run Docker container
       run: |
-        docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
-
-      # Expect `whoami` to be "azureuser"
-      # Expect `nvidia-smi` to show our 2 A100 GPUs
-    - name: Check GPUs
-      run: |
-        whoami
-        docker exec nemo-curator-container nvidia-smi
+        docker run --gpus all --name ${{ env.CONTAINER_NAME }} \
+          -d ${{ env.IMAGE_NAME }} \
+          bash -c "sleep infinity"
 
-      # In the virtual environment (called "curator") we created in the container,
-      # list all of our packages. Useful for debugging
-    - name: Verify installations
+    # In the virtual environment (called "curator") we created in the container,
+    # list all of our packages. Useful for debugging
+    # Expect `whoami` to be "azureuser"
+    # Expect `nvidia-smi` to show our 2 A100 GPUs
+    - name: Check GPUs + Verify installations
       run: |
-        docker exec nemo-curator-container pip list
-
-      # In the virtual environment (called "curator") we created in the container,
-      # run our PyTests marked with `@pytest.mark.gpu`
-      # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
-      # and then the directory where the PyTests are located
+          echo "Checking system user:"
+          whoami
+          docker exec ${{ env.CONTAINER_NAME }} whoami
+          echo "Checking GPU availability:"
+          docker exec ${{ env.CONTAINER_NAME }} nvidia-smi
+          echo "Checking installed packages:"
+          docker exec ${{ env.CONTAINER_NAME }} pip list
+    # In the virtual environment (called "curator") we created in the container,
+    # run our PyTests marked with `@pytest.mark.gpu`
+    # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
+    # and then the directory where the PyTests are located
     - name: Run PyTests with GPU mark
       run: |
-        docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
-
+          docker exec ${{ env.CONTAINER_NAME }} pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
       # After running `docker stop`, the container remains in an exited state
       # It is still present on our system and could be restarted with `docker start`
       # Thus, we use `docker rm` to permanently removed it from the system
     - name: Cleanup
       if: always()
-      run: |
-        docker stop nemo-curator-container && docker rm nemo-curator-container
+      run: docker stop ${{ env.CONTAINER_NAME }} && docker rm ${{ env.CONTAINER_NAME }}
diff --git a/Dockerfile b/Dockerfile
@@ -6,6 +6,7 @@ ARG PYTHON_VER=3.10
 ARG IMAGE_LABEL
 ARG REPO_URL
 ARG CURATOR_COMMIT
+ARG BUILD_TYPE=stable
 
 FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} as curator-update
 # Needed to navigate to and pull the forked repository's changes
@@ -23,14 +24,16 @@ RUN bash -exu <<EOF
   git checkout $CURATOR_COMMIT
 EOF
 
-
 FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER}
 LABEL "nemo.library"=${IMAGE_LABEL}
 WORKDIR /opt
 
+# Re-declare ARGs after new FROM to make them available in this stage
+ARG CUDA_VER
+ARG BUILD_TYPE
+
 # Install the minimal libcu* libraries needed by NeMo Curator
-ENV _CUDA_VER=${CUDA_VER}
-RUN conda create -y --name curator -c nvidia/label/cuda-${_CUDA_VER} -c conda-forge \
+RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-forge \
   python=3.10 \
   cuda-cudart \
   libcufft \
@@ -48,15 +51,23 @@ RUN \
 --mount=type=bind,source=/opt/NeMo-Curator/pyproject.toml,target=/opt/NeMo-Curator/pyproject.toml,from=curator-update \
   cd /opt/NeMo-Curator && \
   source activate curator && \
-  pip install ".[all]"
+  if [ "$BUILD_TYPE" = "nightly" ]; then \
+    pip install ".[all_nightly]"; \
+  else \
+    pip install ".[all]"; \
+  fi
 
 COPY --from=curator-update /opt/NeMo-Curator/ /opt/NeMo-Curator/
 
 # Clone the user's repository, find the relevant commit, and install everything we need
 RUN bash -exu <<EOF
   source activate curator
   cd /opt/NeMo-Curator/
-  pip install --extra-index-url https://pypi.nvidia.com ".[all]"
+  if [ "$BUILD_TYPE" = "nightly" ]; then \
+    pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple ".[all_nightly]"; \
+  else \
+    pip install --extra-index-url https://pypi.nvidia.com ".[all]"; \
+  fi
 EOF
 
 ENV PATH /opt/conda/envs/curator/bin:$PATH
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,7 +47,7 @@ dependencies = [
     "dask[complete]>=2021.7.1",
     "datasets",
     "distributed>=2021.7.1",
-    "fasttext==0.9.2",
+    "fasttext==0.9.3",
     "ftfy==6.1.1",
     "in-place==0.5.0",
     "jieba==0.42.1",
@@ -75,20 +75,20 @@ dynamic = ["version"]
 [project.optional-dependencies]
 # Installs CPU + GPU text curation modules
 cuda12x = [
-    "cudf-cu12>=24.10",
-    "cugraph-cu12>=24.10",
-    "cuml-cu12>=24.10",
-    "dask-cuda>=24.10",
-    "dask-cudf-cu12>=24.10",
+    "cudf-cu12>=24.12",
+    "cugraph-cu12>=24.12",
+    "cuml-cu12>=24.12",
+    "dask-cuda>=24.12",
+    "dask-cudf-cu12>=24.12",
     "spacy[cuda12x]>=3.6.0, <3.8.0",
 ]
 # Installs CPU + GPU text curation modules with RAPIDS Nightlies
 cuda12x_nightly = [
-    "cudf-cu12>=24.12.0a0,<=24.12",
-    "cugraph-cu12>=24.12.0a0,<=24.12",
-    "cuml-cu12>=24.12.0a0,<=24.12",
-    "dask-cuda>=24.12.0a0,<=24.12",
-    "dask-cudf-cu12>=24.12.0a0,<=24.12",
+    "cudf-cu12>=25.02.0a0,<=25.02",
+    "cugraph-cu12>=25.02.0a0,<=25.02",
+    "cuml-cu12>=25.02.0a0,<=25.02",
+    "dask-cuda>=25.02.0a0,<=25.02",
+    "dask-cudf-cu12>=25.02.0a0,<=25.02",
     "spacy[cuda12x]>=3.6.0, <3.8.0",
 ]
 # Installs CPU + GPU text and image curation modules