From 1792a4171577a91dc8056ba84b66c8bc466ab9f1 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 9 Nov 2025 07:44:55 +0000 Subject: [PATCH 1/4] feat: allow uv-less execution and fingerprint the environment Signed-off-by: Terry Kong fix Signed-off-by: Terry Kong seed Signed-off-by: Terry Kong init Signed-off-by: Terry Kong fix Signed-off-by: Terry Kong fix Signed-off-by: Terry Kong fix symlink creation to use exec as opposed to symlink which doesn't work Signed-off-by: Terry Kong frozen grpo Signed-off-by: Terry Kong use a json fingerprint Signed-off-by: Terry Kong logging level Signed-off-by: Terry Kong docs update Signed-off-by: Terry Kong fix up Signed-off-by: Terry Kong more functional test Signed-off-by: Terry Kong mermaid Signed-off-by: Terry Kong nrl-force rebuild and build-v will skip fingerprint check Signed-off-by: Terry Kong fix tests Signed-off-by: Terry Kong explain the local development Signed-off-by: Terry Kong fix ci container Signed-off-by: Terry Kong safe.directory fix Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 2 +- docker/Dockerfile | 9 + docs/conf.py | 1 + docs/design-docs/dependency-management.md | 343 +++++++++++++++ docs/index.md | 1 + nemo_rl/__init__.py | 159 +++++++ nemo_rl/utils/prefetch_venvs.py | 96 ++++ pyproject.toml | 1 + tests/functional/L1_Functional_Tests_GPU.sh | 4 + tests/functional/grpo_frozen_env.sh | 45 ++ tests/functional/test_frozen_env.sh | 213 +++++++++ tests/unit/test_version_check.py | 460 ++++++++++++++++++++ tools/build-custom-vllm.sh | 2 +- tools/generate_fingerprint.py | 139 ++++++ tools/list_editable_packages.py | 167 +++++++ uv.lock | 19 + 16 files changed, 1659 insertions(+), 2 deletions(-) create mode 100644 docs/design-docs/dependency-management.md create mode 100755 tests/functional/grpo_frozen_env.sh create mode 100755 tests/functional/test_frozen_env.sh create mode 100644 tests/unit/test_version_check.py create mode 100755 tools/generate_fingerprint.py create mode 100755 tools/list_editable_packages.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 07f1a1bb24..9efa03d06a 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -204,7 +204,7 @@ jobs: image-name: nemo_rl_container dockerfile: docker/Dockerfile image-label: nemo-rl - target: hermetic + target: release build-contexts: | nemo-rl=${{ github.run_id }}/ build-args: | diff --git a/docker/Dockerfile b/docker/Dockerfile index bd327db27d..d9a2184c62 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -10,6 +10,8 @@ ARG NRL_GIT_REF=main ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} / FROM ${BASE_IMAGE} AS base +# An environment variable to indicate that we are in a container. +ENV NRL_CONTAINER=1 # It is more convenient for users to run as root USER root @@ -76,10 +78,13 @@ ENV TORCH_CUDA_ARCH_LIST="9.0 10.0" # First copy only the dependency files COPY --from=nemo-rl pyproject.toml uv.lock ./ +# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist. +COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/ COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/ RUN <<"EOF" bash -exu +uv venv --seed if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then bash tools/build-custom-vllm.sh source 3rdparty/vllm/nemo-rl.env @@ -124,3 +129,7 @@ COPY --from=nemo-rl . /opt/nemo-rl RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py +# Generate container fingerprint for frozen environment support +# Store outside /opt/nemo-rl to avoid being overwritten by user mounts +RUN python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint + diff --git a/docs/conf.py b/docs/conf.py index 4e4b34630d..adac132816 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -66,6 +66,7 @@ "tasklist", # Adds support for GitHub-style task lists with [ ] and [x] ] myst_heading_anchors = 5 # Generates anchor links for headings up to level 5 +myst_fence_as_directive = ["mermaid"] # Treat ```mermaid blocks as directives # -- Options for Autodoc2 --------------------------------------------------- sys.path.insert(0, os.path.abspath("..")) diff --git a/docs/design-docs/dependency-management.md b/docs/design-docs/dependency-management.md new file mode 100644 index 0000000000..26151f7809 --- /dev/null +++ b/docs/design-docs/dependency-management.md @@ -0,0 +1,343 @@ +# Dependency Management + +NeMo RL's dependency management system supports both production and development workflows through a flexible virtual environment architecture. This document explains how NeMo RL manages Python dependencies and when to use each workflow. + +## Workflows Overview + +NeMo RL supports two distinct workflows based on your use case: + +### Production Workflow + +A **production workflow** is when you run NeMo RL out-of-the-box (OOTB) without modifying dependencies. This is the typical scenario for: +- Running NeMo RL with pre-built Docker containers +- Using released versions without local modifications +- Executing examples with default dependencies + +In a production workflow, the container's dependencies are aligned with your NeMo RL code version, and you can run applications directly without rebuilding environments. + +> [!NOTE] +> This workflow is similar to how other machine learning projects work: the Docker image is static, and there's an assumption that the code works with the container's pre-installed dependencies. However, NeMo RL goes further by providing mechanisms to align container dependencies dynamically, offering more flexibility than traditional static containers. + +### Development Workflow + +A **development workflow** is when you actively modify dependencies, submodules, or work with code that has different dependency requirements than the container. Common scenarios include: + +- **Version mismatch**: Using a container built from commit A, but your local NeMo RL code is at commit B, where B has different submodule versions or Python dependencies than A +- **Dependency changes**: Actively developing new features that require updated Python packages +- **Submodule modifications**: Working with modified versions of Megatron-LM, NeMo-Automodel, or other submodules + +> [!WARNING] +> If your container was built from commit `abc123` which used `vllm==0.9.0`, but your local checkout is at commit `def456` which requires `vllm==0.10.0`, you are in a development workflow. The container's cached environments won't match your code's requirements. + +## How `uv run` Works + +When you execute a NeMo RL application, such as: + +```bash +uv run examples/run_grpo_math.py +``` + +This command actually performs several steps behind the scenes: + +```bash +uv lock + uv sync + source .venv/bin/activate + python examples/run_grpo_math.py +``` + +Let's break down each component: + +### 1. `uv lock` + +Resolves all dependencies specified in [`pyproject.toml`](https://github.com/NVIDIA-NeMo/RL/blob/main/pyproject.toml#L21-L54) and generates a lock file (`uv.lock`) that pins exact versions of all packages. This ensures reproducible builds across different environments. + +### 2. `uv sync` + +Synchronizes your local virtual environment with the locked dependencies. It installs or updates packages as needed to match the lock file. + +The virtual environment location depends on your runtime environment: +- **Bare metal**: The venv defaults to `.venv/` local to your NeMo RL clone +- **Container**: The container sets [`UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv`](https://github.com/NVIDIA-NeMo/RL/blob/main/docker/Dockerfile#L67), so the environment is synced to `/opt/nemo_rl_venv`. Note that this location is ephemeral to the container instance. + +### 3. `source .venv/bin/activate` + +Activates the virtual environment, setting up the Python path and environment variables so your script runs with the correct dependencies. + +### 4. `python examples/run_grpo_math.py` + +Executes your driver script within the activated environment. + +## Multi-Environment Architecture + +```mermaid +graph TD + subgraph Container["uv run examples/run_grpo_math.py"] + A[Driver Script Environment
Default dependencies from pyproject.toml] + A --> B[Starts Ray Worker Groups] + B --> C[Policy Workers
Separate venv: MCORE] + B --> D[Generation Workers
Separate venv: VLLM] + B --> E[Environment Workers
Separate venv: SYSTEM] + end +``` + +The driver script (`examples/run_grpo_math.py`) runs with the [default dependencies specified in `pyproject.toml`](https://github.com/NVIDIA-NeMo/RL/blob/main/pyproject.toml#L21-L54) (without optional extras like `mcore` or `vllm`). However, the application creates multiple worker groups, each potentially requiring different Python environments. + +### Worker Groups and Virtual Environments + +Within the driver script, NeMo RL starts multiple [`RayWorkerGroup`](https://github.com/NVIDIA-NeMo/RL/blob/main/nemo_rl/distributed/worker_groups.py#L303-L313) instances. Each worker group manages a set of Ray actors that execute tasks in parallel. These workers may have specialized dependency requirements: + +- **Policy workers** (e.g., using Megatron-Core): Require `mcore` dependencies +- **Generation workers** (e.g., vLLM): Require `vllm` dependencies +- **Environment workers** (e.g., math evaluation): Use system/base dependencies + +Each worker type is mapped to a specific Python executable configuration in the [`ACTOR_ENVIRONMENT_REGISTRY`](https://github.com/NVIDIA-NeMo/RL/blob/main/nemo_rl/distributed/ray_actor_environment_registry.py#L27-L46). This registry defines which virtual environment should be used for each actor type: + +```python +ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = { + "nemo_rl.models.generation.vllm.vllm_worker.VllmGenerationWorker": VLLM_EXECUTABLE, + "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": MCORE_EXECUTABLE, + "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM, + # ... more mappings +} +``` + +> [!NOTE] +> For more details on how workers define and use their Python executables, see the [UV Documentation](uv.md#worker-configuration). + +## Container Pre-caching + +When a [release container](../docker.md#release-image) is built, it pre-caches: + +1. **Virtual environments**: All worker virtual environments are created and stored in the container +2. **UV cache**: Python packages are pre-downloaded into the UV cache directory + +This pre-caching significantly speeds up application startup in production workflows, as workers can immediately use their required environments without downloading or compiling packages. + +### When Pre-cached Environments Are Sufficient + +If your local NeMo RL checkout has the **same** Python dependencies and submodules as the container was built with, the pre-cached environments work seamlessly. You can simply run: + +```bash +uv run examples/run_grpo_math.py +``` + +The workers will use the pre-cached virtual environments, and your application starts quickly. + +## Handling Dependency Changes + +When your local code has **different** dependencies than the container (development workflow), you have two options: + +### Option 1: Force Rebuild Environments + +Set the `NRL_FORCE_REBUILD_VENVS` environment variable to rebuild all worker virtual environments on every node: + +```bash +export NRL_FORCE_REBUILD_VENVS=true +uv run examples/run_grpo_math.py +``` + +This approach works on both single-node and multi-node setups. On multi-node runs, each node will independently rebuild its virtual environments. + +> [!TIP] +> This approach is convenient for local development and small-scale experiments. It automatically rebuilds environments to match your current dependency specifications without requiring a container rebuild. + +> [!WARNING] +> On large-scale distributed runs (e.g., >=32 nodes), rebuilding environments on all ranks can add significant overhead. Consider rebuilding the container for these large runs + +### Option 2: Rebuild the Container + +For production deployments or large-scale runs, rebuild the container to pre-cache the new dependencies: + +```bash +docker buildx build --target release -f docker/Dockerfile --tag my-registry/nemo-rl:custom . +``` + +> [!TIP] +> Rebuilding the container is recommended when: +> - Running a job with many nodes (>=32 nodes) +> - Dependencies have changed significantly +> - You need reproducible, fast startup times +> - Multiple team members need the same environment + +The rebuilt container will have all virtual environments pre-cached with your updated dependencies, eliminating runtime overhead. + +### Option 3: Classic Workflow - Mounting Modified Submodules + +For situations where you're **only changing submodules** (like nemo-automodel, Penguin, Megatron-LM, or Megatron-Bridge) but **not changing Python package versions**, you can use a classic mounting approach. This workflow assumes that the non-submodule Python packages in your local checkout match what the container was built with. + +The container's NeMo RL code is located at `/opt/nemo-rl`. By mounting your local `3rdparty/` directory over the container's `/opt/nemo-rl/3rdparty/`, you can swap out submodules without rebuilding environments or containers. + +**Example - Mounting Modified Submodules on Slurm:** + +Assuming you're launching from the root of your local NeMo RL clone: + +```bash +# Run from the root of NeMo RL repo + +CONTAINER=YOUR_CONTAINER \ +MOUNTS="$PWD:$PWD,$PWD/3rdparty:/opt/nemo-rl/3rdparty" \ +sbatch \ + --nodes=1 \ + --account=YOUR_ACCOUNT \ + --job-name=YOUR_JOBNAME \ + --partition=YOUR_PARTITION \ + --time=1:0:0 \ + ray.sub +``` + +This mounts: +1. `$PWD:$PWD` - Your local NeMo RL directory to the same path in the container +2. `$PWD/3rdparty:/opt/nemo-rl/3rdparty` - Your local submodules override the container's submodules at `/opt/nemo-rl/3rdparty` + +> [!NOTE] +> This approach works because Python packages are already installed in the cached virtual environments. You're only swapping out the source code in the `3rdparty/` submodules, which doesn't require reinstalling packages or rebuilding environments. + +> [!IMPORTANT] +> This workflow is **only suitable when**: +> - Python package versions in `pyproject.toml` and `uv.lock` haven't changed +> - You're only modifying code within submodules (nemo-automodel, Penguin, Megatron-LM, Megatron-Bridge) +> - The submodule commits/branches are compatible with the installed package versions + +If you've changed Python package versions or dependencies outside of submodules, use Option 1 (`NRL_FORCE_REBUILD_VENVS=true`) or Option 2 (rebuild the container) instead. + +## Decision Guide + +Use this flowchart to determine which workflow applies to you: + +```mermaid +flowchart TD + A[Start] --> B{Are you modifying
dependencies or submodules?} + + B -->|No| C{Container built from
same commit as code?} + B -->|Yes| D{Small scale
or testing?} + + C ---->|Yes| F["✓ Run with + NRL_FORCE_REBUILD_VENVS=true uv run examples/..."] + C -->|No| D + + D -->|Yes| E["✓ Run directly + uv run examples/..."] + D -->|No| G[✓ Rebuild container with new dependencies] + + G --> E +``` + +## Frozen Environments + +For users who prefer or do not need to use `uv` at runtime, NeMo RL containers provide "frozen" environments. In these environments, Python executables—each corresponding to an actor's `PY_EXECUTABLE`—are prebuilt with all required dependencies and made available directly in your `PATH`. + +### What Are Frozen Environments? + +In a frozen environment setup: +- `pip` is available in all virtual environments +- Python executables like `python-MegatronPolicyWorker` are accessible directly +- Users can manually install packages with `python-MegatronPolicyWorker -m pip install ` + +> [!WARNING] +> While `pip` installing packages into a frozen environment is possible for experimentation or local debugging, **all dependencies must ultimately be added to `pyproject.toml` and locked in `uv.lock` before any change is upstreamed**. Direct `pip` installs are not reproducible or supported for collaborative or production workflows. **We cannot accept package additions that only exist via manual pip installs.** + +### When to Use Frozen Environments + +Frozen environments are useful when: +- You prefer traditional Python virtual environment workflows +- You want to manually manage package installations with `pip` +- You do not need `uv run` at runtime to automatically check if your dependencies are in sync + +> [!NOTE] +> For most users, `uv run` is still the recommended approach as it ensures reproducible builds and automatic dependency management. Frozen environments require manual intervention to keep dependencies in sync. + +### Available Python Executables + +Containers provide convenience symlinks for each worker type: + +```bash +# List all available python executables +ls /usr/local/bin/python-* + +# Examples: +python # Default executable for driver scripts (e.g., examples/run_grpo_math.py) +python-MegatronPolicyWorker # For Megatron policy workers +python-VllmGenerationWorker # For vLLM generation workers +python-MathEnvironment # For environment workers +``` + +> [!NOTE] +> The `python` executable (without any suffix) corresponds to the default frozen environment used to launch driver scripts like `examples/run_grpo_math.py`. This environment contains the base dependencies from `pyproject.toml` without optional extras. + +To see which packages can be mounted for each executable: + +```bash +python tools/list_editable_packages.py +``` + +### Container Version Checking + +NeMo RL containers enforce environment reproducibility by automatically checking that your code and dependencies match the state of the container at build time. The version checking mechanism works by comparing: + +- The **md5sum of `pyproject.toml`** +- The **md5sum of `uv.lock`** +- The **commit hashes of relevant submodules** + +If any of these values differ between your code and the container image, NeMo RL will alert you and show exactly what has changed: + +```text +-------------------------------------------------------------------------------- +WARNING: Container/Code Version Mismatch Detected! + +-------------------------------------------------------------------------------- +Your container's dependencies do not match your current code. + +Differences found: + - pyproject.toml: + Container: abc123def456 + Current: xyz789abc012 + - uv.lock: + Container: 0987f6543210 + Current: 1234abcd5678 + - submodules/3rdparty/ExampleSubmodule: + Container: a1b2c3d4e5f6 + Current: f6e5d4c3b2a1 + +This can lead to unexpected behavior or errors. + +Solutions: + 1. Rebuild the container to match your code + 2. Set NRL_FORCE_REBUILD_VENVS=true to rebuild virtual environments + (This forces Ray workers to recreate their venvs with updated dependencies) + 3. Set NRL_IGNORE_VERSION_MISMATCH=1 to bypass this check (not recommended) + +Learn more about dependency management: + https://github.com/NVIDIA-NeMo/RL/blob/main/docs/design-docs/dependency-management.md + +-------------------------------------------------------------------------------- +``` + +This check **only runs in containers** (when `NRL_CONTAINER=1` is set) and can be bypassed if absolutely needed: + +```bash +export NRL_IGNORE_VERSION_MISMATCH=1 +``` + +> [!WARNING] +> Bypassing version checks can result in subtle, hard-to-debug errors due to dependency mismatches. Only do this if you fully understand the risks and have a specific need. + +> [!CAUTION] +> **If you modify a frozen environment manually** (for example, by running `python-MegatronPolicyWorker -m pip install `) this change will *not* be detected or tracked by the container version check described above. This is strongly discouraged as it leads to a non-reproducible setup, increases the chance of hard-to-debug environment errors, and breaks the guarantee of consistency across developer machines and production deployments. +> +> Always make dependency changes in `pyproject.toml` and use the recommended workflows so that your environment stays consistent and traceable. + +## Summary + +NeMo RL's dependency management balances flexibility and performance: + +- **Production workflows** leverage pre-cached environments for fast, reliable startup +- **Development workflows** can dynamically rebuild environments as needed (this works on multi-node setups as well) +- **Submodule-only changes** can use the classic mount workflow to swap submodules without rebuilding environments +- **Container rebuilds** provide the best performance for large-scale production runs +- **`NRL_FORCE_REBUILD_VENVS`** offers flexibility for development without container rebuilds +- **Frozen environments** provide an alternative to `uv run` for users who prefer traditional Python virtual environment workflows with direct access to specialized Python executables + +Choose the approach that best fits your scale and development velocity: +- For most users, the **production workflow** with pre-built containers provides the optimal experience +- When iterating on submodule code, the **classic mount workflow** offers a fast middle ground +- For significant dependency changes, use **`NRL_FORCE_REBUILD_VENVS`** for small runs or **rebuild containers** for large-scale deployments +- For manual dependency management, **frozen environments** are available, though `uv run` is recommended for reproducibility + diff --git a/docs/index.md b/docs/index.md index c1b81b2b54..c3d987a619 100644 --- a/docs/index.md +++ b/docs/index.md @@ -242,6 +242,7 @@ design-docs/design-and-philosophy.md design-docs/padding.md design-docs/logger.md design-docs/uv.md +design-docs/dependency-management.md design-docs/chat-datasets.md design-docs/generation.md design-docs/checkpointing.md diff --git a/nemo_rl/__init__.py b/nemo_rl/__init__.py index 9217b6a580..d8a1c3b334 100644 --- a/nemo_rl/__init__.py +++ b/nemo_rl/__init__.py @@ -16,6 +16,12 @@ import sys from pathlib import Path +# Configure logging to show file location for warnings +logging.basicConfig( + format="%(levelname)s:%(name)s:%(filename)s:%(lineno)d: %(message)s", + level=logging.WARNING, +) + """ This is a work around to ensure whenever NeMo RL is imported, that we add Megatron-LM to the python path. This is because the only sub-package @@ -49,6 +55,159 @@ os.environ["RAY_ENABLE_UV_RUN_RUNTIME_ENV"] = "0" +def _is_build_isolation(): + """Detect if we're running in a uv build isolation environment. + + When running uv lock/sync, uv creates a temporary isolated environment + in ~/.cache/uv/builds-v*/ to build packages and introspect metadata. + We skip the fingerprint check in this context since the user is updating dependencies. + + Returns True if in build isolation, False otherwise. + """ + # Check if we're in uv's build isolation directory + # uv always uses paths like: /root/.cache/uv/builds-v0/.tmp*/ + return "/builds-v" in sys.prefix + + +def _check_container_fingerprint(): + """Check if container dependencies match the current code (container-only). + + This check only runs when NRL_CONTAINER=1 is set (inside containers). + It compares the container's fingerprint (computed at build time) with + the current code's fingerprint to detect dependency drift. + + This check is also skipped entirely if NRL_FORCE_REBUILD_VENVS=true is set, + since environment rebuilding will ensure dependencies are consistent regardless + of a mismatch. + + If there's a mismatch, raises RuntimeError unless NRL_IGNORE_VERSION_MISMATCH is set. + """ + # Skip check if not in container or if we're going to force venv rebuild anyway + if not os.environ.get("NRL_CONTAINER"): + return + if os.environ.get("NRL_FORCE_REBUILD_VENVS", "").lower() == "true": + logging.info( + "Skipping container fingerprint check because NRL_FORCE_REBUILD_VENVS=true (venvs will be rebuilt anyway)" + ) + return + + # Skip check if we're in a build isolation environment (e.g., during uv lock/sync) + if _is_build_isolation(): + logging.debug( + "Skipping container fingerprint check because we're in a build isolation environment" + ) + return + + try: + import json + import runpy + import sys + from io import StringIO + + # Get repo root (relative to this module) + repo_root = Path(__file__).parent.parent + fingerprint_script = repo_root / "tools" / "generate_fingerprint.py" + + # Check if script exists + if not fingerprint_script.exists(): + logging.warning( + f"Fingerprint script not found at {fingerprint_script}, skipping version check" + ) + return + + # Compute current code fingerprint using runpy (cleaner than subprocess) + old_stdout = sys.stdout + sys.stdout = captured_output = StringIO() + try: + runpy.run_path(str(fingerprint_script), run_name="__main__") + current_fingerprint_json = captured_output.getvalue().strip() + finally: + sys.stdout = old_stdout + + if not current_fingerprint_json: + logging.warning("Failed to compute code fingerprint: empty output") + return + + current_fingerprint = json.loads(current_fingerprint_json) + + # Read container fingerprint + container_fingerprint_file = Path("/opt/nemo_rl_container_fingerprint") + if not container_fingerprint_file.exists(): + logging.warning( + "Container fingerprint file not found, skipping version check" + ) + return + + container_fingerprint = json.loads( + container_fingerprint_file.read_text().strip() + ) + + # Compare fingerprints and find differences + all_keys = set(current_fingerprint.keys()) | set(container_fingerprint.keys()) + differences = [] + + for key in sorted(all_keys): + current_val = current_fingerprint.get(key, "missing") + container_val = container_fingerprint.get(key, "missing") + + if current_val != container_val: + differences.append(f" - {key}:") + differences.append(f" Container: {container_val}") + differences.append(f" Current: {current_val}") + + if differences: + diff_text = "\n".join(differences) + sep_line = "\n" + ("-" * 80) + warning_msg = ( + f"{sep_line}\n" + "WARNING: Container/Code Version Mismatch Detected!\n" + f"{sep_line}\n" + "Your container's dependencies do not match your current code.\n" + "\n" + "Differences found:\n" + f"{diff_text}\n" + "\n" + "This can lead to unexpected behavior or errors.\n" + "\n" + "Solutions:\n" + " 1. Rebuild the container to match your code\n" + " 2. Set NRL_FORCE_REBUILD_VENVS=true to rebuild virtual environments\n" + " (This forces Ray workers to recreate their venvs with updated dependencies)\n" + " 3. Update the container fingerprint to match your current code (for local dev):\n" + " python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint\n" + " 4. Set NRL_IGNORE_VERSION_MISMATCH=1 to bypass this check (not recommended)\n" + "\n" + "Learn more about dependency management:\n" + " https://github.com/NVIDIA-NeMo/RL/blob/main/docs/design-docs/dependency-management.md\n" + f"{sep_line}\n" + ) + + # Check if user wants to ignore the mismatch + if os.environ.get("NRL_IGNORE_VERSION_MISMATCH"): + logging.warning( + warning_msg + + "Proceeding anyway (NRL_IGNORE_VERSION_MISMATCH is set)..." + ) + else: + raise RuntimeError( + warning_msg + + "To proceed anyway, set: export NRL_IGNORE_VERSION_MISMATCH=1" + ) + else: + logging.debug("Container fingerprint matches code fingerprint") + + except RuntimeError: + # Re-raise RuntimeError for version mismatches (user should see this) + raise + except Exception as e: + # Log other errors but don't crash on version check failures + logging.debug(f"Version check failed (non-fatal): {e}") + + +# Perform container version check +_check_container_fingerprint() + + def _patch_nsight_file(): """Patch the nsight.py file to fix the context.py_executable assignment. diff --git a/nemo_rl/utils/prefetch_venvs.py b/nemo_rl/utils/prefetch_venvs.py index 618b54a99e..c6e95722c1 100755 --- a/nemo_rl/utils/prefetch_venvs.py +++ b/nemo_rl/utils/prefetch_venvs.py @@ -11,7 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import sys +from pathlib import Path from nemo_rl.distributed.ray_actor_environment_registry import ( ACTOR_ENVIRONMENT_REGISTRY, @@ -52,6 +54,100 @@ def prefetch_venvs(): print("\nVenv prefetching complete!") + # Create convenience python wrapper scripts for frozen environment support (container-only) + create_frozen_environment_symlinks(venv_configs) + + +def create_frozen_environment_symlinks(venv_configs): + """Create python-{ClassName} wrapper scripts in /usr/local/bin for frozen environment support. + + Only runs in container (when NRL_CONTAINER=1 is set). + + Args: + venv_configs: Dictionary mapping py_executable to list of actor FQNs + """ + # Only create wrapper scripts in container + if not os.environ.get("NRL_CONTAINER"): + print( + "\nSkipping frozen environment wrapper script creation (not in container)" + ) + return + + print("\nCreating frozen environment wrapper scripts...") + + # Collect all wrapper mappings: class_name -> venv_path + wrapper_mappings = {} + + for py_executable, actor_fqns in venv_configs.items(): + for actor_fqn in actor_fqns: + # Extract class name from FQN (last part) + # e.g., "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker" -> "MegatronPolicyWorker" + class_name = actor_fqn.split(".")[-1] + + # Get the venv path that was created + try: + python_path = create_local_venv(py_executable, actor_fqn) + + # Check for collisions + if class_name in wrapper_mappings: + existing_path = wrapper_mappings[class_name] + if existing_path != python_path: + raise RuntimeError( + f"Collision detected: Multiple venvs want to use name '{class_name}'\n" + f" Existing: {existing_path}\n" + f" New: {python_path}\n" + f"This indicates two different worker classes have the same name." + ) + else: + wrapper_mappings[class_name] = python_path + except Exception as e: + print(f" Warning: Could not get venv path for {actor_fqn}: {e}") + continue + + # Create wrapper scripts + wrapper_dir = Path("/usr/local/bin") + created_wrappers = [] + + for class_name, python_path in sorted(wrapper_mappings.items()): + wrapper_name = f"python-{class_name}" + wrapper_path = wrapper_dir / wrapper_name + + # Get the venv directory path (parent of bin/python) + venv_path = Path(python_path).parent.parent + + # Create wrapper script content + wrapper_content = f"""#!/bin/bash +VENV_PATH="{venv_path}" +export VIRTUAL_ENV="$VENV_PATH" +export PATH="$VENV_PATH/bin:$PATH" +exec "$VENV_PATH/bin/python" "$@" +""" + + try: + # Remove existing wrapper if present + if wrapper_path.exists() or wrapper_path.is_symlink(): + wrapper_path.unlink() + + # Write wrapper script + wrapper_path.write_text(wrapper_content) + + # Make executable + wrapper_path.chmod(0o755) + + created_wrappers.append(wrapper_name) + print(f" Created: {wrapper_name} -> {python_path}") + except Exception as e: + print(f" Warning: Could not create wrapper script {wrapper_name}: {e}") + continue + + if created_wrappers: + print(f"\nCreated {len(created_wrappers)} frozen environment wrapper scripts") + print("Users can now use these python executables directly:") + for name in created_wrappers: + print(f" - {name}") + else: + print("\nNo frozen environment wrapper scripts were created") + if __name__ == "__main__": prefetch_venvs() diff --git a/pyproject.toml b/pyproject.toml index e64a6441f6..51806191b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ requires-python = ">=3.12" license = { text = "Apache 2.0" } dependencies = [ "setuptools", + "pip", # Required for frozen environments; uv venv --seed may not reliably install pip "ninja", # for flash-attn parallel build "torch==2.8.0", "triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh index 9de07d28bd..5aa4f83755 100644 --- a/tests/functional/L1_Functional_Tests_GPU.sh +++ b/tests/functional/L1_Functional_Tests_GPU.sh @@ -19,6 +19,10 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..) cd ${PROJECT_ROOT} +# This test is intentionally not run with uv run --no-sync to verify that the frozen environment is working correctly. +time bash ./tests/functional/grpo_frozen_env.sh +time bash ./tests/functional/test_frozen_env.sh + time uv run --no-sync bash ./tests/functional/sft.sh time uv run --no-sync bash ./tests/functional/grpo.sh time uv run --no-sync bash ./tests/functional/grpo_async.sh diff --git a/tests/functional/grpo_frozen_env.sh b/tests/functional/grpo_frozen_env.sh new file mode 100755 index 0000000000..4d4edc7eb0 --- /dev/null +++ b/tests/functional/grpo_frozen_env.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetches metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +set -eou pipefail + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR + +cd $PROJECT_ROOT + +# Test frozen environment by using bare python instead of uv run +# This verifies that direct python execution works with all dependencies pre-installed +python -m coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJECT_ROOT/nemo_rl \ + $PROJECT_ROOT/examples/run_grpo_math.py \ + policy.model_name=Qwen/Qwen3-0.6B \ + grpo.num_prompts_per_step=2 \ + grpo.num_generations_per_prompt=4 \ + policy.train_global_batch_size=4 \ + policy.train_micro_batch_size=1 \ + cluster.gpus_per_node=2 \ + grpo.max_num_steps=2 \ + logger.tensorboard_enabled=true \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=false \ + logger.monitor_gpus=true \ + checkpointing.enabled=false \ + $@ \ + 2>&1 | tee $RUN_LOG + +python tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +python tests/check_metrics.py $JSON_METRICS \ + 'max(data["train/token_mult_prob_error"]) < 1.05' + diff --git a/tests/functional/test_frozen_env.sh b/tests/functional/test_frozen_env.sh new file mode 100755 index 0000000000..38dfb27aed --- /dev/null +++ b/tests/functional/test_frozen_env.sh @@ -0,0 +1,213 @@ +#!/bin/bash +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +PROJECT_ROOT=$(cd -- "$SCRIPT_DIR/../.." && pwd) + +echo "==========================================" +echo "Frozen Environment Functional Test" +echo "==========================================" +echo + +# ============================================================================ +# Test 1: Ray Import Across All Python Executables +# ============================================================================ +echo "Test 1: Verifying ray imports across all python-* executables" +echo "---------------------------------------------------------------" + +# Find all python-* executables in /usr/local/bin +PYTHON_EXECUTABLES=($(ls -1 /usr/local/bin/python-* 2>/dev/null || true)) + +if [ ${#PYTHON_EXECUTABLES[@]} -eq 0 ]; then + echo "ERROR: No python-* executables found in /usr/local/bin" + echo "This test requires frozen environment setup (NRL_CONTAINER=1)" + exit 1 +fi + +echo "Found ${#PYTHON_EXECUTABLES[@]} python-* executables to test" +echo + +for py_exec in "${PYTHON_EXECUTABLES[@]}"; do + py_name=$(basename "$py_exec") + echo -n " Testing $py_name ... " + + if $py_exec -c "import ray" 2>/dev/null; then + echo "✓ OK" + else + echo "✗ FAILED" + echo "ERROR: $py_name cannot import ray" + exit 1 + fi +done + +echo +echo "Test 1: PASSED - All python-* executables can import ray" +echo + +# ============================================================================ +# Test 2: Mutation Detection in Frozen Environment +# ============================================================================ +echo "Test 2: Verifying mutation detection in frozen environment" +echo "-----------------------------------------------------------" + +# Create temporary directory for testing +TEMP_DIR=$(mktemp -d -t nemo-rl-frozen-env-test-XXXXXX) +echo "Created temporary directory: $TEMP_DIR" + +# Setup cleanup trap +cleanup() { + echo "Cleaning up temporary directory: $TEMP_DIR" + rm -rf "$TEMP_DIR" +} +trap cleanup EXIT + +# Copy codebase using rsync with git ls-files (like code_snapshot.sh) +echo "Copying codebase to temporary directory..." +cd "$PROJECT_ROOT" +# Add --git-dir and --work-tree to ensure we can run git without the safe.directory check +rsync -a --files-from=<( + git --git-dir="$PROJECT_ROOT/.git" --work-tree="$PROJECT_ROOT" ls-files --recurse-submodules --cached --full-name +) ./ "$TEMP_DIR/" + +# Also copy .git directories so fingerprint check can determine submodule commits +echo "Copying .git metadata for fingerprint verification..." +rsync -a .git "$TEMP_DIR/" +rsync -a 3rdparty/ "$TEMP_DIR/3rdparty/" --include='*/.git' --include='*/.git/**' --exclude='*' 2>/dev/null || true + +# Navigate to temp directory and set PYTHONPATH +cd "$TEMP_DIR" +export PYTHONPATH="$TEMP_DIR:${PYTHONPATH:-}" + +echo +echo "Test 2a: Baseline - import nemo_rl should succeed without mutations" +echo -n " Testing python -c 'import nemo_rl' ... " +if python -c "import nemo_rl" 2>/dev/null; then + echo "✓ OK" +else + echo "✗ FAILED" + echo "ERROR: Baseline import of nemo_rl failed (should succeed)" + exit 1 +fi + +echo +echo "Test 2b: Pyproject mutation - import nemo_rl should fail after mutation" +echo " Adding newline to top of pyproject.toml..." +# Add a newline to the top of pyproject.toml +echo "" | cat - pyproject.toml > pyproject.toml.tmp && mv pyproject.toml.tmp pyproject.toml + +echo -n " Testing python -c 'import nemo_rl' (should fail) ... " +if python -c "import nemo_rl" 2>/dev/null; then + echo "✗ FAILED" + echo "ERROR: import nemo_rl succeeded after pyproject.toml mutation (should fail)" + exit 1 +else + echo "✓ OK (import failed as expected)" +fi + +# Restore pyproject.toml for next test +echo " Restoring pyproject.toml..." +cd "$PROJECT_ROOT" +rsync -a pyproject.toml "$TEMP_DIR/pyproject.toml" +cd "$TEMP_DIR" + +echo +echo "Test 2c: Submodule mutation - import nemo_rl should fail after updating submodule" +echo " Updating 3rdparty/megatron-lm submodule to HEAD of main branch..." + +# Check if megatron-lm submodule exists +if [ ! -d "3rdparty/megatron-lm/.git" ]; then + echo " WARNING: megatron-lm submodule not initialized, skipping submodule mutation test" +else + cd "$TEMP_DIR/3rdparty/megatron-lm" + + # Fetch latest from remote and checkout main + if git fetch origin main 2>/dev/null && git checkout origin/main 2>/dev/null; then + echo " Successfully updated submodule to latest main" + + cd "$TEMP_DIR" + echo -n " Testing python -c 'import nemo_rl' (should fail) ... " + if python -c "import nemo_rl" 2>/dev/null; then + echo "✗ FAILED" + echo "ERROR: import nemo_rl succeeded after submodule mutation (should fail)" + exit 1 + else + echo "✓ OK (import failed as expected)" + fi + else + echo " WARNING: Could not update submodule (network issue?), skipping this test" + fi +fi + +echo +echo "Test 2: PASSED - Mutation detection working correctly" +echo + +# ============================================================================ +# Test 3: Import Isolation Between Worker Environments +# ============================================================================ +echo "Test 3: Verifying import isolation between worker environments" +echo "---------------------------------------------------------------" + +# Return to project root for test 3 +cd "$PROJECT_ROOT" +unset PYTHONPATH + +echo +echo "Test 3a: python-MegatronPolicyWorker should have megatron.core but not nemo_automodel" + +echo -n " Testing python-MegatronPolicyWorker can import megatron.core ... " +if python-MegatronPolicyWorker -c "import megatron.core" 2>/dev/null; then + echo "✓ OK" +else + echo "✗ FAILED" + echo "ERROR: python-MegatronPolicyWorker cannot import megatron.core" + exit 1 +fi + +echo -n " Testing python-MegatronPolicyWorker cannot import nemo_automodel ... " +if python-MegatronPolicyWorker -c "import nemo_automodel" 2>/dev/null; then + echo "✗ FAILED" + echo "ERROR: python-MegatronPolicyWorker can import nemo_automodel (should fail)" + exit 1 +else + echo "✓ OK (import failed as expected)" +fi + +echo +echo "Test 3b: python-DTensorPolicyWorkerV2 should have nemo_automodel but not megatron.core" + +echo -n " Testing python-DTensorPolicyWorkerV2 can import nemo_automodel ... " +if python-DTensorPolicyWorkerV2 -c "import nemo_automodel" 2>/dev/null; then + echo "✓ OK" +else + echo "✗ FAILED" + echo "ERROR: python-DTensorPolicyWorkerV2 cannot import nemo_automodel" + exit 1 +fi + +echo -n " Testing python-DTensorPolicyWorkerV2 cannot import megatron.core ... " +if python-DTensorPolicyWorkerV2 -c "import megatron.core" 2>/dev/null; then + echo "✗ FAILED" + echo "ERROR: python-DTensorPolicyWorkerV2 can import megatron.core (should fail)" + exit 1 +else + echo "✓ OK (import failed as expected)" +fi + +echo +echo "Test 3: PASSED - Import isolation working correctly" +echo + +# ============================================================================ +# Summary +# ============================================================================ +echo "==========================================" +echo "All Frozen Environment Tests PASSED ✓" +echo "==========================================" +echo +echo "Summary:" +echo " ✓ Test 1: All ${#PYTHON_EXECUTABLES[@]} python-* executables can import ray" +echo " ✓ Test 2: Mutation detection working (pyproject.toml and submodule changes detected)" +echo " ✓ Test 3: Import isolation between worker environments verified" +echo + diff --git a/tests/unit/test_version_check.py b/tests/unit/test_version_check.py new file mode 100644 index 0000000000..cdd8b7752f --- /dev/null +++ b/tests/unit/test_version_check.py @@ -0,0 +1,460 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for container version fingerprint checking.""" + +import os +from pathlib import Path +from unittest import mock + +import pytest + +# Import the functions before any tests run +from nemo_rl import _check_container_fingerprint + + +class TestContainerFingerprintCheck: + """Test the container fingerprint check functionality.""" + + def test_skip_check_on_baremetal(self, monkeypatch): + """Test that version check is skipped when not in container.""" + # Ensure NRL_CONTAINER is not set + monkeypatch.delenv("NRL_CONTAINER", raising=False) + + # Call should return early without doing anything + _check_container_fingerprint() + + # No exception should be raised + assert True + + def test_check_passes_when_fingerprints_match(self, monkeypatch): + """Test that check passes silently when fingerprints match.""" + import json + + # Set up environment to simulate container + monkeypatch.setenv("NRL_CONTAINER", "1") + + # Create a mock fingerprint dictionary + fingerprint = { + "pyproject.toml": "abc123", + "uv.lock": "def456", + "submodules/third_party/NeMo": "789xyz", + } + fingerprint_json = json.dumps(fingerprint, indent=2, sort_keys=True) + + # Mock runpy to return matching fingerprint + def mock_run_path(path, run_name=None): + print(fingerprint_json) + + with mock.patch("runpy.run_path", side_effect=mock_run_path): + with mock.patch("nemo_rl.Path") as mock_path: + mock_fp_script = mock.MagicMock() + mock_fp_script.exists.return_value = True + + mock_container_fp_file = mock.MagicMock() + mock_container_fp_file.exists.return_value = True + mock_container_fp_file.read_text.return_value = ( + fingerprint_json # Same fingerprint + ) + + def path_constructor(arg): + if "/opt/nemo_rl_container_fingerprint" in str(arg): + return mock_container_fp_file + m = mock.MagicMock() + m.exists.return_value = True + m.__truediv__ = mock.MagicMock(return_value=mock_fp_script) + return m + + mock_path.side_effect = path_constructor + + # Should complete without exception + _check_container_fingerprint() + + # No exception raised + assert True + + @pytest.mark.skip(reason="Complex mocking - integration test more appropriate") + def test_check_raises_on_mismatch_without_ignore_flag(self, monkeypatch, tmp_path): + """Test that check raises RuntimeError when fingerprints don't match.""" + # Set up environment to simulate container + monkeypatch.setenv("NRL_CONTAINER", "1") + monkeypatch.delenv("NRL_IGNORE_VERSION_MISMATCH", raising=False) + + # Create actual files with different fingerprints + container_fingerprint = "abc123def456" + code_fingerprint = "different999" + + # Create a fake fingerprint script that just prints code_fingerprint + fake_script = tmp_path / "generate_fingerprint.py" + fake_script.write_text(f"#!/usr/bin/env python3\nprint('{code_fingerprint}')\n") + + # Create container fingerprint file + container_fp_file = tmp_path / "nemo_rl_container_fingerprint" + container_fp_file.write_text(container_fingerprint) + + # Patch Path to point to our temp files + original_path_init = Path.__init__ + + def mock_path_init(self, *args): + path_str = str(args[0]) if args else "" + if "/opt/nemo_rl_container_fingerprint" in path_str: + original_path_init(self, container_fp_file) + elif "generate_fingerprint.py" in path_str: + original_path_init(self, fake_script) + else: + original_path_init(self, *args) + + with mock.patch.object(Path, "__init__", mock_path_init): + # Should raise RuntimeError + with pytest.raises(RuntimeError, match="Container/Code Version Mismatch"): + _check_container_fingerprint() + + @pytest.mark.skip(reason="Complex mocking - integration test more appropriate") + def test_check_logs_warning_with_ignore_flag(self, monkeypatch, caplog): + """Test that check logs warning but continues when NRL_IGNORE_VERSION_MISMATCH is set.""" + # Set up environment to simulate container with ignore flag + monkeypatch.setenv("NRL_CONTAINER", "1") + monkeypatch.setenv("NRL_IGNORE_VERSION_MISMATCH", "1") + + # Create a mock fingerprint file with different fingerprint + container_fingerprint = "abc123def456" + code_fingerprint = "different999" + + # Mock runpy to return a different fingerprint + def mock_run_path(path, run_name=None): + print(code_fingerprint) + + with mock.patch("runpy.run_path", side_effect=mock_run_path): + # Mock the Path class + with mock.patch("nemo_rl.Path") as mock_path_class: + mock_repo_root = mock.MagicMock() + mock_fingerprint_script = mock.MagicMock() + mock_fingerprint_script.exists.return_value = True + mock_repo_root.__truediv__ = mock.MagicMock( + return_value=mock_fingerprint_script + ) + + mock_container_fp = mock.MagicMock() + mock_container_fp.exists.return_value = True + mock_container_fp.read_text.return_value = container_fingerprint + + def path_side_effect(arg): + if str(arg) == "/opt/nemo_rl_container_fingerprint": + return mock_container_fp + return Path(arg) + + mock_path_class.side_effect = path_side_effect + + from nemo_rl import _check_container_fingerprint + + # Should not raise, just log warning + _check_container_fingerprint() + + # No exception raised + assert True + + @pytest.mark.skip(reason="Complex mocking - integration test more appropriate") + def test_check_handles_missing_fingerprint_file(self, monkeypatch): + """Test that check handles missing container fingerprint gracefully.""" + # Set up environment to simulate container + monkeypatch.setenv("NRL_CONTAINER", "1") + + # Mock runpy to return a fingerprint + def mock_run_path(path, run_name=None): + print("abc123") + + with mock.patch("runpy.run_path", side_effect=mock_run_path): + with mock.patch("nemo_rl.Path") as mock_path_class: + mock_fingerprint_script = mock.MagicMock() + mock_fingerprint_script.exists.return_value = True + + mock_container_fp = mock.MagicMock() + mock_container_fp.exists.return_value = False # Missing file + + def path_side_effect(arg): + if str(arg) == "/opt/nemo_rl_container_fingerprint": + return mock_container_fp + elif "generate_fingerprint.py" in str(arg): + return mock_fingerprint_script + return Path(arg) + + mock_path_class.side_effect = path_side_effect + + from nemo_rl import _check_container_fingerprint + + # Should not raise exception + _check_container_fingerprint() + + assert True + + @pytest.mark.skip(reason="Complex mocking - integration test more appropriate") + def test_check_handles_runpy_failure(self, monkeypatch): + """Test that check handles runpy failures gracefully.""" + # Set up environment to simulate container + monkeypatch.setenv("NRL_CONTAINER", "1") + + # Mock runpy to raise an exception + def mock_run_path(path, run_name=None): + raise RuntimeError("Error generating fingerprint") + + with mock.patch("runpy.run_path", side_effect=mock_run_path): + with mock.patch("nemo_rl.Path") as mock_path_class: + mock_fingerprint_script = mock.MagicMock() + mock_fingerprint_script.exists.return_value = True + + def path_side_effect(arg): + if "generate_fingerprint.py" in str(arg): + return mock_fingerprint_script + return Path(arg) + + mock_path_class.side_effect = path_side_effect + + from nemo_rl import _check_container_fingerprint + + # Should not raise exception (handles gracefully) + _check_container_fingerprint() + + assert True + + +class TestBuildIsolationDetection: + """Test the build isolation detection functionality with real uv commands.""" + + @pytest.fixture + def dummy_project(self, tmp_path): + """Create a minimal dummy project for testing.""" + import subprocess + + project_dir = tmp_path / "dummy_project" + project_dir.mkdir() + + # Get the nemo_rl project root + nemo_rl_root = Path(__file__).parent.parent.parent + + # Create a minimal pyproject.toml that imports nemo_rl + pyproject = project_dir / "pyproject.toml" + pyproject.write_text(f"""[project] +name = "dummy-test-package" +version = "0.1.0" +dependencies = [ + "nemo-rl @ file://{nemo_rl_root}", +] + +[build-system] +requires = ["setuptools", "nemo-rl @ file://{nemo_rl_root}"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["dummy_pkg"] +""") + + # Create a minimal package + pkg_dir = project_dir / "dummy_pkg" + pkg_dir.mkdir() + + # Create a file to log build isolation detection results + log_file = project_dir / "build_isolation_log.txt" + + init_file = pkg_dir / "__init__.py" + init_file.write_text("""__version__ = "0.1.0" +""") + + # Create a setup.py that will be executed during build + setup_py = project_dir / "setup.py" + setup_py.write_text(f"""import sys +import os +from pathlib import Path +from setuptools import setup + +# Log file to write build isolation detection results +log_file = Path(r"{log_file}") + +# Set ignore flag to avoid actual fingerprint check failing +os.environ["NRL_IGNORE_VERSION_MISMATCH"] = "1" + +try: + # Import and test the build isolation detection + from nemo_rl import _is_build_isolation + result = _is_build_isolation() + + # Write results to log file + with open(log_file, "a") as f: + f.write(f"PREFIX:{{sys.prefix}}\\n") + f.write(f"IS_BUILD_ISOLATION:{{result}}\\n") + f.write("---\\n") +except Exception as e: + # Write error to log file + with open(log_file, "a") as f: + f.write(f"ERROR:{{e}}\\n") + f.write("---\\n") + +# Call setup() - setuptools will read pyproject.toml for config +setup() +""") + + # Initialize uv.lock with isolated environment + dummy_venv = project_dir / ".venv" + test_env = { + **os.environ, + "NRL_IGNORE_VERSION_MISMATCH": "1", + "UV_PROJECT_ENVIRONMENT": str(dummy_venv), + } + + try: + subprocess.run( + ["uv", "lock"], + cwd=project_dir, + capture_output=True, + check=True, + timeout=30, + env=test_env, + ) + except subprocess.TimeoutExpired: + pytest.skip("uv lock timed out") + except Exception as e: + pytest.skip(f"Failed to initialize dummy project: {e}") + + return project_dir + + def test_build_isolation_detected_during_uv_sync(self, dummy_project): + """Test that build isolation is detected during uv sync.""" + import subprocess + + log_file = dummy_project / "build_isolation_log.txt" + + # Clear log file if it exists + if log_file.exists(): + log_file.unlink() + + # Touch the package to force a rebuild + init_file = dummy_project / "dummy_pkg" / "__init__.py" + init_file.touch() + + # Set up isolated environment for the dummy project + dummy_venv = dummy_project / ".venv" + test_env = { + **os.environ, + "NRL_IGNORE_VERSION_MISMATCH": "1", + "UV_PROJECT_ENVIRONMENT": str(dummy_venv), + } + + # Run uv sync which will trigger a build + result = subprocess.run( + ["uv", "sync"], + cwd=dummy_project, + capture_output=True, + text=True, + timeout=60, + env=test_env, + ) + + # Read the log file written during build + assert log_file.exists(), ( + f"Log file not created. uv sync output:\n" + f"STDOUT: {result.stdout}\n" + f"STDERR: {result.stderr}" + ) + + log_content = log_file.read_text() + log_lines = log_content.strip().split("\n") + + # Look for our markers in the log + prefix_lines = [line for line in log_lines if "PREFIX:" in line] + isolation_lines = [line for line in log_lines if "IS_BUILD_ISOLATION:" in line] + + # During build, we should see at least one invocation with build isolation + assert len(prefix_lines) > 0, f"No prefix lines found in log:\n{log_content}" + assert len(isolation_lines) > 0, ( + f"No isolation detection lines found in log:\n{log_content}" + ) + + # Check that at least one prefix contains /builds-v (build isolation) + has_build_isolation = any("/builds-v" in line for line in prefix_lines) + assert has_build_isolation, ( + f"Expected /builds-v in at least one prefix:\n{'\n'.join(prefix_lines)}" + ) + + # Check that at least one isolation check returned True + has_true_isolation = any( + "IS_BUILD_ISOLATION:True" in line for line in isolation_lines + ) + assert has_true_isolation, ( + f"Expected at least one True isolation detection:\n{'\n'.join(isolation_lines)}" + ) + + def test_build_isolation_not_detected_during_uv_run(self, dummy_project): + """Test that build isolation is NOT detected during uv run.""" + import subprocess + + log_file = dummy_project / "build_isolation_log.txt" + + # Clear log file + if log_file.exists(): + log_file.unlink() + + # Set up isolated environment for the dummy project + dummy_venv = dummy_project / ".venv" + test_env = { + **os.environ, + "NRL_IGNORE_VERSION_MISMATCH": "1", + "UV_PROJECT_ENVIRONMENT": str(dummy_venv), + } + + # Run a simple command with uv run that writes to our log file + result = subprocess.run( + [ + "uv", + "run", + "python", + "-c", + f"import sys; import os; " + f"os.environ['NRL_IGNORE_VERSION_MISMATCH']='1'; " + f"from nemo_rl import _is_build_isolation; " + f"from pathlib import Path; " + f"log = Path(r'{log_file}'); " + f"log.write_text(f'PREFIX:{{sys.prefix}}\\nIS_BUILD_ISOLATION:{{_is_build_isolation()}}\\n')", + ], + cwd=dummy_project, + capture_output=True, + text=True, + timeout=60, + env=test_env, + ) + + # Read the log file + assert log_file.exists(), ( + f"Log file not created. uv run output:\n{result.stdout}\n{result.stderr}" + ) + + log_content = log_file.read_text() + + # During uv run, we should NOT be in build isolation + assert "/builds-v" not in log_content, ( + f"Unexpected build isolation path in uv run:\n{log_content}" + ) + assert "IS_BUILD_ISOLATION:False" in log_content, ( + f"Expected IS_BUILD_ISOLATION:False in uv run:\n{log_content}" + ) + + def test_fingerprint_check_skipped_with_force_rebuild_venvs(self, monkeypatch): + """Test that fingerprint check is skipped when NRL_FORCE_REBUILD_VENVS=true.""" + # Set up environment to simulate container with force rebuild + monkeypatch.setenv("NRL_CONTAINER", "1") + monkeypatch.setenv("NRL_FORCE_REBUILD_VENVS", "true") + + # Should complete without exception (check is skipped) + _check_container_fingerprint() + + # No exception raised + assert True diff --git a/tools/build-custom-vllm.sh b/tools/build-custom-vllm.sh index 399f361643..260dae7295 100644 --- a/tools/build-custom-vllm.sh +++ b/tools/build-custom-vllm.sh @@ -66,7 +66,7 @@ uv run --no-project use_existing_torch.py echo "Installing dependencies..." uv pip install --upgrade pip uv pip install numpy setuptools setuptools_scm -uv pip install torch==2.8.0 --torch-backend=cu128 +uv pip install torch==2.8.0 --torch-backend=cu129 # Install vLLM using precompiled wheel echo "Installing vLLM with precompiled wheel..." diff --git a/tools/generate_fingerprint.py b/tools/generate_fingerprint.py new file mode 100755 index 0000000000..2a23578804 --- /dev/null +++ b/tools/generate_fingerprint.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate a fingerprint for the NeMo RL codebase. + +This script computes hashes for individual dependency components: +- pyproject.toml contents +- uv.lock contents +- Git submodule commit SHAs + +The fingerprint is printed to stdout as JSON and can be used to detect container/code drift. +This script uses ONLY Python stdlib (no external packages) for maximum portability. + +Usage: + python tools/generate_fingerprint.py + +Output: + JSON object mapping component names to their hashes/commits +""" + +import hashlib +import json +import subprocess +from pathlib import Path + + +def get_repo_root() -> Path: + """Get the repository root directory relative to this script.""" + script_dir = Path(__file__).parent.resolve() + repo_root = script_dir.parent + return repo_root + + +def compute_file_hash(file_path: Path) -> str: + """Compute MD5 hash of a file's contents.""" + if not file_path.exists(): + return "missing" + + md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + md5.update(chunk) + return md5.hexdigest() + + +def get_submodule_shas(repo_root: Path) -> dict[str, str]: + """Get commit SHAs for all git submodules. + + Returns: + Dictionary mapping submodule path to commit SHA + """ + submodules = {} + + try: + # Run git submodule status to get current commits + result = subprocess.run( + # Add --git-dir and --work-tree to ensure we can run git without the safe.directory check + [ + "git", + f"--git-dir={repo_root}/.git", + f"--work-tree={repo_root}", + "submodule", + "status", + ], + cwd=repo_root, + capture_output=True, + text=True, + check=True, + ) + + # Parse output: " ()" or "+ ()" + for line in result.stdout.strip().split("\n"): + if not line: + continue + + parts = line.strip().split() + if len(parts) >= 2: + # Remove leading +/- indicators + commit = parts[0].lstrip("+-") + path = parts[1] + submodules[path] = commit + + except subprocess.CalledProcessError: + # If git command fails, return empty dict (e.g., not in a git repo) + pass + except FileNotFoundError: + # Git not available + pass + + return submodules + + +def generate_fingerprint() -> dict[str, str]: + """Generate a fingerprint for the current codebase state. + + Returns: + Dictionary mapping component names to their hashes/commits: + - "pyproject.toml": MD5 hash of pyproject.toml + - "uv.lock": MD5 hash of uv.lock + - "submodules/": Commit SHA for each submodule + """ + repo_root = get_repo_root() + + fingerprint = {} + + # Hash pyproject.toml + fingerprint["pyproject.toml"] = compute_file_hash(repo_root / "pyproject.toml") + + # Hash uv.lock + fingerprint["uv.lock"] = compute_file_hash(repo_root / "uv.lock") + + # Get submodule SHAs (sorted by path for consistency) + submodules = get_submodule_shas(repo_root) + for path, sha in sorted(submodules.items()): + fingerprint[f"submodules/{path}"] = sha + + return fingerprint + + +def main(): + """Main entry point: print fingerprint JSON to stdout.""" + fingerprint = generate_fingerprint() + print(json.dumps(fingerprint, indent=2, sort_keys=True)) + + +if __name__ == "__main__": + main() diff --git a/tools/list_editable_packages.py b/tools/list_editable_packages.py new file mode 100755 index 0000000000..68f1f19a93 --- /dev/null +++ b/tools/list_editable_packages.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""List editable packages for all python executables in PATH. + +This utility helps users identify which packages can be mounted for development. +It searches for all python* executables in PATH and lists their editable installs. +""" + +import json +import os +import re +import subprocess +import sys + + +def find_python_executables(): + """Find all python* executables in PATH. + + Returns: + List of (name, path) tuples for python executables + """ + # Pattern to match: + # - python (exact match, as representative of driver script's python) + # - python-* wrapper scripts (like python-AsyncTrajectoryCollector, python-DTensorPolicyWorker, etc.) + # Excludes python3, python3.12, etc. and argcomplete-related scripts + python_pattern = re.compile(r"^python$|^python-(?!.*argcomplete).*$") + + executables = [] + path_dirs = os.environ.get("PATH", "").split(os.pathsep) + + seen = set() + for path_dir in path_dirs: + if not path_dir or not os.path.isdir(path_dir): + continue + + try: + for entry in os.listdir(path_dir): + # Filter by pattern (includes argcomplete exclusion via negative lookahead) + if python_pattern.match(entry) and entry not in seen: + full_path = os.path.join(path_dir, entry) + if os.path.isfile(full_path) and os.access(full_path, os.X_OK): + # Verify it's actually a python executable + try: + result = subprocess.run( + [full_path, "--version"], + capture_output=True, + timeout=2, + ) + if result.returncode == 0: + executables.append((entry, full_path)) + seen.add(entry) + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + except (PermissionError, OSError): + continue + + # Sort by name for consistent output + executables.sort(key=lambda x: x[0]) + return executables + + +def get_editable_packages(python_exe): + """Get list of editable packages for a python executable. + + Args: + python_exe: Path to python executable + + Returns: + List of (package_name, location) tuples for editable packages + """ + try: + result = subprocess.run( + [python_exe, "-m", "pip", "list", "--format=json", "--editable"], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode != 0: + return None + + packages = json.loads(result.stdout) + editable_packages = [] + + for pkg in packages: + # Get more details about the package location + show_result = subprocess.run( + [python_exe, "-m", "pip", "show", pkg["name"]], + capture_output=True, + text=True, + timeout=5, + ) + + if show_result.returncode == 0: + location = None + editable_location = None + + for line in show_result.stdout.split("\n"): + if line.startswith("Location:"): + location = line.split(":", 1)[1].strip() + elif line.startswith("Editable project location:"): + editable_location = line.split(":", 1)[1].strip() + + # Prefer editable location if available + final_location = editable_location or location + if final_location: + editable_packages.append((pkg["name"], final_location)) + + return editable_packages + + except ( + subprocess.TimeoutExpired, + subprocess.CalledProcessError, + json.JSONDecodeError, + ): + return None + + +def main(): + """Main entry point: list editable packages for all python executables.""" + print("Searching for python executables in PATH...") + print() + + executables = find_python_executables() + + if not executables: + print("No python executables found in PATH.") + return 1 + + found_any_editable = False + + for name, path in executables: + editable_packages = get_editable_packages(path) + + if editable_packages is None: + continue # Skip executables where pip list failed + + if not editable_packages: + continue # Skip executables with no editable packages + + found_any_editable = True + print(f"{name}:") + for pkg_name, pkg_location in sorted(editable_packages): + print(f" - {pkg_name}: {pkg_location}") + print() + + if not found_any_editable: + print("No editable packages found in any python executable.") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/uv.lock b/uv.lock index 7b06abd41f..3886a5f5f1 100644 --- a/uv.lock +++ b/uv.lock @@ -3218,6 +3218,7 @@ dependencies = [ { name = "nvtx" }, { name = "omegaconf" }, { name = "pillow" }, + { name = "pip" }, { name = "plotly" }, { name = "pyzmq" }, { name = "ray", extra = ["default"] }, @@ -3294,9 +3295,13 @@ docs = [ { name = "sphinx-autobuild" }, { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, +<<<<<<< HEAD { name = "sphinx-design" }, { name = "sphinxcontrib-mermaid" }, { name = "swagger-plugin-for-sphinx" }, +======= + { name = "sphinxcontrib-mermaid" }, +>>>>>>> 7d1d7f2d (feat: allow uv-less execution and fingerprint the environment) ] test = [ { name = "pytest" }, @@ -3340,6 +3345,7 @@ requires-dist = [ { name = "omegaconf" }, { name = "penguin", marker = "extra == 'penguin'", editable = "3rdparty/Penguin-workspace" }, { name = "pillow", specifier = ">=11.3.0" }, + { name = "pip" }, { name = "plotly" }, { name = "pyzmq" }, { name = "ray", extras = ["default"], specifier = "==2.49.2" }, @@ -3391,9 +3397,13 @@ docs = [ { name = "sphinx-autobuild" }, { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, +<<<<<<< HEAD { name = "sphinx-design" }, { name = "sphinxcontrib-mermaid" }, { name = "swagger-plugin-for-sphinx" }, +======= + { name = "sphinxcontrib-mermaid" }, +>>>>>>> 7d1d7f2d (feat: allow uv-less execution and fingerprint the environment) ] test = [ { name = "pytest", specifier = ">=7.0.0" }, @@ -4251,6 +4261,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" }, ] +[[package]] +name = "pip" +version = "25.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/6e/74a3f0179a4a73a53d66ce57fdb4de0080a8baa1de0063de206d6167acc2/pip-25.3.tar.gz", hash = "sha256:8d0538dbbd7babbd207f261ed969c65de439f6bc9e5dbd3b3b9a77f25d95f343", size = 1803014, upload-time = "2025-10-25T00:55:41.394Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/3c/d717024885424591d5376220b5e836c2d5293ce2011523c9de23ff7bf068/pip-25.3-py3-none-any.whl", hash = "sha256:9655943313a94722b7774661c21049070f6bbb0a1516bf02f7c8d5d9201514cd", size = 1778622, upload-time = "2025-10-25T00:55:39.247Z" }, +] + [[package]] name = "platformdirs" version = "4.3.8" From 907dc91461fd6569ff2ba6997b56b3fb891e300f Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 18 Nov 2025 07:02:30 +0000 Subject: [PATCH 2/4] fix uv.lock Signed-off-by: Terry Kong --- uv.lock | 8 -------- 1 file changed, 8 deletions(-) diff --git a/uv.lock b/uv.lock index 3886a5f5f1..3ba086094e 100644 --- a/uv.lock +++ b/uv.lock @@ -3295,13 +3295,9 @@ docs = [ { name = "sphinx-autobuild" }, { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, -<<<<<<< HEAD { name = "sphinx-design" }, { name = "sphinxcontrib-mermaid" }, { name = "swagger-plugin-for-sphinx" }, -======= - { name = "sphinxcontrib-mermaid" }, ->>>>>>> 7d1d7f2d (feat: allow uv-less execution and fingerprint the environment) ] test = [ { name = "pytest" }, @@ -3397,13 +3393,9 @@ docs = [ { name = "sphinx-autobuild" }, { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, -<<<<<<< HEAD { name = "sphinx-design" }, { name = "sphinxcontrib-mermaid" }, { name = "swagger-plugin-for-sphinx" }, -======= - { name = "sphinxcontrib-mermaid" }, ->>>>>>> 7d1d7f2d (feat: allow uv-less execution and fingerprint the environment) ] test = [ { name = "pytest", specifier = ">=7.0.0" }, From 29f027aac1040464779d3358bac7f988fa7fe145 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 18 Nov 2025 07:05:24 +0000 Subject: [PATCH 3/4] lint Signed-off-by: Terry Kong --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 51806191b7..25d3da1aec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ requires-python = ">=3.12" license = { text = "Apache 2.0" } dependencies = [ "setuptools", - "pip", # Required for frozen environments; uv venv --seed may not reliably install pip + "pip", # Required for frozen environments; uv venv --seed may not reliably install pip "ninja", # for flash-attn parallel build "torch==2.8.0", "triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", From 20e266b0284eb2ace103d91675fe9bb98b7cacf2 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 18 Nov 2025 22:11:54 +0000 Subject: [PATCH 4/4] fix custom vllm Signed-off-by: Terry Kong --- docker/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d9a2184c62..f0c798e7e3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,3 +1,4 @@ +# syntax=docker/dockerfile:1 # Usage: # Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile --tag /nemo-rl:latest --push . # Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag /nemo-rl:r0.3.0 --push . @@ -122,7 +123,8 @@ LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" ENV NEMO_RL_VENV_DIR=/opt/ray_venvs # Copy in source from build context (defaults to cloned repo, can be overridden) -COPY --from=nemo-rl . /opt/nemo-rl +# Exclude pyproject.toml and uv.lock since those may be altered by build-custom-vllm.sh +COPY --from=nemo-rl --exclude=pyproject.toml --exclude=uv.lock . /opt/nemo-rl # Unshallow the repo to get the full history (in the case it was from the scratch layer). # Potentially not necessary if the repo is passed in as a complete repository (w/ full git history), # so do a quick check before trying to unshallow.