Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ jobs:
image-name: nemo_rl_container
dockerfile: docker/Dockerfile
image-label: nemo-rl
target: hermetic
target: release
build-contexts: |
nemo-rl=${{ github.run_id }}/
build-args: |
Expand Down
13 changes: 12 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# syntax=docker/dockerfile:1
# Usage:
# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile --tag <registry>/nemo-rl:latest --push .
# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push .
Expand All @@ -10,6 +11,8 @@ ARG NRL_GIT_REF=main
ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /

FROM ${BASE_IMAGE} AS base
# An environment variable to indicate that we are in a container.
ENV NRL_CONTAINER=1

# It is more convenient for users to run as root
USER root
Expand Down Expand Up @@ -76,10 +79,13 @@ ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"

# First copy only the dependency files
COPY --from=nemo-rl pyproject.toml uv.lock ./
# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist.
COPY --from=nemo-rl nemo_rl/__init__.py nemo_rl/package_info.py ./nemo_rl/
COPY --from=nemo-rl tools/build-custom-vllm.sh ./tools/build-custom-vllm.sh
COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/

RUN <<"EOF" bash -exu
uv venv --seed
if [[ -n "${BUILD_CUSTOM_VLLM:-}" ]]; then
bash tools/build-custom-vllm.sh
source 3rdparty/vllm/nemo-rl.env
Expand Down Expand Up @@ -117,10 +123,15 @@ LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs

# Copy in source from build context (defaults to cloned repo, can be overridden)
COPY --from=nemo-rl . /opt/nemo-rl
# Exclude pyproject.toml and uv.lock since those may be altered by build-custom-vllm.sh
COPY --from=nemo-rl --exclude=pyproject.toml --exclude=uv.lock . /opt/nemo-rl
# Unshallow the repo to get the full history (in the case it was from the scratch layer).
# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
# so do a quick check before trying to unshallow.
RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
RUN UV_LINK_MODE=symlink uv run nemo_rl/utils/prefetch_venvs.py

# Generate container fingerprint for frozen environment support
# Store outside /opt/nemo-rl to avoid being overwritten by user mounts
RUN python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint

1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"tasklist", # Adds support for GitHub-style task lists with [ ] and [x]
]
myst_heading_anchors = 5 # Generates anchor links for headings up to level 5
myst_fence_as_directive = ["mermaid"] # Treat ```mermaid blocks as directives

# -- Options for Autodoc2 ---------------------------------------------------
sys.path.insert(0, os.path.abspath(".."))
Expand Down
343 changes: 343 additions & 0 deletions docs/design-docs/dependency-management.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ design-docs/design-and-philosophy.md
design-docs/padding.md
design-docs/logger.md
design-docs/uv.md
design-docs/dependency-management.md
design-docs/chat-datasets.md
design-docs/generation.md
design-docs/checkpointing.md
Expand Down
159 changes: 159 additions & 0 deletions nemo_rl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
import sys
from pathlib import Path

# Configure logging to show file location for warnings
logging.basicConfig(
format="%(levelname)s:%(name)s:%(filename)s:%(lineno)d: %(message)s",
level=logging.WARNING,
)

"""
This is a work around to ensure whenever NeMo RL is imported, that we
add Megatron-LM to the python path. This is because the only sub-package
Expand Down Expand Up @@ -49,6 +55,159 @@
os.environ["RAY_ENABLE_UV_RUN_RUNTIME_ENV"] = "0"


def _is_build_isolation():
"""Detect if we're running in a uv build isolation environment.

When running uv lock/sync, uv creates a temporary isolated environment
in ~/.cache/uv/builds-v*/ to build packages and introspect metadata.
We skip the fingerprint check in this context since the user is updating dependencies.

Returns True if in build isolation, False otherwise.
"""
# Check if we're in uv's build isolation directory
# uv always uses paths like: /root/.cache/uv/builds-v0/.tmp*/
return "/builds-v" in sys.prefix


def _check_container_fingerprint():
"""Check if container dependencies match the current code (container-only).

This check only runs when NRL_CONTAINER=1 is set (inside containers).
It compares the container's fingerprint (computed at build time) with
the current code's fingerprint to detect dependency drift.

This check is also skipped entirely if NRL_FORCE_REBUILD_VENVS=true is set,
since environment rebuilding will ensure dependencies are consistent regardless
of a mismatch.

If there's a mismatch, raises RuntimeError unless NRL_IGNORE_VERSION_MISMATCH is set.
"""
# Skip check if not in container or if we're going to force venv rebuild anyway
if not os.environ.get("NRL_CONTAINER"):
return
if os.environ.get("NRL_FORCE_REBUILD_VENVS", "").lower() == "true":
logging.info(
"Skipping container fingerprint check because NRL_FORCE_REBUILD_VENVS=true (venvs will be rebuilt anyway)"
)
return

# Skip check if we're in a build isolation environment (e.g., during uv lock/sync)
if _is_build_isolation():
logging.debug(
"Skipping container fingerprint check because we're in a build isolation environment"
)
return

try:
import json
import runpy
import sys
from io import StringIO

# Get repo root (relative to this module)
repo_root = Path(__file__).parent.parent
fingerprint_script = repo_root / "tools" / "generate_fingerprint.py"

# Check if script exists
if not fingerprint_script.exists():
logging.warning(
f"Fingerprint script not found at {fingerprint_script}, skipping version check"
)
return

# Compute current code fingerprint using runpy (cleaner than subprocess)
old_stdout = sys.stdout
sys.stdout = captured_output = StringIO()
try:
runpy.run_path(str(fingerprint_script), run_name="__main__")
current_fingerprint_json = captured_output.getvalue().strip()
finally:
sys.stdout = old_stdout

if not current_fingerprint_json:
logging.warning("Failed to compute code fingerprint: empty output")
return

current_fingerprint = json.loads(current_fingerprint_json)

# Read container fingerprint
container_fingerprint_file = Path("/opt/nemo_rl_container_fingerprint")
if not container_fingerprint_file.exists():
logging.warning(
"Container fingerprint file not found, skipping version check"
)
return

container_fingerprint = json.loads(
container_fingerprint_file.read_text().strip()
)

# Compare fingerprints and find differences
all_keys = set(current_fingerprint.keys()) | set(container_fingerprint.keys())
differences = []

for key in sorted(all_keys):
current_val = current_fingerprint.get(key, "missing")
container_val = container_fingerprint.get(key, "missing")

if current_val != container_val:
differences.append(f" - {key}:")
differences.append(f" Container: {container_val}")
differences.append(f" Current: {current_val}")

if differences:
diff_text = "\n".join(differences)
sep_line = "\n" + ("-" * 80)
warning_msg = (
f"{sep_line}\n"
"WARNING: Container/Code Version Mismatch Detected!\n"
f"{sep_line}\n"
"Your container's dependencies do not match your current code.\n"
"\n"
"Differences found:\n"
f"{diff_text}\n"
"\n"
"This can lead to unexpected behavior or errors.\n"
"\n"
"Solutions:\n"
" 1. Rebuild the container to match your code\n"
" 2. Set NRL_FORCE_REBUILD_VENVS=true to rebuild virtual environments\n"
" (This forces Ray workers to recreate their venvs with updated dependencies)\n"
" 3. Update the container fingerprint to match your current code (for local dev):\n"
" python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint\n"
" 4. Set NRL_IGNORE_VERSION_MISMATCH=1 to bypass this check (not recommended)\n"
"\n"
"Learn more about dependency management:\n"
" https://github.com/NVIDIA-NeMo/RL/blob/main/docs/design-docs/dependency-management.md\n"
f"{sep_line}\n"
)

# Check if user wants to ignore the mismatch
if os.environ.get("NRL_IGNORE_VERSION_MISMATCH"):
logging.warning(
warning_msg
+ "Proceeding anyway (NRL_IGNORE_VERSION_MISMATCH is set)..."
)
else:
raise RuntimeError(
warning_msg
+ "To proceed anyway, set: export NRL_IGNORE_VERSION_MISMATCH=1"
)
else:
logging.debug("Container fingerprint matches code fingerprint")

except RuntimeError:
# Re-raise RuntimeError for version mismatches (user should see this)
raise
except Exception as e:
# Log other errors but don't crash on version check failures
logging.debug(f"Version check failed (non-fatal): {e}")


# Perform container version check
_check_container_fingerprint()


def _patch_nsight_file():
"""Patch the nsight.py file to fix the context.py_executable assignment.

Expand Down
96 changes: 96 additions & 0 deletions nemo_rl/utils/prefetch_venvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from pathlib import Path

from nemo_rl.distributed.ray_actor_environment_registry import (
ACTOR_ENVIRONMENT_REGISTRY,
Expand Down Expand Up @@ -52,6 +54,100 @@ def prefetch_venvs():

print("\nVenv prefetching complete!")

# Create convenience python wrapper scripts for frozen environment support (container-only)
create_frozen_environment_symlinks(venv_configs)


def create_frozen_environment_symlinks(venv_configs):
"""Create python-{ClassName} wrapper scripts in /usr/local/bin for frozen environment support.

Only runs in container (when NRL_CONTAINER=1 is set).

Args:
venv_configs: Dictionary mapping py_executable to list of actor FQNs
"""
# Only create wrapper scripts in container
if not os.environ.get("NRL_CONTAINER"):
print(
"\nSkipping frozen environment wrapper script creation (not in container)"
)
return

print("\nCreating frozen environment wrapper scripts...")

# Collect all wrapper mappings: class_name -> venv_path
wrapper_mappings = {}

for py_executable, actor_fqns in venv_configs.items():
for actor_fqn in actor_fqns:
# Extract class name from FQN (last part)
# e.g., "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker" -> "MegatronPolicyWorker"
class_name = actor_fqn.split(".")[-1]

# Get the venv path that was created
try:
python_path = create_local_venv(py_executable, actor_fqn)

# Check for collisions
if class_name in wrapper_mappings:
existing_path = wrapper_mappings[class_name]
if existing_path != python_path:
raise RuntimeError(
f"Collision detected: Multiple venvs want to use name '{class_name}'\n"
f" Existing: {existing_path}\n"
f" New: {python_path}\n"
f"This indicates two different worker classes have the same name."
)
else:
wrapper_mappings[class_name] = python_path
except Exception as e:
print(f" Warning: Could not get venv path for {actor_fqn}: {e}")
continue

# Create wrapper scripts
wrapper_dir = Path("/usr/local/bin")
created_wrappers = []

for class_name, python_path in sorted(wrapper_mappings.items()):
wrapper_name = f"python-{class_name}"
wrapper_path = wrapper_dir / wrapper_name

# Get the venv directory path (parent of bin/python)
venv_path = Path(python_path).parent.parent

# Create wrapper script content
wrapper_content = f"""#!/bin/bash
VENV_PATH="{venv_path}"
export VIRTUAL_ENV="$VENV_PATH"
export PATH="$VENV_PATH/bin:$PATH"
exec "$VENV_PATH/bin/python" "$@"
"""

try:
# Remove existing wrapper if present
if wrapper_path.exists() or wrapper_path.is_symlink():
wrapper_path.unlink()

# Write wrapper script
wrapper_path.write_text(wrapper_content)

# Make executable
wrapper_path.chmod(0o755)

created_wrappers.append(wrapper_name)
print(f" Created: {wrapper_name} -> {python_path}")
except Exception as e:
print(f" Warning: Could not create wrapper script {wrapper_name}: {e}")
continue

if created_wrappers:
print(f"\nCreated {len(created_wrappers)} frozen environment wrapper scripts")
print("Users can now use these python executables directly:")
for name in created_wrappers:
print(f" - {name}")
else:
print("\nNo frozen environment wrapper scripts were created")


if __name__ == "__main__":
prefetch_venvs()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ requires-python = ">=3.12"
license = { text = "Apache 2.0" }
dependencies = [
"setuptools",
"pip", # Required for frozen environments; uv venv --seed may not reliably install pip
"ninja", # for flash-attn parallel build
"torch==2.8.0",
"triton; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')",
Expand Down
4 changes: 4 additions & 0 deletions tests/functional/L1_Functional_Tests_GPU.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
PROJECT_ROOT=$(realpath ${SCRIPT_DIR}/../..)

cd ${PROJECT_ROOT}
# This test is intentionally not run with uv run --no-sync to verify that the frozen environment is working correctly.
time bash ./tests/functional/grpo_frozen_env.sh
time bash ./tests/functional/test_frozen_env.sh

time uv run --no-sync bash ./tests/functional/sft.sh
time uv run --no-sync bash ./tests/functional/grpo.sh
time uv run --no-sync bash ./tests/functional/grpo_async.sh
Expand Down
Loading
Loading