|
16 | 16 | import sys |
17 | 17 | from pathlib import Path |
18 | 18 |
|
| 19 | +# Configure logging to show file location for warnings |
| 20 | +logging.basicConfig( |
| 21 | + format="%(levelname)s:%(name)s:%(filename)s:%(lineno)d: %(message)s", |
| 22 | + level=logging.WARNING, |
| 23 | +) |
| 24 | + |
19 | 25 | """ |
20 | 26 | This is a work around to ensure whenever NeMo RL is imported, that we |
21 | 27 | add Megatron-LM to the python path. This is because the only sub-package |
|
49 | 55 | os.environ["RAY_ENABLE_UV_RUN_RUNTIME_ENV"] = "0" |
50 | 56 |
|
51 | 57 |
|
| 58 | +def _is_build_isolation(): |
| 59 | + """Detect if we're running in a uv build isolation environment. |
| 60 | +
|
| 61 | + When running uv lock/sync, uv creates a temporary isolated environment |
| 62 | + in ~/.cache/uv/builds-v*/ to build packages and introspect metadata. |
| 63 | + We skip the fingerprint check in this context since the user is updating dependencies. |
| 64 | +
|
| 65 | + Returns True if in build isolation, False otherwise. |
| 66 | + """ |
| 67 | + # Check if we're in uv's build isolation directory |
| 68 | + # uv always uses paths like: /root/.cache/uv/builds-v0/.tmp*/ |
| 69 | + return "/builds-v" in sys.prefix |
| 70 | + |
| 71 | + |
| 72 | +def _check_container_fingerprint(): |
| 73 | + """Check if container dependencies match the current code (container-only). |
| 74 | +
|
| 75 | + This check only runs when NRL_CONTAINER=1 is set (inside containers). |
| 76 | + It compares the container's fingerprint (computed at build time) with |
| 77 | + the current code's fingerprint to detect dependency drift. |
| 78 | +
|
| 79 | + This check is also skipped entirely if NRL_FORCE_REBUILD_VENVS=true is set, |
| 80 | + since environment rebuilding will ensure dependencies are consistent regardless |
| 81 | + of a mismatch. |
| 82 | +
|
| 83 | + If there's a mismatch, raises RuntimeError unless NRL_IGNORE_VERSION_MISMATCH is set. |
| 84 | + """ |
| 85 | + # Skip check if not in container or if we're going to force venv rebuild anyway |
| 86 | + if not os.environ.get("NRL_CONTAINER"): |
| 87 | + return |
| 88 | + if os.environ.get("NRL_FORCE_REBUILD_VENVS", "").lower() == "true": |
| 89 | + logging.info( |
| 90 | + "Skipping container fingerprint check because NRL_FORCE_REBUILD_VENVS=true (venvs will be rebuilt anyway)" |
| 91 | + ) |
| 92 | + return |
| 93 | + |
| 94 | + # Skip check if we're in a build isolation environment (e.g., during uv lock/sync) |
| 95 | + if _is_build_isolation(): |
| 96 | + logging.debug( |
| 97 | + "Skipping container fingerprint check because we're in a build isolation environment" |
| 98 | + ) |
| 99 | + return |
| 100 | + |
| 101 | + try: |
| 102 | + import json |
| 103 | + import runpy |
| 104 | + import sys |
| 105 | + from io import StringIO |
| 106 | + |
| 107 | + # Get repo root (relative to this module) |
| 108 | + repo_root = Path(__file__).parent.parent |
| 109 | + fingerprint_script = repo_root / "tools" / "generate_fingerprint.py" |
| 110 | + |
| 111 | + # Check if script exists |
| 112 | + if not fingerprint_script.exists(): |
| 113 | + logging.warning( |
| 114 | + f"Fingerprint script not found at {fingerprint_script}, skipping version check" |
| 115 | + ) |
| 116 | + return |
| 117 | + |
| 118 | + # Compute current code fingerprint using runpy (cleaner than subprocess) |
| 119 | + old_stdout = sys.stdout |
| 120 | + sys.stdout = captured_output = StringIO() |
| 121 | + try: |
| 122 | + runpy.run_path(str(fingerprint_script), run_name="__main__") |
| 123 | + current_fingerprint_json = captured_output.getvalue().strip() |
| 124 | + finally: |
| 125 | + sys.stdout = old_stdout |
| 126 | + |
| 127 | + if not current_fingerprint_json: |
| 128 | + logging.warning("Failed to compute code fingerprint: empty output") |
| 129 | + return |
| 130 | + |
| 131 | + current_fingerprint = json.loads(current_fingerprint_json) |
| 132 | + |
| 133 | + # Read container fingerprint |
| 134 | + container_fingerprint_file = Path("/opt/nemo_rl_container_fingerprint") |
| 135 | + if not container_fingerprint_file.exists(): |
| 136 | + logging.warning( |
| 137 | + "Container fingerprint file not found, skipping version check" |
| 138 | + ) |
| 139 | + return |
| 140 | + |
| 141 | + container_fingerprint = json.loads( |
| 142 | + container_fingerprint_file.read_text().strip() |
| 143 | + ) |
| 144 | + |
| 145 | + # Compare fingerprints and find differences |
| 146 | + all_keys = set(current_fingerprint.keys()) | set(container_fingerprint.keys()) |
| 147 | + differences = [] |
| 148 | + |
| 149 | + for key in sorted(all_keys): |
| 150 | + current_val = current_fingerprint.get(key, "missing") |
| 151 | + container_val = container_fingerprint.get(key, "missing") |
| 152 | + |
| 153 | + if current_val != container_val: |
| 154 | + differences.append(f" - {key}:") |
| 155 | + differences.append(f" Container: {container_val}") |
| 156 | + differences.append(f" Current: {current_val}") |
| 157 | + |
| 158 | + if differences: |
| 159 | + diff_text = "\n".join(differences) |
| 160 | + sep_line = "\n" + ("-" * 80) |
| 161 | + warning_msg = ( |
| 162 | + f"{sep_line}\n" |
| 163 | + "WARNING: Container/Code Version Mismatch Detected!\n" |
| 164 | + f"{sep_line}\n" |
| 165 | + "Your container's dependencies do not match your current code.\n" |
| 166 | + "\n" |
| 167 | + "Differences found:\n" |
| 168 | + f"{diff_text}\n" |
| 169 | + "\n" |
| 170 | + "This can lead to unexpected behavior or errors.\n" |
| 171 | + "\n" |
| 172 | + "Solutions:\n" |
| 173 | + " 1. Rebuild the container to match your code\n" |
| 174 | + " 2. Set NRL_FORCE_REBUILD_VENVS=true to rebuild virtual environments\n" |
| 175 | + " (This forces Ray workers to recreate their venvs with updated dependencies)\n" |
| 176 | + " 3. Update the container fingerprint to match your current code (for local dev):\n" |
| 177 | + " python tools/generate_fingerprint.py > /opt/nemo_rl_container_fingerprint\n" |
| 178 | + " 4. Set NRL_IGNORE_VERSION_MISMATCH=1 to bypass this check (not recommended)\n" |
| 179 | + "\n" |
| 180 | + "Learn more about dependency management:\n" |
| 181 | + " https://github.com/NVIDIA-NeMo/RL/blob/main/docs/design-docs/dependency-management.md\n" |
| 182 | + f"{sep_line}\n" |
| 183 | + ) |
| 184 | + |
| 185 | + # Check if user wants to ignore the mismatch |
| 186 | + if os.environ.get("NRL_IGNORE_VERSION_MISMATCH"): |
| 187 | + logging.warning( |
| 188 | + warning_msg |
| 189 | + + "Proceeding anyway (NRL_IGNORE_VERSION_MISMATCH is set)..." |
| 190 | + ) |
| 191 | + else: |
| 192 | + raise RuntimeError( |
| 193 | + warning_msg |
| 194 | + + "To proceed anyway, set: export NRL_IGNORE_VERSION_MISMATCH=1" |
| 195 | + ) |
| 196 | + else: |
| 197 | + logging.debug("Container fingerprint matches code fingerprint") |
| 198 | + |
| 199 | + except RuntimeError: |
| 200 | + # Re-raise RuntimeError for version mismatches (user should see this) |
| 201 | + raise |
| 202 | + except Exception as e: |
| 203 | + # Log other errors but don't crash on version check failures |
| 204 | + logging.debug(f"Version check failed (non-fatal): {e}") |
| 205 | + |
| 206 | + |
| 207 | +# Perform container version check |
| 208 | +_check_container_fingerprint() |
| 209 | + |
| 210 | + |
52 | 211 | def _patch_nsight_file(): |
53 | 212 | """Patch the nsight.py file to fix the context.py_executable assignment. |
54 | 213 |
|
|
0 commit comments