adding all diff to new branch

mnoukhov · mnoukhov · commit ea52adf6589a · 2025-08-14T20:32:59.000Z
diff --git a/.gitignore b/.gitignore
@@ -159,3 +159,4 @@ dmypy.json
 cache/
 local_dataset_cache/
 scratch/
+vllm_olmo3/
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,5 @@
 FROM ghcr.io/allenai/cuda:12.8-dev-ubuntu22.04-torch2.7.0-v1.2.170
 
-# Add build arguments for git information
-ARG GIT_COMMIT=""
-ARG GIT_BRANCH=""
-
-# Set them as environment variables
-ENV GIT_COMMIT=${GIT_COMMIT}
-ENV GIT_BRANCH=${GIT_BRANCH}
-
 COPY --from=ghcr.io/astral-sh/uv:0.8.6 /uv /uvx /bin/
 
 # Set default cache directory but allow override from environment
@@ -31,6 +23,9 @@ COPY pyproject.toml uv.lock ./
 # Annoyingly, we need this before `uv run`, or it complains.
 COPY open_instruct open_instruct
 
+# Install custom vllm for olmo3
+RUN git clone -b shanea/olmo2-retrofit https://github.com/2015aroras/vllm.git vllm_olmo2.5
+
 # Install dependencies
 RUN --mount=type=cache,target=${UV_CACHE_DIR} \
     --mount=type=bind,source=uv.lock,target=uv.lock \
@@ -47,6 +42,7 @@ COPY configs configs
 COPY scripts scripts
 COPY oe-eval-internal oe-eval-internal
 COPY mason.py mason.py
+COPY .git/ ./.git/
 
 # Set up the environment
-ENV PATH=/stage/.venv/bin:$PATH
+ENV PATH=/stage/.venv/bin:$PATH
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: style quality
+.PHONY: style quality docker
 
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = open_instruct
@@ -16,3 +16,10 @@ style-check:   ## *fail* if anything needs rewriting
 
 quality-check: ## *fail* if any rewrite was needed
 	uv run ruff check --exit-non-zero-on-fix $(check_dirs)
+
+docker:
+	DOCKER_BUILDKIT=1 docker build -f Dockerfile --build-arg UV_CACHE_DIR=$(UV_CACHE_DIR) -t open_instruct_dev_uv_olmo3 .
+	# if you are internally at AI2, you can create an image like this:
+	$(eval beaker_user := $(shell beaker account whoami --format json | jq -r '.[0].name'))
+	beaker image delete $(beaker_user)/open_instruct_dev_olmo2.5
+	beaker image create open_instruct_dev_uv_olmo3 -n open_instruct_dev_uv_olmo3 -w ai2/$(beaker_user)
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -73,6 +73,7 @@
 from ray.util.placement_group import PlacementGroup, placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 from rich.pretty import pprint
+from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizer, get_scheduler
 from transformers.integrations import HfDeepSpeedConfig
@@ -122,7 +123,6 @@
     is_beaker_job,
     launch_ai2_evals_on_weka,
     maybe_get_beaker_config,
-    maybe_update_beaker_description_with_wandb_url,
     maybe_use_ai2_hf_entity,
     maybe_use_ai2_wandb_entity,
     ray_get_with_progress,
@@ -382,6 +382,8 @@ class Args:
     """The beaker evaluation tasks to launch"""
     oe_eval_max_length: int = 4096
     """the max generation length for evaluation for oe-eval"""
+    oe_eval_beaker_image: Optional[str] = None
+    """the docker image for evaluation for oe-eval"""
     eval_priority: Literal["low", "normal", "high", "urgent"] = "normal"
     """the priority of auto-launched evaluation jobs"""
 
@@ -1078,6 +1080,7 @@ def launch_ai2_evals_on_weka_wrapper(self, step_dir, leaderboard_name, wandb_url
                 args.stop_strings,
                 args.gs_bucket_path,
                 args.eval_priority,
+                args.oe_eval_beaker_image,
             )
 
 
@@ -1648,15 +1651,21 @@ def setup_experiment_tracking(args: Args, tc: TokenizerConfig, model_config: Mod
         wandb.init(
             project=args.wandb_project_name,
             entity=args.wandb_entity,
+            sync_tensorboard=True,
             config=all_configs,
             name=args.run_name,
             save_code=True,
             tags=[args.exp_name] + get_wandb_tags(),
         )
         wandb_url = wandb.run.get_url()
-        maybe_update_beaker_description_with_wandb_url(wandb_url)
 
-    return beaker_config, wandb_url
+    writer = SummaryWriter(f"runs/{args.run_name}")
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+    )
+
+    return beaker_config, writer, wandb_url
 
 
 def setup_datasets(args: Args, tc: TokenizerConfig, tokenizer: PreTrainedTokenizer):
@@ -1936,11 +1945,13 @@ def one_training_step(
     collated_data,
     tokenizer,
     data_thread_metrics,
+    average_metrics,
     episode,
     training_step,
     num_total_tokens,
     start_time,
     train_dataset,
+    writer,
     wandb_url,
     chat_template_name,
 ):
@@ -1975,18 +1986,16 @@ def one_training_step(
             **data_thread_metrics,
             **average_metrics,
         }
-        # Print only scalar metrics
-        scalar_metrics = {k: v for k, v in metrics.items() if isinstance(v, (float, int))}
+        scalar_metrics = {}
+        for key, value in metrics.items():
+            if isinstance(value, float) or isinstance(value, int):
+                writer.add_scalar(key, value, episode)
+                scalar_metrics[key] = value
+            if isinstance(value, np.ndarray) or isinstance(value, list):
+                if len(value) > 0:
+                    writer.add_histogram(key, value, episode)
         print_rich_single_line_metrics(scalar_metrics)
 
-        if args.with_tracking:
-            # Convert array/list metrics to wandb histograms for logging
-            for key, value in metrics.items():
-                if isinstance(value, np.ndarray) or isinstance(value, list):
-                    if len(value) > 0:
-                        metrics[key] = wandb.Histogram(value)
-            wandb.log(metrics, step=episode)
-
         if args.save_freq > 0 and training_step % args.save_freq == 0 and (args.eval_on_step_0 or training_step > 1):
             with Timer("[Main Thread] 🗡️ Saving model"):
                 checkpoint_dir = f"{args.output_dir}_checkpoints"
@@ -2036,6 +2045,7 @@ def maybe_evaluate(
     eval_batch: Optional[Batch],
     reward_fn,
     episode,
+    writer,
     eval_pending_queries_map: PendingQueriesMap,
     eval_generation_config,
 ):
@@ -2083,18 +2093,19 @@ def maybe_evaluate(
             **eval_reward_metrics,
         }
         print_rich_single_line_metrics(eval_metrics)
-
+        for key, value in eval_metrics.items():
+            writer.add_scalar(key, value, episode)
         table = {}
         table["prompt"] = tokenizer.batch_decode(eval_batch.queries if eval_batch else [])
         table["response"] = eval_decoded_responses
         table["response"] = [item.replace(tokenizer.pad_token, "") for item in table["response"]]
         table["scores"] = eval_scores
         table["ground_truth"] = eval_batch.ground_truths if eval_batch else []
         df = pd.DataFrame(table)
-
         if args.with_tracking:
-            eval_metrics["sample_completions"] = wandb.Table(dataframe=df)
-            wandb.log(eval_metrics, step=episode)
+            import wandb
+
+            wandb.log({"sample_completions": wandb.Table(dataframe=df)})
         else:
             print_rich_table(df.iloc[:1])
         del table
@@ -2229,8 +2240,11 @@ async def reward_fn(
 
 def cleanup_judge_clients():
     """Cleans up all LLM judge clients and shutdown Ray."""
-    asyncio.run(cleanup_all_llm_judge_clients())
-    logger.info("✅ LLM judge clients cleaned up")
+    try:
+        asyncio.run(cleanup_all_llm_judge_clients())
+        logger.info("✅ LLM judge clients cleaned up")
+    except Exception as cleanup_error:
+        logger.warning(f"Error during LLM judge cleanup: {cleanup_error}")
     ray.shutdown()
 
 
@@ -2263,7 +2277,12 @@ def cleanup_training_resources(
         queues[0].put(ShutdownSentinel(), timeout=1)
 
     logger.info("Shutting down Ray queues...")
-    [queue.shutdown() for queue in queues]
+    for queue in queues:
+        try:
+            queue.shutdown()
+        except Exception as e:
+            logger.warning(f"Error shutting down Ray queue: {e}")
+
     logger.info("Shutting down thread pool executor...")
     executor.shutdown(wait=True)
 
@@ -2274,7 +2293,7 @@ def cleanup_training_resources(
 def main(args: Args, tc: TokenizerConfig, model_config: ModelConfig, num_eval_samples: int = 32):
     tokenizer = make_tokenizer(tc, model_config)
     args = setup_runtime_variables(args)
-    beaker_config, wandb_url = setup_experiment_tracking(args, tc, model_config)
+    beaker_config, writer, wandb_url = setup_experiment_tracking(args, tc, model_config)
 
     train_dataset, eval_dataset = setup_datasets(args, tc, tokenizer)
     if args.cache_dataset_only:
@@ -2412,11 +2431,13 @@ def main(args: Args, tc: TokenizerConfig, model_config: ModelConfig, num_eval_sa
             collated_data,
             tokenizer,
             data_thread_metrics,
+            {},
             episode,
             training_step,
             num_total_tokens,
             start_time,
             train_dataset,
+            writer,
             wandb_url,
             tc.chat_template_name,
         )
@@ -2429,6 +2450,7 @@ def main(args: Args, tc: TokenizerConfig, model_config: ModelConfig, num_eval_sa
             eval_batch,
             reward_fn,
             episode,
+            writer,
             eval_pending_queries_map,
             generation_configs["eval"],
         )
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
@@ -48,7 +48,6 @@
 from multiprocessing import resource_tracker as _rt
 from typing import Any, Iterable, List, NewType, Optional, Tuple, Union
 
-import beaker
 import numpy as np
 import ray
 import requests
@@ -650,22 +649,66 @@ def parse(self) -> Union[DataClassType, Tuple[DataClassType]]:
 
 # ----------------------------------------------------------------------------
 # Experiment tracking utilities
-def get_wandb_tags() -> List[str]:
-    """Get tags for Weights & Biases (e.g., `no-tag-404-g98dc659,pr-123,branch-main`)"""
-    tags = [t for t in os.environ.get("WANDB_TAGS", "").split(",") if t != ""]
-    if "GIT_COMMIT" in os.environ:
-        git_commit = os.environ["GIT_COMMIT"]
-        tags.append(f"commit: {git_commit}")
+def get_git_tag() -> str:
+    """Try to get the latest Git tag (e.g., `no-tag-404-g98dc659` or `v1.0.0-4-g98dc659`)"""
+    git_tag = ""
+    try:
+        git_tag = (
+            subprocess.check_output(["git", "describe", "--tags"], stderr=subprocess.DEVNULL).decode("ascii").strip()
+        )
+    except subprocess.CalledProcessError as e:
+        logging.debug(f"Failed to get Git tag: {e}")
+
+    # If no Git tag found, create a custom tag based on commit count and hash
+    if len(git_tag) == 0:
+        try:
+            count = int(
+                subprocess.check_output(["git", "rev-list", "--count", "HEAD"], stderr=subprocess.DEVNULL)
+                .decode("ascii")
+                .strip()
+            )
+            hash = (
+                subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL)
+                .decode("ascii")
+                .strip()
+            )
+            git_tag = f"no-tag-{count}-g{hash}"
+        except subprocess.CalledProcessError as e:
+            logging.debug(f"Failed to get commit count and hash: {e}")
+
+    return git_tag
+
+
+def get_pr_tag() -> str:
+    """Try to find associated pull request on GitHub (e.g., `pr-123`)"""
+    pr_tag = ""
+    try:
+        git_commit = (
+            subprocess.check_output(["git", "rev-parse", "--verify", "HEAD"], stderr=subprocess.DEVNULL)
+            .decode("ascii")
+            .strip()
+        )
         # try finding the pull request number on github
         prs = requests.get(f"https://api.github.com/search/issues?q=repo:allenai/open-instruct+is:pr+{git_commit}")
         if prs.status_code == 200:
             prs = prs.json()
-            if len(prs["items"]):
+            if len(prs["items"]) > 0:
                 pr = prs["items"][0]
-                tags.append(f"pr: {pr['number']}")
-    if "GIT_BRANCH" in os.environ:
-        tags.append(f"branch: {os.environ['GIT_BRANCH']}")
-    return tags
+                pr_number = pr["number"]
+                pr_tag = f"pr-{pr_number}"
+    except Exception as e:
+        logging.debug(f"Failed to get PR number: {e}")
+
+    return pr_tag
+
+
+def get_wandb_tags() -> List[str]:
+    """Get tags for Weights & Biases (e.g., `no-tag-404-g98dc659,pr-123`)"""
+    existing_wandb_tags = os.environ.get("WANDB_TAGS", "")
+    git_tag = get_git_tag()
+    pr_tag = get_pr_tag()
+    non_empty_tags = [tag for tag in [existing_wandb_tags, git_tag, pr_tag] if len(tag) > 0]
+    return non_empty_tags
 
 
 # ----------------------------------------------------------------------------
@@ -923,17 +966,6 @@ def maybe_get_beaker_config():
     )
 
 
-def maybe_update_beaker_description_with_wandb_url(wandb_url: str) -> None:
-    """Update Beaker experiment description with wandb URL if running on Beaker."""
-    if not is_beaker_job() or wandb_url is None:
-        return
-
-    client = beaker.Beaker.from_env()
-    spec = client.experiment.get(os.environ["BEAKER_WORKLOAD_ID"])
-    current_description = spec.description or ""
-    client.experiment.set_description(os.environ["BEAKER_WORKLOAD_ID"], f"{current_description}\n{wandb_url}")
-
-
 def live_subprocess_output(cmd: List[str]) -> str:
     output_lines = []
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
@@ -1045,6 +1077,7 @@ def launch_ai2_evals_on_weka(
     stop_strings: Optional[List[str]] = None,
     gs_bucket_path: Optional[str] = None,
     eval_priority: Optional[str] = "normal",
+    beaker_image: Optional[str] = None,
 ) -> None:
     weka_cluster = "ai2/saturn-cirrascale ai2/neptune-cirrascale"
     gcp_cluster = "ai2/augusta-google-1"
@@ -1096,6 +1129,8 @@ def launch_ai2_evals_on_weka(
         command += f" --oe_eval_tasks {','.join(oe_eval_tasks)}"
     if stop_strings is not None:
         command += f" --oe_eval_stop_sequences '{','.join(stop_strings)}'"
+    if beaker_image is not None:
+        command += f" --beaker_image {beaker_image}"
     print(f"Launching eval jobs with command: {command}")
     process = subprocess.Popen(["bash", "-c", command], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     stdout, stderr = process.communicate()
diff --git a/open_instruct/vllm_utils3.py b/open_instruct/vllm_utils3.py
@@ -306,7 +306,10 @@ def create_vllm_engines(
     results_queue=None,
     eval_results_queue=None,
 ) -> list[LLMRayActor]:
-    assert vllm.__version__ >= "0.8.1", "OpenRLHF only supports vllm >= 0.8.1"
+
+    # if we installed from source, don't worry about it
+    if "dev" not in vllm.__version__:
+        assert vllm.__version__ >= "0.8.1", "OpenRLHF only supports vllm >= 0.8.1"
 
     # Convert max_tool_calls to a dict mapping tool end strings to their limits
     assert len(max_tool_calls) == 1 or len(max_tool_calls) == len(tools), (
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/scripts/train/rlvr/grpo_olmo25.sh b/scripts/train/rlvr/grpo_olmo25.sh