File tree Expand file tree Collapse file tree 4 files changed +7
-7
lines changed Expand file tree Collapse file tree 4 files changed +7
-7
lines changed Original file line number Diff line number Diff line change @@ -361,9 +361,9 @@ def create_vllm_engines(
361361 use_hybrid_engine = pg is not None
362362 num_gpus = int (tensor_parallel_size == 1 )
363363 if use_hybrid_engine and tensor_parallel_size == 1 and single_gpu_mode :
364- # every worker will use 0.5 GPU, so that we can schedule
365- # 2 instances on the same GPUs .
366- num_gpus = 0.5
364+ # every worker will use 0.5/num_engines GPU, so that we can schedule
365+ # multiple instances on the same GPU while leaving 0.5 for the learner .
366+ num_gpus = 0.5 / num_engines
367367
368368 print (f"num_gpus: { num_gpus } " )
369369
@@ -381,7 +381,7 @@ def create_vllm_engines(
381381 scheduling_strategy = PlacementGroupSchedulingStrategy (
382382 placement_group = pg ,
383383 placement_group_capture_child_tasks = True ,
384- placement_group_bundle_index = i * tensor_parallel_size ,
384+ placement_group_bundle_index = 0 if single_gpu_mode else i * tensor_parallel_size ,
385385 )
386386
387387 additional_kwargs = {}
Original file line number Diff line number Diff line change 3030
3131# Install Python dependencies
3232echo " Installing dependencies with uv..."
33- uv sync --only-group dev
33+ uv sync
3434
3535# Run the provided script
3636bash $1 " $beaker_user /$image_name "
Original file line number Diff line number Diff line change @@ -34,7 +34,7 @@ uv run python mason.py \
3434 --per_device_train_batch_size 1 \
3535 --num_unique_prompts_rollout 8 \
3636 --num_samples_per_prompt_rollout 4 \
37- --model_name_or_path Qwen/Qwen3-1.7B \
37+ --model_name_or_path EleutherAI/pythia-14m \
3838 --stop_strings " </answer>" \
3939 --apply_r1_style_format_reward \
4040 --apply_verifiable_reward true \
@@ -46,6 +46,7 @@ uv run python mason.py \
4646 --deepspeed_stage 2 \
4747 --num_epochs 1 \
4848 --num_learners_per_node 1 \
49+ --vllm_num_engines 2 \
4950 --vllm_tensor_parallel_size 1 \
5051 --beta 0.01 \
5152 --seed 3 \
Original file line number Diff line number Diff line change @@ -47,7 +47,6 @@ uv run python mason.py \
4747 --with_tracking \
4848 --num_epochs 1 \
4949 --num_learners_per_node 1 \
50- --vllm_num_engines 2 \
5150 --vllm_tensor_parallel_size 1 \
5251 --beta 0.01 \
5352 --seed 3 \
You can’t perform that action at this time.
0 commit comments