Skip to content

Commit 6acc9a0

Browse files
authored
Add configs and stuff (huggingface#2)
1 parent a4bf904 commit 6acc9a0

File tree

13 files changed

+287
-182
lines changed

13 files changed

+287
-182
lines changed

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
44
export PYTHONPATH = src
55

6-
check_dirs := src tests scripts
6+
check_dirs := src scripts
77

88
style:
99
black --line-length 119 --target-version py310 $(check_dirs) setup.py
@@ -18,16 +18,16 @@ quality:
1818
# Release stuff
1919

2020
pre-release:
21-
python src/alignment/release.py
21+
python src/open_r1/release.py
2222

2323
pre-patch:
24-
python src/alignment/release.py --patch
24+
python src/open_r1/release.py --patch
2525

2626
post-release:
27-
python src/alignment/release.py --post_release
27+
python src/open_r1/release.py --post_release
2828

2929
post-patch:
30-
python src/alignment/release.py --post_release --patch
30+
python src/open_r1/release.py --post_release --patch
3131

3232
wheels:
3333
python setup.py bdist_wheel && python setup.py sdist

README.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,43 @@
1-
# open_r1
1+
# Open R1
2+
3+
## Installation instructions
4+
5+
To run the code in this project, first, create a Python virtual environment using e.g. Conda:
6+
7+
```shell
8+
conda create -n openr1 python=3.11 && conda activate openr1
9+
```
10+
11+
Next, install vLLM:
12+
13+
```shell
14+
pip install vllm==0.6.6.post1
15+
16+
# For HF (cluster only has CUDA 12.1)
17+
pip install vllm==0.6.6.post1 --extra-index-url https://download.pytorch.org/whl/cu121
18+
```
19+
20+
This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
21+
22+
```shell
23+
pip install -e ".[dev]"
24+
```
25+
26+
Next, log into your Hugging Face and Weights and Biases accounts as follows:
27+
28+
```shell
29+
huggingface-cli login
30+
wandb login
31+
```
32+
33+
Finally, check your system has Git LFS installed so that you can load and push models/datasets to the Hugging Face Hub:
34+
35+
```shell
36+
git-lfs --version
37+
```
38+
39+
If it isn't installed, run:
40+
41+
```shell
42+
sudo apt-get install git-lfs
43+
```
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Model arguments
2+
model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
3+
model_revision: main
4+
torch_dtype: bfloat16
5+
attn_implementation: flash_attention_2
6+
7+
# Data training arguments
8+
dataset_mixer:
9+
HuggingFaceH4/Bespoke-Stratos-17k: 1.0
10+
dataset_splits:
11+
- train
12+
- test
13+
preprocessing_num_workers: 12
14+
15+
# SFT trainer config
16+
bf16: true
17+
do_eval: true
18+
eval_strategy: epoch
19+
gradient_accumulation_steps: 1
20+
gradient_checkpointing: true
21+
gradient_checkpointing_kwargs:
22+
use_reentrant: False
23+
hub_model_id: HuggingFaceH4/Qwen2.5-1.5B-R1-v00.00
24+
hub_strategy: every_save
25+
learning_rate: 2.0e-05
26+
log_level: info
27+
logging_steps: 5
28+
logging_strategy: steps
29+
lr_scheduler_type: cosine
30+
max_seq_length: 2048
31+
max_steps: -1
32+
num_train_epochs: 1
33+
output_dir: data/Qwen2.5-1.5B-Distill-R1-v00.00
34+
overwrite_output_dir: true
35+
per_device_eval_batch_size: 8
36+
per_device_train_batch_size: 16
37+
push_to_hub: true
38+
remove_unused_columns: true
39+
report_to:
40+
- wandb
41+
save_strategy: "steps"
42+
save_steps: 100
43+
save_total_limit: 1
44+
seed: 42
45+
warmup_ratio: 0.1
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
compute_environment: LOCAL_MACHINE
2+
debug: false
3+
deepspeed_config:
4+
deepspeed_multinode_launcher: standard
5+
offload_optimizer_device: none
6+
offload_param_device: none
7+
zero3_init_flag: true
8+
zero3_save_16bit_model: true
9+
zero_stage: 3
10+
distributed_type: DEEPSPEED
11+
downcast_bf16: 'no'
12+
machine_rank: 0
13+
main_training_function: main
14+
mixed_precision: bf16
15+
num_machines: 1
16+
num_processes: 8
17+
rdzv_backend: static
18+
same_network: true
19+
tpu_env: []
20+
tpu_use_cluster: false
21+
tpu_use_sudo: false
22+
use_cpu: false

recipes/accelerate_configs/fsdp.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
compute_environment: LOCAL_MACHINE
2+
debug: false
3+
distributed_type: FSDP
4+
downcast_bf16: 'no'
5+
enable_cpu_affinity: false
6+
fsdp_config:
7+
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
8+
fsdp_backward_prefetch: BACKWARD_PRE
9+
fsdp_cpu_ram_efficient_loading: true
10+
fsdp_forward_prefetch: true
11+
fsdp_offload_params: false
12+
fsdp_sharding_strategy: FULL_SHARD
13+
fsdp_state_dict_type: SHARDED_STATE_DICT
14+
fsdp_sync_module_states: true
15+
fsdp_use_orig_params: true
16+
machine_rank: 0
17+
main_training_function: main
18+
mixed_precision: bf16
19+
num_machines: 1
20+
num_processes: 8
21+
rdzv_backend: static
22+
same_network: true
23+
tpu_env: []
24+
tpu_use_cluster: false
25+
tpu_use_sudo: false
26+
use_cpu: false
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
compute_environment: LOCAL_MACHINE
2+
debug: false
3+
distributed_type: FSDP
4+
downcast_bf16: 'no'
5+
fsdp_config:
6+
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
7+
fsdp_backward_prefetch: BACKWARD_PRE
8+
fsdp_cpu_ram_efficient_loading: true
9+
fsdp_forward_prefetch: false
10+
fsdp_offload_params: true
11+
fsdp_sharding_strategy: FULL_SHARD
12+
fsdp_state_dict_type: SHARDED_STATE_DICT
13+
fsdp_sync_module_states: true
14+
fsdp_use_orig_params: false
15+
machine_rank: 0
16+
main_training_function: main
17+
mixed_precision: 'no'
18+
num_machines: 1
19+
num_processes: 2
20+
rdzv_backend: static
21+
same_network: true
22+
tpu_env: []
23+
tpu_use_cluster: false
24+
tpu_use_sudo: false
25+
use_cpu: false
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
compute_environment: LOCAL_MACHINE
2+
debug: false
3+
distributed_type: MULTI_GPU
4+
downcast_bf16: 'no'
5+
gpu_ids: all
6+
machine_rank: 0
7+
main_training_function: main
8+
mixed_precision: bf16
9+
num_machines: 1
10+
num_processes: 8
11+
rdzv_backend: static
12+
same_network: true
13+
tpu_env: []
14+
tpu_use_cluster: false
15+
tpu_use_sudo: false
16+
use_cpu: false

recipes/launch.slurm

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#!/bin/bash
2+
#SBATCH --ntasks-per-node=1
3+
#SBATCH --exclusive
4+
#SBATCH --gres=gpu:8
5+
#SBATCH --partition=hopper-prod # Adjust this for your cluster
6+
#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
7+
#SBATCH --err=/fsx/h4/logs/%x-%j.err # Adjust this for your cluster
8+
9+
set -x -e
10+
11+
source ~/.bashrc
12+
conda activate openr1
13+
echo "START TIME: $(date)"
14+
15+
MODEL=$1
16+
TASK=$2
17+
PRECISION=$3
18+
ACCELERATOR=$4
19+
OPTIONAL_ARGS=$5
20+
21+
# Training setup
22+
NUM_NODES=$SLURM_NNODES
23+
GPUS_PER_NODE=8
24+
WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
25+
# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
26+
CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
27+
GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
28+
29+
# Split the string into individual arguments
30+
IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
31+
32+
# Loop through the arguments and find the one with "--gradient_accumulation_steps"
33+
for arg in "${ARGS[@]}"; do
34+
if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
35+
# Extract the value after the equals sign
36+
GRAD_ACC_STEPS="${arg#*=}"
37+
break # Exit the loop once we find the desired argument
38+
fi
39+
done
40+
41+
echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
42+
# so processes know who to talk to
43+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
44+
MASTER_PORT=6000
45+
46+
export CMD=" \
47+
scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
48+
"
49+
50+
export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
51+
--config_file recipes/accelerate_configs/$ACCELERATOR.yaml \
52+
--gradient_accumulation_steps $GRAD_ACC_STEPS \
53+
--num_machines $NUM_NODES \
54+
--num_processes $WORLD_SIZE \
55+
--main_process_ip $MASTER_ADDR \
56+
--main_process_port $MASTER_PORT \
57+
--machine_rank \$SLURM_PROCID \
58+
--rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
59+
--max_restarts 1 \
60+
--role \$(hostname -s): \
61+
--tee 3 \
62+
"
63+
64+
# force crashing on nccl issues like hanging broadcast
65+
export NCCL_ASYNC_ERROR_HANDLING=1
66+
# export NCCL_DEBUG=INFO
67+
# export NCCL_DEBUG_SUBSYS=COLL
68+
# export NCCL_SOCKET_NTHREADS=1
69+
# export NCCL_NSOCKS_PERTHREAD=1
70+
# export CUDA_LAUNCH_BLOCKING=1
71+
72+
# Specific configuration optimized for the Hugging Face Compute Cluster
73+
# Be ye warned this may not work on other clusters!
74+
module load cuda/12.1
75+
76+
# srun error handling:
77+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
78+
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
79+
SRUN_ARGS=" \
80+
--wait=60 \
81+
--kill-on-bad-exit=1 \
82+
"
83+
84+
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
85+
86+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)