todototry
diff --git a/‎Makefile
Lines changed: 5 additions & 5 deletions b/‎Makefile
Lines changed: 5 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 43 additions & 1 deletion b/‎README.md
Lines changed: 43 additions & 1 deletion
diff --git a/‎recipes/Qwen2.5-1.5B-Instruct/config_v00.00.yaml
Lines changed: 45 additions & 0 deletions b/‎recipes/Qwen2.5-1.5B-Instruct/config_v00.00.yaml
Lines changed: 45 additions & 0 deletions
diff --git a/‎recipes/accelerate_configs/deepspeed_zero3.yaml
Lines changed: 22 additions & 0 deletions b/‎recipes/accelerate_configs/deepspeed_zero3.yaml
Lines changed: 22 additions & 0 deletions
diff --git a/‎recipes/accelerate_configs/fsdp.yaml
Lines changed: 26 additions & 0 deletions b/‎recipes/accelerate_configs/fsdp.yaml
Lines changed: 26 additions & 0 deletions
diff --git a/‎recipes/accelerate_configs/fsdp_qlora.yaml
Lines changed: 25 additions & 0 deletions b/‎recipes/accelerate_configs/fsdp_qlora.yaml
Lines changed: 25 additions & 0 deletions
diff --git a/‎recipes/accelerate_configs/multi_gpu.yaml
Lines changed: 16 additions & 0 deletions b/‎recipes/accelerate_configs/multi_gpu.yaml
Lines changed: 16 additions & 0 deletions
diff --git a/‎recipes/launch.slurm
Lines changed: 86 additions & 0 deletions b/‎recipes/launch.slurm
Lines changed: 86 additions & 0 deletions
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := src tests scripts
+check_dirs := src scripts
 
 style:
 	black --line-length 119 --target-version py310 $(check_dirs) setup.py
@@ -18,16 +18,16 @@ quality:
 # Release stuff
 
 pre-release:
-	python src/alignment/release.py
+	python src/open_r1/release.py
 
 pre-patch:
-	python src/alignment/release.py --patch
+	python src/open_r1/release.py --patch
 
 post-release:
-	python src/alignment/release.py --post_release
+	python src/open_r1/release.py --post_release
 
 post-patch:
-	python src/alignment/release.py --post_release --patch
+	python src/open_r1/release.py --post_release --patch
 
 wheels:
 	python setup.py bdist_wheel && python setup.py sdist
 
@@ -1 +1,43 @@
-# open_r1
+ # Open R1
+
+ ## Installation instructions
+
+To run the code in this project, first, create a Python virtual environment using e.g. Conda:
+
+```shell
+conda create -n openr1 python=3.11 && conda activate openr1
+```
+
+Next, install vLLM:
+
+```shell
+pip install vllm==0.6.6.post1
+
+# For HF (cluster only has CUDA 12.1)
+pip install vllm==0.6.6.post1 --extra-index-url https://download.pytorch.org/whl/cu121
+```
+
+This will also install PyTorch `v2.5.1` and it is **very important** to use this version since the vLLM binaries are compiled for it. You can then install the remaining dependencies for your specific use case via `pip install -e .[LIST OF MODES]`. For most contributors, we recommend:
+
+```shell
+pip install -e ".[dev]"
+```
+
+Next, log into your Hugging Face and Weights and Biases accounts as follows:
+
+```shell
+huggingface-cli login
+wandb login
+```
+
+Finally, check your system has Git LFS installed so that you can load and push models/datasets to the Hugging Face Hub:
+
+```shell
+git-lfs --version
+```
+
+If it isn't installed, run:
+
+```shell
+sudo apt-get install git-lfs
+```
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/Bespoke-Stratos-17k: 1.0
+dataset_splits:
+- train
+- test
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: HuggingFaceH4/Qwen2.5-1.5B-R1-v00.00
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/Qwen2.5-1.5B-Distill-R1-v00.00
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 16
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,26 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE                                                                                                                                           
+debug: false                                                                                                                                                                 
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: true
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --gres=gpu:8
+#SBATCH --partition=hopper-prod  # Adjust this for your cluster
+#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
+#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster
+
+set -x -e
+
+source ~/.bashrc
+conda activate openr1
+echo "START TIME: $(date)"
+
+MODEL=$1
+TASK=$2
+PRECISION=$3
+ACCELERATOR=$4
+OPTIONAL_ARGS=$5
+
+# Training setup
+NUM_NODES=$SLURM_NNODES
+GPUS_PER_NODE=8
+WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
+# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
+GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+
+# Split the string into individual arguments
+IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
+
+# Loop through the arguments and find the one with "--gradient_accumulation_steps"
+for arg in "${ARGS[@]}"; do
+    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
+        # Extract the value after the equals sign
+        GRAD_ACC_STEPS="${arg#*=}"
+        break  # Exit the loop once we find the desired argument
+    fi
+done
+
+echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+
+export CMD=" \
+    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
+    "
+
+export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
+    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
+    --gradient_accumulation_steps $GRAD_ACC_STEPS \
+    --num_machines $NUM_NODES \
+    --num_processes $WORLD_SIZE \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank \$SLURM_PROCID \
+    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
+    --max_restarts 1 \
+    --role \$(hostname -s): \
+    --tee 3 \
+    "
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# Specific configuration optimized for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
+module load cuda/12.1
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
+
+echo "END TIME: $(date)"