Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

files for multinode dpo #366

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions configs/beaker_configs/default_dpo_multinode.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
version: v2
description: open-instruct-finetune-multinode
budget: ai2/oe-adapt
tasks:
- name: open-instruct-finetune-multinode
replicas: 4
leaderSelection: true
hostNetworking: true
propagateFailure: true
propagatePreemption: true
synchronizedStartTimeout: 60m
image:
beaker: nathanl/open_instruct_auto
command: [
'/bin/sh', '-c'
]
arguments: ['
unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
--mixed_precision bf16
--num_machines 4
--num_processes 32
--machine_rank $BEAKER_REPLICA_RANK
--main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME
--main_process_port 29400
--use_deepspeed
--deepspeed_config_file configs/ds_configs/stage3_offloading_accelerate.conf
--deepspeed_multinode_launcher standard
open_instruct/dpo_tune_cache.py
--model_name_or_path /hf_llama_models
--use_flash_attn
--max_seq_length 4096
--preprocessing_num_workers 16
--per_device_train_batch_size 1
--gradient_accumulation_steps 4
--learning_rate 5e-7
--lr_scheduler_type linear
--warmup_ratio 0.1
--weight_decay 0.
--num_train_epochs 3
--output_dir /output/
--with_tracking
--report_to tensorboard
--logging_steps 1
']
envVars:
- name: CUDA_DEVICE_ORDER
value: PCI_BUS_ID
- name: TRANSFORMERS_CACHE
value: ./cache/
- name: WANDB_API_KEY
secret: jacobm_WANDB_API_KEY
- name: WANDB_PROJECT
value: open-instruct
- name: WANDB_WATCH
value: false
- name: WANDB_LOG_MODEL
value: false
- name: WANDB_DISABLED
value: true
- name: HF_TOKEN
secret: jacobm_HF_TOKEN
# datasets: # example for how to include datasets in mounting
# - mountPath: /data
# source:
# beaker: Yizhongw03/processed_open_instruct_data
# - mountPath: /mmlu
# source:
# beaker: Yizhongw03/mmlu
# - mountPath: /hf_llama_models
# source:
# beaker: Yizhongw03/hf_llama_model_7B
datasets:
- mountPath: /oe-adapt-default
source:
weka: oe-adapt-default
result:
path: /output
resources:
gpuCount: 8
context:
cluster: ai2/allennlp-cirrascale
priority: high
preemptible: false
4 changes: 2 additions & 2 deletions configs/beaker_configs/default_finetune_multinode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ tasks:
- name: TRANSFORMERS_CACHE
value: ./cache/
- name: WANDB_API_KEY
secret: WANDB_API_KEY
secret: jacobm_WANDB_API_KEY
- name: WANDB_PROJECT
value: open-instruct
- name: WANDB_WATCH
Expand All @@ -61,7 +61,7 @@ tasks:
- name: WANDB_DISABLED
value: true
- name: HF_TOKEN
secret: HF_TOKEN
secret: jacobm_HF_TOKEN
datasets:
- mountPath: /oe-adapt-default
source:
Expand Down
29 changes: 29 additions & 0 deletions configs/train_configs/dpo/test_dpo_multinode.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
model_name_or_path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed
tokenizer_name: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed
model_revision: main
use_flash_attn: true
gradient_checkpointing: true
# dataset_name: ai2-adapt-dev/tulu3.4-sft-replica-50k
# dataset_config_name: gpt4-prefs-on-policy
dataset_mixer:
ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0
ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0
use_slow_tokenizer: true
max_seq_length: 2048
preprocessing_num_workers: 16
per_device_train_batch_size: 1
gradient_accumulation_steps: 2 # designed for 8 GPUs, so batch size 128
learning_rate: 2.0e-7
lr_scheduler_type: linear
warmup_ratio: 0.1
weight_decay: 0.0
num_train_epochs: 1
output_dir: /output
with_tracking: true
report_to:
- wandb
logging_steps: 1
use_lora: false
dpo_loss_type: dpo_norm
dpo_beta: 5
checkpointing_steps: epoch
69 changes: 69 additions & 0 deletions configs/train_configs/sft/llama_3.1_70b-test-math-mixes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
model_name_or_path: meta-llama/Llama-3.1-70B
model_revision: main
use_flash_attn: true
tokenizer_name: meta-llama/Llama-3.1-70B
use_slow_tokenizer: true
dataset_mixer:
# base math datasets:
natolambert/tulu-v2-sft-mixture-flan: 50000
natolambert/tulu-v2-sft-mixture-cot: 49747
ai2-adapt-dev/personahub_math_v4_149975: 149975
AI-MO/NuminaMath-TIR: 72441

# v3.4 datasets keeping (for now):
HuggingFaceH4/no_robots: 9500
allenai/openassistant-guanaco-reformatted: 7708
ai2-adapt-dev/tulu_hard_coded_examples: 14
ai2-adapt-dev/SciRIFF-train-mix-science: 10000
ai2-adapt-dev/Table-GPT-All-train: 3000

# ai2-adapt-dev/personahub_grade_math_v1_49980: 49980

# # other datasets:
ai2-adapt-dev/personahub_ifdata_v1_29980: 29980
ai2-adapt-dev/coconot-sft-reformat: 11477
ai2-adapt-dev/openmath-2-gsm8k: 50000

# # testing:
m-a-p/CodeFeedback-Filtered-Instruction: 50000
ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 100000

# # safety data:
# ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 86759
# ai2-adapt-dev/processed-wildjailbreak: 261559

# # potentially problematic:
# # m-a-p/CodeFeedback-Filtered-Instruction: 156526
# # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663

# # new math:
# ai2-adapt-dev/test-persona-geometry-10k: 10000 ??? next do ICL

# ai2-adapt-dev/metamath-qa-reformat: 100000
# ai2-adapt-dev/WebInstructSub-reformat: 100000

# # other datasets:
# WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000
# ai2-adapt-dev/openmath-2-math: 50000

# removed cuz bad
# ai2-adapt-dev/aya_dataset-reformat: 100000
# ai2-adapt-dev/SlimOrca-reformat: 100000
# ai2-adapt-dev/Daring-Anteater-reformat: 99532
max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
learning_rate: 5.0e-06 # best LR so far
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
gradient_checkpointing: true
25 changes: 25 additions & 0 deletions configs/train_configs/sft/llama_3.1_8b-openmath-2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
model_name_or_path: meta-llama/Meta-Llama-3.1-8B
model_revision: main
use_flash_attn: true
tokenizer_name: meta-llama/Meta-Llama-3.1-8B
use_slow_tokenizer: true
dataset_mixer:
ai2-adapt-dev/openmath-2-math: 100000
ai2-adapt-dev/openmath-2-gsm8k: 100000
max_seq_length: 4096
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
learning_rate: 5.0e-06 # best LR so far
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
gradient_checkpointing: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
model_revision: main
use_flash_attn: true
tokenizer_name: meta-llama/Llama-3.1-8B-Instruct
use_slow_tokenizer: true
dataset_mixer:
# General datasets:
ai2-adapt-dev/oasst2_converted: 9091
ai2-adapt-dev/flan_v2_converted: 89982 # ODC-BY
ai2-adapt-dev/tulu_hard_coded_repeated_10: 240
ai2-adapt-dev/no_robots_converted: 9500 # NC
ai2-adapt-dev/wildchat_gpt4_converted: 100000

# Math datasets:
ai2-adapt-dev/personahub_math_v5_regen_149960: 149960
ai2-adapt-dev/personahub_grade_math_v1_49980: 49980
ai2-adapt-dev/open_math_2_gsm8k_converted: 50000
AI-MO/NuminaMath-TIR: 72441 # NC

# Coding datasets:
ai2-adapt-dev/evol_codealpaca_converted: 110999
ai2-adapt-dev/personahub_code_v2_34999: 34999

# IF datasets:
ai2-adapt-dev/personahub_ifdata_manual_seed_v2_19891: 19891

# Safety datasets:
ai2-adapt-dev/coconot_converted: 10983
ai2-adapt-dev/processed-wildjailbreak: 50000
ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000

# Specialty datasets:
ai2-adapt-dev/sciriff_converted: 10000 # NC? (I think a subset or two)
ai2-adapt-dev/table_gpt_converted: 5000
ai2-adapt-dev/aya_dataset_converted: 100000

max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
learning_rate: 5.0e-06 # best LR so far
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
75 changes: 75 additions & 0 deletions configs/train_configs/sft/llama_3.1_8b-test-math-mixes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
model_name_or_path: meta-llama/Llama-3.1-8B
model_revision: main
use_flash_attn: true
tokenizer_name: meta-llama/Llama-3.1-8B
use_slow_tokenizer: true
dataset_mixer:
# base math datasets:
natolambert/tulu-v2-sft-mixture-flan: 50000
natolambert/tulu-v2-sft-mixture-cot: 49747
ai2-adapt-dev/personahub_math_v4_149975: 149975
# ai2-adapt-dev/personahub_math_v5_regen_149960: 149960
AI-MO/NuminaMath-TIR: 72441

# v3.4 datasets keeping (for now):
HuggingFaceH4/no_robots: 9500
allenai/openassistant-guanaco-reformatted: 7708
ai2-adapt-dev/tulu_hard_coded_examples: 14
ai2-adapt-dev/SciRIFF-train-mix-science: 10000
ai2-adapt-dev/Table-GPT-All-train: 3000


ai2-adapt-dev/personahub_grade_math_v1_49980: 49980

# # other datasets:
ai2-adapt-dev/personahub_ifdata_v1_29980: 29980
ai2-adapt-dev/coconot-sft-reformat: 11477
ai2-adapt-dev/openmath-2-gsm8k: 50000

# # testing:
ai2-adapt-dev/evol_codealpaca_converted: 110999
# m-a-p/CodeFeedback-Filtered-Instruction: 50000
ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 100000

# # safety data:
# ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 86759
# ai2-adapt-dev/processed-wildjailbreak: 261559
ai2-adapt-dev/processed-wildjailbreak: 50000
ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000

ai2-adapt-dev/personahub_code_v1_21699: 21699

# # potentially problematic:
# # m-a-p/CodeFeedback-Filtered-Instruction: 156526
# # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663

# # new math:
# ai2-adapt-dev/test-persona-geometry-10k: 10000 ??? next do ICL

# ai2-adapt-dev/metamath-qa-reformat: 100000
# ai2-adapt-dev/WebInstructSub-reformat: 100000

# # other datasets:
# WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000
# ai2-adapt-dev/openmath-2-math: 50000

# removed cuz bad
# ai2-adapt-dev/aya_dataset-reformat: 100000
# ai2-adapt-dev/SlimOrca-reformat: 100000
# ai2-adapt-dev/Daring-Anteater-reformat: 99532
max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
learning_rate: 5.0e-06 # best LR so far
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
23 changes: 23 additions & 0 deletions configs/train_configs/sft/llama_3_8b-sciriff.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
model_name_or_path: Qwen/Qwen2-1.5B
model_revision: main
use_flash_attn: true
tokenizer_name: Qwen/Qwen2-1.5B
use_slow_tokenizer: true
train_file: /oe-adapt-default/jacobm/sciriff-data/tulu_none_science_1000_eval_no.jsonl
max_seq_length: 4096
preprocessing_num_workers: 128
per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
learning_rate: 5.0e-06 # best LR so far
lr_scheduler_type: linear
warmup_ratio: 0.03
weight_decay: 0.0
num_train_epochs: 2
output_dir: /output/
with_tracking: true
report_to:
- wandb
logging_steps: 1
checkpointing_steps: epoch
dataset_mix_dir: /output/
gradient_checkpointing: true
Loading
Loading