allenai · jacob-morrison · Sep 14, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/configs/beaker_configs/default_dpo_multinode.yaml b/configs/beaker_configs/default_dpo_multinode.yaml
@@ -0,0 +1,83 @@
+version: v2
+description: open-instruct-finetune-multinode
+budget: ai2/oe-adapt
+tasks:
+  - name: open-instruct-finetune-multinode
+    replicas: 4
+    leaderSelection: true
+    hostNetworking: true
+    propagateFailure: true
+    propagatePreemption: true
+    synchronizedStartTimeout: 60m
+    image:
+      beaker: nathanl/open_instruct_auto
+    command: [
+      '/bin/sh', '-c'
+    ]
+    arguments: ['
+        unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
+        --mixed_precision bf16
+        --num_machines 4
+        --num_processes 32
+        --machine_rank $BEAKER_REPLICA_RANK
+        --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME
+        --main_process_port 29400
+        --use_deepspeed
+        --deepspeed_config_file configs/ds_configs/stage3_offloading_accelerate.conf
+        --deepspeed_multinode_launcher standard
+        open_instruct/dpo_tune_cache.py
+        --model_name_or_path /hf_llama_models
+        --use_flash_attn
+        --max_seq_length 4096
+        --preprocessing_num_workers 16
+        --per_device_train_batch_size 1
+        --gradient_accumulation_steps 4
+        --learning_rate 5e-7
+        --lr_scheduler_type linear
+        --warmup_ratio 0.1
+        --weight_decay 0.
+        --num_train_epochs 3
+        --output_dir /output/
+        --with_tracking
+        --report_to tensorboard
+        --logging_steps 1
+    ']
+    envVars:
+      - name: CUDA_DEVICE_ORDER
+        value: PCI_BUS_ID
+      - name: TRANSFORMERS_CACHE
+        value: ./cache/
+      - name: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
+      - name: WANDB_PROJECT
+        value: open-instruct
+      - name: WANDB_WATCH
+        value: false
+      - name: WANDB_LOG_MODEL
+        value: false
+      - name: WANDB_DISABLED
+        value: true
+      - name: HF_TOKEN
+        secret: jacobm_HF_TOKEN
+    # datasets: # example for how to include datasets in mounting
+    #   - mountPath: /data
+    #     source:
+    #       beaker: Yizhongw03/processed_open_instruct_data
+    #   - mountPath: /mmlu
+    #     source:
+    #       beaker: Yizhongw03/mmlu
+    #   - mountPath: /hf_llama_models
+    #     source:
+    #       beaker: Yizhongw03/hf_llama_model_7B
+    datasets:
+      - mountPath: /oe-adapt-default
+        source:
+          weka: oe-adapt-default
+    result:
+      path: /output
+    resources:
+      gpuCount: 8
+    context:
+      cluster: ai2/allennlp-cirrascale
+      priority: high
+      preemptible: false
diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml
@@ -51,7 +51,7 @@ tasks:
       - name: TRANSFORMERS_CACHE
         value: ./cache/
       - name: WANDB_API_KEY
-        secret: WANDB_API_KEY
+        secret: jacobm_WANDB_API_KEY
       - name: WANDB_PROJECT
         value: open-instruct
       - name: WANDB_WATCH
@@ -61,7 +61,7 @@ tasks:
       - name: WANDB_DISABLED
         value: true
       - name: HF_TOKEN
-        secret: HF_TOKEN
+        secret: jacobm_HF_TOKEN
     datasets:
       - mountPath: /oe-adapt-default
         source:

diff --git a/configs/train_configs/dpo/test_dpo_multinode.yaml b/configs/train_configs/dpo/test_dpo_multinode.yaml
@@ -0,0 +1,29 @@
+model_name_or_path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed
+tokenizer_name: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed
+model_revision: main
+use_flash_attn: true
+gradient_checkpointing: true
+# dataset_name: ai2-adapt-dev/tulu3.4-sft-replica-50k
+# dataset_config_name: gpt4-prefs-on-policy
+dataset_mixer:
+  ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0
+  ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0
+use_slow_tokenizer: true
+max_seq_length: 2048
+preprocessing_num_workers: 16
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2 # designed for 8 GPUs, so batch size 128
+learning_rate: 2.0e-7
+lr_scheduler_type: linear
+warmup_ratio: 0.1
+weight_decay: 0.0
+num_train_epochs: 1
+output_dir: /output
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+use_lora: false
+dpo_loss_type: dpo_norm
+dpo_beta: 5
+checkpointing_steps: epoch
diff --git a/configs/train_configs/sft/llama_3.1_70b-test-math-mixes.yaml b/configs/train_configs/sft/llama_3.1_70b-test-math-mixes.yaml
@@ -0,0 +1,69 @@
+model_name_or_path: meta-llama/Llama-3.1-70B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-70B
+use_slow_tokenizer: true
+dataset_mixer:
+    # base math datasets:
+    natolambert/tulu-v2-sft-mixture-flan: 50000
+    natolambert/tulu-v2-sft-mixture-cot: 49747
+    ai2-adapt-dev/personahub_math_v4_149975: 149975
+    AI-MO/NuminaMath-TIR: 72441
+
+    # v3.4 datasets keeping (for now):
+    HuggingFaceH4/no_robots: 9500
+    allenai/openassistant-guanaco-reformatted: 7708
+    ai2-adapt-dev/tulu_hard_coded_examples: 14
+    ai2-adapt-dev/SciRIFF-train-mix-science: 10000
+    ai2-adapt-dev/Table-GPT-All-train: 3000
+
+    # ai2-adapt-dev/personahub_grade_math_v1_49980: 49980
+
+    # # other datasets:
+    ai2-adapt-dev/personahub_ifdata_v1_29980: 29980
+    ai2-adapt-dev/coconot-sft-reformat: 11477
+    ai2-adapt-dev/openmath-2-gsm8k: 50000
+
+    # # testing:
+    m-a-p/CodeFeedback-Filtered-Instruction: 50000
+    ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 100000
+
+    # # safety data:
+    # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 86759
+    # ai2-adapt-dev/processed-wildjailbreak: 261559
+
+    # # potentially problematic:
+    # # m-a-p/CodeFeedback-Filtered-Instruction: 156526
+    # # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663
+
+    # # new math:
+    # ai2-adapt-dev/test-persona-geometry-10k: 10000 ??? next do ICL
+
+    # ai2-adapt-dev/metamath-qa-reformat: 100000
+    # ai2-adapt-dev/WebInstructSub-reformat: 100000
+
+    # # other datasets:
+    # WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000
+    # ai2-adapt-dev/openmath-2-math: 50000
+
+    # removed cuz bad
+    # ai2-adapt-dev/aya_dataset-reformat: 100000
+    # ai2-adapt-dev/SlimOrca-reformat: 100000
+    # ai2-adapt-dev/Daring-Anteater-reformat: 99532
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
+gradient_checkpointing: true
diff --git a/configs/train_configs/sft/llama_3.1_8b-openmath-2.yaml b/configs/train_configs/sft/llama_3.1_8b-openmath-2.yaml
@@ -0,0 +1,25 @@
+model_name_or_path: meta-llama/Meta-Llama-3.1-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Meta-Llama-3.1-8B
+use_slow_tokenizer: true
+dataset_mixer:
+  ai2-adapt-dev/openmath-2-math: 100000
+  ai2-adapt-dev/openmath-2-gsm8k: 100000
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
+gradient_checkpointing: true
diff --git a/configs/train_configs/sft/llama_3.1_8b-test-math-mixes-filtered.yaml b/configs/train_configs/sft/llama_3.1_8b-test-math-mixes-filtered.yaml
@@ -0,0 +1,52 @@
+model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-8B-Instruct
+use_slow_tokenizer: true
+dataset_mixer:
+    # General datasets:
+    ai2-adapt-dev/oasst2_converted: 9091
+    ai2-adapt-dev/flan_v2_converted: 89982 # ODC-BY
+    ai2-adapt-dev/tulu_hard_coded_repeated_10: 240
+    ai2-adapt-dev/no_robots_converted: 9500 # NC
+    ai2-adapt-dev/wildchat_gpt4_converted: 100000
+
+    # Math datasets:
+    ai2-adapt-dev/personahub_math_v5_regen_149960: 149960
+    ai2-adapt-dev/personahub_grade_math_v1_49980: 49980
+    ai2-adapt-dev/open_math_2_gsm8k_converted: 50000
+    AI-MO/NuminaMath-TIR: 72441 # NC
+
+    # Coding datasets:
+    ai2-adapt-dev/evol_codealpaca_converted: 110999
+    ai2-adapt-dev/personahub_code_v2_34999: 34999
+
+    # IF datasets:
+    ai2-adapt-dev/personahub_ifdata_manual_seed_v2_19891: 19891
+
+    # Safety datasets:
+    ai2-adapt-dev/coconot_converted: 10983
+    ai2-adapt-dev/processed-wildjailbreak: 50000
+    ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000
+
+    # Specialty datasets:
+    ai2-adapt-dev/sciriff_converted: 10000 # NC? (I think a subset or two)
+    ai2-adapt-dev/table_gpt_converted: 5000
+    ai2-adapt-dev/aya_dataset_converted: 100000
+
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/llama_3.1_8b-test-math-mixes.yaml b/configs/train_configs/sft/llama_3.1_8b-test-math-mixes.yaml
@@ -0,0 +1,75 @@
+model_name_or_path: meta-llama/Llama-3.1-8B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-3.1-8B
+use_slow_tokenizer: true
+dataset_mixer:
+    # base math datasets:
+    natolambert/tulu-v2-sft-mixture-flan: 50000
+    natolambert/tulu-v2-sft-mixture-cot: 49747
+    ai2-adapt-dev/personahub_math_v4_149975: 149975
+    # ai2-adapt-dev/personahub_math_v5_regen_149960: 149960
+    AI-MO/NuminaMath-TIR: 72441
+
+    # v3.4 datasets keeping (for now):
+    HuggingFaceH4/no_robots: 9500
+    allenai/openassistant-guanaco-reformatted: 7708
+    ai2-adapt-dev/tulu_hard_coded_examples: 14
+    ai2-adapt-dev/SciRIFF-train-mix-science: 10000
+    ai2-adapt-dev/Table-GPT-All-train: 3000
+
+
+    ai2-adapt-dev/personahub_grade_math_v1_49980: 49980
+
+    # # other datasets:
+    ai2-adapt-dev/personahub_ifdata_v1_29980: 29980
+    ai2-adapt-dev/coconot-sft-reformat: 11477
+    ai2-adapt-dev/openmath-2-gsm8k: 50000
+
+    # # testing:
+    ai2-adapt-dev/evol_codealpaca_converted: 110999
+    # m-a-p/CodeFeedback-Filtered-Instruction: 50000
+    ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 100000
+
+    # # safety data:
+    # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 86759
+    # ai2-adapt-dev/processed-wildjailbreak: 261559
+    ai2-adapt-dev/processed-wildjailbreak: 50000
+    ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000
+
+    ai2-adapt-dev/personahub_code_v1_21699: 21699
+
+    # # potentially problematic:
+    # # m-a-p/CodeFeedback-Filtered-Instruction: 156526
+    # # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663
+
+    # # new math:
+    # ai2-adapt-dev/test-persona-geometry-10k: 10000 ??? next do ICL
+
+    # ai2-adapt-dev/metamath-qa-reformat: 100000
+    # ai2-adapt-dev/WebInstructSub-reformat: 100000
+
+    # # other datasets:
+    # WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000
+    # ai2-adapt-dev/openmath-2-math: 50000
+
+    # removed cuz bad
+    # ai2-adapt-dev/aya_dataset-reformat: 100000
+    # ai2-adapt-dev/SlimOrca-reformat: 100000
+    # ai2-adapt-dev/Daring-Anteater-reformat: 99532
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 2 # effective batch size 128 with 1 node
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
diff --git a/configs/train_configs/sft/llama_3_8b-sciriff.yaml b/configs/train_configs/sft/llama_3_8b-sciriff.yaml
@@ -0,0 +1,23 @@
+model_name_or_path: Qwen/Qwen2-1.5B
+model_revision: main
+use_flash_attn: true
+tokenizer_name: Qwen/Qwen2-1.5B
+use_slow_tokenizer: true
+train_file: /oe-adapt-default/jacobm/sciriff-data/tulu_none_science_1000_eval_no.jsonl
+max_seq_length: 4096
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes
+learning_rate: 5.0e-06 # best LR so far
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 2
+output_dir: /output/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+dataset_mix_dir: /output/
+gradient_checkpointing: true