Skip to content

Commit

Permalink
allow training script to automatically start the dht server
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Aug 12, 2024
1 parent f2be2f7 commit 3e1fa45
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions open_diloco/run_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
#
## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise


# you can either pass a fixed initial peer or set it to auto and the script will start a dht server for you
## # ./run_training.sh 2 1 auto --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny


# Function to get CUDA devices based on the number of GPUs and index
function get_cuda_devices() {
local num_gpu=$1
Expand All @@ -31,6 +36,25 @@ NUM_GPU=$2
INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument
shift 3 # Remove the first three arguments so $@ contains only additional Python arguments

mkdir -p logs
echo "Initial peer: $INITIAL_PEER"

# Check if INITIAL_PEER is set to 'auto' and adjust accordingly
if [ "$INITIAL_PEER" = "auto" ]; then
# start the dht server
echo "Starting DHT server"
hivemind-dht --host_maddr /ip4/0.0.0.0/tcp/12345 --identity_path fixed_key.pem > logs/log_dht 2>&1 &

INITIAL_PEER=""
# get the initial peer from the logs, loop until the peer is found
while [ -z "$INITIAL_PEER" ]; do
sleep 1
INITIAL_PEER=$(awk '/Running a DHT instance/ {print $NF}' logs/log_dht)

done
fi
echo "Initial peer: $INITIAL_PEER"

# Ensure the logs directory exists
mkdir -p logs

Expand Down

0 comments on commit 3e1fa45

Please sign in to comment.