From 10ace45d41caa51dc7079c0819e92808230267fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emmanuel=20Fr=C3=A9con?= Date: Thu, 22 Feb 2024 00:00:31 +0100 Subject: [PATCH 1/4] Handle termination signals Implement graceful shutdown of all the runners on termination signals received by the orchestrator. In order to avoid ctrl-c from being sent down the entire tree, a separate process group is created by the runner loop: this prevents abrupt termination of the microVM. When termination is received, a termination file (with the same secret as for the break file) is created. The implementation will react to the termination file and kill and unregister the runner. In order for all termination to be handled in the background, all processes are started in the background and waited for. The library is enhanced with a number of functions to handle processes, these should be portable to macOS. --- lib/common.sh | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ orchestrator.sh | 20 +++++++++++++------ runner.sh | 39 ++++++++++++++++++++++++++++++++++-- runner/runner.sh | 24 +++++++++++++++++++++-- 4 files changed, 124 insertions(+), 10 deletions(-) diff --git a/lib/common.sh b/lib/common.sh index feaa8f6..27f5073 100644 --- a/lib/common.sh +++ b/lib/common.sh @@ -75,6 +75,57 @@ run_krunvm() { buildah unshare krunvm "$@" } +tac() { + awk '{ buffer[NR] = $0; } END { for(i=NR; i>0; i--) { print buffer[i] } }' +} + +# This is the same as pgrep -P, but using ps and awk. The option (and keywords) +# used also exist on macOS, so this implementation should be cross platform. +pgrep_P() { + ps -A -o pid,ppid | awk -v pid="$1" '$2 == pid { print $1 }' +} + +ps_tree() { + if [ -n "$1" ]; then + printf %s\\n "$1" + for _pid in $(pgrep_P "$1"); do + ps_tree "$_pid" + done + fi +} + +running() { + # Construct a comma separated list of pids to wait for + _pidlist= + for _pid; do + _pidlist="${_pidlist},${_pid}" + done + + # Construct the list of those pids that are still running + ps -p "${_pidlist#,}" -o pid= 2>/dev/null | awk '{ print $1 }' +} + +waitpid() { + # Construct the list of those pids that are still running + _running=$(running "$@") + + # If not empty, sleep and try again with the list of running pids (so we avoid + # having the same PID that would reappear -- very unlikely) + if [ -n "$_running" ]; then + sleep 1 + # shellcheck disable=SC2086 # We want to expand the list of pids + waitpid $_running + fi +} + +kill_tree() { + verbose "Killing process tree for $1" + for pid in $(ps_tree "$1"|tac); do + debug "Killing process $pid" + kill -s "${2:-TERM}" -- "$pid" + done +} + find_pattern() { _type=$(to_lower "${2:-f}") diff --git a/orchestrator.sh b/orchestrator.sh index aac0706..10e69df 100755 --- a/orchestrator.sh +++ b/orchestrator.sh @@ -89,7 +89,15 @@ KRUNVM_RUNNER_LOG=$ORCHESTRATOR_LOG KRUNVM_RUNNER_VERBOSE=$ORCHESTRATOR_VERBOSE cleanup() { - trap - INT TERM EXIT + trap '' EXIT + + for pid in $ORCHESTRATOR_PIDS; do + verbose "Killing runner loop $pid" + kill "$pid" + done + verbose "Waiting for runners to die" + # shellcheck disable=SC2086 # We want to wait for all pids + waitpid $ORCHESTRATOR_PIDS if run_krunvm list | grep -qE "^${ORCHESTRATOR_PREFIX}-"; then while IFS= read -r vm; do @@ -118,7 +126,8 @@ if [ "$ORCHESTRATOR_ISOLATION" = 1 ]; then else verbose "Creating $ORCHESTRATOR_RUNNERS insecure runner loops" fi -trap cleanup INT TERM EXIT + +trap cleanup EXIT # Pass essential variables, verbosity and log configuration to main runner # script. @@ -159,7 +168,6 @@ for i in $(seq 1 "$ORCHESTRATOR_RUNNERS"); do fi done -verbose "Waiting for runners to die" -for pid in $ORCHESTRATOR_PIDS; do - wait "$pid" -done +# shellcheck disable=SC2086 # We want to wait for all pids +waitpid $ORCHESTRATOR_PIDS +cleanup diff --git a/runner.sh b/runner.sh index d7e669a..7d1d08d 100755 --- a/runner.sh +++ b/runner.sh @@ -223,7 +223,6 @@ $(printf %s\\n "$RUNNER_MOUNT") EOF fi run_krunvm create "$RUNNER_IMAGE" "$@" - } @@ -263,7 +262,14 @@ EOF done fi verbose "Starting microVM '${RUNNER_PREFIX}-$_id' with entrypoint $RUNNER_ENTRYPOINT" - run_krunvm start "${RUNNER_PREFIX}-$_id" "$RUNNER_ENTRYPOINT" -- "$@" + optstate=$(set +o) + set -m; # Disable job control + run_krunvm start "${RUNNER_PREFIX}-$_id" "$RUNNER_ENTRYPOINT" -- "$@" "${RUNNER_ENVIRONMENT}/${RUNNER_ID}.trm" + if [ "$RUNNER_PID" ]; then + # shellcheck disable=SC2046 # We want to wait for all children + waitpid $(ps_tree "$RUNNER_PID"|tac) + else + warning "No PID to wait for" + fi + elif [ -n "$RUNNER_PID" ]; then + kill_tree "$RUNNER_PID" + # shellcheck disable=SC2046 # We want to wait for all children + waitpid $(ps_tree "$RUNNER_PID"|tac) + fi +} + +cleanup() { + trap '' EXIT + if [ -n "$RUNNER_PID" ]; then + vm_terminate + fi + if [ -n "$RUNNER_ID" ]; then + vm_delete "$RUNNER_ID" + fi +} + +trap cleanup EXIT + iteration=0 while true; do @@ -285,6 +319,7 @@ while true; do vm_create "${RUNNER_ID}" vm_start "${RUNNER_ID}" vm_delete "${RUNNER_ID}" + RUNNER_ID= if [ "$RUNNER_REPEAT" -gt 0 ]; then iteration=$((iteration+1)) diff --git a/runner/runner.sh b/runner/runner.sh index 37c4d07..7fd9751 100755 --- a/runner/runner.sh +++ b/runner/runner.sh @@ -420,10 +420,12 @@ fi # Start the runner. verbose "Starting runner as user '$RUNNER_USER' (current user=$(id -un)): $*" +RUNNER_PID= case "$RUNNER_USER" in root) if [ "$(id -u)" = "0" ]; then - "$@" + "$@" & + RUNNER_PID="$!" else error "Cannot start runner as root from non-root user" fi @@ -433,7 +435,8 @@ case "$RUNNER_USER" in if [ "$(id -u)" = "0" ]; then verbose "Starting runner as $RUNNER_USER" chown -R "$RUNNER_USER" "$RUNNER_WORKDIR" - runas "$@" + runas "$@" & + RUNNER_PID="$!" elif [ "$(id -un)" = "$RUNNER_USER" ]; then "$@" else @@ -444,3 +447,20 @@ case "$RUNNER_USER" in fi ;; esac + +if [ -n "$RUNNER_PID" ]; then + while [ -n "$(running "$RUNNER_PID")" ]; do + if [ -n "${RUNNER_TOKENFILE:-}" ] && [ -n "${RUNNER_SECRET:-}" ]; then + if [ -f "${RUNNER_TOKENFILE%.*}.trm" ]; then + break=$(cat "${RUNNER_TOKENFILE%.*}.trm") + if [ "$break" = "$RUNNER_SECRET" ]; then + verbose "Termination file found, stopping runner" + kill "$RUNNER_PID" + runner_unregister 1 + else + warning "Termination found at ${RUNNER_TOKENFILE%.*}.trm, but it does not match the secret" + fi + fi + fi + done +fi From f9a97d48bfafef58d385583693254da3acc280f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emmanuel=20Fr=C3=A9con?= Date: Thu, 22 Feb 2024 00:01:00 +0100 Subject: [PATCH 2/4] Fix call to adapt to new CLI format --- demo/demo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/demo.sh b/demo/demo.sh index 54aa86d..d6340ab 100755 --- a/demo/demo.sh +++ b/demo/demo.sh @@ -15,4 +15,4 @@ clear pe "# For the sake of the asciicast: PAT is present in RUNNER_PAT environment variable" pe "# -r 1: to run once only, the default is to run forever" -pe "./orchestrator.sh -v -r 1 -p efrecon/gh-runner-krunvm -- 1" +pe "./orchestrator.sh -v -- -r 1 -p efrecon/gh-runner-krunvm" From 7412f21e2b8334737fb98f9d3278cb102bff95d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emmanuel=20Fr=C3=A9con?= Date: Thu, 22 Feb 2024 00:19:30 +0100 Subject: [PATCH 3/4] Use presence of token file during termination When the token file is present, remove through creating a termination file. Otherwise, kill the entire process tree directly, as the runner has not (yet) been registered at GitHub. --- lib/common.sh | 2 +- runner.sh | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/lib/common.sh b/lib/common.sh index 27f5073..ded9f90 100644 --- a/lib/common.sh +++ b/lib/common.sh @@ -122,7 +122,7 @@ kill_tree() { verbose "Killing process tree for $1" for pid in $(ps_tree "$1"|tac); do debug "Killing process $pid" - kill -s "${2:-TERM}" -- "$pid" + kill -s "${2:-TERM}" -- "$pid" 2>/dev/null done } diff --git a/runner.sh b/runner.sh index 7d1d08d..23f2fde 100755 --- a/runner.sh +++ b/runner.sh @@ -285,13 +285,24 @@ vm_delete() { } vm_terminate() { - if [ -n "$RUNNER_ENVIRONMENT" ] && [ -n "${RUNNER_SECRET:-}" ]; then - printf %s\\n "$RUNNER_SECRET" > "${RUNNER_ENVIRONMENT}/${RUNNER_ID}.trm" - if [ "$RUNNER_PID" ]; then + if [ -n "$RUNNER_ENVIRONMENT" ]; then + if [ -f "${RUNNER_ENVIRONMENT}/${RUNNER_ID}.tkn" ]; then + if [ -n "${RUNNER_SECRET:-}" ]; then + verbose "Requesting termination via ${RUNNER_ENVIRONMENT}/${RUNNER_ID}.trm" + printf %s\\n "$RUNNER_SECRET" > "${RUNNER_ENVIRONMENT}/${RUNNER_ID}.trm" + elif [ -n "$RUNNER_PID" ]; then + kill_tree "$RUNNER_PID" + fi + if [ "$RUNNER_PID" ]; then + # shellcheck disable=SC2046 # We want to wait for all children + waitpid $(ps_tree "$RUNNER_PID"|tac) + else + warning "No PID to wait for" + fi + elif [ -n "$RUNNER_PID" ]; then + kill_tree "$RUNNER_PID" # shellcheck disable=SC2046 # We want to wait for all children waitpid $(ps_tree "$RUNNER_PID"|tac) - else - warning "No PID to wait for" fi elif [ -n "$RUNNER_PID" ]; then kill_tree "$RUNNER_PID" From b344d5f50ffc74923753b01f42a38b05c6d6a96c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emmanuel=20Fr=C3=A9con?= Date: Thu, 22 Feb 2024 00:25:25 +0100 Subject: [PATCH 4/4] Add sleep to loop --- runner/runner.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runner/runner.sh b/runner/runner.sh index 7fd9751..b4c313a 100755 --- a/runner/runner.sh +++ b/runner/runner.sh @@ -462,5 +462,6 @@ if [ -n "$RUNNER_PID" ]; then fi fi fi + sleep 1 done fi